]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Corpus/Parsers/Date.hs
[conduit] parsers length works so full progress report should work as well
[gargantext.git] / src / Gargantext / Core / Text / Corpus / Parsers / Date.hs
1 {-|
2 Module : Gargantext.Core.Text.Corpus.Parsers.Date
3 Description : Some utils to parse dates
4 Copyright : (c) CNRS 2017-present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 According to the language of the text, parseDateRaw returns date as Text:
11
12 TODO : Add some tests
13 import Gargantext.Core.Text.Corpus.Parsers.Date as DGP
14 DGP.parseDateRaw DGP.FR "12 avril 2010" == "2010-04-12T00:00:00.000+00:00"
15 -}
16
17 {-# LANGUAGE TypeFamilies #-}
18
19 module Gargantext.Core.Text.Corpus.Parsers.Date
20 {-(parse, parseRaw, dateSplit, Year, Month, Day)-}
21 where
22
23 import Data.Aeson (toJSON, Value)
24 import Data.HashMap.Strict as HM hiding (map)
25 import Data.Text (Text, unpack, splitOn)
26 import Data.Time (parseTimeOrError, defaultTimeLocale, toGregorian)
27 import Data.Time.Clock (UTCTime(..), getCurrentTime)
28 import Data.Time.LocalTime (utc)
29 import Data.Time.LocalTime.TimeZone.Series (zonedTimeToZoneSeriesTime)
30 import Duckling.Api (analyze)
31 import Duckling.Core (makeLocale, Dimension(Time))
32 import Duckling.Types (Seal(..))
33 import Duckling.Resolve (fromUTC, Context(Context, referenceTime, locale), DucklingTime(DucklingTime), Options(..))
34 import Duckling.Types (ResolvedToken(..), ResolvedVal(..))
35 import Gargantext.Core (Lang(FR,EN))
36 import Gargantext.Prelude
37 import qualified Data.Aeson as Json
38 import qualified Data.HashSet as HashSet
39 import qualified Duckling.Core as DC
40
41 ------------------------------------------------------------------------
42 -- | Parse date to Ints
43 -- TODO add hours, minutes and seconds
44 dateSplit :: Lang -> Maybe Text -> IO (Maybe UTCTime, (Maybe Year, Maybe Month, Maybe Day))
45 dateSplit _ Nothing = pure (Nothing, (Nothing, Nothing, Nothing))
46 dateSplit l (Just txt) = do
47 utcTime <- parse l txt
48 let (y, m, d) = split' utcTime
49 pure (Just utcTime, (Just y, Just m,Just d))
50
51 split' :: UTCTime -> (Year, Month, Day)
52 split' utcTime = (fromIntegral y, m, d)
53 where
54 (UTCTime day _) = utcTime
55 (y,m,d) = toGregorian day
56
57 type Year = Int
58 type Month = Int
59 type Day = Int
60 ------------------------------------------------------------------------
61
62 -- | Date Parser
63 -- Parses dates mentions in full text given the language.
64 -- >>> parseDate FR (pack "10 avril 1979 à 19H")
65 -- 1979-04-10 19:00:00 UTC
66 -- >>> parseDate EN (pack "April 10 1979")
67 -- 1979-04-10 00:00:00 UTC
68 parse :: Lang -> Text -> IO UTCTime
69 parse lang s = parseDate' "%Y-%m-%dT%T" "0-0-0T0:0:0" lang s
70
71 type DateFormat = Text
72 type DateDefault = Text
73
74 parseDate' :: DateFormat
75 -> DateDefault
76 -> Lang
77 -> Text
78 -> IO UTCTime
79 parseDate' format def lang s = do
80 dateStr' <- parseRaw lang s
81 if dateStr' == ""
82 then getCurrentTime
83 else do
84 let dateStr = unpack
85 $ maybe def identity
86 $ head
87 $ splitOn "." dateStr'
88 pure $ parseTimeOrError True defaultTimeLocale (unpack format) dateStr
89
90
91 -- TODO add Paris at Duckling.Locale Region datatype
92 -- | To get Homogeinity of the languages
93 -- TODO : put this in a more generic place in the source code
94 parserLang :: Lang -> DC.Lang
95 parserLang FR = DC.FR
96 parserLang EN = DC.EN
97 parserLang _ = panic "not implemented"
98
99 -- | Final Date parser API
100 -- IO can be avoided here:
101 -- currentContext :: Lang -> IO Context
102 -- currentContext lang = localContext lang <$> utcToDucklingTime <$> getCurrentTime
103 -- parseRaw :: Context -> Text -> SomeErrorHandling Text
104
105 parseRaw :: Lang -> Text -> IO Text
106 parseRaw lang text = do -- case result
107 maybeResult <- extractValue <$> getTimeValue
108 <$> parseDateWithDuckling lang text (Options True)
109 case maybeResult of
110 Just result -> pure result
111 Nothing -> do
112 printDebug ("[G.C.T.C.P.D.parseRaw] ERROR " <> (cs . show) lang)
113 text
114 pure ""
115
116 getTimeValue :: [ResolvedToken] -> Maybe Value
117 getTimeValue rt = case head rt of
118 Nothing -> do
119 Nothing
120 Just x -> case rval x of
121 RVal Time t -> Just $ toJSON t
122 _ -> do
123 Nothing
124
125 extractValue :: Maybe Value -> Maybe Text
126 extractValue (Just (Json.Object object)) =
127 case HM.lookup "value" object of
128 Just (Json.String date) -> Just date
129 _ -> Nothing
130 extractValue _ = Nothing
131
132 -- | Current Time in DucklingTime format
133 -- TODO : get local Time in a more generic way
134 utcToDucklingTime :: UTCTime -> DucklingTime
135 utcToDucklingTime time = DucklingTime . zonedTimeToZoneSeriesTime $ fromUTC time utc
136
137 -- | Local Context which depends on Lang and Time
138 localContext :: Lang -> DucklingTime -> Context
139 localContext lang dt = Context {referenceTime = dt, locale = makeLocale (parserLang lang) Nothing}
140
141 -- | Date parser with Duckling
142 parseDateWithDuckling :: Lang -> Text -> Options -> IO [ResolvedToken]
143 parseDateWithDuckling lang input options = do
144 contxt <- localContext lang <$> utcToDucklingTime <$> getCurrentTime
145 --pure $ parseAndResolve (rulesFor (locale ctx) (HashSet.fromList [(This Time)])) input ctx
146 -- TODO check/test Options False or True
147 pure $ analyze input contxt options $ HashSet.fromList [(Seal Time)]
148
149
150