2 Module : Gargantext.Core.Text.Corpus.Parsers.Date
3 Description : Some utils to parse dates
4 Copyright : (c) CNRS 2017-present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 According to the language of the text, parseDateRaw returns date as Text:
13 import Gargantext.Core.Text.Corpus.Parsers.Date as DGP
14 DGP.parseDateRaw DGP.FR "12 avril 2010" == "2010-04-12T00:00:00.000+00:00"
17 {-# LANGUAGE TypeFamilies #-}
19 module Gargantext.Core.Text.Corpus.Parsers.Date
20 {-(parse, parseRaw, dateSplit, Year, Month, Day)-}
23 import System.Environment (getEnv)
24 import Data.Aeson (toJSON, Value)
25 import Data.Either (Either(..))
26 import Data.HashMap.Strict as HM hiding (map)
27 import Data.Text (Text, unpack, splitOn, replace)
28 import Data.Time (defaultTimeLocale, toGregorian, iso8601DateFormat, parseTimeM)
29 import Data.Time.Clock (UTCTime(..), getCurrentTime)
30 import Data.Time.LocalTime (utc)
31 import Data.Time.LocalTime.TimeZone.Series (zonedTimeToZoneSeriesTime)
32 import Duckling.Api (analyze)
33 import Duckling.Core (makeLocale, Dimension(Time))
34 import Duckling.Types (Seal(..))
35 import Duckling.Resolve (fromUTC, Context(Context, referenceTime, locale), DucklingTime(DucklingTime), Options(..))
36 import Duckling.Types (ResolvedToken(..), ResolvedVal(..))
37 import Gargantext.Core (Lang(FR,EN))
38 import Gargantext.Core.Types (DebugMode(..), withDebugMode)
39 import Gargantext.Prelude
40 import qualified Control.Exception as CE
41 import qualified Data.Aeson as Json
42 import qualified Data.HashSet as HashSet
43 import qualified Duckling.Core as DC
45 ------------------------------------------------------------------------
46 -- | Parse date to Ints
47 -- TODO add hours, minutes and seconds
48 dateSplit :: Lang -> Maybe Text -> IO (Maybe UTCTime, (Maybe Year, Maybe Month, Maybe Day))
49 dateSplit _ Nothing = pure (Nothing, (Nothing, Nothing, Nothing))
50 dateSplit l (Just txt) = do
51 utcTime <- parse l txt
52 let (y, m, d) = split' utcTime
53 pure (Just utcTime, (Just y, Just m,Just d))
55 split' :: UTCTime -> (Year, Month, Day)
56 split' utcTime = (fromIntegral y, m, d)
58 (UTCTime day _) = utcTime
59 (y,m,d) = toGregorian day
64 ------------------------------------------------------------------------
67 -- Parses dates mentions in full text given the language.
68 -- >>> parse FR (pack "10 avril 1900 à 19H")
69 -- 1900-04-10 19:00:00 UTC
70 -- >>> parse EN (pack "April 10 1900")
71 -- 1900-04-10 00:00:00 UTC
72 parse :: Lang -> Text -> IO UTCTime
74 dateStr' <- parseRawSafe lang s
75 case dateFlow dateStr' of
76 DateFlowSuccess ok -> pure ok
77 _ -> withDebugMode (DebugMode True)
78 "[G.C.T.P.T.Date parse]" (lang,s)
83 defaultDate = "0-0-0T0:0:0"
85 type DateFormat = Text
86 type DateDefault = Text
89 data DateFlow = DucklingSuccess { ds_result :: Text }
90 | DucklingFailure { df_result :: Text }
91 | ReadFailure1 { rf1_result :: Text }
92 | ReadFailure2 { rf2_result :: Text }
93 | DateFlowSuccess { success :: UTCTime }
98 dateFlow :: DateFlow -> DateFlow
99 dateFlow (DucklingSuccess res) = case (head $ splitOn "." res) of
100 Nothing -> dateFlow (ReadFailure1 res)
101 Just re -> case readDate res of
102 Nothing -> dateFlow (ReadFailure1 re)
103 Just ok -> DateFlowSuccess ok
104 dateFlow (DucklingFailure txt) = case readDate $ replace " " "T" txt of
105 Nothing -> dateFlow (ReadFailure1 txt)
106 Just ok -> DateFlowSuccess ok
107 dateFlow (ReadFailure1 txt) = case readDate txt of
108 Nothing -> DateFlowFailure
109 Just ok -> DateFlowSuccess ok
110 dateFlow _ = DateFlowFailure
113 readDate :: Text -> Maybe UTCTime
115 let format = cs $ iso8601DateFormat (Just "%H:%M:%S")
116 parseTimeM True defaultTimeLocale (unpack format) (cs txt)
119 -- TODO add Paris at Duckling.Locale Region datatype
120 -- | To get Homogeinity of the languages
121 -- TODO : put this in a more generic place in the source code
122 parserLang :: Lang -> DC.Lang
123 parserLang FR = DC.FR
124 parserLang EN = DC.EN
125 parserLang lang = panic $ "[G.C.T.C.P.Date] Lang not implemented" <> (cs $ show lang)
127 -- | Final Date parser API
128 -- IO can be avoided here:
129 -- currentContext :: Lang -> IO Context
130 -- currentContext lang = localContext lang <$> utcToDucklingTime <$> getCurrentTime
131 -- parseRaw :: Context -> Text -> SomeErrorHandling Text
134 parseRawSafe :: Lang -> Text -> IO DateFlow
135 parseRawSafe lang text = do
136 triedParseRaw <- tryParseRaw lang text
137 dateStr' <- case triedParseRaw of
138 Left (CE.SomeException err) -> do
139 envLang <- getEnv "LANG"
140 printDebug "[G.C.T.C.P.Date] Exception: " (err, envLang, lang, text)
141 pure $ DucklingFailure text
142 Right res -> pure $ DucklingSuccess res
145 tryParseRaw :: CE.Exception e => Lang -> Text -> IO (Either e Text)
146 tryParseRaw lang text = CE.try (parseRaw lang text)
148 parseRaw :: Lang -> Text -> IO Text
149 parseRaw lang text = do -- case result
150 maybeResult <- extractValue <$> getTimeValue
151 <$> parseDateWithDuckling lang text (Options True)
153 Just result -> pure result
155 printDebug ("[G.C.T.C.P.D.parseRaw] ERROR " <> (cs . show) lang)
159 getTimeValue :: [ResolvedToken] -> Maybe Value
160 getTimeValue rt = case head rt of
163 Just x -> case rval x of
164 RVal Time t -> Just $ toJSON t
168 extractValue :: Maybe Value -> Maybe Text
169 extractValue (Just (Json.Object object)) =
170 case HM.lookup "value" object of
171 Just (Json.String date) -> Just date
173 extractValue _ = Nothing
175 -- | Current Time in DucklingTime format
176 -- TODO : get local Time in a more generic way
177 utcToDucklingTime :: UTCTime -> DucklingTime
178 utcToDucklingTime time = DucklingTime . zonedTimeToZoneSeriesTime $ fromUTC time utc
180 -- | Local Context which depends on Lang and Time
181 localContext :: Lang -> DucklingTime -> Context
182 localContext lang dt = Context {referenceTime = dt, locale = makeLocale (parserLang lang) Nothing}
184 -- | Date parser with Duckling
185 parseDateWithDuckling :: Lang -> Text -> Options -> IO [ResolvedToken]
186 parseDateWithDuckling lang input options = do
187 contxt <- localContext lang <$> utcToDucklingTime <$> getCurrentTime
188 --pure $ parseAndResolve (rulesFor (locale ctx) (HashSet.fromList [(This Time)])) input ctx
189 -- TODO check/test Options False or True
190 pure $ analyze input contxt options $ HashSet.fromList [(Seal Time)]