]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Corpus/Parsers/Date.hs
[ngrams] some more fixes and refactorings
[gargantext.git] / src / Gargantext / Core / Text / Corpus / Parsers / Date.hs
1 {-|
2 Module : Gargantext.Core.Text.Corpus.Parsers.Date
3 Description : Some utils to parse dates
4 Copyright : (c) CNRS 2017-present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 According to the language of the text, parseDateRaw returns date as Text:
11
12 TODO : Add some tests
13 import Gargantext.Core.Text.Corpus.Parsers.Date as DGP
14 DGP.parseDateRaw DGP.FR "12 avril 2010" == "2010-04-12T00:00:00.000+00:00"
15 -}
16
17 {-# LANGUAGE TypeFamilies #-}
18
19 module Gargantext.Core.Text.Corpus.Parsers.Date
20 {-(parse, parseRaw, dateSplit, Year, Month, Day)-}
21 where
22
23 import System.Environment (getEnv)
24 import Data.Aeson (toJSON, Value)
25 import Data.Either (Either(..))
26 import Data.HashMap.Strict as HM hiding (map)
27 import Data.Text (Text, unpack, splitOn, replace)
28 import Data.Time (defaultTimeLocale, iso8601DateFormat, parseTimeM, toGregorian)
29 import qualified Data.Time.Calendar as DTC
30 import Data.Time.Clock (UTCTime(..), getCurrentTime)
31 import Data.Time.Clock ( secondsToDiffTime)
32 import Data.Time.LocalTime (utc)
33 import Data.Time.LocalTime.TimeZone.Series (zonedTimeToZoneSeriesTime)
34 import Duckling.Api (analyze)
35 import Duckling.Core (makeLocale, Dimension(Time))
36 import Duckling.Types (Seal(..))
37 import Duckling.Resolve (fromUTC, Context(Context, referenceTime, locale), DucklingTime(DucklingTime), Options(..))
38 import Duckling.Types (ResolvedToken(..), ResolvedVal(..))
39 import Gargantext.Core (Lang(FR,EN))
40 import Gargantext.Core.Types (DebugMode(..), withDebugMode)
41 import Gargantext.Prelude
42 --import qualified Control.Exception as CE
43 import qualified Data.Aeson as Json
44 import qualified Data.HashSet as HashSet
45 import qualified Duckling.Core as DC
46
47 ------------------------------------------------------------------------
48 -- | Parse date to Ints
49 -- TODO add hours, minutes and seconds
50 dateSplit :: Lang -> Maybe Text -> IO (Maybe UTCTime, (Maybe Year, Maybe Month, Maybe Day))
51 dateSplit _ Nothing = pure (Nothing, (Nothing, Nothing, Nothing))
52 dateSplit l (Just txt) = do
53 utcTime <- parse l txt
54 let (y, m, d) = split' utcTime
55 pure (Just utcTime, (Just y, Just m,Just d))
56
57 split' :: UTCTime -> (Year, Month, Day)
58 split' utcTime = (fromIntegral y, m, d)
59 where
60 (UTCTime day _) = utcTime
61 (y,m,d) = toGregorian day
62
63 type Year = Int
64 type Month = Int
65 type Day = Int
66 ------------------------------------------------------------------------
67
68 -- | Date Parser
69 -- Parses dates mentions in full text given the language.
70 -- >>> parse FR (pack "1 avril 1900 à 19H")
71 -- 1900-04-01 19:00:00 UTC
72 -- >>> parse EN (pack "April 1 1900")
73 -- 1900-04-01 00:00:00 UTC
74 parse :: Lang -> Text -> IO UTCTime
75 parse lang s = do
76 --printDebug "Date: " s
77 dateStr' <- pure $ dateFlow (DucklingFailure s) -- parseRawSafe lang s
78 --printDebug "Date': " dateStr'
79 case dateFlow dateStr' of
80 DateFlowSuccess ok -> pure ok
81 _ -> withDebugMode (DebugMode True)
82 "[G.C.T.P.T.Date parse]" (lang,s)
83 $ getCurrentTime
84
85
86 defaultDate :: Text
87 defaultDate = "0-0-0T0:0:0"
88
89 type DateFormat = Text
90 type DateDefault = Text
91
92
93 data DateFlow = DucklingSuccess { ds_result :: Text }
94 | DucklingFailure { df_result :: Text }
95 | ReadFailure1 { rf1_result :: Text }
96 | ReadFailure2 { rf2_result :: Text }
97 | DateFlowSuccess { success :: UTCTime }
98 | DateFlowFailure
99 deriving Show
100
101 --{-
102 dateFlow :: DateFlow -> DateFlow
103 dateFlow (DucklingSuccess res) = case (head $ splitOn "." res) of
104 Nothing -> dateFlow (ReadFailure1 res)
105 Just re -> case readDate res of
106 Nothing -> dateFlow (ReadFailure1 re)
107 Just ok -> DateFlowSuccess ok
108 dateFlow (DucklingFailure txt) = case readDate $ replace " " "T" txt of
109 Nothing -> dateFlow (ReadFailure1 txt)
110 Just ok -> DateFlowSuccess ok
111 dateFlow (ReadFailure1 txt) = case readDate txt of
112 Nothing -> dateFlow $ ReadFailure2 txt
113 Just ok -> DateFlowSuccess ok
114 dateFlow (ReadFailure2 txt) = case readDate $ replace " " "" txt <> "-01-01T00:00:00" of
115 Nothing -> DateFlowFailure
116 Just ok -> DateFlowSuccess ok
117 dateFlow _ = DateFlowFailure
118 --}
119
120 readDate :: Text -> Maybe UTCTime
121 readDate txt = do
122 let format = cs $ iso8601DateFormat (Just "%H:%M:%S")
123 parseTimeM True defaultTimeLocale (unpack format) (cs txt)
124
125
126 -- TODO add Paris at Duckling.Locale Region datatype
127 -- | To get Homogeinity of the languages
128 -- TODO : put this in a more generic place in the source code
129 parserLang :: Lang -> DC.Lang
130 parserLang FR = DC.FR
131 parserLang EN = DC.EN
132 parserLang lang = panic $ "[G.C.T.C.P.Date] Lang not implemented" <> (cs $ show lang)
133
134 -- | Final Date parser API
135 -- IO can be avoided here:
136 -- currentContext :: Lang -> IO Context
137 -- currentContext lang = localContext lang <$> utcToDucklingTime <$> getCurrentTime
138 -- parseRaw :: Context -> Text -> SomeErrorHandling Text
139
140
141 parseRawSafe :: Lang -> Text -> IO DateFlow
142 parseRawSafe lang text = do
143 let triedParseRaw = parseRaw lang text
144 dateStr' <- case triedParseRaw of
145 --Left (CE.SomeException err) -> do
146 Left err -> do
147 envLang <- getEnv "LANG"
148 printDebug "[G.C.T.C.P.Date] Exception: " (err, envLang, lang, text)
149 pure $ DucklingFailure text
150 Right res -> pure $ DucklingSuccess res
151 pure dateStr'
152
153 --tryParseRaw :: CE.Exception e => Lang -> Text -> IO (Either e Text)
154 --tryParseRaw lang text = CE.try (parseRaw lang text)
155
156 parseRaw :: Lang -> Text -> Either Text Text
157 parseRaw lang text = do -- case result
158 let maybeResult = extractValue $ getTimeValue
159 $ parseDateWithDuckling lang text (Options True)
160 case maybeResult of
161 Just result -> Right result
162 Nothing -> do
163 -- printDebug ("[G.C.T.C.P.D.parseRaw] ERROR " <> (cs . show) lang) text
164 Left $ "[G.C.T.C.P.D.parseRaw ERROR] " <> (cs . show) lang <> " :: " <> text
165
166 getTimeValue :: [ResolvedToken] -> Maybe Value
167 getTimeValue rt = case head rt of
168 Nothing -> do
169 Nothing
170 Just x -> case rval x of
171 RVal Time t -> Just $ toJSON t
172 _ -> do
173 Nothing
174
175 extractValue :: Maybe Value -> Maybe Text
176 extractValue (Just (Json.Object object)) =
177 case HM.lookup "value" object of
178 Just (Json.String date) -> Just date
179 _ -> Nothing
180 extractValue _ = Nothing
181
182 -- | Current Time in DucklingTime format
183 -- TODO : get local Time in a more generic way
184 utcToDucklingTime :: UTCTime -> DucklingTime
185 utcToDucklingTime time = DucklingTime . zonedTimeToZoneSeriesTime $ fromUTC time utc
186
187 -- | Local Context which depends on Lang and Time
188 localContext :: Lang -> DucklingTime -> Context
189 localContext lang dt = Context { referenceTime = dt
190 , locale = makeLocale (parserLang lang) Nothing }
191
192 defaultDay :: DTC.Day
193 defaultDay = DTC.fromGregorian 1 1 1
194
195 defaultUTCTime :: UTCTime
196 defaultUTCTime = UTCTime { utctDay = defaultDay
197 , utctDayTime = secondsToDiffTime 0 }
198
199 -- | Date parser with Duckling
200 parseDateWithDuckling :: Lang -> Text -> Options -> [ResolvedToken]
201 parseDateWithDuckling lang input options = do
202 let contxt = localContext lang $ utcToDucklingTime defaultUTCTime
203 --pure $ parseAndResolve (rulesFor (locale ctx) (HashSet.fromList [(This Time)])) input ctx
204 -- TODO check/test Options False or True
205 analyze input contxt options $ HashSet.fromList [(Seal Time)]
206