]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Corpus/Parsers/Date.hs
Merge branch 'dev' into 97-dev-istex-search
[gargantext.git] / src / Gargantext / Core / Text / Corpus / Parsers / Date.hs
1 {-|
2 Module : Gargantext.Core.Text.Corpus.Parsers.Date
3 Description : Some utils to parse dates
4 Copyright : (c) CNRS 2017-present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 According to the language of the text, parseDateRaw returns date as Text:
11
12 TODO : Add some tests
13 import Gargantext.Core.Text.Corpus.Parsers.Date as DGP
14 DGP.parseDateRaw DGP.FR "12 avril 2010" == "2010-04-12T00:00:00.000+00:00"
15 -}
16
17 {-# LANGUAGE TypeFamilies #-}
18
19 module Gargantext.Core.Text.Corpus.Parsers.Date
20 {-(parse, parseRaw, dateSplit, Year, Month, Day)-}
21 where
22
23 import System.Environment (getEnv)
24 import Data.Aeson (toJSON, Value)
25 import Data.Either (Either(..))
26 import Data.HashMap.Strict as HM hiding (map)
27 import Data.Text (Text, unpack, splitOn, replace)
28 import Data.Time (defaultTimeLocale, iso8601DateFormat, parseTimeM, toGregorian)
29 import qualified Data.Time.Calendar as DTC
30 import Data.Time.Clock (UTCTime(..), getCurrentTime)
31 import Data.Time.Clock ( secondsToDiffTime)
32 import Data.Time.LocalTime (utc)
33 import Data.Time.LocalTime.TimeZone.Series (zonedTimeToZoneSeriesTime)
34 import Duckling.Api (analyze)
35 import Duckling.Core (makeLocale, Dimension(Time))
36 import Duckling.Types (Seal(..))
37 import Duckling.Resolve (fromUTC, Context(Context, referenceTime, locale), DucklingTime(DucklingTime), Options(..))
38 import Duckling.Types (ResolvedToken(..), ResolvedVal(..))
39 import Gargantext.Core (Lang(FR,EN))
40 import Gargantext.Core.Types (DebugMode(..), withDebugMode)
41 import Gargantext.Prelude
42 --import qualified Control.Exception as CE
43 import qualified Data.Aeson as Json
44 import qualified Data.HashSet as HashSet
45 import qualified Duckling.Core as DC
46
47 ------------------------------------------------------------------------
48 -- | Parse date to Ints
49 -- TODO add hours, minutes and seconds
50 dateSplit :: Lang -> Maybe Text -> IO (Maybe UTCTime, (Maybe Year, Maybe Month, Maybe Day))
51 dateSplit _ Nothing = pure (Nothing, (Nothing, Nothing, Nothing))
52 dateSplit l (Just txt) = do
53 utcTime <- parse l txt
54 let (y, m, d) = split' utcTime
55 pure (Just utcTime, (Just y, Just m,Just d))
56
57 split' :: UTCTime -> (Year, Month, Day)
58 split' utcTime = (fromIntegral y, m, d)
59 where
60 (UTCTime day _) = utcTime
61 (y,m,d) = toGregorian day
62
63 type Year = Int
64 type Month = Int
65 type Day = Int
66 ------------------------------------------------------------------------
67
68 -- | Date Parser
69 -- Parses dates mentions in full text given the language.
70 -- >>> parse FR (pack "10 avril 1900 à 19H")
71 -- 1900-04-10 19:00:00 UTC
72 -- >>> parse EN (pack "April 10 1900")
73 -- 1900-04-10 00:00:00 UTC
74 parse :: Lang -> Text -> IO UTCTime
75 parse lang s = do
76 dateStr' <- parseRawSafe lang s
77 case dateFlow dateStr' of
78 DateFlowSuccess ok -> pure ok
79 _ -> withDebugMode (DebugMode True)
80 "[G.C.T.P.T.Date parse]" (lang,s)
81 $ getCurrentTime
82
83
84 defaultDate :: Text
85 defaultDate = "0-0-0T0:0:0"
86
87 type DateFormat = Text
88 type DateDefault = Text
89
90
91 data DateFlow = DucklingSuccess { ds_result :: Text }
92 | DucklingFailure { df_result :: Text }
93 | ReadFailure1 { rf1_result :: Text }
94 | ReadFailure2 { rf2_result :: Text }
95 | DateFlowSuccess { success :: UTCTime }
96 | DateFlowFailure
97 deriving Show
98
99 --{-
100 dateFlow :: DateFlow -> DateFlow
101 dateFlow (DucklingSuccess res) = case (head $ splitOn "." res) of
102 Nothing -> dateFlow (ReadFailure1 res)
103 Just re -> case readDate res of
104 Nothing -> dateFlow (ReadFailure1 re)
105 Just ok -> DateFlowSuccess ok
106 dateFlow (DucklingFailure txt) = case readDate $ replace " " "T" txt of
107 Nothing -> dateFlow (ReadFailure1 txt)
108 Just ok -> DateFlowSuccess ok
109 dateFlow (ReadFailure1 txt) = case readDate txt of
110 Nothing -> dateFlow $ ReadFailure2 txt
111 Just ok -> DateFlowSuccess ok
112 dateFlow (ReadFailure2 txt) = case readDate $ replace " " "" txt <> "-01-01T00:00:00" of
113 Nothing -> DateFlowFailure
114 Just ok -> DateFlowSuccess ok
115 dateFlow _ = DateFlowFailure
116 --}
117
118 readDate :: Text -> Maybe UTCTime
119 readDate txt = do
120 let format = cs $ iso8601DateFormat (Just "%H:%M:%S")
121 parseTimeM True defaultTimeLocale (unpack format) (cs txt)
122
123
124 -- TODO add Paris at Duckling.Locale Region datatype
125 -- | To get Homogeinity of the languages
126 -- TODO : put this in a more generic place in the source code
127 parserLang :: Lang -> DC.Lang
128 parserLang FR = DC.FR
129 parserLang EN = DC.EN
130 parserLang lang = panic $ "[G.C.T.C.P.Date] Lang not implemented" <> (cs $ show lang)
131
132 -- | Final Date parser API
133 -- IO can be avoided here:
134 -- currentContext :: Lang -> IO Context
135 -- currentContext lang = localContext lang <$> utcToDucklingTime <$> getCurrentTime
136 -- parseRaw :: Context -> Text -> SomeErrorHandling Text
137
138
139 parseRawSafe :: Lang -> Text -> IO DateFlow
140 parseRawSafe lang text = do
141 let triedParseRaw = parseRaw lang text
142 dateStr' <- case triedParseRaw of
143 --Left (CE.SomeException err) -> do
144 Left err -> do
145 envLang <- getEnv "LANG"
146 printDebug "[G.C.T.C.P.Date] Exception: " (err, envLang, lang, text)
147 pure $ DucklingFailure text
148 Right res -> pure $ DucklingSuccess res
149 pure dateStr'
150
151 --tryParseRaw :: CE.Exception e => Lang -> Text -> IO (Either e Text)
152 --tryParseRaw lang text = CE.try (parseRaw lang text)
153
154 parseRaw :: Lang -> Text -> Either Text Text
155 parseRaw lang text = do -- case result
156 let maybeResult = extractValue $ getTimeValue
157 $ parseDateWithDuckling lang text (Options True)
158 case maybeResult of
159 Just result -> Right result
160 Nothing -> do
161 -- printDebug ("[G.C.T.C.P.D.parseRaw] ERROR " <> (cs . show) lang) text
162 Left $ "[G.C.T.C.P.D.parseRaw ERROR] " <> (cs . show) lang <> " :: " <> text
163
164 getTimeValue :: [ResolvedToken] -> Maybe Value
165 getTimeValue rt = case head rt of
166 Nothing -> do
167 Nothing
168 Just x -> case rval x of
169 RVal Time t -> Just $ toJSON t
170 _ -> do
171 Nothing
172
173 extractValue :: Maybe Value -> Maybe Text
174 extractValue (Just (Json.Object object)) =
175 case HM.lookup "value" object of
176 Just (Json.String date) -> Just date
177 _ -> Nothing
178 extractValue _ = Nothing
179
180 -- | Current Time in DucklingTime format
181 -- TODO : get local Time in a more generic way
182 utcToDucklingTime :: UTCTime -> DucklingTime
183 utcToDucklingTime time = DucklingTime . zonedTimeToZoneSeriesTime $ fromUTC time utc
184
185 -- | Local Context which depends on Lang and Time
186 localContext :: Lang -> DucklingTime -> Context
187 localContext lang dt = Context { referenceTime = dt
188 , locale = makeLocale (parserLang lang) Nothing }
189
190 defaultDay :: DTC.Day
191 defaultDay = DTC.fromGregorian 1 1 1
192
193 defaultUTCTime :: UTCTime
194 defaultUTCTime = UTCTime { utctDay = defaultDay
195 , utctDayTime = secondsToDiffTime 0 }
196
197 -- | Date parser with Duckling
198 parseDateWithDuckling :: Lang -> Text -> Options -> [ResolvedToken]
199 parseDateWithDuckling lang input options = do
200 let contxt = localContext lang $ utcToDucklingTime defaultUTCTime
201 --pure $ parseAndResolve (rulesFor (locale ctx) (HashSet.fromList [(This Time)])) input ctx
202 -- TODO check/test Options False or True
203 analyze input contxt options $ HashSet.fromList [(Seal Time)]
204