]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Corpus/Parsers/Date.hs
[MERGE]
[gargantext.git] / src / Gargantext / Core / Text / Corpus / Parsers / Date.hs
1 {-|
2 Module : Gargantext.Core.Text.Corpus.Parsers.Date
3 Description : Some utils to parse dates
4 Copyright : (c) CNRS 2017-present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 According to the language of the text, parseDateRaw returns date as Text:
11
12 TODO : Add some tests
13 import Gargantext.Core.Text.Corpus.Parsers.Date as DGP
14 DGP.parseDateRaw DGP.FR "12 avril 2010" == "2010-04-12T00:00:00.000+00:00"
15 -}
16
17 {-# LANGUAGE TypeFamilies #-}
18
19 module Gargantext.Core.Text.Corpus.Parsers.Date
20 {-(parse, parseRaw, dateSplit, Year, Month, Day)-}
21 where
22
23 --import qualified Control.Exception as CE
24 import Data.Aeson (toJSON, Value)
25 import Data.Either (Either(..))
26 import Data.HashMap.Strict as HM hiding (map)
27 import Data.Maybe (fromMaybe)
28 import Data.Text (Text, unpack, splitOn, replace)
29 import Data.Time (defaultTimeLocale, iso8601DateFormat, parseTimeM, toGregorian)
30 import Data.Time.Clock ( secondsToDiffTime)
31 import Data.Time.Clock (UTCTime(..), getCurrentTime)
32 import Data.Time.LocalTime (utc)
33 import Data.Time.LocalTime.TimeZone.Series (zonedTimeToZoneSeriesTime)
34 import Duckling.Api (analyze)
35 import Duckling.Core (makeLocale, Dimension(Time))
36 import Duckling.Resolve (fromUTC, Context(Context, referenceTime, locale), DucklingTime(DucklingTime), Options(..))
37 import Duckling.Types (ResolvedToken(..), ResolvedVal(..))
38 import Duckling.Types (Seal(..))
39 import Gargantext.Core (Lang(FR,EN))
40 import Gargantext.Core.Types (DebugMode(..), withDebugMode)
41 import Gargantext.Prelude
42 import System.Environment (getEnv)
43 import qualified Data.Aeson as Json
44 import qualified Data.HashSet as HashSet
45 import qualified Data.Time.Calendar as DTC
46 import qualified Duckling.Core as DC
47 import qualified Data.List as List
48 ------------------------------------------------------------------------
49 -- | Parse date to Ints
50 -- TODO add hours, minutes and seconds
51 dateSplit :: Lang -> Maybe Text -> IO (Maybe UTCTime, (Maybe Year, Maybe Month, Maybe Day))
52 dateSplit _ Nothing = pure (Nothing, (Nothing, Nothing, Nothing))
53 dateSplit l (Just txt) = do
54 utcTime <- parse l txt
55 let (y, m, d) = split' utcTime
56 pure (Just utcTime, (Just y, Just m,Just d))
57
58 split' :: UTCTime -> (Year, Month, Day)
59 split' utcTime = (fromIntegral y, m, d)
60 where
61 (UTCTime day _) = utcTime
62 (y,m,d) = toGregorian day
63
64 type Year = Int
65 type Month = Int
66 type Day = Int
67 ------------------------------------------------------------------------
68
69 -- | Date Parser
70 -- Parses dates mentions in full text given the language.
71 -- >>> parse FR (pack "1 avril 1900 à 19H")
72 -- 1900-04-01 19:00:00 UTC
73 -- >>> parse EN (pack "April 1 1900")
74 -- 1900-04-01 00:00:00 UTC
75 parse :: Lang -> Text -> IO UTCTime
76 parse lang s = do
77 -- printDebug "Date: " s
78 let result = dateFlow (DucklingFailure s)
79 --printDebug "Date': " dateStr'
80 case result of
81 DateFlowSuccess ok -> pure ok
82 DateFlowFailure -> (withDebugMode (DebugMode True)
83 "[G.C.T.P.T.Date parse]" (lang,s)
84 $ getCurrentTime)
85 _ -> panic "[G.C.T.C.Parsers.Date] parse: Should not happen"
86
87 defaultDate :: Text
88 defaultDate = "0-0-0T0:0:0"
89
90 type DateFormat = Text
91 type DateDefault = Text
92
93
94 data DateFlow = DucklingSuccess { ds_result :: Text }
95 | DucklingFailure { df_result :: Text }
96 | ReadFailure1 { rf1_result :: Text }
97 | ReadFailure2 { rf2_result :: Text }
98 | DateFlowSuccess { success :: UTCTime }
99 | DateFlowFailure
100 deriving Show
101
102 --{-
103 dateFlow :: DateFlow -> DateFlow
104 dateFlow (DucklingSuccess res) = case (head $ splitOn "." res) of
105 Nothing -> dateFlow (ReadFailure1 res)
106 Just re -> case readDate res of
107 Nothing -> dateFlow (ReadFailure1 re)
108 Just ok -> DateFlowSuccess ok
109 --dateFlow (DucklingFailure txt) = case readDate $ replace " " "T" txt of
110 dateFlow (DucklingFailure txt) = case readDate (fromMaybe "" $ headMay $ List.filter (/= "") $ splitOn " " txt) of
111 Nothing -> dateFlow (ReadFailure1 txt)
112 Just ok -> DateFlowSuccess ok
113 dateFlow (ReadFailure1 txt) = case readDate txt of
114 Nothing -> dateFlow $ ReadFailure2 txt
115 Just ok -> DateFlowSuccess ok
116 dateFlow (ReadFailure2 txt) = case readDate $ replace " " "" txt <> "-01-01" of
117 Nothing -> DateFlowFailure
118 Just ok -> DateFlowSuccess ok
119 dateFlow _ = DateFlowFailure
120 --}
121
122 readDate :: Text -> Maybe UTCTime
123 readDate txt = do
124 --let format = cs $ iso8601DateFormat (Just "%F %H:%M:%S")
125 let format = cs $ iso8601DateFormat Nothing
126 parseTimeM True defaultTimeLocale (unpack format) (cs txt)
127
128
129 -- TODO add Paris at Duckling.Locale Region datatype
130 -- | To get Homogeinity of the languages
131 -- TODO : put this in a more generic place in the source code
132 parserLang :: Lang -> DC.Lang
133 parserLang FR = DC.FR
134 parserLang EN = DC.EN
135 parserLang lang = panic $ "[G.C.T.C.P.Date] Lang not implemented" <> (cs $ show lang)
136
137 -- | Final Date parser API
138 -- IO can be avoided here:
139 -- currentContext :: Lang -> IO Context
140 -- currentContext lang = localContext lang <$> utcToDucklingTime <$> getCurrentTime
141 -- parseRaw :: Context -> Text -> SomeErrorHandling Text
142
143
144 parseRawSafe :: Lang -> Text -> IO DateFlow
145 parseRawSafe lang text = do
146 let triedParseRaw = parseRaw lang text
147 dateStr' <- case triedParseRaw of
148 --Left (CE.SomeException err) -> do
149 Left err -> do
150 envLang <- getEnv "LANG"
151 printDebug "[G.C.T.C.P.Date] Exception: " (err, envLang, lang, text)
152 pure $ DucklingFailure text
153 Right res -> pure $ DucklingSuccess res
154 pure dateStr'
155
156 --tryParseRaw :: CE.Exception e => Lang -> Text -> IO (Either e Text)
157 --tryParseRaw lang text = CE.try (parseRaw lang text)
158
159 parseRaw :: Lang -> Text -> Either Text Text
160 parseRaw lang text = do -- case result
161 let maybeResult = extractValue $ getTimeValue
162 $ parseDateWithDuckling lang text (Options True)
163 case maybeResult of
164 Just result -> Right result
165 Nothing -> do
166 -- printDebug ("[G.C.T.C.P.D.parseRaw] ERROR " <> (cs . show) lang) text
167 Left $ "[G.C.T.C.P.D.parseRaw ERROR] " <> (cs . show) lang <> " :: " <> text
168
169 getTimeValue :: [ResolvedToken] -> Maybe Value
170 getTimeValue rt = case head rt of
171 Nothing -> do
172 Nothing
173 Just x -> case rval x of
174 RVal Time t -> Just $ toJSON t
175 _ -> do
176 Nothing
177
178 extractValue :: Maybe Value -> Maybe Text
179 extractValue (Just (Json.Object object)) =
180 case HM.lookup "value" object of
181 Just (Json.String date) -> Just date
182 _ -> Nothing
183 extractValue _ = Nothing
184
185 -- | Current Time in DucklingTime format
186 -- TODO : get local Time in a more generic way
187 utcToDucklingTime :: UTCTime -> DucklingTime
188 utcToDucklingTime time = DucklingTime . zonedTimeToZoneSeriesTime $ fromUTC time utc
189
190 -- | Local Context which depends on Lang and Time
191 localContext :: Lang -> DucklingTime -> Context
192 localContext lang dt = Context { referenceTime = dt
193 , locale = makeLocale (parserLang lang) Nothing }
194
195 defaultDay :: DTC.Day
196 defaultDay = DTC.fromGregorian 1 1 1
197
198 defaultUTCTime :: UTCTime
199 defaultUTCTime = UTCTime { utctDay = defaultDay
200 , utctDayTime = secondsToDiffTime 0 }
201
202 -- | Date parser with Duckling
203 parseDateWithDuckling :: Lang -> Text -> Options -> [ResolvedToken]
204 parseDateWithDuckling lang input options = do
205 let contxt = localContext lang $ utcToDucklingTime defaultUTCTime
206 --pure $ parseAndResolve (rulesFor (locale ctx) (HashSet.fromList [(This Time)])) input ctx
207 -- TODO check/test Options False or True
208 analyze input contxt options $ HashSet.fromList [(Seal Time)]
209