]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Corpus/Parsers/Date.hs
[FIX] Date Parser for WOS
[gargantext.git] / src / Gargantext / Core / Text / Corpus / Parsers / Date.hs
1 {-|
2 Module : Gargantext.Core.Text.Corpus.Parsers.Date
3 Description : Some utils to parse dates
4 Copyright : (c) CNRS 2017-present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 According to the language of the text, parseDateRaw returns date as Text:
11
12 TODO : Add some tests
13 import Gargantext.Core.Text.Corpus.Parsers.Date as DGP
14 DGP.parseDateRaw DGP.FR "12 avril 2010" == "2010-04-12T00:00:00.000+00:00"
15 -}
16
17 {-# LANGUAGE TypeFamilies #-}
18
19 module Gargantext.Core.Text.Corpus.Parsers.Date
20 {-(parse, parseRaw, dateSplit, Year, Month, Day)-}
21 where
22
23 import System.Environment (getEnv)
24 import Data.Aeson (toJSON, Value)
25 import Data.Either (Either(..))
26 import Data.HashMap.Strict as HM hiding (map)
27 import Data.Text (Text, unpack, splitOn, replace)
28 import Data.Time (defaultTimeLocale, toGregorian, iso8601DateFormat, parseTimeM)
29 import Data.Time.Clock (UTCTime(..), getCurrentTime)
30 import Data.Time.LocalTime (utc)
31 import Data.Time.LocalTime.TimeZone.Series (zonedTimeToZoneSeriesTime)
32 import Duckling.Api (analyze)
33 import Duckling.Core (makeLocale, Dimension(Time))
34 import Duckling.Types (Seal(..))
35 import Duckling.Resolve (fromUTC, Context(Context, referenceTime, locale), DucklingTime(DucklingTime), Options(..))
36 import Duckling.Types (ResolvedToken(..), ResolvedVal(..))
37 import Gargantext.Core (Lang(FR,EN))
38 import Gargantext.Core.Types (DebugMode(..), withDebugMode)
39 import Gargantext.Prelude
40 import qualified Control.Exception as CE
41 import qualified Data.Aeson as Json
42 import qualified Data.HashSet as HashSet
43 import qualified Duckling.Core as DC
44
45 ------------------------------------------------------------------------
46 -- | Parse date to Ints
47 -- TODO add hours, minutes and seconds
48 dateSplit :: Lang -> Maybe Text -> IO (Maybe UTCTime, (Maybe Year, Maybe Month, Maybe Day))
49 dateSplit _ Nothing = pure (Nothing, (Nothing, Nothing, Nothing))
50 dateSplit l (Just txt) = do
51 utcTime <- parse l txt
52 let (y, m, d) = split' utcTime
53 pure (Just utcTime, (Just y, Just m,Just d))
54
55 split' :: UTCTime -> (Year, Month, Day)
56 split' utcTime = (fromIntegral y, m, d)
57 where
58 (UTCTime day _) = utcTime
59 (y,m,d) = toGregorian day
60
61 type Year = Int
62 type Month = Int
63 type Day = Int
64 ------------------------------------------------------------------------
65
66 -- | Date Parser
67 -- Parses dates mentions in full text given the language.
68 -- >>> parse FR (pack "10 avril 1900 à 19H")
69 -- 1900-04-10 19:00:00 UTC
70 -- >>> parse EN (pack "April 10 1900")
71 -- 1900-04-10 00:00:00 UTC
72 parse :: Lang -> Text -> IO UTCTime
73 parse lang s = do
74 dateStr' <- parseRawSafe lang s
75 case dateFlow dateStr' of
76 DateFlowSuccess ok -> pure ok
77 _ -> withDebugMode (DebugMode True)
78 "[G.C.T.P.T.Date parse]" (lang,s)
79 $ getCurrentTime
80
81
82 defaultDate :: Text
83 defaultDate = "0-0-0T0:0:0"
84
85 type DateFormat = Text
86 type DateDefault = Text
87
88
89 data DateFlow = DucklingSuccess { ds_result :: Text }
90 | DucklingFailure { df_result :: Text }
91 | ReadFailure1 { rf1_result :: Text }
92 | ReadFailure2 { rf2_result :: Text }
93 | DateFlowSuccess { success :: UTCTime }
94 | DateFlowFailure
95 deriving Show
96
97 --{-
98 dateFlow :: DateFlow -> DateFlow
99 dateFlow (DucklingSuccess res) = case (head $ splitOn "." res) of
100 Nothing -> dateFlow (ReadFailure1 res)
101 Just re -> case readDate res of
102 Nothing -> dateFlow (ReadFailure1 re)
103 Just ok -> DateFlowSuccess ok
104 dateFlow (DucklingFailure txt) = case readDate $ replace " " "T" txt of
105 Nothing -> dateFlow (ReadFailure1 txt)
106 Just ok -> DateFlowSuccess ok
107 dateFlow (ReadFailure1 txt) = case readDate txt of
108 Nothing -> dateFlow $ ReadFailure2 txt
109 Just ok -> DateFlowSuccess ok
110 dateFlow (ReadFailure2 txt) = case readDate $ replace " " "" txt <> "-01-01T00:00:00" of
111 Nothing -> DateFlowFailure
112 Just ok -> DateFlowSuccess ok
113 dateFlow _ = DateFlowFailure
114 --}
115
116 readDate :: Text -> Maybe UTCTime
117 readDate txt = do
118 let format = cs $ iso8601DateFormat (Just "%H:%M:%S")
119 parseTimeM True defaultTimeLocale (unpack format) (cs txt)
120
121
122 -- TODO add Paris at Duckling.Locale Region datatype
123 -- | To get Homogeinity of the languages
124 -- TODO : put this in a more generic place in the source code
125 parserLang :: Lang -> DC.Lang
126 parserLang FR = DC.FR
127 parserLang EN = DC.EN
128 parserLang lang = panic $ "[G.C.T.C.P.Date] Lang not implemented" <> (cs $ show lang)
129
130 -- | Final Date parser API
131 -- IO can be avoided here:
132 -- currentContext :: Lang -> IO Context
133 -- currentContext lang = localContext lang <$> utcToDucklingTime <$> getCurrentTime
134 -- parseRaw :: Context -> Text -> SomeErrorHandling Text
135
136
137 parseRawSafe :: Lang -> Text -> IO DateFlow
138 parseRawSafe lang text = do
139 triedParseRaw <- tryParseRaw lang text
140 dateStr' <- case triedParseRaw of
141 Left (CE.SomeException err) -> do
142 envLang <- getEnv "LANG"
143 printDebug "[G.C.T.C.P.Date] Exception: " (err, envLang, lang, text)
144 pure $ DucklingFailure text
145 Right res -> pure $ DucklingSuccess res
146 pure dateStr'
147
148 tryParseRaw :: CE.Exception e => Lang -> Text -> IO (Either e Text)
149 tryParseRaw lang text = CE.try (parseRaw lang text)
150
151 parseRaw :: Lang -> Text -> IO Text
152 parseRaw lang text = do -- case result
153 maybeResult <- extractValue <$> getTimeValue
154 <$> parseDateWithDuckling lang text (Options True)
155 case maybeResult of
156 Just result -> pure result
157 Nothing -> do
158 printDebug ("[G.C.T.C.P.D.parseRaw] ERROR " <> (cs . show) lang)
159 text
160 pure ""
161
162 getTimeValue :: [ResolvedToken] -> Maybe Value
163 getTimeValue rt = case head rt of
164 Nothing -> do
165 Nothing
166 Just x -> case rval x of
167 RVal Time t -> Just $ toJSON t
168 _ -> do
169 Nothing
170
171 extractValue :: Maybe Value -> Maybe Text
172 extractValue (Just (Json.Object object)) =
173 case HM.lookup "value" object of
174 Just (Json.String date) -> Just date
175 _ -> Nothing
176 extractValue _ = Nothing
177
178 -- | Current Time in DucklingTime format
179 -- TODO : get local Time in a more generic way
180 utcToDucklingTime :: UTCTime -> DucklingTime
181 utcToDucklingTime time = DucklingTime . zonedTimeToZoneSeriesTime $ fromUTC time utc
182
183 -- | Local Context which depends on Lang and Time
184 localContext :: Lang -> DucklingTime -> Context
185 localContext lang dt = Context {referenceTime = dt, locale = makeLocale (parserLang lang) Nothing}
186
187 -- | Date parser with Duckling
188 parseDateWithDuckling :: Lang -> Text -> Options -> IO [ResolvedToken]
189 parseDateWithDuckling lang input options = do
190 contxt <- localContext lang <$> utcToDucklingTime <$> getCurrentTime
191 --pure $ parseAndResolve (rulesFor (locale ctx) (HashSet.fromList [(This Time)])) input ctx
192 -- TODO check/test Options False or True
193 pure $ analyze input contxt options $ HashSet.fromList [(Seal Time)]
194