]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Corpus/Parsers/Date.hs
[REFACT] HyperData
[gargantext.git] / src / Gargantext / Text / Corpus / Parsers / Date.hs
1 {-|
2 Module : Gargantext.Text.Corpus.Parsers.Date
3 Description : Some utils to parse dates
4 Copyright : (c) CNRS 2017-present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 According to the language of the text, parseDateRaw returns date as Text:
11
12 TODO : Add some tests
13 import Gargantext.Text.Corpus.Parsers.Date as DGP
14 DGP.parseDateRaw DGP.FR "12 avril 2010" == "2010-04-12T00:00:00.000+00:00"
15 -}
16
17
18 module Gargantext.Text.Corpus.Parsers.Date (parse, parseRaw, dateSplit, Year, Month, Day) where
19
20 import Data.HashMap.Strict as HM hiding (map)
21 import Data.Text (Text, unpack, splitOn, pack)
22 import Data.Time (parseTimeOrError, defaultTimeLocale, toGregorian)
23 import Data.Time.Clock (UTCTime(..), getCurrentTime)
24 import Data.Time.LocalTime (utc)
25 import Data.Time.LocalTime.TimeZone.Series (zonedTimeToZoneSeriesTime)
26 import Duckling.Api (analyze)
27 import Duckling.Core (makeLocale, Some(This), Dimension(Time))
28 import Duckling.Resolve (fromUTC, Context(Context, referenceTime, locale), DucklingTime(DucklingTime))
29 import Duckling.Types (ResolvedToken)
30 import Duckling.Types (jsonValue)
31 import Gargantext.Core (Lang(FR,EN))
32 import Gargantext.Prelude
33 import qualified Data.Aeson as Json
34 import qualified Data.HashSet as HashSet
35 import qualified Duckling.Core as DC
36
37 ------------------------------------------------------------------------
38 -- | Parse date to Ints
39 -- TODO add hours, minutes and seconds
40 dateSplit :: Lang -> Maybe Text -> IO (Maybe UTCTime, (Maybe Year, Maybe Month, Maybe Day))
41 dateSplit _ Nothing = pure (Nothing, (Nothing, Nothing, Nothing))
42 dateSplit l (Just txt) = do
43 utcTime <- parse l txt
44 let (y, m, d) = split' utcTime
45 pure (Just utcTime, (Just y, Just m,Just d))
46
47 split' :: UTCTime -> (Year, Month, Day)
48 split' utcTime = (fromIntegral y, m, d)
49 where
50 (UTCTime day _) = utcTime
51 (y,m,d) = toGregorian day
52
53 type Year = Int
54 type Month = Int
55 type Day = Int
56 ------------------------------------------------------------------------
57
58 -- | Date Parser
59 -- Parses dates mentions in full text given the language.
60 -- >>> parseDate FR (pack "10 avril 1979 à 19H")
61 -- 1979-04-10 19:00:00 UTC
62 -- >>> parseDate EN (pack "April 10 1979")
63 -- 1979-04-10 00:00:00 UTC
64 parse :: Lang -> Text -> IO UTCTime
65 parse lang s = parseDate' "%Y-%m-%dT%T" "0-0-0T0:0:0" lang s
66
67 type DateFormat = Text
68 type DateDefault = Text
69
70 parseDate' :: DateFormat -> DateDefault -> Lang -> Text -> IO UTCTime
71 parseDate' format def lang s = do
72 dateStr' <- parseRaw lang s
73 let dateStr = unpack $ maybe def identity
74 $ head $ splitOn "." dateStr'
75 pure $ parseTimeOrError True defaultTimeLocale (unpack format) dateStr
76
77
78 -- TODO add Paris at Duckling.Locale Region datatype
79 -- | To get Homogeinity of the languages
80 -- TODO : put this in a more generic place in the source code
81 parserLang :: Lang -> DC.Lang
82 parserLang FR = DC.FR
83 parserLang EN = DC.EN
84 parserLang _ = panic "not implemented"
85
86 -- | Final Date parser API
87 -- IO can be avoided here:
88 -- currentContext :: Lang -> IO Context
89 -- currentContext lang = localContext lang <$> utcToDucklingTime <$> getCurrentTime
90 -- parseRaw :: Context -> Text -> SomeErrorHandling Text
91
92 -- TODO error handling
93 parseRaw :: Lang -> Text -> IO (Text)
94 parseRaw lang text = do
95 maybeJson <- map jsonValue <$> parseDateWithDuckling lang text
96 case headMay maybeJson of
97 Just (Json.Object object) -> case HM.lookup "value" object of
98 Just (Json.String date) -> pure date
99 Just _ -> panic "ParseRaw ERROR: should be a json String"
100 Nothing -> panic $ "ParseRaw ERROR: no date found" <> (pack . show) lang <> " " <> text
101
102 _ -> panic $ "ParseRaw ERROR: type error" <> (pack . show) lang <> " " <> text
103
104
105 -- | Current Time in DucklingTime format
106 -- TODO : get local Time in a more generic way
107 utcToDucklingTime :: UTCTime -> DucklingTime
108 utcToDucklingTime time = DucklingTime . zonedTimeToZoneSeriesTime $ fromUTC time utc
109
110 -- | Local Context which depends on Lang and Time
111 localContext :: Lang -> DucklingTime -> Context
112 localContext lang dt = Context {referenceTime = dt, locale = makeLocale (parserLang lang) Nothing}
113
114 -- | Date parser with Duckling
115 parseDateWithDuckling :: Lang -> Text -> IO [ResolvedToken]
116 parseDateWithDuckling lang input = do
117 contxt <- localContext lang <$> utcToDucklingTime <$> getCurrentTime
118 --pure $ parseAndResolve (rulesFor (locale ctx) (HashSet.fromList [(This Time)])) input ctx
119 pure $ analyze input contxt $ HashSet.fromList [(This Time)]
120