]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Corpus/Parsers/Date.hs
[version] bump to 0.0.1.4.2
[gargantext.git] / src / Gargantext / Text / Corpus / Parsers / Date.hs
1 {-|
2 Module : Gargantext.Text.Corpus.Parsers.Date
3 Description : Some utils to parse dates
4 Copyright : (c) CNRS 2017-present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 According to the language of the text, parseDateRaw returns date as Text:
11
12 TODO : Add some tests
13 import Gargantext.Text.Corpus.Parsers.Date as DGP
14 DGP.parseDateRaw DGP.FR "12 avril 2010" == "2010-04-12T00:00:00.000+00:00"
15 -}
16
17 {-# LANGUAGE FlexibleContexts #-}
18 {-# LANGUAGE NoImplicitPrelude #-}
19 {-# LANGUAGE OverloadedStrings #-}
20
21 module Gargantext.Text.Corpus.Parsers.Date (parse, parseRaw, dateSplit, Year, Month, Day) where
22
23 import Data.HashMap.Strict as HM hiding (map)
24 import Data.Text (Text, unpack, splitOn, pack)
25 import Data.Time (parseTimeOrError, defaultTimeLocale, toGregorian)
26 import Data.Time.Clock (UTCTime(..), getCurrentTime)
27 import Data.Time.LocalTime (utc)
28 import Data.Time.LocalTime.TimeZone.Series (zonedTimeToZoneSeriesTime)
29 import Duckling.Api (analyze)
30 import Duckling.Core (makeLocale, Some(This), Dimension(Time))
31 import Duckling.Resolve (fromUTC, Context(Context, referenceTime, locale), DucklingTime(DucklingTime))
32 import Duckling.Types (ResolvedToken)
33 import Duckling.Types (jsonValue)
34 import Gargantext.Core (Lang(FR,EN))
35 import Gargantext.Prelude
36 import qualified Data.Aeson as Json
37 import qualified Data.HashSet as HashSet
38 import qualified Duckling.Core as DC
39
40 ------------------------------------------------------------------------
41 -- | Parse date to Ints
42 -- TODO add hours, minutes and seconds
43 dateSplit :: Lang -> Maybe Text -> IO (Maybe UTCTime, (Maybe Year, Maybe Month, Maybe Day))
44 dateSplit _ Nothing = pure (Nothing, (Nothing, Nothing, Nothing))
45 dateSplit l (Just txt) = do
46 utcTime <- parse l txt
47 let (y, m, d) = split' utcTime
48 pure (Just utcTime, (Just y, Just m,Just d))
49
50 split' :: UTCTime -> (Year, Month, Day)
51 split' utcTime = (fromIntegral y, m, d)
52 where
53 (UTCTime day _) = utcTime
54 (y,m,d) = toGregorian day
55
56 type Year = Int
57 type Month = Int
58 type Day = Int
59 ------------------------------------------------------------------------
60
61 -- | Date Parser
62 -- Parses dates mentions in full text given the language.
63 -- >>> parseDate FR (pack "10 avril 1979 à 19H")
64 -- 1979-04-10 19:00:00 UTC
65 -- >>> parseDate EN (pack "April 10 1979")
66 -- 1979-04-10 00:00:00 UTC
67 parse :: Lang -> Text -> IO UTCTime
68 parse lang s = parseDate' "%Y-%m-%dT%T" "0-0-0T0:0:0" lang s
69
70 type DateFormat = Text
71 type DateDefault = Text
72
73 parseDate' :: DateFormat -> DateDefault -> Lang -> Text -> IO UTCTime
74 parseDate' format def lang s = do
75 dateStr' <- parseRaw lang s
76 let dateStr = unpack $ maybe def identity
77 $ head $ splitOn "." dateStr'
78 pure $ parseTimeOrError True defaultTimeLocale (unpack format) dateStr
79
80
81 -- TODO add Paris at Duckling.Locale Region datatype
82 -- | To get Homogeinity of the languages
83 -- TODO : put this in a more generic place in the source code
84 parserLang :: Lang -> DC.Lang
85 parserLang FR = DC.FR
86 parserLang EN = DC.EN
87 parserLang _ = panic "not implemented"
88
89 -- | Final Date parser API
90 -- IO can be avoided here:
91 -- currentContext :: Lang -> IO Context
92 -- currentContext lang = localContext lang <$> utcToDucklingTime <$> getCurrentTime
93 -- parseRaw :: Context -> Text -> SomeErrorHandling Text
94
95 -- TODO error handling
96 parseRaw :: Lang -> Text -> IO (Text)
97 parseRaw lang text = do
98 maybeJson <- map jsonValue <$> parseDateWithDuckling lang text
99 case headMay maybeJson of
100 Just (Json.Object object) -> case HM.lookup "value" object of
101 Just (Json.String date) -> pure date
102 Just _ -> panic "ParseRaw ERROR: should be a json String"
103 Nothing -> panic $ "ParseRaw ERROR: no date found" <> (pack . show) lang <> " " <> text
104
105 _ -> panic $ "ParseRaw ERROR: type error" <> (pack . show) lang <> " " <> text
106
107
108 -- | Current Time in DucklingTime format
109 -- TODO : get local Time in a more generic way
110 utcToDucklingTime :: UTCTime -> DucklingTime
111 utcToDucklingTime time = DucklingTime . zonedTimeToZoneSeriesTime $ fromUTC time utc
112
113 -- | Local Context which depends on Lang and Time
114 localContext :: Lang -> DucklingTime -> Context
115 localContext lang dt = Context {referenceTime = dt, locale = makeLocale (parserLang lang) Nothing}
116
117 -- | Date parser with Duckling
118 parseDateWithDuckling :: Lang -> Text -> IO [ResolvedToken]
119 parseDateWithDuckling lang input = do
120 contxt <- localContext lang <$> utcToDucklingTime <$> getCurrentTime
121 --pure $ parseAndResolve (rulesFor (locale ctx) (HashSet.fromList [(This Time)])) input ctx
122 pure $ analyze input contxt $ HashSet.fromList [(This Time)]
123