]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Corpus/Parsers/Date.hs
[TEXTFLOW] enriched ngrams connected
[gargantext.git] / src / Gargantext / Core / Text / Corpus / Parsers / Date.hs
1 {-|
2 Module : Gargantext.Core.Text.Corpus.Parsers.Date
3 Description : Some utils to parse dates
4 Copyright : (c) CNRS 2017-present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 According to the language of the text, parseDateRaw returns date as Text:
11
12 TODO : Add some tests
13 import Gargantext.Core.Text.Corpus.Parsers.Date as DGP
14 DGP.parseDateRaw DGP.FR "12 avril 2010" == "2010-04-12T00:00:00.000+00:00"
15 -}
16
17 {-# LANGUAGE TypeFamilies #-}
18
19 module Gargantext.Core.Text.Corpus.Parsers.Date {-(parse, parseRaw, dateSplit, Year, Month, Day)-} where
20
21 import Data.Aeson (toJSON, Value)
22 import Data.HashMap.Strict as HM hiding (map)
23 import Data.Text (Text, unpack, splitOn, pack)
24 import Data.Time (parseTimeOrError, defaultTimeLocale, toGregorian)
25 import Data.Time.Clock (UTCTime(..), getCurrentTime)
26 import Data.Time.LocalTime (utc)
27 import Data.Time.LocalTime.TimeZone.Series (zonedTimeToZoneSeriesTime)
28 import Duckling.Api (analyze)
29 import Duckling.Core (makeLocale, Some(This), Dimension(Time))
30 import Duckling.Resolve (fromUTC, Context(Context, referenceTime, locale), DucklingTime(DucklingTime), Options(..))
31 import Duckling.Types (ResolvedToken(..), ResolvedVal(..))
32 import Gargantext.Core (Lang(FR,EN))
33 import Gargantext.Prelude
34 import qualified Data.Aeson as Json
35 import qualified Data.HashSet as HashSet
36 import qualified Duckling.Core as DC
37
38 ------------------------------------------------------------------------
39 -- | Parse date to Ints
40 -- TODO add hours, minutes and seconds
41 dateSplit :: Lang -> Maybe Text -> IO (Maybe UTCTime, (Maybe Year, Maybe Month, Maybe Day))
42 dateSplit _ Nothing = pure (Nothing, (Nothing, Nothing, Nothing))
43 dateSplit l (Just txt) = do
44 utcTime <- parse l txt
45 let (y, m, d) = split' utcTime
46 pure (Just utcTime, (Just y, Just m,Just d))
47
48 split' :: UTCTime -> (Year, Month, Day)
49 split' utcTime = (fromIntegral y, m, d)
50 where
51 (UTCTime day _) = utcTime
52 (y,m,d) = toGregorian day
53
54 type Year = Int
55 type Month = Int
56 type Day = Int
57 ------------------------------------------------------------------------
58
59 -- | Date Parser
60 -- Parses dates mentions in full text given the language.
61 -- >>> parseDate FR (pack "10 avril 1979 à 19H")
62 -- 1979-04-10 19:00:00 UTC
63 -- >>> parseDate EN (pack "April 10 1979")
64 -- 1979-04-10 00:00:00 UTC
65 parse :: Lang -> Text -> IO UTCTime
66 parse lang s = parseDate' "%Y-%m-%dT%T" "0-0-0T0:0:0" lang s
67
68 type DateFormat = Text
69 type DateDefault = Text
70
71 parseDate' :: DateFormat -> DateDefault -> Lang -> Text -> IO UTCTime
72 parseDate' format def lang s = do
73 dateStr' <- parseRaw lang s
74 let dateStr = unpack $ maybe def identity
75 $ head $ splitOn "." dateStr'
76 pure $ parseTimeOrError True defaultTimeLocale (unpack format) dateStr
77
78
79 -- TODO add Paris at Duckling.Locale Region datatype
80 -- | To get Homogeinity of the languages
81 -- TODO : put this in a more generic place in the source code
82 parserLang :: Lang -> DC.Lang
83 parserLang FR = DC.FR
84 parserLang EN = DC.EN
85 parserLang _ = panic "not implemented"
86
87 -- | Final Date parser API
88 -- IO can be avoided here:
89 -- currentContext :: Lang -> IO Context
90 -- currentContext lang = localContext lang <$> utcToDucklingTime <$> getCurrentTime
91 -- parseRaw :: Context -> Text -> SomeErrorHandling Text
92
93 -- TODO error handling
94 parseRaw :: Lang -> Text -> IO Text
95 parseRaw lang text = do -- case result
96 maybeResult <- extractValue <$> getTimeValue <$> parseDateWithDuckling lang text (Options True)
97 case maybeResult of
98 Just result -> pure result
99 Nothing -> panic $ "[G.C.T.C.P.D.parseRaw] ERROR" <> (pack . show) lang <> " " <> text
100
101
102 getTimeValue :: [ResolvedToken] -> Value
103 getTimeValue rt = case head rt of
104 Nothing -> panic "error"
105 Just x -> case rval x of
106 RVal Time t -> toJSON t
107 _ -> panic "error2"
108
109 extractValue :: Value -> Maybe Text
110 extractValue (Json.Object object) =
111 case HM.lookup "value" object of
112 Just (Json.String date) -> Just date
113 _ -> Nothing
114 extractValue _ = Nothing
115
116 -- | Current Time in DucklingTime format
117 -- TODO : get local Time in a more generic way
118 utcToDucklingTime :: UTCTime -> DucklingTime
119 utcToDucklingTime time = DucklingTime . zonedTimeToZoneSeriesTime $ fromUTC time utc
120
121 -- | Local Context which depends on Lang and Time
122 localContext :: Lang -> DucklingTime -> Context
123 localContext lang dt = Context {referenceTime = dt, locale = makeLocale (parserLang lang) Nothing}
124
125 -- | Date parser with Duckling
126 parseDateWithDuckling :: Lang -> Text -> Options -> IO [ResolvedToken]
127 parseDateWithDuckling lang input options = do
128 contxt <- localContext lang <$> utcToDucklingTime <$> getCurrentTime
129 --pure $ parseAndResolve (rulesFor (locale ctx) (HashSet.fromList [(This Time)])) input ctx
130 -- TODO check/test Options False or True
131 pure $ analyze input contxt options $ HashSet.fromList [(This Time)]
132
133
134