]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Corpus/Parsers/Date.hs
Merge branch '70-dev-searx-parser' of ssh://gitlab.iscpif.fr:20022/gargantext/haskell...
[gargantext.git] / src / Gargantext / Core / Text / Corpus / Parsers / Date.hs
1 {-|
2 Module : Gargantext.Core.Text.Corpus.Parsers.Date
3 Description : Some utils to parse dates
4 Copyright : (c) CNRS 2017-present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 According to the language of the text, parseDateRaw returns date as Text:
11
12 TODO : Add some tests
13 import Gargantext.Core.Text.Corpus.Parsers.Date as DGP
14 DGP.parseDateRaw DGP.FR "12 avril 2010" == "2010-04-12T00:00:00.000+00:00"
15 -}
16
17 {-# LANGUAGE TypeFamilies #-}
18
19 module Gargantext.Core.Text.Corpus.Parsers.Date {-(parse, parseRaw, dateSplit, Year, Month, Day)-} where
20
21 import Data.Aeson (toJSON, Value)
22 import Data.HashMap.Strict as HM hiding (map)
23 import Data.Text (Text, unpack, splitOn, pack)
24 import Data.Time (parseTimeOrError, defaultTimeLocale, toGregorian)
25 import Data.Time.Clock (UTCTime(..), getCurrentTime)
26 import Data.Time.LocalTime (utc)
27 import Data.Time.LocalTime.TimeZone.Series (zonedTimeToZoneSeriesTime)
28 import Duckling.Api (analyze)
29 import Duckling.Core (makeLocale, Dimension(Time))
30 import Duckling.Types (Seal(..))
31 import Duckling.Resolve (fromUTC, Context(Context, referenceTime, locale), DucklingTime(DucklingTime), Options(..))
32 import Duckling.Types (ResolvedToken(..), ResolvedVal(..))
33 import Gargantext.Core (Lang(FR,EN))
34 import Gargantext.Prelude
35 import qualified Data.Aeson as Json
36 import qualified Data.HashSet as HashSet
37 import qualified Duckling.Core as DC
38
39 ------------------------------------------------------------------------
40 -- | Parse date to Ints
41 -- TODO add hours, minutes and seconds
42 dateSplit :: Lang -> Maybe Text -> IO (Maybe UTCTime, (Maybe Year, Maybe Month, Maybe Day))
43 dateSplit _ Nothing = pure (Nothing, (Nothing, Nothing, Nothing))
44 dateSplit l (Just txt) = do
45 utcTime <- parse l txt
46 let (y, m, d) = split' utcTime
47 pure (Just utcTime, (Just y, Just m,Just d))
48
49 split' :: UTCTime -> (Year, Month, Day)
50 split' utcTime = (fromIntegral y, m, d)
51 where
52 (UTCTime day _) = utcTime
53 (y,m,d) = toGregorian day
54
55 type Year = Int
56 type Month = Int
57 type Day = Int
58 ------------------------------------------------------------------------
59
60 -- | Date Parser
61 -- Parses dates mentions in full text given the language.
62 -- >>> parseDate FR (pack "10 avril 1979 à 19H")
63 -- 1979-04-10 19:00:00 UTC
64 -- >>> parseDate EN (pack "April 10 1979")
65 -- 1979-04-10 00:00:00 UTC
66 parse :: Lang -> Text -> IO UTCTime
67 parse lang s = parseDate' "%Y-%m-%dT%T" "0-0-0T0:0:0" lang s
68
69 type DateFormat = Text
70 type DateDefault = Text
71
72 parseDate' :: DateFormat -> DateDefault -> Lang -> Text -> IO UTCTime
73 parseDate' format def lang s = do
74 dateStr' <- parseRaw lang s
75 let dateStr = unpack $ maybe def identity
76 $ head $ splitOn "." dateStr'
77 pure $ parseTimeOrError True defaultTimeLocale (unpack format) dateStr
78
79
80 -- TODO add Paris at Duckling.Locale Region datatype
81 -- | To get Homogeinity of the languages
82 -- TODO : put this in a more generic place in the source code
83 parserLang :: Lang -> DC.Lang
84 parserLang FR = DC.FR
85 parserLang EN = DC.EN
86 parserLang _ = panic "not implemented"
87
88 -- | Final Date parser API
89 -- IO can be avoided here:
90 -- currentContext :: Lang -> IO Context
91 -- currentContext lang = localContext lang <$> utcToDucklingTime <$> getCurrentTime
92 -- parseRaw :: Context -> Text -> SomeErrorHandling Text
93
94 -- TODO error handling
95 parseRaw :: Lang -> Text -> IO Text
96 parseRaw lang text = do -- case result
97 maybeResult <- extractValue <$> getTimeValue <$> parseDateWithDuckling lang text (Options True)
98 case maybeResult of
99 Just result -> pure result
100 Nothing -> panic $ "[G.C.T.C.P.D.parseRaw] ERROR" <> (pack . show) lang <> " " <> text
101
102
103 getTimeValue :: [ResolvedToken] -> Value
104 getTimeValue rt = case head rt of
105 Nothing -> panic "error"
106 Just x -> case rval x of
107 RVal Time t -> toJSON t
108 _ -> panic "error2"
109
110 extractValue :: Value -> Maybe Text
111 extractValue (Json.Object object) =
112 case HM.lookup "value" object of
113 Just (Json.String date) -> Just date
114 _ -> Nothing
115 extractValue _ = Nothing
116
117 -- | Current Time in DucklingTime format
118 -- TODO : get local Time in a more generic way
119 utcToDucklingTime :: UTCTime -> DucklingTime
120 utcToDucklingTime time = DucklingTime . zonedTimeToZoneSeriesTime $ fromUTC time utc
121
122 -- | Local Context which depends on Lang and Time
123 localContext :: Lang -> DucklingTime -> Context
124 localContext lang dt = Context {referenceTime = dt, locale = makeLocale (parserLang lang) Nothing}
125
126 -- | Date parser with Duckling
127 parseDateWithDuckling :: Lang -> Text -> Options -> IO [ResolvedToken]
128 parseDateWithDuckling lang input options = do
129 contxt <- localContext lang <$> utcToDucklingTime <$> getCurrentTime
130 --pure $ parseAndResolve (rulesFor (locale ctx) (HashSet.fromList [(This Time)])) input ctx
131 -- TODO check/test Options False or True
132 pure $ analyze input contxt options $ HashSet.fromList [(Seal Time)]
133
134
135