]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Parsers/Date.hs
[FEAT] Parsers RIS && Presse.
[gargantext.git] / src / Gargantext / Text / Parsers / Date.hs
1 {-|
2 Module : Gargantext.Text.Parsers.Date
3 Description : Some utils to parse dates
4 Copyright : (c) CNRS 2017-present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 According to the language of the text, parseDateRaw returns date as Text:
11
12 TODO : Add some tests
13 import Gargantext.Text.Parsers.Date as DGP
14 DGP.parseDateRaw DGP.FR "12 avril 2010" == "2010-04-12T00:00:00.000+00:00"
15 -}
16
17 {-# LANGUAGE FlexibleContexts #-}
18 {-# LANGUAGE NoImplicitPrelude #-}
19 {-# LANGUAGE OverloadedStrings #-}
20
21 module Gargantext.Text.Parsers.Date (parseDate, parseDateRaw, parseGregorian, wrapDST) where
22
23 import Data.HashMap.Strict as HM hiding (map)
24 import Data.Text (Text, unpack, splitOn, pack)
25 import Data.Time (parseTimeOrError, defaultTimeLocale)
26 import Data.Time.Clock (UTCTime, getCurrentTime)
27 import Data.Time.LocalTime (utc)
28 import Data.Time.LocalTime.TimeZone.Series (zonedTimeToZoneSeriesTime)
29 import Duckling.Api (analyze)
30 import Duckling.Core (makeLocale, Some(This), Dimension(Time))
31 import Duckling.Resolve (fromUTC, Context(Context, referenceTime, locale), DucklingTime(DucklingTime))
32 import Duckling.Types (ResolvedToken)
33 import Duckling.Types (jsonValue)
34 import Gargantext.Core (Lang(FR,EN))
35 import Gargantext.Prelude
36 import qualified Data.Aeson as Json
37 import qualified Data.HashSet as HashSet
38 import qualified Duckling.Core as DC
39
40 -- | Unused import (to parse Date Format, keeping it for maybe next steps)
41 import Control.Monad ((=<<))
42 import Data.Either (Either)
43 import Data.Fixed (Fixed (MkFixed))
44 import Data.Foldable (length)
45 import Data.String (String)
46 import Data.Time (ZonedTime(..), LocalTime(..), TimeZone(..), TimeOfDay(..))
47 import Data.Time.Calendar (Day, fromGregorian)
48 import Duckling.Debug as DB
49 import Duckling.Engine (parseAndResolve)
50 import Duckling.Rules (rulesFor)
51 import Prelude (toInteger, div, otherwise, (++))
52 import Text.Parsec.Error (ParseError)
53 import Text.Parsec.Prim (Stream, ParsecT)
54 import Text.Parsec.String (Parser)
55 import Text.ParserCombinators.Parsec (many1, noneOf, anyChar, char, oneOf)
56 import Text.XML.HXT.DOM.Util (decimalStringToInt)
57 import qualified Text.ParserCombinators.Parsec (parse)
58
59 ------------------------------------------------------------------------
60 -- | Date Parser
61 -- Parses dates mentions in full text given the language.
62 -- >>> parseDate FR (pack "10 avril 1979 à 19H")
63 -- 1979-04-10 19:00:00 UTC
64 -- >>> parseDate EN (pack "April 10 1979")
65 -- 1979-04-10 00:00:00 UTC
66 parseDate :: Lang -> Text -> IO UTCTime
67 parseDate lang s = parseDate' "%Y-%m-%dT%T" "0-0-0T0:0:0" lang s
68
69 type DateFormat = Text
70 type DateNull = Text
71
72 parseDate' :: DateFormat -> DateNull -> Lang -> Text -> IO UTCTime
73 parseDate' format def lang s = do
74 dateStr' <- parseDateRaw lang s
75 let dateStr = unpack $ maybe def identity
76 $ head $ splitOn "." dateStr'
77 pure $ parseTimeOrError True defaultTimeLocale (unpack format) dateStr
78
79
80 -- TODO add Paris at Duckling.Locale Region datatype
81 -- | To get Homogeinity of the languages
82 -- TODO : put this in a more generic place in the source code
83 parserLang :: Lang -> DC.Lang
84 parserLang FR = DC.FR
85 parserLang EN = DC.EN
86 -- parserLang _ = panic "not implemented"
87
88 -- | Final Date parser API
89 -- IO can be avoided here:
90 -- currentContext :: Lang -> IO Context
91 -- currentContext lang = localContext lang <$> utcToDucklingTime <$> getCurrentTime
92 -- parseDateRaw :: Context -> Text -> SomeErrorHandling Text
93
94 -- TODO error handling
95 parseDateRaw :: Lang -> Text -> IO (Text)
96 parseDateRaw lang text = do
97 maybeJson <- map jsonValue <$> parseDateWithDuckling lang text
98 case headMay maybeJson of
99 Just (Json.Object object) -> case HM.lookup "value" object of
100 Just (Json.String date) -> pure date
101 Just _ -> panic "ParseDateRaw ERROR: should be a json String"
102 Nothing -> panic $ "ParseDateRaw ERROR: no date found" <> (pack . show) lang <> " " <> text
103
104 _ -> panic $ "ParseDateRaw ERROR: type error" <> (pack . show) lang <> " " <> text
105
106
107 -- | Current Time in DucklingTime format
108 -- TODO : get local Time in a more generic way
109 utcToDucklingTime :: UTCTime -> DucklingTime
110 utcToDucklingTime time = DucklingTime . zonedTimeToZoneSeriesTime $ fromUTC time utc
111
112 -- | Local Context which depends on Lang and Time
113 localContext :: Lang -> DucklingTime -> Context
114 localContext lang dt = Context {referenceTime = dt, locale = makeLocale (parserLang lang) Nothing}
115
116 -- | Date parser with Duckling
117 parseDateWithDuckling :: Lang -> Text -> IO [ResolvedToken]
118 parseDateWithDuckling lang input = do
119 contxt <- localContext lang <$> utcToDucklingTime <$> getCurrentTime
120 --pure $ parseAndResolve (rulesFor (locale ctx) (HashSet.fromList [(This Time)])) input ctx
121 pure $ analyze input contxt $ HashSet.fromList [(This Time)]
122
123 -- | Permit to transform a String to an Int in a monadic context
124 wrapDST :: Monad m => String -> m Int
125 wrapDST = return . decimalStringToInt
126
127 -- | Generic parser which take at least one element not given in argument
128 many1NoneOf :: Stream s m Char => [Char] -> ParsecT s u m [Char]
129 many1NoneOf = (many1 . noneOf)
130
131 --getMultiplicator :: Int -> Int
132 getMultiplicator a
133 | 0 >= a = 1
134 | otherwise = 10 * (getMultiplicator $ div a 10)
135
136 -- | Parser for date format y-m-d
137 parseGregorian :: Parser Day
138 parseGregorian = do
139 y <- wrapDST =<< many1NoneOf ['-']
140 _ <- char '-'
141 m <- wrapDST =<< many1NoneOf ['-']
142 _ <- char '-'
143 d <- wrapDST =<< many1NoneOf ['T']
144 _ <- char 'T'
145 return $ fromGregorian (toInteger y) m d
146
147 ---- | Parser for time format h:m:s
148 parseTimeOfDay :: Parser TimeOfDay
149 parseTimeOfDay = do
150 h <- wrapDST =<< many1NoneOf [':']
151 _ <- char ':'
152 m <- wrapDST =<< many1NoneOf [':']
153 _ <- char ':'
154 r <- many1NoneOf ['.']
155 _ <- char '.'
156 dec <- many1NoneOf ['+', '-']
157 let (nb, l) = (decimalStringToInt $ r ++ dec, length dec)
158 seconds = nb * 10^(12-l)
159 return $ TimeOfDay h m (MkFixed . toInteger $ seconds)
160
161
162 -- | Parser for timezone format +hh:mm
163 parseTimeZone :: Parser TimeZone
164 parseTimeZone = do
165 sign <- oneOf ['+', '-']
166 h <- wrapDST =<< many1NoneOf [':']
167 _ <- char ':'
168 m <- wrapDST =<< (many1 $ anyChar)
169 let timeInMinute = if sign == '+' then h * 60 + m else -h * 60 - m
170 in return $ TimeZone timeInMinute False "CET"
171
172 ---- | Parser which use parseGregorian, parseTimeOfDay and parseTimeZone to create a ZonedTime
173 parseZonedTime :: Parser ZonedTime
174 parseZonedTime= do
175 d <- parseGregorian
176 tod <- parseTimeOfDay
177 tz <- parseTimeZone
178 return $ ZonedTime (LocalTime d (tod)) tz
179
180 ---- | Opposite of toRFC3339
181 fromRFC3339 :: Text -> Either ParseError ZonedTime
182 fromRFC3339 t = Text.ParserCombinators.Parsec.parse parseZonedTime "ERROR: Couldn't parse zoned time." input
183 where input = unpack t