]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Utils/JohnSnowNLP.hs
Add Arxiv tests
[gargantext.git] / src / Gargantext / Utils / JohnSnowNLP.hs
1 {-|
2 Module : Gargantext.Utils.JohnSnow
3 Description : John Snow NLP API connexion
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE TemplateHaskell #-}
13
14 module Gargantext.Utils.JohnSnowNLP where
15
16 import Control.Concurrent (threadDelay)
17 import Control.Lens
18 import Data.Aeson (encode, ToJSON, toJSON, FromJSON, parseJSON, Value(..), (.:), (.:?))
19 import Data.Aeson.TH (deriveJSON)
20 import Data.Aeson.Types (prependFailure, typeMismatch)
21 import Data.Map.Strict (Map)
22 import Data.Maybe (fromMaybe)
23 import Data.Text hiding (map, group, filter, concat, zip)
24 import Gargantext.Core (Lang(..))
25 import Gargantext.Core.Text.Terms.Multi.PosTagging.Types
26 import Gargantext.Core.Types (POS(..))
27 import Gargantext.Core.Utils.Prefix (unPrefix)
28 import Gargantext.Prelude
29 import Network.HTTP.Simple (parseRequest, httpJSON, setRequestBodyLBS, getResponseBody, Response)
30 import qualified Data.List.Safe as LS
31 import qualified Data.Map.Strict as Map
32
33
34 data JSSpell = JSPOS Lang | JSLemma Lang
35 deriving (Show)
36
37 instance ToJSON JSSpell where
38 toJSON (JSPOS All) = "pos"
39 toJSON (JSPOS DE) = "de.pos"
40 toJSON (JSPOS EL) = "el.pos"
41 toJSON (JSPOS EN) = "en.pos"
42 toJSON (JSPOS ES) = "es.pos"
43 toJSON (JSPOS FR) = "fr.pos"
44 toJSON (JSPOS IT) = "it.pos"
45 toJSON (JSPOS PL) = "pl.pos"
46 toJSON (JSPOS PT) = "pt.pos"
47 toJSON (JSPOS RU) = "ru.pos"
48 toJSON (JSPOS UK) = "uk.pos"
49 toJSON (JSPOS ZH) = "zh.pos"
50
51 toJSON (JSLemma All) = "lemma"
52 toJSON (JSLemma DE) = "de.lemma"
53 toJSON (JSLemma EL) = "el.lemma"
54 toJSON (JSLemma EN) = "en.lemma"
55 toJSON (JSLemma ES) = "es.lemma"
56 toJSON (JSLemma FR) = "fr.lemma"
57 toJSON (JSLemma IT) = "it.lemma"
58 toJSON (JSLemma PL) = "pl.lemma"
59 toJSON (JSLemma PT) = "pt.lemma"
60 toJSON (JSLemma RU) = "ru.lemma"
61 toJSON (JSLemma UK) = "uk.lemma"
62 toJSON (JSLemma ZH) = "zh.lemma"
63
64 instance FromJSON JSSpell where
65 parseJSON (String "de.pos") = pure $ JSPOS DE
66 parseJSON (String "en.pos") = pure $ JSPOS EN
67 parseJSON (String "el.pos") = pure $ JSPOS EL
68 parseJSON (String "es.pos") = pure $ JSPOS ES
69 parseJSON (String "fr.pos") = pure $ JSPOS FR
70 parseJSON (String "it.pos") = pure $ JSPOS IT
71 parseJSON (String "pl.pos") = pure $ JSPOS PL
72 parseJSON (String "pt.pos") = pure $ JSPOS PT
73 parseJSON (String "ru.pos") = pure $ JSPOS RU
74 parseJSON (String "uk.pos") = pure $ JSPOS UK
75 parseJSON (String "zh.pos") = pure $ JSPOS ZH
76 parseJSON (String "pos") = pure $ JSPOS All
77
78 parseJSON (String "de.lemma") = pure $ JSLemma DE
79 parseJSON (String "en.lemma") = pure $ JSLemma EN
80 parseJSON (String "el.lemma") = pure $ JSLemma EL
81 parseJSON (String "es.lemma") = pure $ JSLemma ES
82 parseJSON (String "fr.lemma") = pure $ JSLemma FR
83 parseJSON (String "it.lemma") = pure $ JSLemma IT
84 parseJSON (String "pl.lemma") = pure $ JSLemma PL
85 parseJSON (String "pt.lemma") = pure $ JSLemma PT
86 parseJSON (String "ru.lemma") = pure $ JSLemma RU
87 parseJSON (String "uk.lemma") = pure $ JSLemma UK
88 parseJSON (String "zh.lemma") = pure $ JSLemma ZH
89 parseJSON (String "lemma") = pure $ JSLemma All
90 parseJSON s =
91 prependFailure "parsing spell failed, "
92 (typeMismatch "Spell" s)
93
94 data JSRequest =
95 JSRequest { _jsRequest_data :: !Text
96 , _jsRequest_format :: !Text
97 , _jsRequest_grouping :: !(Maybe Text)
98 , _jsRequest_spell :: !JSSpell }
99 deriving (Show)
100
101 -- "spell" options:
102 -- https://nlu.johnsnowlabs.com/docs/en/spellbook
103
104 deriveJSON (unPrefix "_jsRequest_") ''JSRequest
105
106 -- | JohnSnow NLP works via asynchronous tasks: send a query and get a
107 -- task in response. One must poll for task status and then get it's
108 -- result.
109 data JSAsyncTask =
110 JSAsyncTask { _jsAsyncTask_uuid :: !Text }
111 deriving (Show)
112
113 deriveJSON (unPrefix "_jsAsyncTask_") ''JSAsyncTask
114
115 -- | Task status.
116 data JSAsyncTaskStatus =
117 JSAsyncTaskStatus { _jsAsyncTaskStatus_code :: !Text
118 , _jsAsyncTaskStatus_message :: !(Maybe Text) }
119 deriving (Show)
120
121 taskReady :: JSAsyncTaskStatus -> Bool
122 taskReady (JSAsyncTaskStatus { .. }) = _jsAsyncTaskStatus_code == "success"
123
124 --deriveJSON (unPrefix "_jsAsyncTaskStatus_") ''JSAsyncTaskStatus
125 instance FromJSON JSAsyncTaskStatus where
126 parseJSON (Object v) = do
127 status <- v .: "status"
128 code <- status .: "code"
129 message <- status .:? "message"
130 pure $ JSAsyncTaskStatus { _jsAsyncTaskStatus_code = code
131 , _jsAsyncTaskStatus_message = message }
132 parseJSON s =
133 prependFailure "parsing status failed"
134 (typeMismatch "status" s)
135
136 -- | Response for our query. The `Maybe` types are here because we
137 -- combine 2 types of responses into one: `pos` and `lemma`.
138 data JSAsyncTaskResponse =
139 JSAsyncTaskResponse { _jsAsyncTaskResponse_index :: Map Text Int
140 , _jsAsyncTaskResponse_document :: Map Text Text
141 , _jsAsyncTaskResponse_sentence :: Map Text [Text]
142 , _jsAsyncTaskResponse_lem :: Maybe (Map Text [Text])
143 , _jsAsyncTaskResponse_pos :: Maybe (Map Text [POS])
144 , _jsAsyncTaskResponse_token :: Map Text [Text] }
145 deriving (Show)
146
147 deriveJSON (unPrefix "_jsAsyncTaskResponse_") ''JSAsyncTaskResponse
148 makeLenses ''JSAsyncTaskResponse
149
150 -- | We need to combine 2 responses: `pos` and `lemma` spells.
151 jsAsyncTaskResponseToSentences :: JSAsyncTaskResponse -> JSAsyncTaskResponse -> PosSentences
152 jsAsyncTaskResponseToSentences jsPos jsLemma =
153 PosSentences { _sentences }
154 where
155 _sentences = Map.elems $ Map.mapWithKey mapSentence (jsPos ^. jsAsyncTaskResponse_sentence)
156 mapSentence idx sentence = Sentence { _sentenceIndex = sIndex
157 , _sentenceTokens = sTokens }
158 where
159 sIndex = Map.findWithDefault (-1) idx (jsPos ^. jsAsyncTaskResponse_index)
160 lemmas = fromMaybe [] $
161 if Just sentence == Map.lookup idx (jsLemma ^. jsAsyncTaskResponse_sentence) then
162 Map.lookup idx $ fromMaybe Map.empty (jsLemma ^. jsAsyncTaskResponse_lem)
163 else
164 Nothing
165 sTokens = imap mapPosToken $ zip (Map.findWithDefault [] idx $ fromMaybe Map.empty (jsPos ^. jsAsyncTaskResponse_pos))
166 (Map.findWithDefault [] idx (jsPos ^. jsAsyncTaskResponse_token))
167 mapPosToken idx' (pos, token) = Token { _tokenIndex = -1
168 , _tokenWord = token
169 , _tokenOriginalText = ""
170 , _tokenLemma = fromMaybe "" $ (LS.!!) lemmas idx'
171 , _tokenCharacterOffsetBegin = -1
172 , _tokenCharacterOffsetEnd = -1
173 , _tokenPos = Just pos
174 , _tokenNer = Nothing
175 , _tokenBefore = Nothing
176 , _tokenAfter = Nothing }
177
178 -----------------------------------------------------
179
180 jsRequest :: Text -> JSSpell -> IO JSAsyncTask
181 jsRequest t s = do
182 url <- parseRequest $ "POST http://localhost:5000/api/results"
183 let jsReq = JSRequest { _jsRequest_data = t
184 , _jsRequest_format = "text"
185 , _jsRequest_grouping = Nothing
186 , _jsRequest_spell = s }
187 let request = setRequestBodyLBS (encode jsReq) url
188 task <- httpJSON request :: IO (Response JSAsyncTask)
189 pure $ getResponseBody task
190
191 jsTaskStatus :: JSAsyncTask -> IO JSAsyncTaskStatus
192 jsTaskStatus (JSAsyncTask uuid) = do
193 url <- parseRequest $ unpack $ "GET http://localhost:5000/api/results/" <> uuid <> "/status"
194 status <- httpJSON url
195 pure $ getResponseBody status
196
197 jsTaskResponse :: JSAsyncTask -> IO JSAsyncTaskResponse
198 jsTaskResponse (JSAsyncTask uuid) = do
199 url <- parseRequest $ unpack $ "GET http://localhost:5000/api/results/" <> uuid
200 result <- httpJSON url
201 pure $ getResponseBody result
202
203 waitForJsTask :: JSAsyncTask -> IO JSAsyncTaskResponse
204 waitForJsTask jsTask = wait' 0
205 where
206 wait' :: Int -> IO JSAsyncTaskResponse
207 wait' counter = do
208 status <- jsTaskStatus jsTask
209 if taskReady status then
210 jsTaskResponse jsTask
211 else
212 if counter > 60 then
213 panic "[waitForJsTask] waited for 1 minute and still no answer from JohnSnow NLP"
214 else do
215 -- printDebug "[waitForJsTask] task not ready, waiting" counter
216 _ <- threadDelay $ 1000000*1
217 wait' $ counter + 1
218
219 getPosTagAndLems :: Lang -> Text -> IO PosSentences
220 getPosTagAndLems l t = do
221 jsPosTask <- jsRequest t (JSPOS l)
222 jsLemmaTask <- jsRequest t (JSLemma l)
223
224 -- wait for both tasks
225 jsPos <- waitForJsTask jsPosTask
226 jsLemma <- waitForJsTask jsLemmaTask
227
228 pure $ jsAsyncTaskResponseToSentences jsPos jsLemma
229
230 nlp :: Lang -> Text -> IO PosSentences
231 nlp = getPosTagAndLems