]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Utils/JohnSnowNLP.hs
[NLP] add support for arbitrary languages in INI file
[gargantext.git] / src / Gargantext / Utils / JohnSnowNLP.hs
1 {-|
2 Module : Gargantext.Utils.JohnSnow
3 Description : John Snow NLP API connexion
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE TemplateHaskell #-}
13
14 module Gargantext.Utils.JohnSnowNLP where
15
16 import Control.Concurrent (threadDelay)
17 import Control.Lens
18 import Data.Aeson (encode, ToJSON, toJSON, FromJSON, parseJSON, Value(..), (.:), (.:?))
19 import Data.Aeson.Types (prependFailure, typeMismatch)
20 import Data.Aeson.TH (deriveJSON)
21 import qualified Data.List.Safe as LS
22 import Data.Map.Strict (Map)
23 import qualified Data.Map.Strict as Map
24 import Data.Maybe (fromMaybe)
25 import Data.Text hiding (map, group, filter, concat, zip)
26 import Network.HTTP.Simple (parseRequest, httpJSON, setRequestBodyLBS, getResponseBody, Response)
27
28 import Gargantext.Prelude
29 import Gargantext.Core (Lang(..))
30 import Gargantext.Core.Types (POS(..))
31 import Gargantext.Core.Text.Terms.Multi.PosTagging.Types
32 import Gargantext.Core.Utils.Prefix (unPrefix)
33
34
35 data JSSpell = JSPOS Lang | JSLemma Lang
36 deriving (Show)
37
38 instance ToJSON JSSpell where
39 toJSON (JSPOS EN) = "en.pos"
40 toJSON (JSPOS FR) = "fr.pos"
41 toJSON (JSPOS DE) = "de.pos"
42 toJSON (JSPOS ES) = "es.pos"
43 toJSON (JSPOS IT) = "it.pos"
44 toJSON (JSPOS PL) = "pl.pos"
45 toJSON (JSPOS CN) = "cn.pos"
46 toJSON (JSPOS All) = "pos"
47
48 toJSON (JSLemma EN) = "en.lemma"
49 toJSON (JSLemma FR) = "fr.lemma"
50 toJSON (JSLemma DE) = "de.lemma"
51 toJSON (JSLemma ES) = "es.lemma"
52 toJSON (JSLemma IT) = "it.lemma"
53 toJSON (JSLemma PL) = "pl.lemma"
54 toJSON (JSLemma CN) = "cn.lemma"
55 toJSON (JSLemma All) = "lemma"
56
57 instance FromJSON JSSpell where
58 parseJSON (String "en.pos") = pure $ JSPOS EN
59 parseJSON (String "fr.pos") = pure $ JSPOS FR
60 parseJSON (String "de.pos") = pure $ JSPOS DE
61 parseJSON (String "es.pos") = pure $ JSPOS ES
62 parseJSON (String "it.pos") = pure $ JSPOS IT
63 parseJSON (String "pl.pos") = pure $ JSPOS PL
64 parseJSON (String "cn.pos") = pure $ JSPOS CN
65 parseJSON (String "pos") = pure $ JSPOS All
66 parseJSON (String "en.lemma") = pure $ JSLemma EN
67 parseJSON (String "fr.lemma") = pure $ JSLemma FR
68 parseJSON (String "de.lemma") = pure $ JSLemma DE
69 parseJSON (String "es.lemma") = pure $ JSLemma ES
70 parseJSON (String "it.lemma") = pure $ JSLemma IT
71 parseJSON (String "pl.lemma") = pure $ JSLemma PL
72 parseJSON (String "cn.lemma") = pure $ JSLemma CN
73 parseJSON (String "lemma") = pure $ JSLemma All
74 parseJSON s =
75 prependFailure "parsing spell failed, "
76 (typeMismatch "Spell" s)
77
78 data JSRequest =
79 JSRequest { _jsRequest_data :: !Text
80 , _jsRequest_format :: !Text
81 , _jsRequest_grouping :: !(Maybe Text)
82 , _jsRequest_spell :: !JSSpell }
83 deriving (Show)
84
85 -- "spell" options:
86 -- https://nlu.johnsnowlabs.com/docs/en/spellbook
87
88 deriveJSON (unPrefix "_jsRequest_") ''JSRequest
89
90 -- | JohnSnow NLP works via asynchronous tasks: send a query and get a
91 -- task in response. One must poll for task status and then get it's
92 -- result.
93 data JSAsyncTask =
94 JSAsyncTask { _jsAsyncTask_uuid :: !Text }
95 deriving (Show)
96
97 deriveJSON (unPrefix "_jsAsyncTask_") ''JSAsyncTask
98
99 -- | Task status.
100 data JSAsyncTaskStatus =
101 JSAsyncTaskStatus { _jsAsyncTaskStatus_code :: !Text
102 , _jsAsyncTaskStatus_message :: !(Maybe Text) }
103 deriving (Show)
104
105 taskReady :: JSAsyncTaskStatus -> Bool
106 taskReady (JSAsyncTaskStatus { .. }) = _jsAsyncTaskStatus_code == "success"
107
108 --deriveJSON (unPrefix "_jsAsyncTaskStatus_") ''JSAsyncTaskStatus
109 instance FromJSON JSAsyncTaskStatus where
110 parseJSON (Object v) = do
111 status <- v .: "status"
112 code <- status .: "code"
113 message <- status .:? "message"
114 pure $ JSAsyncTaskStatus { _jsAsyncTaskStatus_code = code
115 , _jsAsyncTaskStatus_message = message }
116 parseJSON s =
117 prependFailure "parsing status failed"
118 (typeMismatch "status" s)
119
120 -- | Response for our query. The `Maybe` types are here because we
121 -- combine 2 types of responses into one: `pos` and `lemma`.
122 data JSAsyncTaskResponse =
123 JSAsyncTaskResponse { _jsAsyncTaskResponse_index :: Map Text Int
124 , _jsAsyncTaskResponse_document :: Map Text Text
125 , _jsAsyncTaskResponse_sentence :: Map Text [Text]
126 , _jsAsyncTaskResponse_lem :: Maybe (Map Text [Text])
127 , _jsAsyncTaskResponse_pos :: Maybe (Map Text [POS])
128 , _jsAsyncTaskResponse_token :: Map Text [Text] }
129 deriving (Show)
130
131 deriveJSON (unPrefix "_jsAsyncTaskResponse_") ''JSAsyncTaskResponse
132 makeLenses ''JSAsyncTaskResponse
133
134 -- | We need to combine 2 responses: `pos` and `lemma` spells.
135 jsAsyncTaskResponseToSentences :: JSAsyncTaskResponse -> JSAsyncTaskResponse -> PosSentences
136 jsAsyncTaskResponseToSentences jsPos jsLemma =
137 PosSentences { _sentences }
138 where
139 _sentences = Map.elems $ Map.mapWithKey mapSentence (jsPos ^. jsAsyncTaskResponse_sentence)
140 mapSentence idx sentence = Sentence { _sentenceIndex = sIndex
141 , _sentenceTokens = sTokens }
142 where
143 sIndex = Map.findWithDefault (-1) idx (jsPos ^. jsAsyncTaskResponse_index)
144 lemmas = fromMaybe [] $
145 if Just sentence == Map.lookup idx (jsLemma ^. jsAsyncTaskResponse_sentence) then
146 Map.lookup idx $ fromMaybe Map.empty (jsLemma ^. jsAsyncTaskResponse_lem)
147 else
148 Nothing
149 sTokens = imap mapPosToken $ zip (Map.findWithDefault [] idx $ fromMaybe Map.empty (jsPos ^. jsAsyncTaskResponse_pos))
150 (Map.findWithDefault [] idx (jsPos ^. jsAsyncTaskResponse_token))
151 mapPosToken idx' (pos, token) = Token { _tokenIndex = -1
152 , _tokenWord = token
153 , _tokenOriginalText = ""
154 , _tokenLemma = fromMaybe "" $ (LS.!!) lemmas idx'
155 , _tokenCharacterOffsetBegin = -1
156 , _tokenCharacterOffsetEnd = -1
157 , _tokenPos = Just pos
158 , _tokenNer = Nothing
159 , _tokenBefore = Nothing
160 , _tokenAfter = Nothing }
161
162 -----------------------------------------------------
163
164 jsRequest :: Text -> JSSpell -> IO JSAsyncTask
165 jsRequest t s = do
166 url <- parseRequest $ "POST http://localhost:5000/api/results"
167 let jsReq = JSRequest { _jsRequest_data = t
168 , _jsRequest_format = "text"
169 , _jsRequest_grouping = Nothing
170 , _jsRequest_spell = s }
171 let request = setRequestBodyLBS (encode jsReq) url
172 task <- httpJSON request :: IO (Response JSAsyncTask)
173 pure $ getResponseBody task
174
175 jsTaskStatus :: JSAsyncTask -> IO JSAsyncTaskStatus
176 jsTaskStatus (JSAsyncTask uuid) = do
177 url <- parseRequest $ unpack $ "GET http://localhost:5000/api/results/" <> uuid <> "/status"
178 status <- httpJSON url
179 pure $ getResponseBody status
180
181 jsTaskResponse :: JSAsyncTask -> IO JSAsyncTaskResponse
182 jsTaskResponse (JSAsyncTask uuid) = do
183 url <- parseRequest $ unpack $ "GET http://localhost:5000/api/results/" <> uuid
184 result <- httpJSON url
185 pure $ getResponseBody result
186
187 waitForJsTask :: JSAsyncTask -> IO JSAsyncTaskResponse
188 waitForJsTask jsTask = wait' 0
189 where
190 wait' :: Int -> IO JSAsyncTaskResponse
191 wait' counter = do
192 status <- jsTaskStatus jsTask
193 if taskReady status then
194 jsTaskResponse jsTask
195 else
196 if counter > 60 then
197 panic "[waitForJsTask] waited for 1 minute and still no answer from JohnSnow NLP"
198 else do
199 -- printDebug "[waitForJsTask] task not ready, waiting" counter
200 _ <- threadDelay $ 1000000*1
201 wait' $ counter + 1
202
203 getPosTagAndLems :: Lang -> Text -> IO PosSentences
204 getPosTagAndLems l t = do
205 jsPosTask <- jsRequest t (JSPOS l)
206 jsLemmaTask <- jsRequest t (JSLemma l)
207
208 -- wait for both tasks
209 jsPos <- waitForJsTask jsPosTask
210 jsLemma <- waitForJsTask jsLemmaTask
211
212 pure $ jsAsyncTaskResponseToSentences jsPos jsLemma
213
214 nlp :: Lang -> Text -> IO PosSentences
215 nlp = getPosTagAndLems