]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Utils/JohnSnowNLP.hs
Doc/Comments
[gargantext.git] / src / Gargantext / Utils / JohnSnowNLP.hs
1 {-|
2 Module : Gargantext.Utils.JohnSnow
3 Description : John Snow NLP API connexion
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE TemplateHaskell #-}
13
14 module Gargantext.Utils.JohnSnowNLP where
15
16 import Control.Concurrent (threadDelay)
17 import Control.Lens
18 import Data.Aeson (encode, ToJSON, toJSON, FromJSON, parseJSON, Value(..), (.:), (.:?))
19 import Data.Aeson.Types (prependFailure, typeMismatch)
20 import Data.Aeson.TH (deriveJSON)
21 import qualified Data.List.Safe as LS
22 import Data.Map (Map)
23 import qualified Data.Map as Map
24 import Data.Maybe (fromMaybe)
25 import Data.Text hiding (map, group, filter, concat, zip)
26 import Network.HTTP.Simple (parseRequest, httpJSON, setRequestBodyLBS, getResponseBody, Response)
27
28 import Gargantext.Prelude
29 import Gargantext.Core (Lang(..))
30 import Gargantext.Core.Types (POS(..))
31 import Gargantext.Core.Text.Terms.Multi.PosTagging.Types
32 import Gargantext.Core.Utils.Prefix (unPrefix)
33
34
35 data JSSpell = JSPOS Lang | JSLemma Lang
36 deriving (Show)
37
38 instance ToJSON JSSpell where
39 toJSON (JSPOS EN) = "en.pos"
40 toJSON (JSPOS FR) = "fr.pos"
41 toJSON (JSPOS All) = "pos"
42 toJSON (JSLemma EN) = "en.lemma"
43 toJSON (JSLemma FR) = "fr.lemma"
44 toJSON (JSLemma All) = "lemma"
45
46 instance FromJSON JSSpell where
47 parseJSON (String "en.pos") = pure $ JSPOS EN
48 parseJSON (String "fr.pos") = pure $ JSPOS FR
49 parseJSON (String "pos") = pure $ JSPOS All
50 parseJSON (String "en.lemma") = pure $ JSLemma EN
51 parseJSON (String "fr.lemma") = pure $ JSLemma FR
52 parseJSON (String "lemma") = pure $ JSLemma All
53 parseJSON s =
54 prependFailure "parsing spell failed, "
55 (typeMismatch "Spell" s)
56
57 data JSRequest =
58 JSRequest { _jsRequest_data :: !Text
59 , _jsRequest_format :: !Text
60 , _jsRequest_grouping :: !(Maybe Text)
61 , _jsRequest_spell :: !JSSpell }
62 deriving (Show)
63
64 -- "spell" options:
65 -- https://nlu.johnsnowlabs.com/docs/en/spellbook
66
67 deriveJSON (unPrefix "_jsRequest_") ''JSRequest
68
69 -- | JohnSnow NLP works via asynchronous tasks: send a query and get a
70 -- task in response. One must poll for task status and then get it's
71 -- result.
72 data JSAsyncTask =
73 JSAsyncTask { _jsAsyncTask_uuid :: !Text }
74 deriving (Show)
75
76 deriveJSON (unPrefix "_jsAsyncTask_") ''JSAsyncTask
77
78 -- | Task status.
79 data JSAsyncTaskStatus =
80 JSAsyncTaskStatus { _jsAsyncTaskStatus_code :: !Text
81 , _jsAsyncTaskStatus_message :: !(Maybe Text) }
82 deriving (Show)
83
84 taskReady :: JSAsyncTaskStatus -> Bool
85 taskReady (JSAsyncTaskStatus { .. }) = _jsAsyncTaskStatus_code == "success"
86
87 --deriveJSON (unPrefix "_jsAsyncTaskStatus_") ''JSAsyncTaskStatus
88 instance FromJSON JSAsyncTaskStatus where
89 parseJSON (Object v) = do
90 status <- v .: "status"
91 code <- status .: "code"
92 message <- status .:? "message"
93 pure $ JSAsyncTaskStatus { _jsAsyncTaskStatus_code = code
94 , _jsAsyncTaskStatus_message = message }
95 parseJSON s =
96 prependFailure "parsing status failed"
97 (typeMismatch "status" s)
98
99 -- | Response for our query. The `Maybe` types are here because we
100 -- combine 2 types of responses into one: `pos` and `lemma`.
101 data JSAsyncTaskResponse =
102 JSAsyncTaskResponse { _jsAsyncTaskResponse_index :: Map Text Int
103 , _jsAsyncTaskResponse_document :: Map Text Text
104 , _jsAsyncTaskResponse_sentence :: Map Text [Text]
105 , _jsAsyncTaskResponse_lem :: Maybe (Map Text [Text])
106 , _jsAsyncTaskResponse_pos :: Maybe (Map Text [POS])
107 , _jsAsyncTaskResponse_token :: Map Text [Text] }
108 deriving (Show)
109
110 deriveJSON (unPrefix "_jsAsyncTaskResponse_") ''JSAsyncTaskResponse
111 makeLenses ''JSAsyncTaskResponse
112
113 -- | We need to combine 2 responses: `pos` and `lemma` spells.
114 jsAsyncTaskResponseToSentences :: JSAsyncTaskResponse -> JSAsyncTaskResponse -> PosSentences
115 jsAsyncTaskResponseToSentences jsPos jsLemma =
116 PosSentences { _sentences }
117 where
118 _sentences = Map.elems $ Map.mapWithKey mapSentence (jsPos ^. jsAsyncTaskResponse_sentence)
119 mapSentence idx sentence = Sentence { _sentenceIndex = sIndex
120 , _sentenceTokens = sTokens }
121 where
122 sIndex = Map.findWithDefault (-1) idx (jsPos ^. jsAsyncTaskResponse_index)
123 lemmas = fromMaybe [] $
124 if Just sentence == Map.lookup idx (jsLemma ^. jsAsyncTaskResponse_sentence) then
125 Map.lookup idx $ fromMaybe Map.empty (jsLemma ^. jsAsyncTaskResponse_lem)
126 else
127 Nothing
128 sTokens = imap mapPosToken $ zip (Map.findWithDefault [] idx $ fromMaybe Map.empty (jsPos ^. jsAsyncTaskResponse_pos))
129 (Map.findWithDefault [] idx (jsPos ^. jsAsyncTaskResponse_token))
130 mapPosToken idx' (pos, token) = Token { _tokenIndex = -1
131 , _tokenWord = token
132 , _tokenOriginalText = ""
133 , _tokenLemma = fromMaybe "" $ (LS.!!) lemmas idx'
134 , _tokenCharacterOffsetBegin = -1
135 , _tokenCharacterOffsetEnd = -1
136 , _tokenPos = Just pos
137 , _tokenNer = Nothing
138 , _tokenBefore = Nothing
139 , _tokenAfter = Nothing }
140
141 -----------------------------------------------------
142
143 jsRequest :: Text -> JSSpell -> IO JSAsyncTask
144 jsRequest t s = do
145 url <- parseRequest $ "POST http://localhost:5000/api/results"
146 let jsReq = JSRequest { _jsRequest_data = t
147 , _jsRequest_format = "text"
148 , _jsRequest_grouping = Nothing
149 , _jsRequest_spell = s }
150 let request = setRequestBodyLBS (encode jsReq) url
151 task <- httpJSON request :: IO (Response JSAsyncTask)
152 pure $ getResponseBody task
153
154 jsTaskStatus :: JSAsyncTask -> IO JSAsyncTaskStatus
155 jsTaskStatus (JSAsyncTask uuid) = do
156 url <- parseRequest $ unpack $ "GET http://localhost:5000/api/results/" <> uuid <> "/status"
157 status <- httpJSON url
158 pure $ getResponseBody status
159
160 jsTaskResponse :: JSAsyncTask -> IO JSAsyncTaskResponse
161 jsTaskResponse (JSAsyncTask uuid) = do
162 url <- parseRequest $ unpack $ "GET http://localhost:5000/api/results/" <> uuid
163 result <- httpJSON url
164 pure $ getResponseBody result
165
166 waitForJsTask :: JSAsyncTask -> IO JSAsyncTaskResponse
167 waitForJsTask jsTask = wait' 0
168 where
169 wait' :: Int -> IO JSAsyncTaskResponse
170 wait' counter = do
171 status <- jsTaskStatus jsTask
172 if taskReady status then
173 jsTaskResponse jsTask
174 else
175 if counter > 60 then
176 panic "[waitForJsTask] waited for 1 minute and still no answer from JohnSnow NLP"
177 else do
178 printDebug "[waitForJsTask] task not ready, waiting" counter
179 _ <- threadDelay $ 1000000*1
180 wait' $ counter + 1
181
182 getPosTagAndLems :: Lang -> Text -> IO PosSentences
183 getPosTagAndLems l t = do
184 jsPosTask <- jsRequest t (JSPOS l)
185 jsLemmaTask <- jsRequest t (JSLemma l)
186
187 -- wait for both tasks
188 jsPos <- waitForJsTask jsPosTask
189 jsLemma <- waitForJsTask jsLemmaTask
190
191 pure $ jsAsyncTaskResponseToSentences jsPos jsLemma
192
193 nlp :: Lang -> Text -> IO PosSentences
194 nlp = getPosTagAndLems