src/Gargantext/Utils/JohnSnowNLP.hs

   1 {-|
   2 Module      : Gargantext.Utils.JohnSnow
   3 Description : John Snow NLP API connexion
   4 Copyright   : (c) CNRS, 2017
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 -}
  11
  12 {-# LANGUAGE TemplateHaskell   #-}
  13
  14 module Gargantext.Utils.JohnSnowNLP where
  15
  16 import Control.Concurrent (threadDelay)
  17 import Control.Lens
  18 import Data.Aeson (encode, ToJSON, toJSON, FromJSON, parseJSON, Value(..), (.:), (.:?))
  19 import Data.Aeson.Types (prependFailure, typeMismatch)
  20 import Data.Aeson.TH (deriveJSON)
  21 import qualified Data.List.Safe as LS
  22 import Data.Map.Strict (Map)
  23 import qualified Data.Map.Strict as Map
  24 import Data.Maybe (fromMaybe)
  25 import Data.Text hiding (map, group, filter, concat, zip)
  26 import Network.HTTP.Simple (parseRequest, httpJSON, setRequestBodyLBS, getResponseBody, Response)
  27
  28 import Gargantext.Prelude
  29 import Gargantext.Core (Lang(..))
  30 import Gargantext.Core.Types (POS(..))
  31 import Gargantext.Core.Text.Terms.Multi.PosTagging.Types
  32 import Gargantext.Core.Utils.Prefix (unPrefix)
  33
  34
  35 data JSSpell = JSPOS Lang | JSLemma Lang
  36   deriving (Show)
  37
  38 instance ToJSON JSSpell where
  39   toJSON (JSPOS EN)    = "en.pos"
  40   toJSON (JSPOS FR)    = "fr.pos"
  41   toJSON (JSPOS DE)    = "de.pos"
  42   toJSON (JSPOS ES)    = "es.pos"
  43   toJSON (JSPOS IT)    = "it.pos"
  44   toJSON (JSPOS PL)    = "pl.pos"
  45   toJSON (JSPOS CN)    = "cn.pos"
  46   toJSON (JSPOS All)   = "pos"
  47
  48   toJSON (JSLemma EN)  = "en.lemma"
  49   toJSON (JSLemma FR)  = "fr.lemma"
  50   toJSON (JSLemma DE)  = "de.lemma"
  51   toJSON (JSLemma ES)  = "es.lemma"
  52   toJSON (JSLemma IT)  = "it.lemma"
  53   toJSON (JSLemma PL)  = "pl.lemma"
  54   toJSON (JSLemma CN)  = "cn.lemma"
  55   toJSON (JSLemma All) = "lemma"
  56
  57 instance FromJSON JSSpell where
  58   parseJSON (String "en.pos")   = pure $ JSPOS EN
  59   parseJSON (String "fr.pos")   = pure $ JSPOS FR
  60   parseJSON (String "de.pos")   = pure $ JSPOS DE
  61   parseJSON (String "es.pos")   = pure $ JSPOS ES
  62   parseJSON (String "it.pos")   = pure $ JSPOS IT
  63   parseJSON (String "pl.pos")   = pure $ JSPOS PL
  64   parseJSON (String "cn.pos")   = pure $ JSPOS CN
  65   parseJSON (String "pos")      = pure $ JSPOS All
  66   parseJSON (String "en.lemma") = pure $ JSLemma EN
  67   parseJSON (String "fr.lemma") = pure $ JSLemma FR
  68   parseJSON (String "de.lemma") = pure $ JSLemma DE
  69   parseJSON (String "es.lemma") = pure $ JSLemma ES
  70   parseJSON (String "it.lemma") = pure $ JSLemma IT
  71   parseJSON (String "pl.lemma") = pure $ JSLemma PL
  72   parseJSON (String "cn.lemma") = pure $ JSLemma CN
  73   parseJSON (String "lemma")    = pure $ JSLemma All
  74   parseJSON s =
  75     prependFailure "parsing spell failed, "
  76     (typeMismatch "Spell" s)
  77
  78 data JSRequest =
  79   JSRequest { _jsRequest_data     :: !Text
  80             , _jsRequest_format   :: !Text
  81             , _jsRequest_grouping :: !(Maybe Text)
  82             , _jsRequest_spell    :: !JSSpell }
  83   deriving (Show)
  84
  85 -- "spell" options:
  86 -- https://nlu.johnsnowlabs.com/docs/en/spellbook
  87
  88 deriveJSON (unPrefix "_jsRequest_") ''JSRequest
  89
  90 -- | JohnSnow NLP works via asynchronous tasks: send a query and get a
  91 -- task in response. One must poll for task status and then get it's
  92 -- result.
  93 data JSAsyncTask =
  94   JSAsyncTask { _jsAsyncTask_uuid :: !Text }
  95   deriving (Show)
  96
  97 deriveJSON (unPrefix "_jsAsyncTask_") ''JSAsyncTask
  98
  99 -- | Task status.
 100 data JSAsyncTaskStatus =
 101   JSAsyncTaskStatus { _jsAsyncTaskStatus_code    :: !Text
 102                     , _jsAsyncTaskStatus_message :: !(Maybe Text) }
 103   deriving (Show)
 104
 105 taskReady :: JSAsyncTaskStatus -> Bool
 106 taskReady (JSAsyncTaskStatus { .. }) = _jsAsyncTaskStatus_code == "success"
 107
 108 --deriveJSON (unPrefix "_jsAsyncTaskStatus_") ''JSAsyncTaskStatus
 109 instance FromJSON JSAsyncTaskStatus where
 110   parseJSON (Object v) = do
 111     status <- v .: "status"
 112     code <- status .: "code"
 113     message <- status .:? "message"
 114     pure $ JSAsyncTaskStatus { _jsAsyncTaskStatus_code = code
 115                              , _jsAsyncTaskStatus_message = message }
 116   parseJSON s =
 117     prependFailure "parsing status failed"
 118     (typeMismatch "status" s)
 119
 120 -- | Response for our query. The `Maybe` types are here because we
 121 -- combine 2 types of responses into one: `pos` and `lemma`.
 122 data JSAsyncTaskResponse =
 123   JSAsyncTaskResponse { _jsAsyncTaskResponse_index    :: Map Text Int
 124                       , _jsAsyncTaskResponse_document :: Map Text Text
 125                       , _jsAsyncTaskResponse_sentence :: Map Text [Text]
 126                       , _jsAsyncTaskResponse_lem      :: Maybe (Map Text [Text])
 127                       , _jsAsyncTaskResponse_pos      :: Maybe (Map Text [POS])
 128                       , _jsAsyncTaskResponse_token    :: Map Text [Text] }
 129   deriving (Show)
 130
 131 deriveJSON (unPrefix "_jsAsyncTaskResponse_") ''JSAsyncTaskResponse
 132 makeLenses ''JSAsyncTaskResponse
 133
 134 -- | We need to combine 2 responses: `pos` and `lemma` spells.
 135 jsAsyncTaskResponseToSentences :: JSAsyncTaskResponse -> JSAsyncTaskResponse -> PosSentences
 136 jsAsyncTaskResponseToSentences jsPos jsLemma =
 137   PosSentences { _sentences }
 138   where
 139     _sentences = Map.elems $ Map.mapWithKey mapSentence (jsPos ^. jsAsyncTaskResponse_sentence)
 140     mapSentence idx sentence = Sentence { _sentenceIndex = sIndex
 141                                          , _sentenceTokens = sTokens }
 142       where
 143         sIndex = Map.findWithDefault (-1) idx (jsPos ^. jsAsyncTaskResponse_index)
 144         lemmas = fromMaybe [] $
 145           if Just sentence == Map.lookup idx (jsLemma ^. jsAsyncTaskResponse_sentence) then
 146             Map.lookup idx $ fromMaybe Map.empty (jsLemma ^. jsAsyncTaskResponse_lem)
 147           else
 148             Nothing
 149         sTokens = imap mapPosToken $ zip (Map.findWithDefault [] idx $ fromMaybe Map.empty (jsPos ^. jsAsyncTaskResponse_pos))
 150                                          (Map.findWithDefault [] idx (jsPos ^. jsAsyncTaskResponse_token))
 151         mapPosToken idx' (pos, token) = Token { _tokenIndex = -1
 152                                               , _tokenWord = token
 153                                               , _tokenOriginalText = ""
 154                                               , _tokenLemma = fromMaybe "" $ (LS.!!) lemmas idx'
 155                                               , _tokenCharacterOffsetBegin = -1
 156                                               , _tokenCharacterOffsetEnd = -1
 157                                               , _tokenPos = Just pos
 158                                               , _tokenNer = Nothing
 159                                               , _tokenBefore = Nothing
 160                                               , _tokenAfter = Nothing }
 161
 162 -----------------------------------------------------
 163
 164 jsRequest :: Text -> JSSpell -> IO JSAsyncTask
 165 jsRequest t s = do
 166   url <- parseRequest $ "POST http://localhost:5000/api/results"
 167   let jsReq = JSRequest { _jsRequest_data     = t
 168                         , _jsRequest_format   = "text"
 169                         , _jsRequest_grouping = Nothing
 170                         , _jsRequest_spell    = s }
 171   let request = setRequestBodyLBS (encode jsReq) url
 172   task <- httpJSON request :: IO (Response JSAsyncTask)
 173   pure $ getResponseBody task
 174
 175 jsTaskStatus :: JSAsyncTask -> IO JSAsyncTaskStatus
 176 jsTaskStatus (JSAsyncTask uuid) = do
 177   url <- parseRequest $ unpack $ "GET http://localhost:5000/api/results/" <> uuid <> "/status"
 178   status <- httpJSON url
 179   pure $ getResponseBody status
 180
 181 jsTaskResponse :: JSAsyncTask -> IO JSAsyncTaskResponse
 182 jsTaskResponse (JSAsyncTask uuid) = do
 183   url <- parseRequest $ unpack $ "GET http://localhost:5000/api/results/" <> uuid
 184   result <- httpJSON url
 185   pure $ getResponseBody result
 186
 187 waitForJsTask :: JSAsyncTask -> IO JSAsyncTaskResponse
 188 waitForJsTask jsTask = wait' 0
 189   where
 190     wait' :: Int -> IO JSAsyncTaskResponse
 191     wait' counter = do
 192       status <- jsTaskStatus jsTask
 193       if taskReady status then
 194         jsTaskResponse jsTask
 195       else
 196         if counter > 60 then
 197           panic "[waitForJsTask] waited for 1 minute and still no answer from JohnSnow NLP"
 198         else do
 199           -- printDebug "[waitForJsTask] task not ready, waiting" counter
 200           _ <- threadDelay $ 1000000*1
 201           wait' $ counter + 1
 202
 203 getPosTagAndLems :: Lang -> Text -> IO PosSentences
 204 getPosTagAndLems l t = do
 205   jsPosTask   <- jsRequest t (JSPOS   l)
 206   jsLemmaTask <- jsRequest t (JSLemma l)
 207
 208   -- wait for both tasks
 209   jsPos <- waitForJsTask jsPosTask
 210   jsLemma <- waitForJsTask jsLemmaTask
 211
 212   pure $ jsAsyncTaskResponseToSentences jsPos jsLemma
 213
 214 nlp :: Lang -> Text -> IO PosSentences
 215 nlp = getPosTagAndLems