src/Gargantext/Ngrams/CoreNLP.hs

   1 {-# LANGUAGE DataKinds #-}
   2 {-# LANGUAGE DeriveGeneric #-}
   3 {-# LANGUAGE TypeOperators #-}
   4 {-# LANGUAGE TemplateHaskell #-}
   5
   6 module Gargantext.Ngrams.CoreNLP where
   7
   8 import Data.Aeson
   9 import Data.Aeson.TH (deriveJSON)
  10 import GHC.Generics
  11 import Data.Monoid ((<>))
  12
  13 import Gargantext.Types.Main (Language(..))
  14 import Gargantext.Prelude
  15 import Gargantext.Utils.Prefix (unPrefix)
  16 import Data.Text (Text)
  17
  18 import qualified Data.ByteString.Char8 as S8
  19 import qualified Data.Yaml             as Yaml
  20 import           Network.HTTP.Simple
  21
  22
  23 data Token = Token { _tokenIndex                :: Int
  24                    , _tokenWord                 :: Text
  25                    , _tokenOriginalText         :: Text
  26                    , _tokenLemma                :: Text
  27                    , _tokenCharacterOffsetBegin :: Int
  28                    , _tokenCharacterOffsetEnd   :: Int
  29                    , _tokenPos                  :: Text
  30                    , _tokenNer                  :: Text
  31                    , _tokenBefore               :: Maybe Text
  32                    , _tokenAfter                :: Maybe Text
  33                    } deriving (Show, Generic)
  34 $(deriveJSON (unPrefix "_token") ''Token)
  35
  36 token2text :: Token -> (Text, Text, Text)
  37 token2text (Token _ w _ _ _ _ p n _ _) = (w,p,n)
  38
  39
  40 data Sentence  = Sentence { _sentenceIndex :: Int
  41                           , _sentenceTokens :: [Token]
  42                           } deriving (Show, Generic)
  43
  44 $(deriveJSON (unPrefix "_sentence") ''Sentence)
  45
  46 data Properties = Properties { _propertiesAnnotators  :: Text
  47                              , _propertiesOutputFormat :: Text
  48                              } deriving (Show, Generic)
  49
  50 $(deriveJSON (unPrefix "_properties") ''Properties)
  51
  52 data Sentences = Sentences { sentences :: [Sentence]}
  53   deriving (Show, Generic)
  54 instance ToJSON Sentences
  55 instance FromJSON Sentences
  56
  57
  58 -- request =
  59 -- "fr" : {
  60 --                 "tokenize.language" : "fr",
  61 --                 "pos.model" : "edu/stanford/nlp/models/pos-tagger/french/french.tagger",
  62 --                 "parse.model" : "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz",
  63 --                 // dependency parser
  64 --                 "depparse.model" : "edu/stanford/nlp/models/parser/nndep/UD_French.gz",
  65 --                 "depparse.language" : "french",
  66 --                 "ner.model":  DATA_ROOT+"/eunews.fr.crf.gz",
  67 --                 "ssplit.newlineIsSentenceBreak": "always"
  68 --             },
  69 --
  70
  71
  72 corenlpPretty :: String -> IO ()
  73 corenlpPretty txt = do
  74     url <- parseRequest "POST http://localhost:9000/?properties={\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
  75     let request = setRequestBodyJSON txt url
  76     response <- httpJSON request
  77
  78 --    putStrLn $ "The status code was: " ++
  79 --               show (getResponseStatusCode response)
  80 --    print $ getResponseHeader "Content-Type" response
  81     S8.putStrLn $ Yaml.encode (getResponseBody response :: Sentences)
  82
  83 corenlp :: Language -> String -> IO Sentences
  84 corenlp lang txt = do
  85     let properties = case lang of
  86             EN -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
  87             -- FR -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
  88             FR -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"parse.model\":\"edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz\", \"pos.model\":\"edu/stanford/nlp/models/pos-tagger/french/french.tagger\", \"tokenize.language\":\"fr\", \"outputFormat\": \"json\"}"
  89     url <- parseRequest $ "POST http://localhost:9000/?properties=" <> properties
  90     let request = setRequestBodyJSON txt url
  91     response <- httpJSON request
  92     pure (getResponseBody response :: Sentences)
  93
  94 -- | parseWith
  95 -- Part Of Speech example
  96 -- parseWith  _tokenPos    "Hello world."
  97 -- == [[("``","``"),("Hello","UH"),("world","NN"),(".","."),("''","''")]]
  98
  99 -- Named Entity Recognition example
 100 -- parseWith  _tokenNer     "Hello world of Peter."
 101 -- [[("``","O"),("Hello","O"),("world","O"),("of","O"),("Peter","PERSON"),(".","O"),("''","O")]]
 102 tokenWith :: (Token -> t) -> Language -> String -> IO [[(Text, t)]]
 103 tokenWith f lang s = pm (pm (\t -> (_tokenWord t, f t))) <$> pm _sentenceTokens <$> sentences <$> corenlp lang s
 104
 105
 106