src/Data/Gargantext/NLP/CoreNLP.hs

   1 {-# LANGUAGE DataKinds #-}
   2 {-# LANGUAGE DeriveGeneric #-}
   3 {-# LANGUAGE TypeOperators #-}
   4 {-# LANGUAGE TemplateHaskell #-}
   5
   6 module Data.Gargantext.NLP.CoreNLP where
   7
   8 import Data.Aeson
   9 import Data.Aeson.TH (deriveJSON)
  10 import GHC.Generics
  11
  12 import Data.Gargantext.Prelude
  13 import Data.Gargantext.Utils.Prefix (unPrefix)
  14 import Data.Text (Text)
  15
  16 import qualified Data.ByteString.Char8 as S8
  17 import qualified Data.Yaml             as Yaml
  18 import           Network.HTTP.Simple
  19
  20
  21 data Token = Token { _tokenIndex                :: Int
  22                    , _tokenWord                 :: Text
  23                    , _tokenOriginalText         :: Text
  24                    , _tokenLemma                :: Text
  25                    , _tokenCharacterOffsetBegin :: Int
  26                    , _tokenCharacterOffsetEnd   :: Int
  27                    , _tokenPos                  :: Text
  28                    , _tokenNer                  :: Text
  29                    , _tokenBefore               :: Maybe Text
  30                    , _tokenAfter                :: Maybe Text
  31                    } deriving (Show, Generic)
  32 $(deriveJSON (unPrefix "_token") ''Token)
  33
  34 token2text :: Token -> (Text, Text, Text)
  35 token2text (Token _ w _ _ _ _ p n _ _) = (w,p,n)
  36
  37
  38 data Sentence  = Sentence { _sentenceIndex :: Int
  39                           , _sentenceTokens :: [Token]
  40                           } deriving (Show, Generic)
  41
  42 $(deriveJSON (unPrefix "_sentence") ''Sentence)
  43
  44 data Properties = Properties { _propertiesAnnotators  :: Text
  45                              , _propertiesOutputFormat :: Text
  46                              } deriving (Show, Generic)
  47
  48 $(deriveJSON (unPrefix "_properties") ''Properties)
  49
  50 data Sentences = Sentences { sentences :: [Sentence]}
  51   deriving (Show, Generic)
  52 instance ToJSON Sentences
  53 instance FromJSON Sentences
  54
  55
  56 -- request =
  57 -- "fr" : {
  58 --                 "tokenize.language" : "fr",
  59 --                 "pos.model" : "edu/stanford/nlp/models/pos-tagger/french/french.tagger",
  60 --                 "parse.model" : "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz",
  61 --                 // dependency parser
  62 --                 "depparse.model" : "edu/stanford/nlp/models/parser/nndep/UD_French.gz",
  63 --                 "depparse.language" : "french",
  64 --                 "ner.model":  DATA_ROOT+"/eunews.fr.crf.gz",
  65 --                 "ssplit.newlineIsSentenceBreak": "always"
  66 --             },
  67 --
  68
  69
  70 corenlpPretty :: String -> IO ()
  71 corenlpPretty txt = do
  72     url <- parseRequest "POST http://localhost:9000/?properties={\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
  73     let request = setRequestBodyJSON txt url
  74     response <- httpJSON request
  75
  76 --    putStrLn $ "The status code was: " ++
  77 --               show (getResponseStatusCode response)
  78 --    print $ getResponseHeader "Content-Type" response
  79     S8.putStrLn $ Yaml.encode (getResponseBody response :: Sentences)
  80
  81 corenlp :: String -> IO Sentences
  82 corenlp txt = do
  83     url <- parseRequest "POST http://localhost:9000/?properties={\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
  84     let request = setRequestBodyJSON txt url
  85     response <- httpJSON request
  86     pure (getResponseBody response :: Sentences)
  87
  88 -- | parseWith
  89 -- Part Of Speech example
  90 -- parseWith  _tokenPos    "Hello world."
  91 -- == [[("``","``"),("Hello","UH"),("world","NN"),(".","."),("''","''")]]
  92
  93 -- Named Entity Recognition example
  94 -- parseWith  _tokenNer     "Hello world of Peter."
  95 -- [[("``","O"),("Hello","O"),("world","O"),("of","O"),("Peter","PERSON"),(".","O"),("''","O")]]
  96 tokenWith :: (Token -> t) -> String -> IO [[(Text, t)]]
  97 tokenWith f s = pm (pm (\t -> (_tokenWord t, f t))) <$> pm _sentenceTokens <$> sentences <$> corenlp s
  98
  99
 100