1 {-# LANGUAGE DataKinds #-}
2 {-# LANGUAGE DeriveGeneric #-}
3 {-# LANGUAGE TypeOperators #-}
4 {-# LANGUAGE TemplateHaskell #-}
6 module Data.Gargantext.Ngrams.CoreNLP where
9 import Data.Aeson.TH (deriveJSON)
12 import Data.Gargantext.Prelude
13 import Data.Gargantext.Utils.Prefix (unPrefix)
14 import Data.Text (Text)
16 import qualified Data.ByteString.Char8 as S8
17 import qualified Data.Yaml as Yaml
18 import Network.HTTP.Simple
21 data Token = Token { _tokenIndex :: Int
23 , _tokenOriginalText :: Text
25 , _tokenCharacterOffsetBegin :: Int
26 , _tokenCharacterOffsetEnd :: Int
29 , _tokenBefore :: Maybe Text
30 , _tokenAfter :: Maybe Text
31 } deriving (Show, Generic)
32 $(deriveJSON (unPrefix "_token") ''Token)
34 token2text :: Token -> (Text, Text, Text)
35 token2text (Token _ w _ _ _ _ p n _ _) = (w,p,n)
38 data Sentence = Sentence { _sentenceIndex :: Int
39 , _sentenceTokens :: [Token]
40 } deriving (Show, Generic)
42 $(deriveJSON (unPrefix "_sentence") ''Sentence)
44 data Properties = Properties { _propertiesAnnotators :: Text
45 , _propertiesOutputFormat :: Text
46 } deriving (Show, Generic)
48 $(deriveJSON (unPrefix "_properties") ''Properties)
50 data Sentences = Sentences { sentences :: [Sentence]}
51 deriving (Show, Generic)
52 instance ToJSON Sentences
53 instance FromJSON Sentences
58 -- "tokenize.language" : "fr",
59 -- "pos.model" : "edu/stanford/nlp/models/pos-tagger/french/french.tagger",
60 -- "parse.model" : "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz",
61 -- // dependency parser
62 -- "depparse.model" : "edu/stanford/nlp/models/parser/nndep/UD_French.gz",
63 -- "depparse.language" : "french",
64 -- "ner.model": DATA_ROOT+"/eunews.fr.crf.gz",
65 -- "ssplit.newlineIsSentenceBreak": "always"
70 corenlpPretty :: String -> IO ()
71 corenlpPretty txt = do
72 url <- parseRequest "POST http://localhost:9000/?properties={\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
73 let request = setRequestBodyJSON txt url
74 response <- httpJSON request
76 -- putStrLn $ "The status code was: " ++
77 -- show (getResponseStatusCode response)
78 -- print $ getResponseHeader "Content-Type" response
79 S8.putStrLn $ Yaml.encode (getResponseBody response :: Sentences)
81 corenlp :: String -> IO Sentences
83 url <- parseRequest "POST http://localhost:9000/?properties={\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
84 let request = setRequestBodyJSON txt url
85 response <- httpJSON request
86 pure (getResponseBody response :: Sentences)
89 -- Part Of Speech example
90 -- parseWith _tokenPos "Hello world."
91 -- == [[("``","``"),("Hello","UH"),("world","NN"),(".","."),("''","''")]]
93 -- Named Entity Recognition example
94 -- parseWith _tokenNer "Hello world of Peter."
95 -- [[("``","O"),("Hello","O"),("world","O"),("of","O"),("Peter","PERSON"),(".","O"),("''","O")]]
96 tokenWith :: (Token -> t) -> String -> IO [[(Text, t)]]
97 tokenWith f s = pm (pm (\t -> (_tokenWord t, f t))) <$> pm _sentenceTokens <$> sentences <$> corenlp s