1 {-# LANGUAGE DataKinds #-}
2 {-# LANGUAGE DeriveGeneric #-}
3 {-# LANGUAGE TypeOperators #-}
4 {-# LANGUAGE TemplateHaskell #-}
6 module Data.Gargantext.Ngrams.CoreNLP where
9 import Data.Aeson.TH (deriveJSON)
11 import Data.Monoid ((<>))
13 import Data.Gargantext.Types.Main (Language(..))
14 import Data.Gargantext.Prelude
15 import Data.Gargantext.Utils.Prefix (unPrefix)
16 import Data.Text (Text)
18 import qualified Data.ByteString.Char8 as S8
19 import qualified Data.Yaml as Yaml
20 import Network.HTTP.Simple
23 data Token = Token { _tokenIndex :: Int
25 , _tokenOriginalText :: Text
27 , _tokenCharacterOffsetBegin :: Int
28 , _tokenCharacterOffsetEnd :: Int
31 , _tokenBefore :: Maybe Text
32 , _tokenAfter :: Maybe Text
33 } deriving (Show, Generic)
34 $(deriveJSON (unPrefix "_token") ''Token)
36 token2text :: Token -> (Text, Text, Text)
37 token2text (Token _ w _ _ _ _ p n _ _) = (w,p,n)
40 data Sentence = Sentence { _sentenceIndex :: Int
41 , _sentenceTokens :: [Token]
42 } deriving (Show, Generic)
44 $(deriveJSON (unPrefix "_sentence") ''Sentence)
46 data Properties = Properties { _propertiesAnnotators :: Text
47 , _propertiesOutputFormat :: Text
48 } deriving (Show, Generic)
50 $(deriveJSON (unPrefix "_properties") ''Properties)
52 data Sentences = Sentences { sentences :: [Sentence]}
53 deriving (Show, Generic)
54 instance ToJSON Sentences
55 instance FromJSON Sentences
60 -- "tokenize.language" : "fr",
61 -- "pos.model" : "edu/stanford/nlp/models/pos-tagger/french/french.tagger",
62 -- "parse.model" : "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz",
63 -- // dependency parser
64 -- "depparse.model" : "edu/stanford/nlp/models/parser/nndep/UD_French.gz",
65 -- "depparse.language" : "french",
66 -- "ner.model": DATA_ROOT+"/eunews.fr.crf.gz",
67 -- "ssplit.newlineIsSentenceBreak": "always"
72 corenlpPretty :: String -> IO ()
73 corenlpPretty txt = do
74 url <- parseRequest "POST http://localhost:9000/?properties={\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
75 let request = setRequestBodyJSON txt url
76 response <- httpJSON request
78 -- putStrLn $ "The status code was: " ++
79 -- show (getResponseStatusCode response)
80 -- print $ getResponseHeader "Content-Type" response
81 S8.putStrLn $ Yaml.encode (getResponseBody response :: Sentences)
83 corenlp :: Language -> String -> IO Sentences
85 let properties = case lang of
86 EN -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
87 -- FR -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
88 FR -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"parse.model\":\"edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz\", \"pos.model\":\"edu/stanford/nlp/models/pos-tagger/french/french.tagger\", \"tokenize.language\":\"fr\", \"outputFormat\": \"json\"}"
89 url <- parseRequest $ "POST http://localhost:9000/?properties=" <> properties
90 let request = setRequestBodyJSON txt url
91 response <- httpJSON request
92 pure (getResponseBody response :: Sentences)
95 -- Part Of Speech example
96 -- parseWith _tokenPos "Hello world."
97 -- == [[("``","``"),("Hello","UH"),("world","NN"),(".","."),("''","''")]]
99 -- Named Entity Recognition example
100 -- parseWith _tokenNer "Hello world of Peter."
101 -- [[("``","O"),("Hello","O"),("world","O"),("of","O"),("Peter","PERSON"),(".","O"),("''","O")]]
102 tokenWith :: (Token -> t) -> Language -> String -> IO [[(Text, t)]]
103 tokenWith f lang s = pm (pm (\t -> (_tokenWord t, f t))) <$> pm _sentenceTokens <$> sentences <$> corenlp lang s