1 {-# LANGUAGE DataKinds #-}
2 {-# LANGUAGE DeriveGeneric #-}
3 {-# LANGUAGE TypeOperators #-}
4 {-# LANGUAGE TemplateHaskell #-}
5 {-# LANGUAGE NoImplicitPrelude #-}
7 module Gargantext.Ngrams.CoreNLP where
10 import Data.Aeson.TH (deriveJSON)
12 import Data.Monoid ((<>))
13 import GHC.Show (Show(..))
15 import Gargantext.Types.Main (Language(..))
16 import Gargantext.Prelude
17 import Gargantext.Utils.Prefix (unPrefix)
18 import Data.Text (Text)
20 import qualified Data.ByteString.Char8 as S8
21 import qualified Data.Yaml as Yaml
22 import Network.HTTP.Simple
25 data Token = Token { _tokenIndex :: Int
27 , _tokenOriginalText :: Text
29 , _tokenCharacterOffsetBegin :: Int
30 , _tokenCharacterOffsetEnd :: Int
33 , _tokenBefore :: Maybe Text
34 , _tokenAfter :: Maybe Text
35 } deriving (Show, Generic)
36 $(deriveJSON (unPrefix "_token") ''Token)
38 token2text :: Token -> (Text, Text, Text)
39 token2text (Token _ w _ _ _ _ p n _ _) = (w,p,n)
42 data Sentence = Sentence { _sentenceIndex :: Int
43 , _sentenceTokens :: [Token]
44 } deriving (Show, Generic)
46 $(deriveJSON (unPrefix "_sentence") ''Sentence)
48 data Properties = Properties { _propertiesAnnotators :: Text
49 , _propertiesOutputFormat :: Text
50 } deriving (Show, Generic)
52 $(deriveJSON (unPrefix "_properties") ''Properties)
54 data Sentences = Sentences { sentences :: [Sentence]}
55 deriving (Show, Generic)
56 instance ToJSON Sentences
57 instance FromJSON Sentences
62 -- "tokenize.language" : "fr",
63 -- "pos.model" : "edu/stanford/nlp/models/pos-tagger/french/french.tagger",
64 -- "parse.model" : "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz",
65 -- // dependency parser
66 -- "depparse.model" : "edu/stanford/nlp/models/parser/nndep/UD_French.gz",
67 -- "depparse.language" : "french",
68 -- "ner.model": DATA_ROOT+"/eunews.fr.crf.gz",
69 -- "ssplit.newlineIsSentenceBreak": "always"
74 corenlpPretty :: Text -> IO ()
75 corenlpPretty txt = do
76 url <- parseRequest "POST http://localhost:9000/?properties={\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
77 let request = setRequestBodyJSON txt url
78 response <- httpJSON request
80 -- putStrLn $ "The status code was: " ++
81 -- show (getResponseStatusCode response)
82 -- print $ getResponseHeader "Content-Type" response
83 S8.putStrLn $ Yaml.encode (getResponseBody response :: Sentences)
85 corenlp :: Language -> Text -> IO Sentences
87 let properties = case lang of
88 EN -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
89 -- FR -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
90 FR -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"parse.model\":\"edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz\", \"pos.model\":\"edu/stanford/nlp/models/pos-tagger/french/french.tagger\", \"tokenize.language\":\"fr\", \"outputFormat\": \"json\"}"
91 url <- parseRequest $ "POST http://localhost:9000/?properties=" <> properties
92 let request = setRequestBodyJSON txt url
93 response <- httpJSON request
94 pure (getResponseBody response :: Sentences)
97 -- Part Of Speech example
98 -- parseWith _tokenPos "Hello world."
99 -- == [[("``","``"),("Hello","UH"),("world","NN"),(".","."),("''","''")]]
101 -- Named Entity Recognition example
102 -- parseWith _tokenNer "Hello world of Peter."
103 -- [[("``","O"),("Hello","O"),("world","O"),("of","O"),("Peter","PERSON"),(".","O"),("''","O")]]
104 tokenWith :: (Token -> t) -> Language -> Text -> IO [[(Text, t)]]
105 tokenWith f lang s = map (map (\t -> (_tokenWord t, f t))) <$> map _sentenceTokens <$> sentences <$> corenlp lang s