2 Module : Gargantext.Ngrams.CoreNLP
3 Description : CoreNLP module
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
13 {-# LANGUAGE DataKinds #-}
14 {-# LANGUAGE DeriveGeneric #-}
15 {-# LANGUAGE NoImplicitPrelude #-}
16 {-# LANGUAGE TemplateHaskell #-}
17 {-# LANGUAGE TypeOperators #-}
19 module Gargantext.Ngrams.CoreNLP where
21 import Data.Aeson.TH (deriveJSON)
23 import Data.Monoid ((<>))
24 import GHC.Show (Show(..))
26 import Gargantext.Types.Main (Language(..))
27 import Gargantext.Prelude
28 import Gargantext.Utils.Prefix (unPrefix)
29 import Data.Text (Text)
31 import qualified Data.ByteString.Char8 as S8
32 import qualified Data.Yaml as Yaml
33 import Network.HTTP.Simple
36 data Token = Token { _tokenIndex :: Int
38 , _tokenOriginalText :: Text
40 , _tokenCharacterOffsetBegin :: Int
41 , _tokenCharacterOffsetEnd :: Int
44 , _tokenBefore :: Maybe Text
45 , _tokenAfter :: Maybe Text
46 } deriving (Show, Generic)
47 $(deriveJSON (unPrefix "_token") ''Token)
49 token2text :: Token -> (Text, Text, Text)
50 token2text (Token _ w _ _ _ _ p n _ _) = (w,p,n)
53 data Sentence = Sentence { _sentenceIndex :: Int
54 , _sentenceTokens :: [Token]
55 } deriving (Show, Generic)
57 $(deriveJSON (unPrefix "_sentence") ''Sentence)
59 data Properties = Properties { _propertiesAnnotators :: Text
60 , _propertiesOutputFormat :: Text
61 } deriving (Show, Generic)
63 $(deriveJSON (unPrefix "_properties") ''Properties)
65 data Sentences = Sentences { _sentences :: [Sentence]}
66 deriving (Show, Generic)
68 $(deriveJSON (unPrefix "_") ''Sentences)
73 -- "tokenize.language" : "fr",
74 -- "pos.model" : "edu/stanford/nlp/models/pos-tagger/french/french.tagger",
75 -- "parse.model" : "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz",
76 -- // dependency parser
77 -- "depparse.model" : "edu/stanford/nlp/models/parser/nndep/UD_French.gz",
78 -- "depparse.language" : "french",
79 -- "ner.model": DATA_ROOT+"/eunews.fr.crf.gz",
80 -- "ssplit.newlineIsSentenceBreak": "always"
85 corenlpPretty :: Text -> IO ()
86 corenlpPretty txt = do
87 url <- parseRequest "POST http://localhost:9000/?properties={\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
88 let request = setRequestBodyJSON txt url
89 response <- httpJSON request
91 -- putStrLn $ "The status code was: " ++
92 -- show (getResponseStatusCode response)
93 -- print $ getResponseHeader "Content-Type" response
94 S8.putStrLn $ Yaml.encode (getResponseBody response :: Sentences)
96 corenlp :: Language -> Text -> IO Sentences
98 let properties = case lang of
99 EN -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
100 -- FR -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
101 FR -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"parse.model\":\"edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz\", \"pos.model\":\"edu/stanford/nlp/models/pos-tagger/french/french.tagger\", \"tokenize.language\":\"fr\", \"outputFormat\": \"json\"}"
102 url <- parseRequest $ "POST http://localhost:9000/?properties=" <> properties
103 let request = setRequestBodyJSON txt url
104 response <- httpJSON request
105 pure (getResponseBody response :: Sentences)
108 -- Part Of Speech example
109 -- parseWith _tokenPos "Hello world."
110 -- == [[("``","``"),("Hello","UH"),("world","NN"),(".","."),("''","''")]]
112 -- Named Entity Recognition example
113 -- parseWith _tokenNer "Hello world of Peter."
114 -- [[("``","O"),("Hello","O"),("world","O"),("of","O"),("Peter","PERSON"),(".","O"),("''","O")]]
115 tokenWith :: (Token -> t) -> Language -> Text -> IO [[(Text, t)]]
116 tokenWith f lang s = map (map (\t -> (_tokenWord t, f t))) <$> map _sentenceTokens <$> _sentences <$> corenlp lang s