2 Module : Gargantext.Ngrams.CoreNLP
3 Description : CoreNLP module
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
13 {-# LANGUAGE DataKinds #-}
14 {-# LANGUAGE DeriveGeneric #-}
15 {-# LANGUAGE NoImplicitPrelude #-}
16 {-# LANGUAGE TemplateHaskell #-}
17 {-# LANGUAGE TypeOperators #-}
19 module Gargantext.Ngrams.CoreNLP where
22 import Data.Aeson.TH (deriveJSON)
24 import Data.Monoid ((<>))
25 import GHC.Show (Show(..))
27 import Gargantext.Types.Main (Language(..))
28 import Gargantext.Prelude
29 import Gargantext.Utils.Prefix (unPrefix)
30 import Data.Text (Text)
32 import qualified Data.ByteString.Char8 as S8
33 import qualified Data.Yaml as Yaml
34 import Network.HTTP.Simple
37 data Token = Token { _tokenIndex :: Int
39 , _tokenOriginalText :: Text
41 , _tokenCharacterOffsetBegin :: Int
42 , _tokenCharacterOffsetEnd :: Int
45 , _tokenBefore :: Maybe Text
46 , _tokenAfter :: Maybe Text
47 } deriving (Show, Generic)
48 $(deriveJSON (unPrefix "_token") ''Token)
50 token2text :: Token -> (Text, Text, Text)
51 token2text (Token _ w _ _ _ _ p n _ _) = (w,p,n)
54 data Sentence = Sentence { _sentenceIndex :: Int
55 , _sentenceTokens :: [Token]
56 } deriving (Show, Generic)
58 $(deriveJSON (unPrefix "_sentence") ''Sentence)
60 data Properties = Properties { _propertiesAnnotators :: Text
61 , _propertiesOutputFormat :: Text
62 } deriving (Show, Generic)
64 $(deriveJSON (unPrefix "_properties") ''Properties)
66 data Sentences = Sentences { _sentences :: [Sentence]}
67 deriving (Show, Generic)
68 instance ToJSON Sentences
69 instance FromJSON Sentences
74 -- "tokenize.language" : "fr",
75 -- "pos.model" : "edu/stanford/nlp/models/pos-tagger/french/french.tagger",
76 -- "parse.model" : "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz",
77 -- // dependency parser
78 -- "depparse.model" : "edu/stanford/nlp/models/parser/nndep/UD_French.gz",
79 -- "depparse.language" : "french",
80 -- "ner.model": DATA_ROOT+"/eunews.fr.crf.gz",
81 -- "ssplit.newlineIsSentenceBreak": "always"
86 corenlpPretty :: Text -> IO ()
87 corenlpPretty txt = do
88 url <- parseRequest "POST http://localhost:9000/?properties={\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
89 let request = setRequestBodyJSON txt url
90 response <- httpJSON request
92 -- putStrLn $ "The status code was: " ++
93 -- show (getResponseStatusCode response)
94 -- print $ getResponseHeader "Content-Type" response
95 S8.putStrLn $ Yaml.encode (getResponseBody response :: Sentences)
97 corenlp :: Language -> Text -> IO Sentences
99 let properties = case lang of
100 EN -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
101 -- FR -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
102 FR -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"parse.model\":\"edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz\", \"pos.model\":\"edu/stanford/nlp/models/pos-tagger/french/french.tagger\", \"tokenize.language\":\"fr\", \"outputFormat\": \"json\"}"
103 url <- parseRequest $ "POST http://localhost:9000/?properties=" <> properties
104 let request = setRequestBodyJSON txt url
105 response <- httpJSON request
106 pure (getResponseBody response :: Sentences)
109 -- Part Of Speech example
110 -- parseWith _tokenPos "Hello world."
111 -- == [[("``","``"),("Hello","UH"),("world","NN"),(".","."),("''","''")]]
113 -- Named Entity Recognition example
114 -- parseWith _tokenNer "Hello world of Peter."
115 -- [[("``","O"),("Hello","O"),("world","O"),("of","O"),("Peter","PERSON"),(".","O"),("''","O")]]
116 tokenWith :: (Token -> t) -> Language -> Text -> IO [[(Text, t)]]
117 tokenWith f lang s = map (map (\t -> (_tokenWord t, f t))) <$> map _sentenceTokens <$> _sentences <$> corenlp lang s