2 Module : Gargantext.Ngrams.CoreNLP
3 Description : CoreNLP module
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
13 {-# LANGUAGE DataKinds #-}
14 {-# LANGUAGE DeriveGeneric #-}
15 {-# LANGUAGE NoImplicitPrelude #-}
16 {-# LANGUAGE TemplateHaskell #-}
17 {-# LANGUAGE TypeOperators #-}
19 module Gargantext.Ngrams.CoreNLP where
21 import Data.Aeson.TH (deriveJSON)
23 import Data.Monoid ((<>))
24 import GHC.Show (Show(..))
26 import Gargantext.Types.Main (Language(..))
27 import Gargantext.Prelude
28 import Gargantext.Utils.Prefix (unPrefix)
29 import Data.Text (Text)
31 import Network.HTTP.Simple
34 data Token = Token { _tokenIndex :: Int
36 , _tokenOriginalText :: Text
38 , _tokenCharacterOffsetBegin :: Int
39 , _tokenCharacterOffsetEnd :: Int
42 , _tokenBefore :: Maybe Text
43 , _tokenAfter :: Maybe Text
44 } deriving (Show, Generic)
45 $(deriveJSON (unPrefix "_token") ''Token)
47 token2text :: Token -> (Text, Text, Text)
48 token2text (Token _ w _ _ _ _ p n _ _) = (w,p,n)
51 data Sentence = Sentence { _sentenceIndex :: Int
52 , _sentenceTokens :: [Token]
53 } deriving (Show, Generic)
55 $(deriveJSON (unPrefix "_sentence") ''Sentence)
57 data Properties = Properties { _propertiesAnnotators :: Text
58 , _propertiesOutputFormat :: Text
59 } deriving (Show, Generic)
61 $(deriveJSON (unPrefix "_properties") ''Properties)
63 data Sentences = Sentences { _sentences :: [Sentence]}
64 deriving (Show, Generic)
66 $(deriveJSON (unPrefix "_") ''Sentences)
71 -- "tokenize.language" : "fr",
72 -- "pos.model" : "edu/stanford/nlp/models/pos-tagger/french/french.tagger",
73 -- "parse.model" : "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz",
74 -- // dependency parser
75 -- "depparse.model" : "edu/stanford/nlp/models/parser/nndep/UD_French.gz",
76 -- "depparse.language" : "french",
77 -- "ner.model": DATA_ROOT+"/eunews.fr.crf.gz",
78 -- "ssplit.newlineIsSentenceBreak": "always"
83 corenlp :: Language -> Text -> IO Sentences
85 let properties = case lang of
86 EN -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
87 -- FR -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
88 FR -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"parse.model\":\"edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz\", \"pos.model\":\"edu/stanford/nlp/models/pos-tagger/french/french.tagger\", \"tokenize.language\":\"fr\", \"outputFormat\": \"json\"}"
89 url <- parseRequest $ "POST http://localhost:9000/?properties=" <> properties
90 let request = setRequestBodyLBS (cs txt) url
91 response <- httpJSON request
92 pure (getResponseBody response :: Sentences)
95 -- Part Of Speech example
96 -- parseWith _tokenPos "Hello world."
97 -- == [[("``","``"),("Hello","UH"),("world","NN"),(".","."),("''","''")]]
99 -- Named Entity Recognition example
100 -- parseWith _tokenNer "Hello world of Peter."
101 -- [[("``","O"),("Hello","O"),("world","O"),("of","O"),("Peter","PERSON"),(".","O"),("''","O")]]
102 tokenWith :: (Token -> t) -> Language -> Text -> IO [[(Text, t)]]
103 tokenWith f lang s = map (map (\t -> (_tokenWord t, f t))) <$> map _sentenceTokens <$> _sentences <$> corenlp lang s