]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Ngrams/CoreNLP.hs
[PATH] Data.Gargantext -> Gargantext.
[gargantext.git] / src / Gargantext / Ngrams / CoreNLP.hs
1 {-# LANGUAGE DataKinds #-}
2 {-# LANGUAGE DeriveGeneric #-}
3 {-# LANGUAGE TypeOperators #-}
4 {-# LANGUAGE TemplateHaskell #-}
5
6 module Gargantext.Ngrams.CoreNLP where
7
8 import Data.Aeson
9 import Data.Aeson.TH (deriveJSON)
10 import GHC.Generics
11 import Data.Monoid ((<>))
12
13 import Gargantext.Types.Main (Language(..))
14 import Gargantext.Prelude
15 import Gargantext.Utils.Prefix (unPrefix)
16 import Data.Text (Text)
17
18 import qualified Data.ByteString.Char8 as S8
19 import qualified Data.Yaml as Yaml
20 import Network.HTTP.Simple
21
22
23 data Token = Token { _tokenIndex :: Int
24 , _tokenWord :: Text
25 , _tokenOriginalText :: Text
26 , _tokenLemma :: Text
27 , _tokenCharacterOffsetBegin :: Int
28 , _tokenCharacterOffsetEnd :: Int
29 , _tokenPos :: Text
30 , _tokenNer :: Text
31 , _tokenBefore :: Maybe Text
32 , _tokenAfter :: Maybe Text
33 } deriving (Show, Generic)
34 $(deriveJSON (unPrefix "_token") ''Token)
35
36 token2text :: Token -> (Text, Text, Text)
37 token2text (Token _ w _ _ _ _ p n _ _) = (w,p,n)
38
39
40 data Sentence = Sentence { _sentenceIndex :: Int
41 , _sentenceTokens :: [Token]
42 } deriving (Show, Generic)
43
44 $(deriveJSON (unPrefix "_sentence") ''Sentence)
45
46 data Properties = Properties { _propertiesAnnotators :: Text
47 , _propertiesOutputFormat :: Text
48 } deriving (Show, Generic)
49
50 $(deriveJSON (unPrefix "_properties") ''Properties)
51
52 data Sentences = Sentences { sentences :: [Sentence]}
53 deriving (Show, Generic)
54 instance ToJSON Sentences
55 instance FromJSON Sentences
56
57
58 -- request =
59 -- "fr" : {
60 -- "tokenize.language" : "fr",
61 -- "pos.model" : "edu/stanford/nlp/models/pos-tagger/french/french.tagger",
62 -- "parse.model" : "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz",
63 -- // dependency parser
64 -- "depparse.model" : "edu/stanford/nlp/models/parser/nndep/UD_French.gz",
65 -- "depparse.language" : "french",
66 -- "ner.model": DATA_ROOT+"/eunews.fr.crf.gz",
67 -- "ssplit.newlineIsSentenceBreak": "always"
68 -- },
69 --
70
71
72 corenlpPretty :: String -> IO ()
73 corenlpPretty txt = do
74 url <- parseRequest "POST http://localhost:9000/?properties={\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
75 let request = setRequestBodyJSON txt url
76 response <- httpJSON request
77
78 -- putStrLn $ "The status code was: " ++
79 -- show (getResponseStatusCode response)
80 -- print $ getResponseHeader "Content-Type" response
81 S8.putStrLn $ Yaml.encode (getResponseBody response :: Sentences)
82
83 corenlp :: Language -> String -> IO Sentences
84 corenlp lang txt = do
85 let properties = case lang of
86 EN -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
87 -- FR -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
88 FR -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"parse.model\":\"edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz\", \"pos.model\":\"edu/stanford/nlp/models/pos-tagger/french/french.tagger\", \"tokenize.language\":\"fr\", \"outputFormat\": \"json\"}"
89 url <- parseRequest $ "POST http://localhost:9000/?properties=" <> properties
90 let request = setRequestBodyJSON txt url
91 response <- httpJSON request
92 pure (getResponseBody response :: Sentences)
93
94 -- | parseWith
95 -- Part Of Speech example
96 -- parseWith _tokenPos "Hello world."
97 -- == [[("``","``"),("Hello","UH"),("world","NN"),(".","."),("''","''")]]
98
99 -- Named Entity Recognition example
100 -- parseWith _tokenNer "Hello world of Peter."
101 -- [[("``","O"),("Hello","O"),("world","O"),("of","O"),("Peter","PERSON"),(".","O"),("''","O")]]
102 tokenWith :: (Token -> t) -> Language -> String -> IO [[(Text, t)]]
103 tokenWith f lang s = pm (pm (\t -> (_tokenWord t, f t))) <$> pm _sentenceTokens <$> sentences <$> corenlp lang s
104
105
106