]> Git — Sourcephile - gargantext.git/blob - src/Data/Gargantext/NLP/CoreNLP.hs
[FEAT] Ngrams extractor in English with tests : ok. Need to factor pattern matching...
[gargantext.git] / src / Data / Gargantext / NLP / CoreNLP.hs
1 {-# LANGUAGE DataKinds #-}
2 {-# LANGUAGE DeriveGeneric #-}
3 {-# LANGUAGE TypeOperators #-}
4 {-# LANGUAGE TemplateHaskell #-}
5
6 module Data.Gargantext.NLP.CoreNLP where
7
8 import Data.Aeson
9 import Data.Aeson.TH (deriveJSON)
10 import GHC.Generics
11
12 import Data.Gargantext.Prelude
13 import Data.Gargantext.Utils.Prefix (unPrefix)
14 import Data.Text (Text)
15
16 import qualified Data.ByteString.Char8 as S8
17 import qualified Data.Yaml as Yaml
18 import Network.HTTP.Simple
19
20
21 data Token = Token { _tokenIndex :: Int
22 , _tokenWord :: Text
23 , _tokenOriginalText :: Text
24 , _tokenLemma :: Text
25 , _tokenCharacterOffsetBegin :: Int
26 , _tokenCharacterOffsetEnd :: Int
27 , _tokenPos :: Text
28 , _tokenNer :: Text
29 , _tokenBefore :: Maybe Text
30 , _tokenAfter :: Maybe Text
31 } deriving (Show, Generic)
32 $(deriveJSON (unPrefix "_token") ''Token)
33
34 token2text :: Token -> (Text, Text, Text)
35 token2text (Token _ w _ _ _ _ p n _ _) = (w,p,n)
36
37
38 data Sentence = Sentence { _sentenceIndex :: Int
39 , _sentenceTokens :: [Token]
40 } deriving (Show, Generic)
41
42 $(deriveJSON (unPrefix "_sentence") ''Sentence)
43
44 data Properties = Properties { _propertiesAnnotators :: Text
45 , _propertiesOutputFormat :: Text
46 } deriving (Show, Generic)
47
48 $(deriveJSON (unPrefix "_properties") ''Properties)
49
50 data Sentences = Sentences { sentences :: [Sentence]}
51 deriving (Show, Generic)
52 instance ToJSON Sentences
53 instance FromJSON Sentences
54
55
56 -- request =
57 -- "fr" : {
58 -- "tokenize.language" : "fr",
59 -- "pos.model" : "edu/stanford/nlp/models/pos-tagger/french/french.tagger",
60 -- "parse.model" : "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz",
61 -- // dependency parser
62 -- "depparse.model" : "edu/stanford/nlp/models/parser/nndep/UD_French.gz",
63 -- "depparse.language" : "french",
64 -- "ner.model": DATA_ROOT+"/eunews.fr.crf.gz",
65 -- "ssplit.newlineIsSentenceBreak": "always"
66 -- },
67 --
68
69
70 corenlpPretty :: String -> IO ()
71 corenlpPretty txt = do
72 url <- parseRequest "POST http://localhost:9000/?properties={\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
73 let request = setRequestBodyJSON txt url
74 response <- httpJSON request
75
76 -- putStrLn $ "The status code was: " ++
77 -- show (getResponseStatusCode response)
78 -- print $ getResponseHeader "Content-Type" response
79 S8.putStrLn $ Yaml.encode (getResponseBody response :: Sentences)
80
81 corenlp :: String -> IO Sentences
82 corenlp txt = do
83 url <- parseRequest "POST http://localhost:9000/?properties={\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
84 let request = setRequestBodyJSON txt url
85 response <- httpJSON request
86 pure (getResponseBody response :: Sentences)
87
88 -- | parseWith
89 -- Part Of Speech example
90 -- parseWith _tokenPos "Hello world."
91 -- == [[("``","``"),("Hello","UH"),("world","NN"),(".","."),("''","''")]]
92
93 -- Named Entity Recognition example
94 -- parseWith _tokenNer "Hello world of Peter."
95 -- [[("``","O"),("Hello","O"),("world","O"),("of","O"),("Peter","PERSON"),(".","O"),("''","O")]]
96 tokenWith :: (Token -> t) -> String -> IO [[(Text, t)]]
97 tokenWith f s = pm (pm (\t -> (_tokenWord t, f t))) <$> pm _sentenceTokens <$> sentences <$> corenlp s
98
99
100