]> Git — Sourcephile - gargantext.git/blob - src/Data/Gargantext/NLP/CoreNLP.hs
[RENAME] renaming function to parser tokens.
[gargantext.git] / src / Data / Gargantext / NLP / CoreNLP.hs
1 {-# LANGUAGE DataKinds #-}
2 {-# LANGUAGE DeriveGeneric #-}
3 {-# LANGUAGE TypeOperators #-}
4 {-# LANGUAGE TemplateHaskell #-}
5
6 module Data.Gargantext.NLP.CoreNLP where
7
8 import Data.Aeson
9 import Data.Aeson.TH (deriveJSON)
10 import GHC.Generics
11
12 import Data.Gargantext.Prelude
13 import Data.Gargantext.Utils.Prefix (unPrefix)
14 import Data.Text (Text)
15
16 import qualified Data.ByteString.Char8 as S8
17 import qualified Data.Yaml as Yaml
18 import Network.HTTP.Simple
19
20
21 data Token = Token { _tokenIndex :: Int
22 , _tokenWord :: Text
23 , _tokenOriginalText :: Text
24 , _tokenLemma :: Text
25 , _tokenCharacterOffsetBegin :: Int
26 , _tokenCharacterOffsetEnd :: Int
27 , _tokenPos :: Text
28 , _tokenNer :: Text
29 , _tokenBefore :: Text
30 , _tokenAfter :: Text
31 } deriving (Show, Generic)
32 $(deriveJSON (unPrefix "_token") ''Token)
33
34 data Sentence = Sentence { _sentenceIndex :: Int
35 , _sentenceTokens :: [Token]
36 } deriving (Show, Generic)
37
38 $(deriveJSON (unPrefix "_sentence") ''Sentence)
39
40 data Properties = Properties { _propertiesAnnotators :: Text
41 , _propertiesOutputFormat :: Text
42 } deriving (Show, Generic)
43
44 $(deriveJSON (unPrefix "_properties") ''Properties)
45
46 data Sentences = Sentences { sentences :: [Sentence]}
47 deriving (Show, Generic)
48 instance ToJSON Sentences
49 instance FromJSON Sentences
50
51
52
53 corenlpPretty :: String -> IO ()
54 corenlpPretty txt = do
55 url <- parseRequest "POST http://localhost:9000/?properties={\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
56 let request = setRequestBodyJSON txt url
57 response <- httpJSON request
58
59 -- putStrLn $ "The status code was: " ++
60 -- show (getResponseStatusCode response)
61 -- print $ getResponseHeader "Content-Type" response
62 S8.putStrLn $ Yaml.encode (getResponseBody response :: Sentences)
63
64 corenlp :: String -> IO Sentences
65 corenlp txt = do
66 url <- parseRequest "POST http://localhost:9000/?properties={\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
67 let request = setRequestBodyJSON txt url
68 response <- httpJSON request
69 pure (getResponseBody response :: Sentences)
70
71 -- | parseWith
72 -- Part Of Speech example
73 -- parseWith _tokenPos "Hello world."
74 -- == [[("``","``"),("Hello","UH"),("world","NN"),(".","."),("''","''")]]
75
76 -- Named Entity Recognition example
77 -- parseWith _tokenNer "Hello world of Peter."
78 -- [[("``","O"),("Hello","O"),("world","O"),("of","O"),("Peter","PERSON"),(".","O"),("''","O")]]
79 tokenWith :: (Token -> t) -> String -> IO [[(Text, t)]]
80 tokenWith f s = pm (pm (\t -> (_tokenWord t, f t))) <$> pm _sentenceTokens <$> sentences <$> corenlp s
81
82
83