]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Ngrams/CoreNLP.hs
Merge branch 'master' into fromRFC3339
[gargantext.git] / src / Gargantext / Ngrams / CoreNLP.hs
1 {-# LANGUAGE DataKinds #-}
2 {-# LANGUAGE DeriveGeneric #-}
3 {-# LANGUAGE TypeOperators #-}
4 {-# LANGUAGE TemplateHaskell #-}
5 {-# LANGUAGE NoImplicitPrelude #-}
6
7 module Gargantext.Ngrams.CoreNLP where
8
9 import Data.Aeson
10 import Data.Aeson.TH (deriveJSON)
11 import GHC.Generics
12 import Data.Monoid ((<>))
13 import GHC.Show (Show(..))
14
15 import Gargantext.Types.Main (Language(..))
16 import Gargantext.Prelude
17 import Gargantext.Utils.Prefix (unPrefix)
18 import Data.Text (Text)
19
20 import qualified Data.ByteString.Char8 as S8
21 import qualified Data.Yaml as Yaml
22 import Network.HTTP.Simple
23
24
25 data Token = Token { _tokenIndex :: Int
26 , _tokenWord :: Text
27 , _tokenOriginalText :: Text
28 , _tokenLemma :: Text
29 , _tokenCharacterOffsetBegin :: Int
30 , _tokenCharacterOffsetEnd :: Int
31 , _tokenPos :: Text
32 , _tokenNer :: Text
33 , _tokenBefore :: Maybe Text
34 , _tokenAfter :: Maybe Text
35 } deriving (Show, Generic)
36 $(deriveJSON (unPrefix "_token") ''Token)
37
38 token2text :: Token -> (Text, Text, Text)
39 token2text (Token _ w _ _ _ _ p n _ _) = (w,p,n)
40
41
42 data Sentence = Sentence { _sentenceIndex :: Int
43 , _sentenceTokens :: [Token]
44 } deriving (Show, Generic)
45
46 $(deriveJSON (unPrefix "_sentence") ''Sentence)
47
48 data Properties = Properties { _propertiesAnnotators :: Text
49 , _propertiesOutputFormat :: Text
50 } deriving (Show, Generic)
51
52 $(deriveJSON (unPrefix "_properties") ''Properties)
53
54 data Sentences = Sentences { sentences :: [Sentence]}
55 deriving (Show, Generic)
56 instance ToJSON Sentences
57 instance FromJSON Sentences
58
59
60 -- request =
61 -- "fr" : {
62 -- "tokenize.language" : "fr",
63 -- "pos.model" : "edu/stanford/nlp/models/pos-tagger/french/french.tagger",
64 -- "parse.model" : "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz",
65 -- // dependency parser
66 -- "depparse.model" : "edu/stanford/nlp/models/parser/nndep/UD_French.gz",
67 -- "depparse.language" : "french",
68 -- "ner.model": DATA_ROOT+"/eunews.fr.crf.gz",
69 -- "ssplit.newlineIsSentenceBreak": "always"
70 -- },
71 --
72
73
74 corenlpPretty :: Text -> IO ()
75 corenlpPretty txt = do
76 url <- parseRequest "POST http://localhost:9000/?properties={\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
77 let request = setRequestBodyJSON txt url
78 response <- httpJSON request
79
80 -- putStrLn $ "The status code was: " ++
81 -- show (getResponseStatusCode response)
82 -- print $ getResponseHeader "Content-Type" response
83 S8.putStrLn $ Yaml.encode (getResponseBody response :: Sentences)
84
85 corenlp :: Language -> Text -> IO Sentences
86 corenlp lang txt = do
87 let properties = case lang of
88 EN -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
89 -- FR -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
90 FR -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"parse.model\":\"edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz\", \"pos.model\":\"edu/stanford/nlp/models/pos-tagger/french/french.tagger\", \"tokenize.language\":\"fr\", \"outputFormat\": \"json\"}"
91 url <- parseRequest $ "POST http://localhost:9000/?properties=" <> properties
92 let request = setRequestBodyJSON txt url
93 response <- httpJSON request
94 pure (getResponseBody response :: Sentences)
95
96 -- | parseWith
97 -- Part Of Speech example
98 -- parseWith _tokenPos "Hello world."
99 -- == [[("``","``"),("Hello","UH"),("world","NN"),(".","."),("''","''")]]
100
101 -- Named Entity Recognition example
102 -- parseWith _tokenNer "Hello world of Peter."
103 -- [[("``","O"),("Hello","O"),("world","O"),("of","O"),("Peter","PERSON"),(".","O"),("''","O")]]
104 tokenWith :: (Token -> t) -> Language -> Text -> IO [[(Text, t)]]
105 tokenWith f lang s = map (map (\t -> (_tokenWord t, f t))) <$> map _sentenceTokens <$> sentences <$> corenlp lang s
106
107
108