]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Ngrams/CoreNLP.hs
[TEXT-MINING] adding first functions/datatypes.
[gargantext.git] / src / Gargantext / Ngrams / CoreNLP.hs
1 {-|
2 Module : Gargantext.Ngrams.CoreNLP
3 Description : CoreNLP module
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12
13 {-# LANGUAGE DataKinds #-}
14 {-# LANGUAGE DeriveGeneric #-}
15 {-# LANGUAGE NoImplicitPrelude #-}
16 {-# LANGUAGE TemplateHaskell #-}
17 {-# LANGUAGE TypeOperators #-}
18
19 module Gargantext.Ngrams.CoreNLP where
20
21 import Data.Aeson
22 import Data.Aeson.TH (deriveJSON)
23 import GHC.Generics
24 import Data.Monoid ((<>))
25 import GHC.Show (Show(..))
26
27 import Gargantext.Types.Main (Language(..))
28 import Gargantext.Prelude
29 import Gargantext.Utils.Prefix (unPrefix)
30 import Data.Text (Text)
31
32 import qualified Data.ByteString.Char8 as S8
33 import qualified Data.Yaml as Yaml
34 import Network.HTTP.Simple
35
36
37 data Token = Token { _tokenIndex :: Int
38 , _tokenWord :: Text
39 , _tokenOriginalText :: Text
40 , _tokenLemma :: Text
41 , _tokenCharacterOffsetBegin :: Int
42 , _tokenCharacterOffsetEnd :: Int
43 , _tokenPos :: Text
44 , _tokenNer :: Text
45 , _tokenBefore :: Maybe Text
46 , _tokenAfter :: Maybe Text
47 } deriving (Show, Generic)
48 $(deriveJSON (unPrefix "_token") ''Token)
49
50 token2text :: Token -> (Text, Text, Text)
51 token2text (Token _ w _ _ _ _ p n _ _) = (w,p,n)
52
53
54 data Sentence = Sentence { _sentenceIndex :: Int
55 , _sentenceTokens :: [Token]
56 } deriving (Show, Generic)
57
58 $(deriveJSON (unPrefix "_sentence") ''Sentence)
59
60 data Properties = Properties { _propertiesAnnotators :: Text
61 , _propertiesOutputFormat :: Text
62 } deriving (Show, Generic)
63
64 $(deriveJSON (unPrefix "_properties") ''Properties)
65
66 data Sentences = Sentences { _sentences :: [Sentence]}
67 deriving (Show, Generic)
68 instance ToJSON Sentences
69 instance FromJSON Sentences
70
71
72 -- request =
73 -- "fr" : {
74 -- "tokenize.language" : "fr",
75 -- "pos.model" : "edu/stanford/nlp/models/pos-tagger/french/french.tagger",
76 -- "parse.model" : "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz",
77 -- // dependency parser
78 -- "depparse.model" : "edu/stanford/nlp/models/parser/nndep/UD_French.gz",
79 -- "depparse.language" : "french",
80 -- "ner.model": DATA_ROOT+"/eunews.fr.crf.gz",
81 -- "ssplit.newlineIsSentenceBreak": "always"
82 -- },
83 --
84
85
86 corenlpPretty :: Text -> IO ()
87 corenlpPretty txt = do
88 url <- parseRequest "POST http://localhost:9000/?properties={\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
89 let request = setRequestBodyJSON txt url
90 response <- httpJSON request
91
92 -- putStrLn $ "The status code was: " ++
93 -- show (getResponseStatusCode response)
94 -- print $ getResponseHeader "Content-Type" response
95 S8.putStrLn $ Yaml.encode (getResponseBody response :: Sentences)
96
97 corenlp :: Language -> Text -> IO Sentences
98 corenlp lang txt = do
99 let properties = case lang of
100 EN -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
101 -- FR -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
102 FR -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"parse.model\":\"edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz\", \"pos.model\":\"edu/stanford/nlp/models/pos-tagger/french/french.tagger\", \"tokenize.language\":\"fr\", \"outputFormat\": \"json\"}"
103 url <- parseRequest $ "POST http://localhost:9000/?properties=" <> properties
104 let request = setRequestBodyJSON txt url
105 response <- httpJSON request
106 pure (getResponseBody response :: Sentences)
107
108 -- | parseWith
109 -- Part Of Speech example
110 -- parseWith _tokenPos "Hello world."
111 -- == [[("``","``"),("Hello","UH"),("world","NN"),(".","."),("''","''")]]
112
113 -- Named Entity Recognition example
114 -- parseWith _tokenNer "Hello world of Peter."
115 -- [[("``","O"),("Hello","O"),("world","O"),("of","O"),("Peter","PERSON"),(".","O"),("''","O")]]
116 tokenWith :: (Token -> t) -> Language -> Text -> IO [[(Text, t)]]
117 tokenWith f lang s = map (map (\t -> (_tokenWord t, f t))) <$> map _sentenceTokens <$> _sentences <$> corenlp lang s
118
119
120