]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Ngrams/CoreNLP.hs
[FIS][FIX] Frequent Item Set and fix ngrams extraction test.
[gargantext.git] / src / Gargantext / Ngrams / CoreNLP.hs
1 {-|
2 Module : Gargantext.Ngrams.CoreNLP
3 Description : CoreNLP module
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12
13 {-# LANGUAGE DataKinds #-}
14 {-# LANGUAGE DeriveGeneric #-}
15 {-# LANGUAGE NoImplicitPrelude #-}
16 {-# LANGUAGE TemplateHaskell #-}
17 {-# LANGUAGE TypeOperators #-}
18
19 module Gargantext.Ngrams.CoreNLP where
20
21 import Data.Aeson.TH (deriveJSON)
22 import GHC.Generics
23 import Data.Monoid ((<>))
24 import GHC.Show (Show(..))
25
26 import Gargantext.Types.Main (Language(..))
27 import Gargantext.Prelude
28 import Gargantext.Utils.Prefix (unPrefix)
29 import Data.Text (Text)
30
31 import qualified Data.ByteString.Char8 as S8
32 import qualified Data.Yaml as Yaml
33 import Network.HTTP.Simple
34
35
36 data Token = Token { _tokenIndex :: Int
37 , _tokenWord :: Text
38 , _tokenOriginalText :: Text
39 , _tokenLemma :: Text
40 , _tokenCharacterOffsetBegin :: Int
41 , _tokenCharacterOffsetEnd :: Int
42 , _tokenPos :: Text
43 , _tokenNer :: Text
44 , _tokenBefore :: Maybe Text
45 , _tokenAfter :: Maybe Text
46 } deriving (Show, Generic)
47 $(deriveJSON (unPrefix "_token") ''Token)
48
49 token2text :: Token -> (Text, Text, Text)
50 token2text (Token _ w _ _ _ _ p n _ _) = (w,p,n)
51
52
53 data Sentence = Sentence { _sentenceIndex :: Int
54 , _sentenceTokens :: [Token]
55 } deriving (Show, Generic)
56
57 $(deriveJSON (unPrefix "_sentence") ''Sentence)
58
59 data Properties = Properties { _propertiesAnnotators :: Text
60 , _propertiesOutputFormat :: Text
61 } deriving (Show, Generic)
62
63 $(deriveJSON (unPrefix "_properties") ''Properties)
64
65 data Sentences = Sentences { _sentences :: [Sentence]}
66 deriving (Show, Generic)
67
68 $(deriveJSON (unPrefix "_") ''Sentences)
69
70
71 -- request =
72 -- "fr" : {
73 -- "tokenize.language" : "fr",
74 -- "pos.model" : "edu/stanford/nlp/models/pos-tagger/french/french.tagger",
75 -- "parse.model" : "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz",
76 -- // dependency parser
77 -- "depparse.model" : "edu/stanford/nlp/models/parser/nndep/UD_French.gz",
78 -- "depparse.language" : "french",
79 -- "ner.model": DATA_ROOT+"/eunews.fr.crf.gz",
80 -- "ssplit.newlineIsSentenceBreak": "always"
81 -- },
82 --
83
84
85 corenlpPretty :: Text -> IO ()
86 corenlpPretty txt = do
87 url <- parseRequest "POST http://localhost:9000/?properties={\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
88 let request = setRequestBodyJSON txt url
89 response <- httpJSON request
90
91 -- putStrLn $ "The status code was: " ++
92 -- show (getResponseStatusCode response)
93 -- print $ getResponseHeader "Content-Type" response
94 S8.putStrLn $ Yaml.encode (getResponseBody response :: Sentences)
95
96 corenlp :: Language -> Text -> IO Sentences
97 corenlp lang txt = do
98 let properties = case lang of
99 EN -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
100 -- FR -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
101 FR -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"parse.model\":\"edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz\", \"pos.model\":\"edu/stanford/nlp/models/pos-tagger/french/french.tagger\", \"tokenize.language\":\"fr\", \"outputFormat\": \"json\"}"
102 url <- parseRequest $ "POST http://localhost:9000/?properties=" <> properties
103 let request = setRequestBodyJSON txt url
104 response <- httpJSON request
105 pure (getResponseBody response :: Sentences)
106
107 -- | parseWith
108 -- Part Of Speech example
109 -- parseWith _tokenPos "Hello world."
110 -- == [[("``","``"),("Hello","UH"),("world","NN"),(".","."),("''","''")]]
111
112 -- Named Entity Recognition example
113 -- parseWith _tokenNer "Hello world of Peter."
114 -- [[("``","O"),("Hello","O"),("world","O"),("of","O"),("Peter","PERSON"),(".","O"),("''","O")]]
115 tokenWith :: (Token -> t) -> Language -> Text -> IO [[(Text, t)]]
116 tokenWith f lang s = map (map (\t -> (_tokenWord t, f t))) <$> map _sentenceTokens <$> _sentences <$> corenlp lang s
117
118
119