]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/CoreNLP.hs
[Structure] Ngrams -> Text.
[gargantext.git] / src / Gargantext / Text / CoreNLP.hs
1 {-|
2 Module : Gargantext.Text.CoreNLP
3 Description : CoreNLP module
4 Copyright : (c) CNRS, 2017
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12
13 {-# LANGUAGE DataKinds #-}
14 {-# LANGUAGE DeriveGeneric #-}
15 {-# LANGUAGE NoImplicitPrelude #-}
16 {-# LANGUAGE TemplateHaskell #-}
17 {-# LANGUAGE TypeOperators #-}
18
19 module Gargantext.Text.CoreNLP where
20
21 import Data.Aeson.TH (deriveJSON)
22 import GHC.Generics
23 import Data.Monoid ((<>))
24 import GHC.Show (Show(..))
25
26 import Gargantext.Types.Main (Language(..))
27 import Gargantext.Prelude
28 import Gargantext.Utils.Prefix (unPrefix)
29 import Data.Text (Text)
30
31 import Network.HTTP.Simple
32
33
34 data Token = Token { _tokenIndex :: Int
35 , _tokenWord :: Text
36 , _tokenOriginalText :: Text
37 , _tokenLemma :: Text
38 , _tokenCharacterOffsetBegin :: Int
39 , _tokenCharacterOffsetEnd :: Int
40 , _tokenPos :: Text
41 , _tokenNer :: Text
42 , _tokenBefore :: Maybe Text
43 , _tokenAfter :: Maybe Text
44 } deriving (Show, Generic)
45 $(deriveJSON (unPrefix "_token") ''Token)
46
47 token2text :: Token -> (Text, Text, Text)
48 token2text (Token _ w _ _ _ _ p n _ _) = (w,p,n)
49
50
51 data Sentence = Sentence { _sentenceIndex :: Int
52 , _sentenceTokens :: [Token]
53 } deriving (Show, Generic)
54
55 $(deriveJSON (unPrefix "_sentence") ''Sentence)
56
57 data Properties = Properties { _propertiesAnnotators :: Text
58 , _propertiesOutputFormat :: Text
59 } deriving (Show, Generic)
60
61 $(deriveJSON (unPrefix "_properties") ''Properties)
62
63 data Sentences = Sentences { _sentences :: [Sentence]}
64 deriving (Show, Generic)
65
66 $(deriveJSON (unPrefix "_") ''Sentences)
67
68
69 -- request =
70 -- "fr" : {
71 -- "tokenize.language" : "fr",
72 -- "pos.model" : "edu/stanford/nlp/models/pos-tagger/french/french.tagger",
73 -- "parse.model" : "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz",
74 -- // dependency parser
75 -- "depparse.model" : "edu/stanford/nlp/models/parser/nndep/UD_French.gz",
76 -- "depparse.language" : "french",
77 -- "ner.model": DATA_ROOT+"/eunews.fr.crf.gz",
78 -- "ssplit.newlineIsSentenceBreak": "always"
79 -- },
80 --
81
82
83 corenlp :: Language -> Text -> IO Sentences
84 corenlp lang txt = do
85 let properties = case lang of
86 EN -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
87 -- FR -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
88 FR -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"parse.model\":\"edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz\", \"pos.model\":\"edu/stanford/nlp/models/pos-tagger/french/french.tagger\", \"tokenize.language\":\"fr\", \"outputFormat\": \"json\"}"
89 url <- parseRequest $ "POST http://localhost:9000/?properties=" <> properties
90 let request = setRequestBodyLBS (cs txt) url
91 response <- httpJSON request
92 pure (getResponseBody response :: Sentences)
93
94 -- | parseWith
95 -- Part Of Speech example
96 -- parseWith _tokenPos "Hello world."
97 -- == [[("``","``"),("Hello","UH"),("world","NN"),(".","."),("''","''")]]
98
99 -- Named Entity Recognition example
100 -- parseWith _tokenNer "Hello world of Peter."
101 -- [[("``","O"),("Hello","O"),("world","O"),("of","O"),("Peter","PERSON"),(".","O"),("''","O")]]
102 tokenWith :: (Token -> t) -> Language -> Text -> IO [[(Text, t)]]
103 tokenWith f lang s = map (map (\t -> (_tokenWord t, f t))) <$> map _sentenceTokens <$> _sentences <$> corenlp lang s
104
105
106