src/Gargantext/Ngrams/CoreNLP.hs

   1 {-|
   2 Module      : Gargantext.Ngrams.CoreNLP
   3 Description : CoreNLP module
   4 Copyright   : (c) CNRS, 2017
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 -}
  11
  12
  13 {-# LANGUAGE DataKinds         #-}
  14 {-# LANGUAGE DeriveGeneric     #-}
  15 {-# LANGUAGE NoImplicitPrelude #-}
  16 {-# LANGUAGE TemplateHaskell   #-}
  17 {-# LANGUAGE TypeOperators     #-}
  18
  19 module Gargantext.Ngrams.CoreNLP where
  20
  21 import Data.Aeson
  22 import Data.Aeson.TH (deriveJSON)
  23 import GHC.Generics
  24 import Data.Monoid ((<>))
  25 import GHC.Show (Show(..))
  26
  27 import Gargantext.Types.Main (Language(..))
  28 import Gargantext.Prelude
  29 import Gargantext.Utils.Prefix (unPrefix)
  30 import Data.Text (Text)
  31
  32 import qualified Data.ByteString.Char8 as S8
  33 import qualified Data.Yaml             as Yaml
  34 import           Network.HTTP.Simple
  35
  36
  37 data Token = Token { _tokenIndex                :: Int
  38                    , _tokenWord                 :: Text
  39                    , _tokenOriginalText         :: Text
  40                    , _tokenLemma                :: Text
  41                    , _tokenCharacterOffsetBegin :: Int
  42                    , _tokenCharacterOffsetEnd   :: Int
  43                    , _tokenPos                  :: Text
  44                    , _tokenNer                  :: Text
  45                    , _tokenBefore               :: Maybe Text
  46                    , _tokenAfter                :: Maybe Text
  47                    } deriving (Show, Generic)
  48 $(deriveJSON (unPrefix "_token") ''Token)
  49
  50 token2text :: Token -> (Text, Text, Text)
  51 token2text (Token _ w _ _ _ _ p n _ _) = (w,p,n)
  52
  53
  54 data Sentence  = Sentence { _sentenceIndex :: Int
  55                           , _sentenceTokens :: [Token]
  56                           } deriving (Show, Generic)
  57
  58 $(deriveJSON (unPrefix "_sentence") ''Sentence)
  59
  60 data Properties = Properties { _propertiesAnnotators  :: Text
  61                              , _propertiesOutputFormat :: Text
  62                              } deriving (Show, Generic)
  63
  64 $(deriveJSON (unPrefix "_properties") ''Properties)
  65
  66 data Sentences = Sentences { _sentences :: [Sentence]}
  67   deriving (Show, Generic)
  68 instance ToJSON Sentences
  69 instance FromJSON Sentences
  70
  71
  72 -- request =
  73 -- "fr" : {
  74 --                 "tokenize.language" : "fr",
  75 --                 "pos.model" : "edu/stanford/nlp/models/pos-tagger/french/french.tagger",
  76 --                 "parse.model" : "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz",
  77 --                 // dependency parser
  78 --                 "depparse.model" : "edu/stanford/nlp/models/parser/nndep/UD_French.gz",
  79 --                 "depparse.language" : "french",
  80 --                 "ner.model":  DATA_ROOT+"/eunews.fr.crf.gz",
  81 --                 "ssplit.newlineIsSentenceBreak": "always"
  82 --             },
  83 --
  84
  85
  86 corenlpPretty :: Text -> IO ()
  87 corenlpPretty txt = do
  88     url <- parseRequest "POST http://localhost:9000/?properties={\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
  89     let request = setRequestBodyJSON txt url
  90     response <- httpJSON request
  91
  92 --    putStrLn $ "The status code was: " ++
  93 --               show (getResponseStatusCode response)
  94 --    print $ getResponseHeader "Content-Type" response
  95     S8.putStrLn $ Yaml.encode (getResponseBody response :: Sentences)
  96
  97 corenlp :: Language -> Text -> IO Sentences
  98 corenlp lang txt = do
  99     let properties = case lang of
 100             EN -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
 101             -- FR -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
 102             FR -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"parse.model\":\"edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz\", \"pos.model\":\"edu/stanford/nlp/models/pos-tagger/french/french.tagger\", \"tokenize.language\":\"fr\", \"outputFormat\": \"json\"}"
 103     url <- parseRequest $ "POST http://localhost:9000/?properties=" <> properties
 104     let request = setRequestBodyJSON txt url
 105     response <- httpJSON request
 106     pure (getResponseBody response :: Sentences)
 107
 108 -- | parseWith
 109 -- Part Of Speech example
 110 -- parseWith  _tokenPos    "Hello world."
 111 -- == [[("``","``"),("Hello","UH"),("world","NN"),(".","."),("''","''")]]
 112
 113 -- Named Entity Recognition example
 114 -- parseWith  _tokenNer     "Hello world of Peter."
 115 -- [[("``","O"),("Hello","O"),("world","O"),("of","O"),("Peter","PERSON"),(".","O"),("''","O")]]
 116 tokenWith :: (Token -> t) -> Language -> Text -> IO [[(Text, t)]]
 117 tokenWith f lang s = map (map (\t -> (_tokenWord t, f t))) <$> map _sentenceTokens <$> _sentences <$> corenlp lang s
 118
 119
 120