src/Gargantext/Ngrams/CoreNLP.hs

   1 {-|
   2 Module      : Gargantext.Ngrams.CoreNLP
   3 Description : CoreNLP module
   4 Copyright   : (c) CNRS, 2017
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 -}
  11
  12
  13 {-# LANGUAGE DataKinds         #-}
  14 {-# LANGUAGE DeriveGeneric     #-}
  15 {-# LANGUAGE NoImplicitPrelude #-}
  16 {-# LANGUAGE TemplateHaskell   #-}
  17 {-# LANGUAGE TypeOperators     #-}
  18
  19 module Gargantext.Ngrams.CoreNLP where
  20
  21 import Data.Aeson.TH (deriveJSON)
  22 import GHC.Generics
  23 import Data.Monoid ((<>))
  24 import GHC.Show (Show(..))
  25
  26 import Gargantext.Types.Main (Language(..))
  27 import Gargantext.Prelude
  28 import Gargantext.Utils.Prefix (unPrefix)
  29 import Data.Text (Text)
  30
  31 import qualified Data.ByteString.Char8 as S8
  32 import qualified Data.Yaml             as Yaml
  33 import           Network.HTTP.Simple
  34
  35
  36 data Token = Token { _tokenIndex                :: Int
  37                    , _tokenWord                 :: Text
  38                    , _tokenOriginalText         :: Text
  39                    , _tokenLemma                :: Text
  40                    , _tokenCharacterOffsetBegin :: Int
  41                    , _tokenCharacterOffsetEnd   :: Int
  42                    , _tokenPos                  :: Text
  43                    , _tokenNer                  :: Text
  44                    , _tokenBefore               :: Maybe Text
  45                    , _tokenAfter                :: Maybe Text
  46                    } deriving (Show, Generic)
  47 $(deriveJSON (unPrefix "_token") ''Token)
  48
  49 token2text :: Token -> (Text, Text, Text)
  50 token2text (Token _ w _ _ _ _ p n _ _) = (w,p,n)
  51
  52
  53 data Sentence  = Sentence { _sentenceIndex :: Int
  54                           , _sentenceTokens :: [Token]
  55                           } deriving (Show, Generic)
  56
  57 $(deriveJSON (unPrefix "_sentence") ''Sentence)
  58
  59 data Properties = Properties { _propertiesAnnotators  :: Text
  60                              , _propertiesOutputFormat :: Text
  61                              } deriving (Show, Generic)
  62
  63 $(deriveJSON (unPrefix "_properties") ''Properties)
  64
  65 data Sentences = Sentences { _sentences :: [Sentence]}
  66   deriving (Show, Generic)
  67
  68 $(deriveJSON (unPrefix "_") ''Sentences)
  69
  70
  71 -- request =
  72 -- "fr" : {
  73 --                 "tokenize.language" : "fr",
  74 --                 "pos.model" : "edu/stanford/nlp/models/pos-tagger/french/french.tagger",
  75 --                 "parse.model" : "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz",
  76 --                 // dependency parser
  77 --                 "depparse.model" : "edu/stanford/nlp/models/parser/nndep/UD_French.gz",
  78 --                 "depparse.language" : "french",
  79 --                 "ner.model":  DATA_ROOT+"/eunews.fr.crf.gz",
  80 --                 "ssplit.newlineIsSentenceBreak": "always"
  81 --             },
  82 --
  83
  84
  85 corenlpPretty :: Text -> IO ()
  86 corenlpPretty txt = do
  87     url <- parseRequest "POST http://localhost:9000/?properties={\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
  88     let request = setRequestBodyJSON txt url
  89     response <- httpJSON request
  90
  91 --    putStrLn $ "The status code was: " ++
  92 --               show (getResponseStatusCode response)
  93 --    print $ getResponseHeader "Content-Type" response
  94     S8.putStrLn $ Yaml.encode (getResponseBody response :: Sentences)
  95
  96 corenlp :: Language -> Text -> IO Sentences
  97 corenlp lang txt = do
  98     let properties = case lang of
  99             EN -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
 100             -- FR -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
 101             FR -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"parse.model\":\"edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz\", \"pos.model\":\"edu/stanford/nlp/models/pos-tagger/french/french.tagger\", \"tokenize.language\":\"fr\", \"outputFormat\": \"json\"}"
 102     url <- parseRequest $ "POST http://localhost:9000/?properties=" <> properties
 103     let request = setRequestBodyJSON txt url
 104     response <- httpJSON request
 105     pure (getResponseBody response :: Sentences)
 106
 107 -- | parseWith
 108 -- Part Of Speech example
 109 -- parseWith  _tokenPos    "Hello world."
 110 -- == [[("``","``"),("Hello","UH"),("world","NN"),(".","."),("''","''")]]
 111
 112 -- Named Entity Recognition example
 113 -- parseWith  _tokenNer     "Hello world of Peter."
 114 -- [[("``","O"),("Hello","O"),("world","O"),("of","O"),("Peter","PERSON"),(".","O"),("''","O")]]
 115 tokenWith :: (Token -> t) -> Language -> Text -> IO [[(Text, t)]]
 116 tokenWith f lang s = map (map (\t -> (_tokenWord t, f t))) <$> map _sentenceTokens <$> _sentences <$> corenlp lang s
 117
 118
 119