src/Gargantext/Ngrams/CoreNLP.hs

   1 {-# LANGUAGE DataKinds #-}
   2 {-# LANGUAGE DeriveGeneric #-}
   3 {-# LANGUAGE TypeOperators #-}
   4 {-# LANGUAGE TemplateHaskell #-}
   5 {-# LANGUAGE     NoImplicitPrelude       #-}
   6
   7 module Gargantext.Ngrams.CoreNLP where
   8
   9 import Data.Aeson
  10 import Data.Aeson.TH (deriveJSON)
  11 import GHC.Generics
  12 import Data.Monoid ((<>))
  13 import GHC.Show (Show(..))
  14
  15 import Gargantext.Types.Main (Language(..))
  16 import Gargantext.Prelude
  17 import Gargantext.Utils.Prefix (unPrefix)
  18 import Data.Text (Text)
  19
  20 import qualified Data.ByteString.Char8 as S8
  21 import qualified Data.Yaml             as Yaml
  22 import           Network.HTTP.Simple
  23
  24
  25 data Token = Token { _tokenIndex                :: Int
  26                    , _tokenWord                 :: Text
  27                    , _tokenOriginalText         :: Text
  28                    , _tokenLemma                :: Text
  29                    , _tokenCharacterOffsetBegin :: Int
  30                    , _tokenCharacterOffsetEnd   :: Int
  31                    , _tokenPos                  :: Text
  32                    , _tokenNer                  :: Text
  33                    , _tokenBefore               :: Maybe Text
  34                    , _tokenAfter                :: Maybe Text
  35                    } deriving (Show, Generic)
  36 $(deriveJSON (unPrefix "_token") ''Token)
  37
  38 token2text :: Token -> (Text, Text, Text)
  39 token2text (Token _ w _ _ _ _ p n _ _) = (w,p,n)
  40
  41
  42 data Sentence  = Sentence { _sentenceIndex :: Int
  43                           , _sentenceTokens :: [Token]
  44                           } deriving (Show, Generic)
  45
  46 $(deriveJSON (unPrefix "_sentence") ''Sentence)
  47
  48 data Properties = Properties { _propertiesAnnotators  :: Text
  49                              , _propertiesOutputFormat :: Text
  50                              } deriving (Show, Generic)
  51
  52 $(deriveJSON (unPrefix "_properties") ''Properties)
  53
  54 data Sentences = Sentences { sentences :: [Sentence]}
  55   deriving (Show, Generic)
  56 instance ToJSON Sentences
  57 instance FromJSON Sentences
  58
  59
  60 -- request =
  61 -- "fr" : {
  62 --                 "tokenize.language" : "fr",
  63 --                 "pos.model" : "edu/stanford/nlp/models/pos-tagger/french/french.tagger",
  64 --                 "parse.model" : "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz",
  65 --                 // dependency parser
  66 --                 "depparse.model" : "edu/stanford/nlp/models/parser/nndep/UD_French.gz",
  67 --                 "depparse.language" : "french",
  68 --                 "ner.model":  DATA_ROOT+"/eunews.fr.crf.gz",
  69 --                 "ssplit.newlineIsSentenceBreak": "always"
  70 --             },
  71 --
  72
  73
  74 corenlpPretty :: Text -> IO ()
  75 corenlpPretty txt = do
  76     url <- parseRequest "POST http://localhost:9000/?properties={\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
  77     let request = setRequestBodyJSON txt url
  78     response <- httpJSON request
  79
  80 --    putStrLn $ "The status code was: " ++
  81 --               show (getResponseStatusCode response)
  82 --    print $ getResponseHeader "Content-Type" response
  83     S8.putStrLn $ Yaml.encode (getResponseBody response :: Sentences)
  84
  85 corenlp :: Language -> Text -> IO Sentences
  86 corenlp lang txt = do
  87     let properties = case lang of
  88             EN -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
  89             -- FR -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
  90             FR -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"parse.model\":\"edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz\", \"pos.model\":\"edu/stanford/nlp/models/pos-tagger/french/french.tagger\", \"tokenize.language\":\"fr\", \"outputFormat\": \"json\"}"
  91     url <- parseRequest $ "POST http://localhost:9000/?properties=" <> properties
  92     let request = setRequestBodyJSON txt url
  93     response <- httpJSON request
  94     pure (getResponseBody response :: Sentences)
  95
  96 -- | parseWith
  97 -- Part Of Speech example
  98 -- parseWith  _tokenPos    "Hello world."
  99 -- == [[("``","``"),("Hello","UH"),("world","NN"),(".","."),("''","''")]]
 100
 101 -- Named Entity Recognition example
 102 -- parseWith  _tokenNer     "Hello world of Peter."
 103 -- [[("``","O"),("Hello","O"),("world","O"),("of","O"),("Peter","PERSON"),(".","O"),("''","O")]]
 104 tokenWith :: (Token -> t) -> Language -> Text -> IO [[(Text, t)]]
 105 tokenWith f lang s = map (map (\t -> (_tokenWord t, f t))) <$> map _sentenceTokens <$> sentences <$> corenlp lang s
 106
 107
 108