src/Gargantext/Text/CoreNLP.hs

   1 {-|
   2 Module      : Gargantext.Text.CoreNLP
   3 Description : CoreNLP module
   4 Copyright   : (c) CNRS, 2017
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 -}
  11
  12
  13 {-# LANGUAGE DataKinds         #-}
  14 {-# LANGUAGE DeriveGeneric     #-}
  15 {-# LANGUAGE NoImplicitPrelude #-}
  16 {-# LANGUAGE TemplateHaskell   #-}
  17 {-# LANGUAGE TypeOperators     #-}
  18
  19 module Gargantext.Text.CoreNLP where
  20
  21 import Data.Aeson.TH (deriveJSON)
  22 import GHC.Generics
  23 import Data.Monoid ((<>))
  24 import GHC.Show (Show(..))
  25
  26 import Gargantext.Types.Main (Language(..))
  27 import Gargantext.Prelude
  28 import Gargantext.Utils.Prefix (unPrefix)
  29 import Data.Text (Text)
  30
  31 import Network.HTTP.Simple
  32
  33
  34 data Token = Token { _tokenIndex                :: Int
  35                    , _tokenWord                 :: Text
  36                    , _tokenOriginalText         :: Text
  37                    , _tokenLemma                :: Text
  38                    , _tokenCharacterOffsetBegin :: Int
  39                    , _tokenCharacterOffsetEnd   :: Int
  40                    , _tokenPos                  :: Text
  41                    , _tokenNer                  :: Text
  42                    , _tokenBefore               :: Maybe Text
  43                    , _tokenAfter                :: Maybe Text
  44                    } deriving (Show, Generic)
  45 $(deriveJSON (unPrefix "_token") ''Token)
  46
  47 token2text :: Token -> (Text, Text, Text)
  48 token2text (Token _ w _ _ _ _ p n _ _) = (w,p,n)
  49
  50
  51 data Sentence  = Sentence { _sentenceIndex :: Int
  52                           , _sentenceTokens :: [Token]
  53                           } deriving (Show, Generic)
  54
  55 $(deriveJSON (unPrefix "_sentence") ''Sentence)
  56
  57 data Properties = Properties { _propertiesAnnotators  :: Text
  58                              , _propertiesOutputFormat :: Text
  59                              } deriving (Show, Generic)
  60
  61 $(deriveJSON (unPrefix "_properties") ''Properties)
  62
  63 data Sentences = Sentences { _sentences :: [Sentence]}
  64   deriving (Show, Generic)
  65
  66 $(deriveJSON (unPrefix "_") ''Sentences)
  67
  68
  69 -- request =
  70 -- "fr" : {
  71 --                 "tokenize.language" : "fr",
  72 --                 "pos.model" : "edu/stanford/nlp/models/pos-tagger/french/french.tagger",
  73 --                 "parse.model" : "edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz",
  74 --                 // dependency parser
  75 --                 "depparse.model" : "edu/stanford/nlp/models/parser/nndep/UD_French.gz",
  76 --                 "depparse.language" : "french",
  77 --                 "ner.model":  DATA_ROOT+"/eunews.fr.crf.gz",
  78 --                 "ssplit.newlineIsSentenceBreak": "always"
  79 --             },
  80 --
  81
  82
  83 corenlp :: Language -> Text -> IO Sentences
  84 corenlp lang txt = do
  85     let properties = case lang of
  86             EN -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
  87             -- FR -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"outputFormat\": \"json\"}"
  88             FR -> "{\"annotators\": \"tokenize,ssplit,pos,ner\", \"parse.model\":\"edu/stanford/nlp/models/lexparser/frenchFactored.ser.gz\", \"pos.model\":\"edu/stanford/nlp/models/pos-tagger/french/french.tagger\", \"tokenize.language\":\"fr\", \"outputFormat\": \"json\"}"
  89     url <- parseRequest $ "POST http://localhost:9000/?properties=" <> properties
  90     let request = setRequestBodyLBS (cs txt) url
  91     response <- httpJSON request
  92     pure (getResponseBody response :: Sentences)
  93
  94 -- | parseWith
  95 -- Part Of Speech example
  96 -- parseWith  _tokenPos    "Hello world."
  97 -- == [[("``","``"),("Hello","UH"),("world","NN"),(".","."),("''","''")]]
  98
  99 -- Named Entity Recognition example
 100 -- parseWith  _tokenNer     "Hello world of Peter."
 101 -- [[("``","O"),("Hello","O"),("world","O"),("of","O"),("Peter","PERSON"),(".","O"),("''","O")]]
 102 tokenWith :: (Token -> t) -> Language -> Text -> IO [[(Text, t)]]
 103 tokenWith f lang s = map (map (\t -> (_tokenWord t, f t))) <$> map _sentenceTokens <$> _sentences <$> corenlp lang s
 104
 105
 106