src/Gargantext/Core/Text/Terms/Multi.hs

   1 {-|
   2 Module      : Gargantext.Core.Text.Terms.Multi
   3 Description : Multi Terms module
   4 Copyright   : (c) CNRS, 2017 - present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 Multi-terms are ngrams where n > 1.
  11
  12 -}
  13
  14 {-# LANGUAGE OverloadedStrings #-}
  15
  16 module Gargantext.Core.Text.Terms.Multi (multiterms, multiterms_rake, tokenTagsWith, tokenTags, cleanTextForNLP)
  17   where
  18
  19 import Control.Applicative
  20 import Data.Attoparsec.Text                               as DAT
  21 import Data.List (concat)
  22 import Data.Text hiding (map, group, filter, concat)
  23 import Gargantext.Core (Lang(..), NLPServerConfig(..), PosTagAlgo(..))
  24 import Gargantext.Core.Text.Terms.Multi.PosTagging
  25 import Gargantext.Core.Text.Terms.Multi.PosTagging.Types
  26 import Gargantext.Core.Text.Terms.Multi.RAKE (multiterms_rake)
  27 import Gargantext.Core.Types
  28 import Gargantext.Core.Utils (groupWithCounts)
  29 import Gargantext.Prelude
  30 import Replace.Attoparsec.Text                            as RAT
  31 import qualified Gargantext.Core.Text.Terms.Multi.Lang.En as En
  32 import qualified Gargantext.Core.Text.Terms.Multi.Lang.Fr as Fr
  33 import qualified Gargantext.Utils.SpacyNLP                as SpacyNLP
  34
  35 -------------------------------------------------------------------
  36 type NLP_API = Lang -> Text -> IO PosSentences
  37
  38 -------------------------------------------------------------------
  39 multiterms :: NLPServerConfig -> Lang -> Text -> IO [TermsWithCount]
  40 multiterms nsc l txt = do
  41   let txt' = cleanTextForNLP txt
  42   if txt' == ""
  43      then do
  44        printDebug "[G.C.T.Terms.Multi] becomes empty after cleanTextForNLP" txt
  45        pure []
  46      else do
  47        ret <- multiterms' tokenTag2terms l txt'
  48        pure $ groupWithCounts ret
  49     where
  50       multiterms' :: (TokenTag -> a) -> Lang -> Text -> IO [a]
  51       multiterms' f lang txt' = concat
  52                        <$> map (map f)
  53                        <$> map (filter (\t -> _my_token_pos t == Just NP))
  54                        <$> tokenTags nsc lang txt'
  55
  56 -------------------------------------------------------------------
  57 tokenTag2terms :: TokenTag -> Terms
  58 tokenTag2terms (TokenTag ws t _ _) =  Terms ws t
  59
  60 tokenTags :: NLPServerConfig -> Lang -> Text -> IO [[TokenTag]]
  61 tokenTags (NLPServerConfig { server = CoreNLP, url }) EN txt = tokenTagsWith EN txt $ corenlp url
  62 tokenTags (NLPServerConfig { server = Spacy, url }) l txt = do
  63   -- printDebug "NLP Debug" txt
  64   tokenTagsWith l txt $ SpacyNLP.nlp url
  65 -- tokenTags FR txt = do
  66 --   -- printDebug "[Spacy Debug]" txt
  67 --   if txt == ""
  68 --      then pure [[]]
  69 --      else tokenTagsWith FR txt SpacyNLP.nlp
  70 tokenTags _ l  _   = panic $ "[G.C.T.T.Multi] Lang NLP API not implemented yet " <> (cs $ show l)
  71
  72 tokenTagsWith :: Lang -> Text -> NLP_API -> IO [[TokenTag]]
  73 tokenTagsWith lang txt nlp = map (groupTokens lang)
  74                          <$> map tokens2tokensTags
  75                          <$> map _sentenceTokens
  76                          <$> _sentences
  77                          <$> nlp lang txt
  78
  79
  80 ---- | This function analyses and groups (or not) ngrams according to
  81 ----   specific grammars of each language.
  82 groupTokens :: Lang -> [TokenTag] -> [TokenTag]
  83 groupTokens EN = En.groupTokens
  84 groupTokens FR = Fr.groupTokens
  85 groupTokens _  = Fr.groupTokens
  86
  87 -- TODO: make tests here
  88 cleanTextForNLP :: Text -> Text
  89 cleanTextForNLP = unifySpaces . removeDigitsWith "-" . removeUrls
  90   where
  91     remove x = RAT.streamEdit x (const "")
  92
  93     unifySpaces         = RAT.streamEdit (many DAT.space) (const " ")
  94     removeDigitsWith x  = remove (many DAT.digit *> DAT.string x <* many DAT.digit)
  95
  96     removeUrls          = removeUrlsWith "http" . removeUrlsWith "www"
  97     removeUrlsWith w    = remove (DAT.string w *> many (DAT.notChar ' ') <* many DAT.space)
  98