src/Gargantext/Core/Text/Terms/Multi.hs

   1 {-|
   2 Module      : Gargantext.Core.Text.Terms.Multi
   3 Description : Multi Terms module
   4 Copyright   : (c) CNRS, 2017 - present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 Multi-terms are ngrams where n > 1.
  11
  12 -}
  13
  14 {-# LANGUAGE OverloadedStrings #-}
  15
  16 module Gargantext.Core.Text.Terms.Multi (multiterms, multiterms_rake, tokenTagsWith, tokenTags, cleanTextForNLP)
  17   where
  18
  19 import Control.Applicative
  20 import Data.Attoparsec.Text                               as DAT
  21 import Data.List (concat)
  22 import Data.Text hiding (map, group, filter, concat)
  23 import Gargantext.Core (Lang(..), NLPServerConfig(..), PosTagAlgo(..))
  24 import Gargantext.Core.Text.Terms.Multi.PosTagging
  25 import Gargantext.Core.Text.Terms.Multi.PosTagging.Types
  26 import Gargantext.Core.Text.Terms.Multi.RAKE (multiterms_rake)
  27 import Gargantext.Core.Types
  28 import Gargantext.Core.Utils (groupWithCounts)
  29 import Gargantext.Prelude
  30 import Replace.Attoparsec.Text                            as RAT
  31 import qualified Gargantext.Core.Text.Terms.Multi.Lang.En as En
  32 import qualified Gargantext.Core.Text.Terms.Multi.Lang.Fr as Fr
  33 import qualified Gargantext.Utils.SpacyNLP                as SpacyNLP
  34
  35 -------------------------------------------------------------------
  36 type NLP_API = Lang -> Text -> IO PosSentences
  37
  38 -------------------------------------------------------------------
  39 multiterms :: NLPServerConfig -> Lang -> Text -> IO [TermsWithCount]
  40 multiterms nsc l txt = do
  41   ret <- multiterms' tokenTag2terms l $ cleanTextForNLP txt
  42   pure $ groupWithCounts ret
  43   where
  44     multiterms' :: (TokenTag -> a) -> Lang -> Text -> IO [a]
  45     multiterms' f lang txt' = concat
  46                        <$> map (map f)
  47                        <$> map (filter (\t -> _my_token_pos t == Just NP))
  48                        <$> tokenTags nsc lang txt'
  49
  50 -------------------------------------------------------------------
  51 tokenTag2terms :: TokenTag -> Terms
  52 tokenTag2terms (TokenTag ws t _ _) =  Terms ws t
  53
  54 tokenTags :: NLPServerConfig -> Lang -> Text -> IO [[TokenTag]]
  55 tokenTags (NLPServerConfig { server = CoreNLP, url }) l txt = tokenTagsWith l txt $ corenlp url
  56 tokenTags (NLPServerConfig { server = Spacy, url }) l txt = tokenTagsWith l txt $ SpacyNLP.nlp url
  57 -- tokenTags FR txt = do
  58 --   -- printDebug "[Spacy Debug]" txt
  59 --   if txt == ""
  60 --      then pure [[]]
  61 --      else tokenTagsWith FR txt SpacyNLP.nlp
  62 tokenTags _ l  _   = panic $ "[G.C.T.T.Multi] Lang NLP API not implemented yet " <> (cs $ show l)
  63
  64 tokenTagsWith :: Lang -> Text -> NLP_API -> IO [[TokenTag]]
  65 tokenTagsWith lang txt nlp = map (groupTokens lang)
  66                          <$> map tokens2tokensTags
  67                          <$> map _sentenceTokens
  68                          <$> _sentences
  69                          <$> nlp lang txt
  70
  71
  72 ---- | This function analyses and groups (or not) ngrams according to
  73 ----   specific grammars of each language.
  74 groupTokens :: Lang -> [TokenTag] -> [TokenTag]
  75 groupTokens EN = En.groupTokens
  76 groupTokens FR = Fr.groupTokens
  77 groupTokens _  = panic $ pack "groupTokens :: Lang not implemeted yet"
  78
  79 -- TODO: make tests here
  80 cleanTextForNLP :: Text -> Text
  81 cleanTextForNLP = unifySpaces . removeDigitsWith "-" . removeUrls
  82   where
  83     remove x = RAT.streamEdit x (const "")
  84
  85     unifySpaces         = RAT.streamEdit (many DAT.space) (const " ")
  86     removeDigitsWith x  = remove (many DAT.digit *> DAT.string x <* many DAT.digit)
  87
  88     removeUrls          = removeUrlsWith "http" . removeUrlsWith "www"
  89     removeUrlsWith w    = remove (DAT.string w *> many (DAT.notChar ' ') <* many DAT.space)
  90