src/Gargantext/Core/Text/Terms/Multi.hs

   1 {-|
   2 Module      : Gargantext.Core.Text.Terms.Multi
   3 Description : Multi Terms module
   4 Copyright   : (c) CNRS, 2017 - present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 Multi-terms are ngrams where n > 1.
  11
  12 -}
  13
  14
  15 module Gargantext.Core.Text.Terms.Multi (multiterms, multiterms_rake, tokenTagsWith, tokenTags)
  16   where
  17
  18 import Data.Text hiding (map, group, filter, concat)
  19 import Data.List (concat)
  20
  21 import Gargantext.Prelude
  22 import Gargantext.Core (Lang(..))
  23 import Gargantext.Core.Types
  24 import Gargantext.Core.Utils (groupWithCounts)
  25
  26 import Gargantext.Core.Text.Terms.Multi.PosTagging
  27 import Gargantext.Core.Text.Terms.Multi.PosTagging.Types
  28 import qualified Gargantext.Core.Text.Terms.Multi.Lang.En as En
  29 import qualified Gargantext.Core.Text.Terms.Multi.Lang.Fr as Fr
  30
  31 import Gargantext.Core.Text.Terms.Multi.RAKE (multiterms_rake)
  32 -- import qualified Gargantext.Utils.JohnSnowNLP as JohnSnow
  33
  34 import qualified Gargantext.Utils.SpacyNLP as SpacyNLP
  35
  36
  37 -------------------------------------------------------------------
  38 type NLP_API = Lang -> Text -> IO PosSentences
  39
  40 -------------------------------------------------------------------
  41 multiterms :: Lang -> Text -> IO [TermsWithCount]
  42 multiterms l txt = do
  43   ret <- multiterms' tokenTag2terms l txt
  44   pure $ groupWithCounts ret
  45   where
  46     multiterms' :: (TokenTag -> a) -> Lang -> Text -> IO [a]
  47     multiterms' f lang txt' = concat
  48                        <$> map (map f)
  49                        <$> map (filter (\t -> _my_token_pos t == Just NP))
  50                        <$> tokenTags lang txt'
  51
  52 -------------------------------------------------------------------
  53 tokenTag2terms :: TokenTag -> Terms
  54 tokenTag2terms (TokenTag ws t _ _) =  Terms ws t
  55
  56 tokenTags :: Lang -> Text -> IO [[TokenTag]]
  57 tokenTags EN txt = tokenTagsWith EN txt corenlp
  58 tokenTags FR txt = do
  59   -- printDebug "[Spacy Debug]" txt
  60   if txt == ""
  61      then pure [[]]
  62      else tokenTagsWith FR txt SpacyNLP.nlp
  63 tokenTags l  _   = panic $ "[G.C.T.T.Multi] Lang NLP API not implemented yet " <> (cs $ show l)
  64
  65 tokenTagsWith :: Lang -> Text -> NLP_API -> IO [[TokenTag]]
  66 tokenTagsWith lang txt nlp = map (groupTokens lang)
  67                          <$> map tokens2tokensTags
  68                          <$> map _sentenceTokens
  69                          <$> _sentences
  70                          <$> nlp lang txt
  71
  72
  73 ---- | This function analyses and groups (or not) ngrams according to
  74 ----   specific grammars of each language.
  75 groupTokens :: Lang -> [TokenTag] -> [TokenTag]
  76 groupTokens EN = En.groupTokens
  77 groupTokens FR = Fr.groupTokens
  78 groupTokens _  = panic $ pack "groupTokens :: Lang not implemeted yet"