src/Gargantext/Core/Text/Terms/Multi.hs

   1 {-|
   2 Module      : Gargantext.Core.Text.Terms.Multi
   3 Description : Multi Terms module
   4 Copyright   : (c) CNRS, 2017 - present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 Multi-terms are ngrams where n > 1.
  11
  12 -}
  13
  14
  15 module Gargantext.Core.Text.Terms.Multi (multiterms, multiterms_rake, tokenTagsWith, tokenTags)
  16   where
  17
  18 import Data.Text hiding (map, group, filter, concat)
  19 import Data.List (concat)
  20
  21 import Gargantext.Prelude
  22 import Gargantext.Core (Lang(..))
  23 import Gargantext.Core.Types
  24 import Gargantext.Core.Utils (groupWithCounts)
  25
  26 import Gargantext.Core.Text.Terms.Multi.PosTagging
  27 import Gargantext.Core.Text.Terms.Multi.PosTagging.Types
  28 import qualified Gargantext.Core.Text.Terms.Multi.Lang.En as En
  29 import qualified Gargantext.Core.Text.Terms.Multi.Lang.Fr as Fr
  30
  31 import Gargantext.Core.Text.Terms.Multi.RAKE (multiterms_rake)
  32 -- import qualified Gargantext.Utils.JohnSnowNLP as JohnSnow
  33
  34 import qualified Gargantext.Utils.SpacyNLP as SpacyNLP
  35
  36
  37 -------------------------------------------------------------------
  38 type NLP_API = Lang -> Text -> IO PosSentences
  39
  40 -------------------------------------------------------------------
  41 multiterms :: Lang -> Text -> IO [TermsWithCount]
  42 multiterms l txt = do
  43   ret <- multiterms' tokenTag2terms l txt
  44   pure $ groupWithCounts ret
  45   where
  46     multiterms' :: (TokenTag -> a) -> Lang -> Text -> IO [a]
  47     multiterms' f lang txt' = concat
  48                        <$> map (map f)
  49                        <$> map (filter (\t -> _my_token_pos t == Just NP))
  50                        <$> tokenTags lang txt'
  51
  52 -------------------------------------------------------------------
  53 tokenTag2terms :: TokenTag -> Terms
  54 tokenTag2terms (TokenTag ws t _ _) =  Terms ws t
  55
  56 tokenTags :: Lang -> Text -> IO [[TokenTag]]
  57 tokenTags EN txt = tokenTagsWith EN txt corenlp
  58 tokenTags FR txt = tokenTagsWith FR txt SpacyNLP.nlp
  59 tokenTags l  _   = panic $ "[G.C.T.T.Multi] Lang NLP API not implemented yet " <> (cs $ show l)
  60
  61 tokenTagsWith :: Lang -> Text -> NLP_API -> IO [[TokenTag]]
  62 tokenTagsWith lang txt nlp = map (groupTokens lang)
  63                          <$> map tokens2tokensTags
  64                          <$> map _sentenceTokens
  65                          <$> _sentences
  66                          <$> nlp lang txt
  67
  68
  69 ---- | This function analyses and groups (or not) ngrams according to
  70 ----   specific grammars of each language.
  71 groupTokens :: Lang -> [TokenTag] -> [TokenTag]
  72 groupTokens EN = En.groupTokens
  73 groupTokens FR = Fr.groupTokens
  74 groupTokens _  = panic $ pack "groupTokens :: Lang not implemeted yet"