2 Module : Gargantext.Core.Text.Terms.Multi
3 Description : Multi Terms module
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Multi-terms are ngrams where n > 1.
15 module Gargantext.Core.Text.Terms.Multi (multiterms, multiterms_rake, tokenTagsWith, tokenTags)
18 import Data.Text hiding (map, group, filter, concat)
19 import Data.List (concat)
21 import Gargantext.Prelude
22 import Gargantext.Core (Lang(..))
23 import Gargantext.Core.Types
24 import Gargantext.Core.Utils (groupWithCounts)
26 import Gargantext.Core.Text.Terms.Multi.PosTagging
27 import Gargantext.Core.Text.Terms.Multi.PosTagging.Types
28 import qualified Gargantext.Core.Text.Terms.Multi.Lang.En as En
29 import qualified Gargantext.Core.Text.Terms.Multi.Lang.Fr as Fr
31 import Gargantext.Core.Text.Terms.Multi.RAKE (multiterms_rake)
32 -- import qualified Gargantext.Utils.JohnSnowNLP as JohnSnow
34 import qualified Gargantext.Utils.SpacyNLP as SpacyNLP
37 -------------------------------------------------------------------
38 type NLP_API = Lang -> Text -> IO PosSentences
40 -------------------------------------------------------------------
41 multiterms :: Lang -> Text -> IO [TermsWithCount]
43 ret <- multiterms' tokenTag2terms l txt
44 pure $ groupWithCounts ret
46 multiterms' :: (TokenTag -> a) -> Lang -> Text -> IO [a]
47 multiterms' f lang txt' = concat
49 <$> map (filter (\t -> _my_token_pos t == Just NP))
50 <$> tokenTags lang txt'
52 -------------------------------------------------------------------
53 tokenTag2terms :: TokenTag -> Terms
54 tokenTag2terms (TokenTag ws t _ _) = Terms ws t
56 tokenTags :: Lang -> Text -> IO [[TokenTag]]
57 tokenTags EN txt = tokenTagsWith EN txt corenlp
59 -- printDebug "[Spacy Debug]" txt
62 else tokenTagsWith FR txt SpacyNLP.nlp
63 tokenTags l _ = panic $ "[G.C.T.T.Multi] Lang NLP API not implemented yet " <> (cs $ show l)
65 tokenTagsWith :: Lang -> Text -> NLP_API -> IO [[TokenTag]]
66 tokenTagsWith lang txt nlp = map (groupTokens lang)
67 <$> map tokens2tokensTags
68 <$> map _sentenceTokens
73 ---- | This function analyses and groups (or not) ngrams according to
74 ---- specific grammars of each language.
75 groupTokens :: Lang -> [TokenTag] -> [TokenTag]
76 groupTokens EN = En.groupTokens
77 groupTokens FR = Fr.groupTokens
78 groupTokens _ = panic $ pack "groupTokens :: Lang not implemeted yet"