2 Module : Gargantext.Text.Terms.Multi
3 Description : Multi Terms module
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Multi-terms are ngrams where n > 1.
14 {-# LANGUAGE NoImplicitPrelude #-}
16 module Gargantext.Text.Terms.Multi (multiterms, multiterms_rake)
19 import Data.Text hiding (map, group, filter, concat)
20 import Data.List (concat)
21 import qualified Data.Set as S
23 import Gargantext.Prelude
24 import Gargantext.Core (Lang(..))
25 import Gargantext.Core.Types
27 import Gargantext.Text.Terms.Multi.PosTagging
28 import Gargantext.Text.Terms.Mono.Stem (stem)
29 import qualified Gargantext.Text.Terms.Multi.Lang.En as En
30 import qualified Gargantext.Text.Terms.Multi.Lang.Fr as Fr
32 import Gargantext.Text.Terms.Multi.RAKE (multiterms_rake)
34 multiterms :: Lang -> Text -> IO [Terms]
35 multiterms lang txt = concat
36 <$> map (map (tokenTag2terms lang))
37 <$> map (filter (\t -> _my_token_pos t == Just NP))
38 <$> tokenTags lang txt
40 tokenTag2terms :: Lang -> TokenTag -> Terms
41 tokenTag2terms lang (TokenTag w t _ _) = Terms w t'
43 t' = S.fromList $ map (stem lang) $ S.toList t
45 tokenTags :: Lang -> Text -> IO [[TokenTag]]
46 tokenTags lang s = map (group lang) <$> tokenTags' lang s
49 tokenTags' :: Lang -> Text -> IO [[TokenTag]]
50 tokenTags' lang t = map tokens2tokensTags
51 <$> map _sentenceTokens
55 ---- | This function analyses and groups (or not) ngrams according to
56 ---- specific grammars of each language.
57 group :: Lang -> [TokenTag] -> [TokenTag]
60 group _ = panic $ pack "group :: Lang not implemeted yet"