2 Module : Gargantext.Text.Terms.Multi
3 Description : Multi Terms module
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Multi-terms are ngrams where n > 1.
14 {-# LANGUAGE NoImplicitPrelude #-}
16 module Gargantext.Text.Terms.Multi (multiterms)
19 import Data.Text hiding (map, group, filter, concat)
20 import Data.List (concat)
21 import qualified Data.Set as S
23 import Gargantext.Prelude
24 import Gargantext.Core (Lang(..))
25 import Gargantext.Core.Types
27 import Gargantext.Text.Terms.Multi.PosTagging
28 import Gargantext.Text.Terms.Mono.Stem (stem)
29 import qualified Gargantext.Text.Terms.Multi.Lang.En as En
30 import qualified Gargantext.Text.Terms.Multi.Lang.Fr as Fr
32 multiterms :: Lang -> Text -> IO [Terms]
33 multiterms lang txt = concat
34 <$> map (map (tokenTag2terms lang))
35 <$> map (filter (\t -> _my_token_pos t == Just NP))
36 <$> tokenTags lang txt
38 tokenTag2terms :: Lang -> TokenTag -> Terms
39 tokenTag2terms lang (TokenTag w t _ _) = Terms w t'
41 t' = S.fromList $ map (stem lang) $ S.toList t
43 tokenTags :: Lang -> Text -> IO [[TokenTag]]
44 tokenTags lang s = map (group lang) <$> tokenTags' lang s
47 tokenTags' :: Lang -> Text -> IO [[TokenTag]]
48 tokenTags' lang t = map tokens2tokensTags
49 <$> map _sentenceTokens
53 ---- | This function analyses and groups (or not) ngrams according to
54 ---- specific grammars of each language.
55 group :: Lang -> [TokenTag] -> [TokenTag]