2 Module : Gargantext.Core.Text.Terms.Multi
3 Description : Multi Terms module
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Multi-terms are ngrams where n > 1.
15 module Gargantext.Core.Text.Terms.Multi (multiterms, multiterms_rake)
18 import Data.Text hiding (map, group, filter, concat)
19 import Data.List (concat)
20 import qualified Data.Set as S
22 import Gargantext.Prelude
23 import Gargantext.Core (Lang(..))
24 import Gargantext.Core.Types
26 import Gargantext.Core.Text.Terms.Multi.PosTagging
27 import Gargantext.Core.Text.Terms.Mono.Stem (stem)
28 import qualified Gargantext.Core.Text.Terms.Multi.Lang.En as En
29 import qualified Gargantext.Core.Text.Terms.Multi.Lang.Fr as Fr
31 import Gargantext.Core.Text.Terms.Multi.RAKE (multiterms_rake)
33 multiterms :: Lang -> Text -> IO [Terms]
34 multiterms lang txt = concat
35 <$> map (map (tokenTag2terms lang))
36 <$> map (filter (\t -> _my_token_pos t == Just NP))
37 <$> tokenTags lang txt
39 tokenTag2terms :: Lang -> TokenTag -> Terms
40 tokenTag2terms lang (TokenTag w t _ _) = Terms w t'
42 t' = S.fromList $ map (stem lang) $ S.toList t
44 tokenTags :: Lang -> Text -> IO [[TokenTag]]
45 tokenTags lang s = map (group lang) <$> tokenTags' lang s
48 tokenTags' :: Lang -> Text -> IO [[TokenTag]]
49 tokenTags' lang t = map tokens2tokensTags
50 <$> map _sentenceTokens
54 ---- | This function analyses and groups (or not) ngrams according to
55 ---- specific grammars of each language.
56 group :: Lang -> [TokenTag] -> [TokenTag]
59 group _ = panic $ pack "group :: Lang not implemeted yet"