]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Terms/Multi.hs
[FEAT] Cooc -> Matrix conversions tools.
[gargantext.git] / src / Gargantext / Text / Terms / Multi.hs
1 {-|
2 Module : Gargantext.Text.Terms.Multi
3 Description : Multi Terms module
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Multi-terms are ngrams where n > 1.
11
12 -}
13
14 {-# LANGUAGE NoImplicitPrelude #-}
15
16 module Gargantext.Text.Terms.Multi (multiterms)
17 where
18
19 import Data.Text hiding (map, group, filter, concat)
20 import Data.List (concat)
21
22 import Gargantext.Prelude
23 import Gargantext.Core (Lang(..))
24 import Gargantext.Core.Types
25
26 import Gargantext.Text.Terms.Multi.PosTagging
27 import qualified Gargantext.Text.Terms.Multi.Lang.En as En
28 import qualified Gargantext.Text.Terms.Multi.Lang.Fr as Fr
29
30 multiterms :: Lang -> Text -> IO [Terms]
31 multiterms lang txt = concat
32 <$> map (map tokenTag2terms)
33 <$> map (filter (\t -> _my_token_pos t == Just NP))
34 <$> tokenTags lang txt
35
36 tokenTag2terms :: TokenTag -> Terms
37 tokenTag2terms (TokenTag w t _ _) = Terms w t
38
39 tokenTags :: Lang -> Text -> IO [[TokenTag]]
40 tokenTags lang s = map (group lang) <$> tokenTags' lang s
41
42
43 tokenTags' :: Lang -> Text -> IO [[TokenTag]]
44 tokenTags' lang t = map tokens2tokensTags
45 <$> map _sentenceTokens
46 <$> _sentences
47 <$> corenlp lang t
48
49 ---- | This function analyses and groups (or not) ngrams according to
50 ---- specific grammars of each language.
51 group :: Lang -> [TokenTag] -> [TokenTag]
52 group EN = En.group
53 group FR = Fr.group
54