]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Terms/Multi.hs
[TestFlow] seems good, need to add tests on it and fix distributional distance.
[gargantext.git] / src / Gargantext / Text / Terms / Multi.hs
1 {-|
2 Module : Gargantext.Text.Terms.Multi
3 Description : Multi Terms module
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Multi-terms are ngrams where n > 1.
11
12 -}
13
14 {-# LANGUAGE NoImplicitPrelude #-}
15
16 module Gargantext.Text.Terms.Multi (multiterms)
17 where
18
19 import Data.Text hiding (map, group, filter, concat)
20 import Data.List (concat)
21 import qualified Data.Set as S
22
23 import Gargantext.Prelude
24 import Gargantext.Core (Lang(..))
25 import Gargantext.Core.Types
26
27 import Gargantext.Text.Terms.Multi.PosTagging
28 import Gargantext.Text.Terms.Mono.Stem (stem)
29 import qualified Gargantext.Text.Terms.Multi.Lang.En as En
30 import qualified Gargantext.Text.Terms.Multi.Lang.Fr as Fr
31
32 multiterms :: Lang -> Text -> IO [Terms]
33 multiterms lang txt = concat
34 <$> map (map (tokenTag2terms lang))
35 <$> map (filter (\t -> _my_token_pos t == Just NP))
36 <$> tokenTags lang txt
37
38 tokenTag2terms :: Lang -> TokenTag -> Terms
39 tokenTag2terms lang (TokenTag w t _ _) = Terms w t'
40 where
41 t' = S.fromList $ map (stem lang) $ S.toList t
42
43 tokenTags :: Lang -> Text -> IO [[TokenTag]]
44 tokenTags lang s = map (group lang) <$> tokenTags' lang s
45
46
47 tokenTags' :: Lang -> Text -> IO [[TokenTag]]
48 tokenTags' lang t = map tokens2tokensTags
49 <$> map _sentenceTokens
50 <$> _sentences
51 <$> corenlp lang t
52
53 ---- | This function analyses and groups (or not) ngrams according to
54 ---- specific grammars of each language.
55 group :: Lang -> [TokenTag] -> [TokenTag]
56 group EN = En.group
57 group FR = Fr.group
58