]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Terms/Multi.hs
Merge branch 'dev-doc-annotation-issue' of ssh://gitlab.iscpif.fr:20022/gargantext...
[gargantext.git] / src / Gargantext / Core / Text / Terms / Multi.hs
1 {-|
2 Module : Gargantext.Core.Text.Terms.Multi
3 Description : Multi Terms module
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Multi-terms are ngrams where n > 1.
11
12 -}
13
14
15 module Gargantext.Core.Text.Terms.Multi (multiterms, multiterms_rake)
16 where
17
18 import Data.Text hiding (map, group, filter, concat)
19 import Data.List (concat)
20 import qualified Data.Set as S
21
22 import Gargantext.Prelude
23 import Gargantext.Core (Lang(..))
24 import Gargantext.Core.Types
25
26 import Gargantext.Core.Text.Terms.Multi.PosTagging
27 import Gargantext.Core.Text.Terms.Mono.Stem (stem)
28 import qualified Gargantext.Core.Text.Terms.Multi.Lang.En as En
29 import qualified Gargantext.Core.Text.Terms.Multi.Lang.Fr as Fr
30
31 import Gargantext.Core.Text.Terms.Multi.RAKE (multiterms_rake)
32
33 multiterms :: Lang -> Text -> IO [Terms]
34 multiterms lang txt = concat
35 <$> map (map (tokenTag2terms lang))
36 <$> map (filter (\t -> _my_token_pos t == Just NP))
37 <$> tokenTags lang txt
38
39 tokenTag2terms :: Lang -> TokenTag -> Terms
40 tokenTag2terms lang (TokenTag w t _ _) = Terms w t'
41 where
42 t' = S.fromList $ map (stem lang) $ S.toList t
43
44 tokenTags :: Lang -> Text -> IO [[TokenTag]]
45 tokenTags lang s = map (group lang) <$> tokenTags' lang s
46
47
48 tokenTags' :: Lang -> Text -> IO [[TokenTag]]
49 tokenTags' lang t = map tokens2tokensTags
50 <$> map _sentenceTokens
51 <$> _sentences
52 <$> corenlp lang t
53
54 ---- | This function analyses and groups (or not) ngrams according to
55 ---- specific grammars of each language.
56 group :: Lang -> [TokenTag] -> [TokenTag]
57 group EN = En.group
58 group FR = Fr.group
59 group _ = panic $ pack "group :: Lang not implemeted yet"