]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Terms/Multi.hs
Merge branch 'dev' into 104-dev-john-snow-nlp
[gargantext.git] / src / Gargantext / Core / Text / Terms / Multi.hs
1 {-|
2 Module : Gargantext.Core.Text.Terms.Multi
3 Description : Multi Terms module
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Multi-terms are ngrams where n > 1.
11
12 -}
13
14
15 module Gargantext.Core.Text.Terms.Multi (multiterms, multiterms_rake)
16 where
17
18 import Data.Text hiding (map, group, filter, concat)
19 import Data.List (concat)
20
21 import Gargantext.Prelude
22 import Gargantext.Core (Lang(..))
23 import Gargantext.Core.Types
24
25 import Gargantext.Core.Text.Terms.Multi.PosTagging
26 import Gargantext.Core.Text.Terms.Multi.PosTagging.Types
27 import qualified Gargantext.Core.Text.Terms.Multi.Lang.En as En
28 import qualified Gargantext.Core.Text.Terms.Multi.Lang.Fr as Fr
29
30 import Gargantext.Core.Text.Terms.Multi.RAKE (multiterms_rake)
31
32 -------------------------------------------------------------------
33 -- To be removed
34 multiterms :: Lang -> Text -> IO [Terms]
35 multiterms = multiterms' tokenTag2terms
36
37 multiterms' :: (TokenTag -> a) -> Lang -> Text -> IO [a]
38 multiterms' f lang txt = concat
39 <$> map (map f)
40 <$> map (filter (\t -> _my_token_pos t == Just NP))
41 <$> tokenTags lang txt
42 -------------------------------------------------------------------
43 tokenTag2terms :: TokenTag -> Terms
44 tokenTag2terms (TokenTag ws t _ _) = Terms ws t
45
46 tokenTags :: Lang -> Text -> IO [[TokenTag]]
47 tokenTags lang s = map (groupTokens lang) <$> tokenTags' lang s
48
49
50 tokenTags' :: Lang -> Text -> IO [[TokenTag]]
51 tokenTags' lang t = map tokens2tokensTags
52 <$> map _sentenceTokens
53 <$> _sentences
54 <$> corenlp lang t
55
56 ---- | This function analyses and groups (or not) ngrams according to
57 ---- specific grammars of each language.
58 groupTokens :: Lang -> [TokenTag] -> [TokenTag]
59 groupTokens EN = En.groupTokens
60 groupTokens FR = Fr.groupTokens
61 groupTokens _ = panic $ pack "groupTokens :: Lang not implemeted yet"