]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Terms/Multi.hs
[conduit] attempt to fix length of parsed docs [does not compile]
[gargantext.git] / src / Gargantext / Core / Text / Terms / Multi.hs
1 {-|
2 Module : Gargantext.Core.Text.Terms.Multi
3 Description : Multi Terms module
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Multi-terms are ngrams where n > 1.
11
12 -}
13
14
15 module Gargantext.Core.Text.Terms.Multi (multiterms, multiterms_rake)
16 where
17
18 import Data.Text hiding (map, group, filter, concat)
19 import Data.List (concat)
20
21 import Gargantext.Prelude
22 import Gargantext.Core (Lang(..))
23 import Gargantext.Core.Types
24
25 import Gargantext.Core.Text.Terms.Multi.PosTagging
26 import qualified Gargantext.Core.Text.Terms.Multi.Lang.En as En
27 import qualified Gargantext.Core.Text.Terms.Multi.Lang.Fr as Fr
28
29 import Gargantext.Core.Text.Terms.Multi.RAKE (multiterms_rake)
30
31 -------------------------------------------------------------------
32 -- To be removed
33 multiterms :: Lang -> Text -> IO [Terms]
34 multiterms = multiterms' tokenTag2terms
35
36 multiterms' :: (TokenTag -> a) -> Lang -> Text -> IO [a]
37 multiterms' f lang txt = concat
38 <$> map (map f)
39 <$> map (filter (\t -> _my_token_pos t == Just NP))
40 <$> tokenTags lang txt
41 -------------------------------------------------------------------
42 tokenTag2terms :: TokenTag -> Terms
43 tokenTag2terms (TokenTag ws t _ _) = Terms ws t
44
45 tokenTags :: Lang -> Text -> IO [[TokenTag]]
46 tokenTags lang s = map (groupTokens lang) <$> tokenTags' lang s
47
48
49 tokenTags' :: Lang -> Text -> IO [[TokenTag]]
50 tokenTags' lang t = map tokens2tokensTags
51 <$> map _sentenceTokens
52 <$> _sentences
53 <$> corenlp lang t
54
55 ---- | This function analyses and groups (or not) ngrams according to
56 ---- specific grammars of each language.
57 groupTokens :: Lang -> [TokenTag] -> [TokenTag]
58 groupTokens EN = En.groupTokens
59 groupTokens FR = Fr.groupTokens
60 groupTokens _ = panic $ pack "groupTokens :: Lang not implemeted yet"