]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Terms/Multi.hs
[FIX] PubMed parser
[gargantext.git] / src / Gargantext / Core / Text / Terms / Multi.hs
1 {-|
2 Module : Gargantext.Core.Text.Terms.Multi
3 Description : Multi Terms module
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Multi-terms are ngrams where n > 1.
11
12 -}
13
14
15 module Gargantext.Core.Text.Terms.Multi (multiterms, multiterms_rake)
16 where
17
18 import Data.Text hiding (map, group, filter, concat)
19 import Data.List (concat)
20
21 import Gargantext.Prelude
22 import Gargantext.Core (Lang(..))
23 import Gargantext.Core.Types
24
25 import Gargantext.Core.Text.Terms.Multi.PosTagging
26 import qualified Gargantext.Core.Text.Terms.Multi.Lang.En as En
27 import qualified Gargantext.Core.Text.Terms.Multi.Lang.Fr as Fr
28
29 import Gargantext.Core.Text.Terms.Multi.RAKE (multiterms_rake)
30
31 multiterms :: Lang -> Text -> IO [Terms]
32 multiterms lang txt = concat
33 <$> map (map tokenTag2terms)
34 <$> map (filter (\t -> _my_token_pos t == Just NP))
35 <$> tokenTags lang txt
36
37 tokenTag2terms :: TokenTag -> Terms
38 tokenTag2terms (TokenTag ws t _ _) = Terms ws t
39
40 tokenTags :: Lang -> Text -> IO [[TokenTag]]
41 tokenTags lang s = map (groupTokens lang) <$> tokenTags' lang s
42
43
44 tokenTags' :: Lang -> Text -> IO [[TokenTag]]
45 tokenTags' lang t = map tokens2tokensTags
46 <$> map _sentenceTokens
47 <$> _sentences
48 <$> corenlp lang t
49
50 ---- | This function analyses and groups (or not) ngrams according to
51 ---- specific grammars of each language.
52 groupTokens :: Lang -> [TokenTag] -> [TokenTag]
53 groupTokens EN = En.groupTokens
54 groupTokens FR = Fr.groupTokens
55 groupTokens _ = panic $ pack "groupTokens :: Lang not implemeted yet"