]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Terms/Multi.hs
[arxiv] more work on arxiv code
[gargantext.git] / src / Gargantext / Core / Text / Terms / Multi.hs
1 {-|
2 Module : Gargantext.Core.Text.Terms.Multi
3 Description : Multi Terms module
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Multi-terms are ngrams where n > 1.
11
12 -}
13
14
15 module Gargantext.Core.Text.Terms.Multi (multiterms, multiterms_rake, tokenTagsWith)
16 where
17
18 import Data.Text hiding (map, group, filter, concat)
19 import Data.List (concat)
20
21 import Gargantext.Prelude
22 import Gargantext.Core (Lang(..))
23 import Gargantext.Core.Types
24
25 import Gargantext.Core.Text.Terms.Multi.PosTagging
26 import Gargantext.Core.Text.Terms.Multi.PosTagging.Types
27 import qualified Gargantext.Core.Text.Terms.Multi.Lang.En as En
28 import qualified Gargantext.Core.Text.Terms.Multi.Lang.Fr as Fr
29
30 import Gargantext.Core.Text.Terms.Multi.RAKE (multiterms_rake)
31 import qualified Gargantext.Utils.JohnSnowNLP as JohnSnow
32
33
34 -------------------------------------------------------------------
35 type NLP_API = Lang -> Text -> IO PosSentences
36
37 -------------------------------------------------------------------
38 -- To be removed
39 multiterms :: Lang -> Text -> IO [Terms]
40 multiterms = multiterms' tokenTag2terms
41
42 multiterms' :: (TokenTag -> a) -> Lang -> Text -> IO [a]
43 multiterms' f lang txt = concat
44 <$> map (map f)
45 <$> map (filter (\t -> _my_token_pos t == Just NP))
46 <$> tokenTags lang txt
47
48 -------------------------------------------------------------------
49 tokenTag2terms :: TokenTag -> Terms
50 tokenTag2terms (TokenTag ws t _ _) = Terms ws t
51
52 tokenTags :: Lang -> Text -> IO [[TokenTag]]
53 tokenTags EN txt = tokenTagsWith EN txt corenlp
54 tokenTags FR txt = tokenTagsWith FR txt JohnSnow.nlp
55 tokenTags _ _ = panic "[G.C.T.T.Multi] NLP API not implemented yet"
56
57 tokenTagsWith :: Lang -> Text -> NLP_API -> IO [[TokenTag]]
58 tokenTagsWith lang txt nlp = map (groupTokens lang)
59 <$> map tokens2tokensTags
60 <$> map _sentenceTokens
61 <$> _sentences
62 <$> nlp lang txt
63
64
65 ---- | This function analyses and groups (or not) ngrams according to
66 ---- specific grammars of each language.
67 groupTokens :: Lang -> [TokenTag] -> [TokenTag]
68 groupTokens EN = En.groupTokens
69 groupTokens FR = Fr.groupTokens
70 groupTokens _ = panic $ pack "groupTokens :: Lang not implemeted yet"