]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Terms/Multi.hs
[WIP] First specification for #145 issue
[gargantext.git] / src / Gargantext / Core / Text / Terms / Multi.hs
1 {-|
2 Module : Gargantext.Core.Text.Terms.Multi
3 Description : Multi Terms module
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Multi-terms are ngrams where n > 1.
11
12 -}
13
14
15 module Gargantext.Core.Text.Terms.Multi (multiterms, multiterms_rake, tokenTagsWith)
16 where
17
18 import Data.Text hiding (map, group, filter, concat)
19 import Data.List (concat)
20
21 import Gargantext.Prelude
22 import Gargantext.Core (Lang(..))
23 import Gargantext.Core.Types
24
25 import Gargantext.Core.Text.Terms.Multi.PosTagging
26 import Gargantext.Core.Text.Terms.Multi.PosTagging.Types
27 import qualified Gargantext.Core.Text.Terms.Multi.Lang.En as En
28 import qualified Gargantext.Core.Text.Terms.Multi.Lang.Fr as Fr
29
30 import Gargantext.Core.Text.Terms.Multi.RAKE (multiterms_rake)
31 -- import qualified Gargantext.Utils.JohnSnowNLP as JohnSnow
32
33 import qualified Gargantext.Utils.SpacyNLP as SpacyNLP
34
35
36 -------------------------------------------------------------------
37 type NLP_API = Lang -> Text -> IO PosSentences
38
39 -------------------------------------------------------------------
40 multiterms :: Lang -> Text -> IO [Terms]
41 multiterms = multiterms' tokenTag2terms
42 where
43 multiterms' :: (TokenTag -> a) -> Lang -> Text -> IO [a]
44 multiterms' f lang txt = concat
45 <$> map (map f)
46 <$> map (filter (\t -> _my_token_pos t == Just NP))
47 <$> tokenTags lang txt
48
49 -------------------------------------------------------------------
50 tokenTag2terms :: TokenTag -> Terms
51 tokenTag2terms (TokenTag ws t _ _) = Terms ws t
52
53 tokenTags :: Lang -> Text -> IO [[TokenTag]]
54 tokenTags EN txt = tokenTagsWith EN txt corenlp
55 tokenTags FR txt = tokenTagsWith FR txt SpacyNLP.nlp
56 tokenTags l _ = panic $ "[G.C.T.T.Multi] Lang NLP API not implemented yet " <> (cs $ show l)
57
58 tokenTagsWith :: Lang -> Text -> NLP_API -> IO [[TokenTag]]
59 tokenTagsWith lang txt nlp = map (groupTokens lang)
60 <$> map tokens2tokensTags
61 <$> map _sentenceTokens
62 <$> _sentences
63 <$> nlp lang txt
64
65
66 ---- | This function analyses and groups (or not) ngrams according to
67 ---- specific grammars of each language.
68 groupTokens :: Lang -> [TokenTag] -> [TokenTag]
69 groupTokens EN = En.groupTokens
70 groupTokens FR = Fr.groupTokens
71 groupTokens _ = panic $ pack "groupTokens :: Lang not implemeted yet"