]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Terms/Multi.hs
Merge branch 'dev' into 175-dev-doc-table-count
[gargantext.git] / src / Gargantext / Core / Text / Terms / Multi.hs
1 {-|
2 Module : Gargantext.Core.Text.Terms.Multi
3 Description : Multi Terms module
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Multi-terms are ngrams where n > 1.
11
12 -}
13
14
15 module Gargantext.Core.Text.Terms.Multi (multiterms, multiterms_rake, tokenTagsWith, tokenTags)
16 where
17
18 import Data.Text hiding (map, group, filter, concat)
19 import Data.List (concat)
20
21 import Gargantext.Prelude
22 import Gargantext.Core (Lang(..), NLPServerConfig(..), PosTagAlgo(..))
23 import Gargantext.Core.Types
24 import Gargantext.Core.Utils (groupWithCounts)
25
26 import Gargantext.Core.Text.Terms.Multi.PosTagging
27 import Gargantext.Core.Text.Terms.Multi.PosTagging.Types
28 import qualified Gargantext.Core.Text.Terms.Multi.Lang.En as En
29 import qualified Gargantext.Core.Text.Terms.Multi.Lang.Fr as Fr
30
31 import Gargantext.Core.Text.Terms.Multi.RAKE (multiterms_rake)
32 -- import qualified Gargantext.Utils.JohnSnowNLP as JohnSnow
33
34 import qualified Gargantext.Utils.SpacyNLP as SpacyNLP
35
36
37 -------------------------------------------------------------------
38 type NLP_API = Lang -> Text -> IO PosSentences
39
40 -------------------------------------------------------------------
41 multiterms :: NLPServerConfig -> Lang -> Text -> IO [TermsWithCount]
42 multiterms nsc l txt = do
43 ret <- multiterms' tokenTag2terms l txt
44 pure $ groupWithCounts ret
45 where
46 multiterms' :: (TokenTag -> a) -> Lang -> Text -> IO [a]
47 multiterms' f lang txt' = concat
48 <$> map (map f)
49 <$> map (filter (\t -> _my_token_pos t == Just NP))
50 <$> tokenTags nsc lang txt'
51
52 -------------------------------------------------------------------
53 tokenTag2terms :: TokenTag -> Terms
54 tokenTag2terms (TokenTag ws t _ _) = Terms ws t
55
56 tokenTags :: NLPServerConfig -> Lang -> Text -> IO [[TokenTag]]
57 tokenTags (NLPServerConfig { server = CoreNLP, url }) l txt = tokenTagsWith l txt $ corenlp url
58 tokenTags (NLPServerConfig { server = Spacy, url }) l txt = tokenTagsWith l txt $ SpacyNLP.nlp url
59 -- tokenTags FR txt = do
60 -- -- printDebug "[Spacy Debug]" txt
61 -- if txt == ""
62 -- then pure [[]]
63 -- else tokenTagsWith FR txt SpacyNLP.nlp
64 tokenTags _ l _ = panic $ "[G.C.T.T.Multi] Lang NLP API not implemented yet " <> (cs $ show l)
65
66 tokenTagsWith :: Lang -> Text -> NLP_API -> IO [[TokenTag]]
67 tokenTagsWith lang txt nlp = map (groupTokens lang)
68 <$> map tokens2tokensTags
69 <$> map _sentenceTokens
70 <$> _sentences
71 <$> nlp lang txt
72
73
74 ---- | This function analyses and groups (or not) ngrams according to
75 ---- specific grammars of each language.
76 groupTokens :: Lang -> [TokenTag] -> [TokenTag]
77 groupTokens EN = En.groupTokens
78 groupTokens FR = Fr.groupTokens
79 groupTokens _ = panic $ pack "groupTokens :: Lang not implemeted yet"