]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Terms/Multi.hs
Merge remote-tracking branch 'origin/adinapoli/issue-incorrect-pagination' into dev
[gargantext.git] / src / Gargantext / Core / Text / Terms / Multi.hs
1 {-|
2 Module : Gargantext.Core.Text.Terms.Multi
3 Description : Multi Terms module
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Multi-terms are ngrams where n > 1.
11
12 -}
13
14 {-# LANGUAGE OverloadedStrings #-}
15
16 module Gargantext.Core.Text.Terms.Multi (multiterms, multiterms_rake, tokenTagsWith, tokenTags, cleanTextForNLP)
17 where
18
19 import Control.Applicative
20 import Data.Attoparsec.Text as DAT
21 import Data.List (concat)
22 import Data.Text hiding (map, group, filter, concat)
23 import Gargantext.Core (Lang(..), NLPServerConfig(..), PosTagAlgo(..))
24 import Gargantext.Core.Text.Terms.Multi.PosTagging
25 import Gargantext.Core.Text.Terms.Multi.PosTagging.Types
26 import Gargantext.Core.Text.Terms.Multi.RAKE (multiterms_rake)
27 import Gargantext.Core.Types
28 import Gargantext.Core.Utils (groupWithCounts)
29 import Gargantext.Prelude
30 import Replace.Attoparsec.Text as RAT
31 import qualified Gargantext.Core.Text.Terms.Multi.Lang.En as En
32 import qualified Gargantext.Core.Text.Terms.Multi.Lang.Fr as Fr
33 import qualified Gargantext.Utils.SpacyNLP as SpacyNLP
34
35 -------------------------------------------------------------------
36 type NLP_API = Lang -> Text -> IO PosSentences
37
38 -------------------------------------------------------------------
39 multiterms :: NLPServerConfig -> Lang -> Text -> IO [TermsWithCount]
40 multiterms nsc l txt = do
41 ret <- multiterms' tokenTag2terms l $ cleanTextForNLP txt
42 pure $ groupWithCounts ret
43 where
44 multiterms' :: (TokenTag -> a) -> Lang -> Text -> IO [a]
45 multiterms' f lang txt' = concat
46 <$> map (map f)
47 <$> map (filter (\t -> _my_token_pos t == Just NP))
48 <$> tokenTags nsc lang txt'
49
50 -------------------------------------------------------------------
51 tokenTag2terms :: TokenTag -> Terms
52 tokenTag2terms (TokenTag ws t _ _) = Terms ws t
53
54 tokenTags :: NLPServerConfig -> Lang -> Text -> IO [[TokenTag]]
55 tokenTags (NLPServerConfig { server = CoreNLP, url }) EN txt = tokenTagsWith EN txt $ corenlp url
56 tokenTags (NLPServerConfig { server = Spacy, url }) l txt = tokenTagsWith l txt $ SpacyNLP.nlp url
57 -- tokenTags FR txt = do
58 -- -- printDebug "[Spacy Debug]" txt
59 -- if txt == ""
60 -- then pure [[]]
61 -- else tokenTagsWith FR txt SpacyNLP.nlp
62 tokenTags _ l _ = panic $ "[G.C.T.T.Multi] Lang NLP API not implemented yet " <> (cs $ show l)
63
64 tokenTagsWith :: Lang -> Text -> NLP_API -> IO [[TokenTag]]
65 tokenTagsWith lang txt nlp = map (groupTokens lang)
66 <$> map tokens2tokensTags
67 <$> map _sentenceTokens
68 <$> _sentences
69 <$> nlp lang txt
70
71
72 ---- | This function analyses and groups (or not) ngrams according to
73 ---- specific grammars of each language.
74 groupTokens :: Lang -> [TokenTag] -> [TokenTag]
75 groupTokens EN = En.groupTokens
76 groupTokens FR = Fr.groupTokens
77 groupTokens _ = Fr.groupTokens
78
79 -- TODO: make tests here
80 cleanTextForNLP :: Text -> Text
81 cleanTextForNLP = unifySpaces . removeDigitsWith "-" . removeUrls
82 where
83 remove x = RAT.streamEdit x (const "")
84
85 unifySpaces = RAT.streamEdit (many DAT.space) (const " ")
86 removeDigitsWith x = remove (many DAT.digit *> DAT.string x <* many DAT.digit)
87
88 removeUrls = removeUrlsWith "http" . removeUrlsWith "www"
89 removeUrlsWith w = remove (DAT.string w *> many (DAT.notChar ' ') <* many DAT.space)
90