2 Module : Gargantext.Core.Text.Terms.Multi
3 Description : Multi Terms module
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Multi-terms are ngrams where n > 1.
14 {-# LANGUAGE OverloadedStrings #-}
16 module Gargantext.Core.Text.Terms.Multi (multiterms, multiterms_rake, tokenTagsWith, tokenTags, cleanTextForNLP)
19 import Control.Applicative
20 import Data.Attoparsec.Text as DAT
21 import Data.List (concat)
22 import Data.Text hiding (map, group, filter, concat)
23 import Gargantext.Core (Lang(..), NLPServerConfig(..), PosTagAlgo(..))
24 import Gargantext.Core.Text.Terms.Multi.PosTagging
25 import Gargantext.Core.Text.Terms.Multi.PosTagging.Types
26 import Gargantext.Core.Text.Terms.Multi.RAKE (multiterms_rake)
27 import Gargantext.Core.Types
28 import Gargantext.Core.Utils (groupWithCounts)
29 import Gargantext.Prelude
30 import Replace.Attoparsec.Text as RAT
31 import qualified Gargantext.Core.Text.Terms.Multi.Lang.En as En
32 import qualified Gargantext.Core.Text.Terms.Multi.Lang.Fr as Fr
33 import qualified Gargantext.Utils.SpacyNLP as SpacyNLP
35 -------------------------------------------------------------------
36 type NLP_API = Lang -> Text -> IO PosSentences
38 -------------------------------------------------------------------
39 multiterms :: NLPServerConfig -> Lang -> Text -> IO [TermsWithCount]
40 multiterms nsc l txt = do
41 let txt' = cleanTextForNLP txt
44 printDebug "[G.C.T.Terms.Multi] becomes empty after cleanTextForNLP" txt
47 ret <- multiterms' tokenTag2terms l txt'
48 pure $ groupWithCounts ret
50 multiterms' :: (TokenTag -> a) -> Lang -> Text -> IO [a]
51 multiterms' f lang txt' = concat
53 <$> map (filter (\t -> _my_token_pos t == Just NP))
54 <$> tokenTags nsc lang txt'
56 -------------------------------------------------------------------
57 tokenTag2terms :: TokenTag -> Terms
58 tokenTag2terms (TokenTag ws t _ _) = Terms ws t
60 tokenTags :: NLPServerConfig -> Lang -> Text -> IO [[TokenTag]]
61 tokenTags (NLPServerConfig { server = CoreNLP, url }) EN txt = tokenTagsWith EN txt $ corenlp url
62 tokenTags (NLPServerConfig { server = Spacy, url }) l txt = do
63 -- printDebug "NLP Debug" txt
64 tokenTagsWith l txt $ SpacyNLP.nlp url
65 -- tokenTags FR txt = do
66 -- -- printDebug "[Spacy Debug]" txt
69 -- else tokenTagsWith FR txt SpacyNLP.nlp
70 tokenTags _ l _ = panic $ "[G.C.T.T.Multi] Lang NLP API not implemented yet " <> (cs $ show l)
72 tokenTagsWith :: Lang -> Text -> NLP_API -> IO [[TokenTag]]
73 tokenTagsWith lang txt nlp = map (groupTokens lang)
74 <$> map tokens2tokensTags
75 <$> map _sentenceTokens
80 ---- | This function analyses and groups (or not) ngrams according to
81 ---- specific grammars of each language.
82 groupTokens :: Lang -> [TokenTag] -> [TokenTag]
83 groupTokens EN = En.groupTokens
84 groupTokens FR = Fr.groupTokens
85 groupTokens _ = Fr.groupTokens
87 -- TODO: make tests here
88 cleanTextForNLP :: Text -> Text
89 cleanTextForNLP = unifySpaces . removeDigitsWith "-" . removeUrls
91 remove x = RAT.streamEdit x (const "")
93 unifySpaces = RAT.streamEdit (many DAT.space) (const " ")
94 removeDigitsWith x = remove (many DAT.digit *> DAT.string x <* many DAT.digit)
96 removeUrls = removeUrlsWith "http" . removeUrlsWith "www"
97 removeUrlsWith w = remove (DAT.string w *> many (DAT.notChar ' ') <* many DAT.space)