2 Module : Gargantext.Core.Text.Terms.Multi
3 Description : Multi Terms module
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Multi-terms are ngrams where n > 1.
14 {-# LANGUAGE OverloadedStrings #-}
16 module Gargantext.Core.Text.Terms.Multi (multiterms, multiterms_rake, tokenTagsWith, tokenTags, cleanTextForNLP)
19 import Control.Applicative
20 import Data.Attoparsec.Text as DAT
21 import Data.List (concat)
22 import Data.Text hiding (map, group, filter, concat)
23 import Gargantext.Core (Lang(..), NLPServerConfig(..), PosTagAlgo(..))
24 import Gargantext.Core.Text.Terms.Multi.PosTagging
25 import Gargantext.Core.Text.Terms.Multi.PosTagging.Types
26 import Gargantext.Core.Text.Terms.Multi.RAKE (multiterms_rake)
27 import Gargantext.Core.Types
28 import Gargantext.Core.Utils (groupWithCounts)
29 import Gargantext.Prelude
30 import Replace.Attoparsec.Text as RAT
31 import qualified Gargantext.Core.Text.Terms.Multi.Lang.En as En
32 import qualified Gargantext.Core.Text.Terms.Multi.Lang.Fr as Fr
33 import qualified Gargantext.Utils.SpacyNLP as SpacyNLP
35 -------------------------------------------------------------------
36 type NLP_API = Lang -> Text -> IO PosSentences
38 -------------------------------------------------------------------
39 multiterms :: NLPServerConfig -> Lang -> Text -> IO [TermsWithCount]
40 multiterms nsc l txt = do
41 ret <- multiterms' tokenTag2terms l $ cleanTextForNLP txt
42 pure $ groupWithCounts ret
44 multiterms' :: (TokenTag -> a) -> Lang -> Text -> IO [a]
45 multiterms' f lang txt' = concat
47 <$> map (filter (\t -> _my_token_pos t == Just NP))
48 <$> tokenTags nsc lang txt'
50 -------------------------------------------------------------------
51 tokenTag2terms :: TokenTag -> Terms
52 tokenTag2terms (TokenTag ws t _ _) = Terms ws t
54 tokenTags :: NLPServerConfig -> Lang -> Text -> IO [[TokenTag]]
55 tokenTags (NLPServerConfig { server = CoreNLP, url }) EN txt = tokenTagsWith EN txt $ corenlp url
56 tokenTags (NLPServerConfig { server = Spacy, url }) l txt = tokenTagsWith l txt $ SpacyNLP.nlp url
57 -- tokenTags FR txt = do
58 -- -- printDebug "[Spacy Debug]" txt
61 -- else tokenTagsWith FR txt SpacyNLP.nlp
62 tokenTags _ l _ = panic $ "[G.C.T.T.Multi] Lang NLP API not implemented yet " <> (cs $ show l)
64 tokenTagsWith :: Lang -> Text -> NLP_API -> IO [[TokenTag]]
65 tokenTagsWith lang txt nlp = map (groupTokens lang)
66 <$> map tokens2tokensTags
67 <$> map _sentenceTokens
72 ---- | This function analyses and groups (or not) ngrams according to
73 ---- specific grammars of each language.
74 groupTokens :: Lang -> [TokenTag] -> [TokenTag]
75 groupTokens EN = En.groupTokens
76 groupTokens FR = Fr.groupTokens
77 groupTokens _ = Fr.groupTokens
79 -- TODO: make tests here
80 cleanTextForNLP :: Text -> Text
81 cleanTextForNLP = unifySpaces . removeDigitsWith "-" . removeUrls
83 remove x = RAT.streamEdit x (const "")
85 unifySpaces = RAT.streamEdit (many DAT.space) (const " ")
86 removeDigitsWith x = remove (many DAT.digit *> DAT.string x <* many DAT.digit)
88 removeUrls = removeUrlsWith "http" . removeUrlsWith "www"
89 removeUrlsWith w = remove (DAT.string w *> many (DAT.notChar ' ') <* many DAT.space)