]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Terms/Multi.hs
[FIX]
[gargantext.git] / src / Gargantext / Core / Text / Terms / Multi.hs
1 {-|
2 Module : Gargantext.Core.Text.Terms.Multi
3 Description : Multi Terms module
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Multi-terms are ngrams where n > 1.
11
12 -}
13
14 {-# LANGUAGE OverloadedStrings #-}
15
16 module Gargantext.Core.Text.Terms.Multi (multiterms, multiterms_rake, tokenTagsWith, tokenTags, cleanTextForNLP)
17 where
18
19 import Control.Applicative
20 import Data.Attoparsec.Text as DAT
21 import Data.List (concat)
22 import Data.Text hiding (map, group, filter, concat)
23 import Gargantext.Core (Lang(..), NLPServerConfig(..), PosTagAlgo(..))
24 import Gargantext.Core.Text.Terms.Multi.PosTagging
25 import Gargantext.Core.Text.Terms.Multi.PosTagging.Types
26 import Gargantext.Core.Text.Terms.Multi.RAKE (multiterms_rake)
27 import Gargantext.Core.Types
28 import Gargantext.Core.Utils (groupWithCounts)
29 import Gargantext.Prelude
30 import Replace.Attoparsec.Text as RAT
31 import qualified Gargantext.Core.Text.Terms.Multi.Lang.En as En
32 import qualified Gargantext.Core.Text.Terms.Multi.Lang.Fr as Fr
33 import qualified Gargantext.Utils.SpacyNLP as SpacyNLP
34
35 -------------------------------------------------------------------
36 type NLP_API = Lang -> Text -> IO PosSentences
37
38 -------------------------------------------------------------------
39 multiterms :: NLPServerConfig -> Lang -> Text -> IO [TermsWithCount]
40 multiterms nsc l txt = do
41 let txt' = cleanTextForNLP txt
42 if txt' == ""
43 then do
44 printDebug "[G.C.T.Terms.Multi] becomes empty after cleanTextForNLP" txt
45 pure []
46 else do
47 ret <- multiterms' tokenTag2terms l txt'
48 pure $ groupWithCounts ret
49 where
50 multiterms' :: (TokenTag -> a) -> Lang -> Text -> IO [a]
51 multiterms' f lang txt' = concat
52 <$> map (map f)
53 <$> map (filter (\t -> _my_token_pos t == Just NP))
54 <$> tokenTags nsc lang txt'
55
56 -------------------------------------------------------------------
57 tokenTag2terms :: TokenTag -> Terms
58 tokenTag2terms (TokenTag ws t _ _) = Terms ws t
59
60 tokenTags :: NLPServerConfig -> Lang -> Text -> IO [[TokenTag]]
61 tokenTags (NLPServerConfig { server = CoreNLP, url }) EN txt = tokenTagsWith EN txt $ corenlp url
62 tokenTags (NLPServerConfig { server = Spacy, url }) l txt = do
63 -- printDebug "NLP Debug" txt
64 tokenTagsWith l txt $ SpacyNLP.nlp url
65 -- tokenTags FR txt = do
66 -- -- printDebug "[Spacy Debug]" txt
67 -- if txt == ""
68 -- then pure [[]]
69 -- else tokenTagsWith FR txt SpacyNLP.nlp
70 tokenTags _ l _ = panic $ "[G.C.T.T.Multi] Lang NLP API not implemented yet " <> (cs $ show l)
71
72 tokenTagsWith :: Lang -> Text -> NLP_API -> IO [[TokenTag]]
73 tokenTagsWith lang txt nlp = map (groupTokens lang)
74 <$> map tokens2tokensTags
75 <$> map _sentenceTokens
76 <$> _sentences
77 <$> nlp lang txt
78
79
80 ---- | This function analyses and groups (or not) ngrams according to
81 ---- specific grammars of each language.
82 groupTokens :: Lang -> [TokenTag] -> [TokenTag]
83 groupTokens EN = En.groupTokens
84 groupTokens FR = Fr.groupTokens
85 groupTokens _ = Fr.groupTokens
86
87 -- TODO: make tests here
88 cleanTextForNLP :: Text -> Text
89 cleanTextForNLP = unifySpaces . removeDigitsWith "-" . removeUrls
90 where
91 remove x = RAT.streamEdit x (const "")
92
93 unifySpaces = RAT.streamEdit (many DAT.space) (const " ")
94 removeDigitsWith x = remove (many DAT.digit *> DAT.string x <* many DAT.digit)
95
96 removeUrls = removeUrlsWith "http" . removeUrlsWith "www"
97 removeUrlsWith w = remove (DAT.string w *> many (DAT.notChar ' ') <* many DAT.space)
98