-}
-module Gargantext.Core.Text.Terms.Multi (multiterms, multiterms_rake)
+module Gargantext.Core.Text.Terms.Multi (multiterms, multiterms_rake, tokenTagsWith, tokenTags)
where
import Data.Text hiding (map, group, filter, concat)
import Data.List (concat)
-import qualified Data.Set as S
import Gargantext.Prelude
import Gargantext.Core (Lang(..))
import Gargantext.Core.Types
+import Gargantext.Core.Utils (groupWithCounts)
import Gargantext.Core.Text.Terms.Multi.PosTagging
-import Gargantext.Core.Text.Terms.Mono.Stem (stem)
+import Gargantext.Core.Text.Terms.Multi.PosTagging.Types
import qualified Gargantext.Core.Text.Terms.Multi.Lang.En as En
import qualified Gargantext.Core.Text.Terms.Multi.Lang.Fr as Fr
import Gargantext.Core.Text.Terms.Multi.RAKE (multiterms_rake)
+-- import qualified Gargantext.Utils.JohnSnowNLP as JohnSnow
-multiterms :: Lang -> Text -> IO [Terms]
-multiterms lang txt = concat
- <$> map (map (tokenTag2terms lang))
- <$> map (filter (\t -> _my_token_pos t == Just NP))
- <$> tokenTags lang txt
+import qualified Gargantext.Utils.SpacyNLP as SpacyNLP
-tokenTag2terms :: Lang -> TokenTag -> Terms
-tokenTag2terms lang (TokenTag w t _ _) = Terms w t'
+
+-------------------------------------------------------------------
+type NLP_API = Lang -> Text -> IO PosSentences
+
+-------------------------------------------------------------------
+multiterms :: Lang -> Text -> IO [TermsWithCount]
+multiterms l txt = do
+ ret <- multiterms' tokenTag2terms l txt
+ pure $ groupWithCounts ret
where
- t' = S.fromList $ map (stem lang) $ S.toList t
+ multiterms' :: (TokenTag -> a) -> Lang -> Text -> IO [a]
+ multiterms' f lang txt' = concat
+ <$> map (map f)
+ <$> map (filter (\t -> _my_token_pos t == Just NP))
+ <$> tokenTags lang txt'
+
+-------------------------------------------------------------------
+tokenTag2terms :: TokenTag -> Terms
+tokenTag2terms (TokenTag ws t _ _) = Terms ws t
tokenTags :: Lang -> Text -> IO [[TokenTag]]
-tokenTags lang s = map (group lang) <$> tokenTags' lang s
+tokenTags EN txt = tokenTagsWith EN txt corenlp
+tokenTags FR txt = do
+ -- printDebug "[Spacy Debug]" txt
+ if txt == ""
+ then pure [[]]
+ else tokenTagsWith FR txt SpacyNLP.nlp
+tokenTags l _ = panic $ "[G.C.T.T.Multi] Lang NLP API not implemented yet " <> (cs $ show l)
+tokenTagsWith :: Lang -> Text -> NLP_API -> IO [[TokenTag]]
+tokenTagsWith lang txt nlp = map (groupTokens lang)
+ <$> map tokens2tokensTags
+ <$> map _sentenceTokens
+ <$> _sentences
+ <$> nlp lang txt
-tokenTags' :: Lang -> Text -> IO [[TokenTag]]
-tokenTags' lang t = map tokens2tokensTags
- <$> map _sentenceTokens
- <$> _sentences
- <$> corenlp lang t
---- | This function analyses and groups (or not) ngrams according to
---- specific grammars of each language.
-group :: Lang -> [TokenTag] -> [TokenTag]
-group EN = En.group
-group FR = Fr.group
-group _ = panic $ pack "group :: Lang not implemeted yet"
+groupTokens :: Lang -> [TokenTag] -> [TokenTag]
+groupTokens EN = En.groupTokens
+groupTokens FR = Fr.groupTokens
+groupTokens _ = panic $ pack "groupTokens :: Lang not implemeted yet"