-}
-module Gargantext.Core.Text.Terms.Multi (multiterms, multiterms_rake)
+module Gargantext.Core.Text.Terms.Multi (multiterms, multiterms_rake, tokenTagsWith)
where
import Data.Text hiding (map, group, filter, concat)
import Gargantext.Core.Types
import Gargantext.Core.Text.Terms.Multi.PosTagging
+import Gargantext.Core.Text.Terms.Multi.PosTagging.Types
import qualified Gargantext.Core.Text.Terms.Multi.Lang.En as En
import qualified Gargantext.Core.Text.Terms.Multi.Lang.Fr as Fr
import Gargantext.Core.Text.Terms.Multi.RAKE (multiterms_rake)
+-- import qualified Gargantext.Utils.JohnSnowNLP as JohnSnow
+import qualified Gargantext.Utils.SpacyNLP as SpacyNLP
+
+
+-------------------------------------------------------------------
+type NLP_API = Lang -> Text -> IO PosSentences
+
+-------------------------------------------------------------------
multiterms :: Lang -> Text -> IO [Terms]
-multiterms lang txt = concat
- <$> map (map tokenTag2terms)
- <$> map (filter (\t -> _my_token_pos t == Just NP))
- <$> tokenTags lang txt
+multiterms = multiterms' tokenTag2terms
+ where
+ multiterms' :: (TokenTag -> a) -> Lang -> Text -> IO [a]
+ multiterms' f lang txt = concat
+ <$> map (map f)
+ <$> map (filter (\t -> _my_token_pos t == Just NP))
+ <$> tokenTags lang txt
+-------------------------------------------------------------------
tokenTag2terms :: TokenTag -> Terms
tokenTag2terms (TokenTag ws t _ _) = Terms ws t
tokenTags :: Lang -> Text -> IO [[TokenTag]]
-tokenTags lang s = map (groupTokens lang) <$> tokenTags' lang s
+tokenTags EN txt = tokenTagsWith EN txt corenlp
+tokenTags FR txt = tokenTagsWith FR txt SpacyNLP.nlp
+tokenTags l _ = panic $ "[G.C.T.T.Multi] Lang NLP API not implemented yet " <> (cs $ show l)
+tokenTagsWith :: Lang -> Text -> NLP_API -> IO [[TokenTag]]
+tokenTagsWith lang txt nlp = map (groupTokens lang)
+ <$> map tokens2tokensTags
+ <$> map _sentenceTokens
+ <$> _sentences
+ <$> nlp lang txt
-tokenTags' :: Lang -> Text -> IO [[TokenTag]]
-tokenTags' lang t = map tokens2tokensTags
- <$> map _sentenceTokens
- <$> _sentences
- <$> corenlp lang t
---- | This function analyses and groups (or not) ngrams according to
---- specific grammars of each language.