import Control.Lens
import Data.Text (Text)
import Data.Traversable
+import GHC.Base (String)
import Gargantext.Prelude
import Gargantext.Core
import Gargantext.Text.Terms.Multi (multiterms)
import Gargantext.Text.Terms.Mono (monoTerms)
+import qualified Data.List as List
+import qualified Data.Text as Text
+import Gargantext.Text (sentences)
+import Gargantext.Text.Terms.Mono.Token.En (tokenize)
+import Gargantext.Text.Eleve (mainEleve)
data TermType lang
= Mono { _tt_lang :: lang }
-- terms (WithList list) txt = pure . concat $ extractTermsWithList list txt
------------------------------------------------------------------------
+isPunctuation :: Text -> Bool
+isPunctuation x = List.elem x $ (Text.pack . pure)
+ <$> ("!?(),;." :: String)
+
+-- | Unsupervised ngrams extraction
+-- language agnostic extraction
+-- TODO: remove IO
+-- TODO: BlockText
+extractTermsUnsupervised :: Int -> Text -> [[Text]]
+extractTermsUnsupervised n =
+ List.nub
+ . (List.filter (\l -> List.length l > 1))
+ . List.concat
+ . mainEleve n
+ . map (map Text.toLower)
+ . map (List.filter (not . isPunctuation))
+ . map tokenize
+ . sentences
+