-}
{-# LANGUAGE NoImplicitPrelude #-}
+{-# LANGUAGE TemplateHaskell #-}
module Gargantext.Text.Terms
where
+import Control.Lens
import Data.Text (Text)
import Data.Traversable
+import GHC.Base (String)
import Gargantext.Prelude
import Gargantext.Core
import Gargantext.Core.Types
import Gargantext.Text.Terms.Multi (multiterms)
-import Gargantext.Text.Terms.Mono (monoterms')
+import Gargantext.Text.Terms.Mono (monoTerms)
-data TermType lang = Mono lang | Multi lang | MonoMulti lang
+import qualified Data.List as List
+import qualified Data.Text as Text
+import Gargantext.Text (sentences)
+import Gargantext.Text.Terms.Mono.Token.En (tokenize)
+import Gargantext.Text.Eleve (mainEleve)
+
+data TermType lang
+ = Mono { _tt_lang :: lang }
+ | Multi { _tt_lang :: lang }
+ | MonoMulti { _tt_lang :: lang }
+
+makeLenses ''TermType
+
+--group :: [Text] -> [Text]
+--group = undefined
-- remove Stop Words
-- map (filter (\t -> not . elem t)) $
------------------------------------------------------------------------
-- | Sugar to extract terms from text (hiddeng mapM from end user).
-extractTerms :: Traversable t => TermType Lang -> t Text -> IO (t [Terms])
+--extractTerms :: Traversable t => TermType Lang -> t Text -> IO (t [Terms])
+extractTerms :: TermType Lang -> [Text] -> IO [[Terms]]
extractTerms termTypeLang = mapM (terms termTypeLang)
------------------------------------------------------------------------
-- | Terms from Text
-- MonoMulti : mono and multi
-- TODO : multi terms should exclude mono (intersection is not empty yet)
terms :: TermType Lang -> Text -> IO [Terms]
-terms (Mono lang) txt = pure $ monoterms' lang txt
+terms (Mono lang) txt = pure $ monoTerms lang txt
terms (Multi lang) txt = multiterms lang txt
terms (MonoMulti lang) txt = terms (Multi lang) txt
+-- terms (WithList list) txt = pure . concat $ extractTermsWithList list txt
------------------------------------------------------------------------
+isPunctuation :: Text -> Bool
+isPunctuation x = List.elem x $ (Text.pack . pure)
+ <$> ("!?(),;." :: String)
+
+-- | Unsupervised ngrams extraction
+-- language agnostic extraction
+-- TODO: remove IO
+-- TODO: BlockText
+extractTermsUnsupervised :: Int -> Text -> [[Text]]
+extractTermsUnsupervised n =
+ List.nub
+ . (List.filter (\l -> List.length l > 1))
+ . List.concat
+ . mainEleve n
+ . map (map Text.toLower)
+ . map (List.filter (not . isPunctuation))
+ . map tokenize
+ . sentences
+