change the logs output

[gargantext.git] / src / Gargantext / Text / Terms.hs
diff --git a/src/Gargantext/Text/Terms.hs b/src/Gargantext/Text/Terms.hs

index fefeadd38888253a5a120d0331d2ea7f39a2cf3e..43ace46090d8a1de719e3a11293727a9741cdae8 100644 (file)
--- a/src/Gargantext/Text/Terms.hs
+++ b/src/Gargantext/Text/Terms.hs
@@ -29,27 +29,44 @@ compute graph
  -}
  
  {-# LANGUAGE NoImplicitPrelude #-}
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE TemplateHaskell   #-}
  
  module Gargantext.Text.Terms
    where
  
-import Data.List (concat)
+import Control.Lens
  import Data.Text (Text)
  import Data.Traversable
+import GHC.Base (String)
  
  import Gargantext.Prelude
  import Gargantext.Core
  import Gargantext.Core.Types
  import Gargantext.Text.Terms.Multi (multiterms)
  import Gargantext.Text.Terms.Mono  (monoTerms)
-import Gargantext.Text.Terms.WithList (Patterns, extractTermsWithList)
-
-
-data TermType lang = Mono lang | Multi lang | MonoMulti lang | WithList Patterns
-
-
-group :: [Text] -> [Text]
-group = undefined
+import Gargantext.Text.Terms.Mono.Stem (stem)
+
+import qualified Data.Set  as Set
+import qualified Data.List as List
+import qualified Data.Text as Text
+import Gargantext.Text (sentences)
+import Gargantext.Text.Terms.Mono.Token.En (tokenize)
+import Gargantext.Text.Terms.Eleve (mainEleveWith, Tries, Token, buildTries, toToken)
+
+data TermType lang
+  = Mono      { _tt_lang :: lang }
+  | Multi     { _tt_lang :: lang }
+  | MonoMulti { _tt_lang :: lang }
+  | Unsupervised { _tt_lang  :: lang
+                 , _tt_windoSize  :: Int
+                 , _tt_ngramsSize :: Int
+                 , _tt_model :: Maybe (Tries Token ())
+  }
+makeLenses ''TermType
+
+--group :: [Text] -> [Text]
+--group = undefined
  
  -- remove Stop Words
  -- map (filter (\t -> not . elem t)) $ 
@@ -57,7 +74,15 @@ group = undefined
  -- | Sugar to extract terms from text (hiddeng mapM from end user).
  --extractTerms :: Traversable t => TermType Lang -> t Text -> IO (t [Terms])
  extractTerms :: TermType Lang -> [Text] -> IO [[Terms]]
-extractTerms termTypeLang = mapM (terms termTypeLang)
+
+extractTerms (Unsupervised l n s m) xs = mapM (terms (Unsupervised l n s (Just m'))) xs
+  where
+    m' = case m of
+      Just m''-> m''
+      Nothing -> newTries n (Text.intercalate " " xs)
+
+extractTerms termTypeLang xs = mapM (terms termTypeLang) xs
+
  ------------------------------------------------------------------------
  -- | Terms from Text
  -- Mono : mono terms
@@ -68,6 +93,46 @@ terms :: TermType Lang -> Text -> IO [Terms]
  terms (Mono      lang) txt = pure $ monoTerms lang txt
  terms (Multi     lang) txt = multiterms lang txt
  terms (MonoMulti lang) txt = terms (Multi lang) txt
-terms (WithList  list) txt = pure . concat $ extractTermsWithList list txt
+terms (Unsupervised lang n s m) txt = termsUnsupervised (Unsupervised lang n s (Just m')) txt
+  where
+    m' = maybe (newTries n txt) identity m
+-- terms (WithList  list) txt = pure . concat $ extractTermsWithList list txt
  ------------------------------------------------------------------------
  
+text2term :: Lang -> [Text] -> Terms
+text2term _ [] = Terms [] Set.empty
+text2term lang txt = Terms txt (Set.fromList $ map (stem lang) txt)
+
+isPunctuation :: Text -> Bool
+isPunctuation x = List.elem x $  (Text.pack . pure)
+                             <$> ("!?(),;." :: String)
+
+-- | Unsupervised ngrams extraction
+-- language agnostic extraction
+-- TODO: remove IO
+-- TODO: newtype BlockText
+
+type WindowSize = Int
+type MinNgramSize = Int
+
+termsUnsupervised :: TermType Lang -> Text -> IO [Terms]
+termsUnsupervised (Unsupervised l n s m) =
+               pure
+             . map (text2term l)
+             . List.nub
+             . (List.filter (\l' -> List.length l' >= s))
+             . List.concat
+             . mainEleveWith (maybe (panic "no model") identity m) n
+             . uniText
+termsUnsupervised _ = undefined
+
+newTries :: Int -> Text -> Tries Token ()
+newTries n t = buildTries n (fmap toToken $ uniText t)
+
+-- | TODO removing long terms > 24
+uniText :: Text -> [[Text]]
+uniText = map (List.filter (not . isPunctuation))
+        . map tokenize
+        . sentences       -- | TODO get sentences according to lang
+        . Text.toLower
+