{-# LANGUAGE NoImplicitPrelude #-}
-module Gargantext.Text.Terms.Mono (monoterms, monoterms')
+module Gargantext.Text.Terms.Mono (monoTerms, monoTexts, monoTextsBySentence, words)
where
-import Data.Text (Text, toLower, split, splitOn, pack)
+import Prelude (String)
+
+import Data.Text (Text)
+import qualified Data.Text as T
+
+import qualified Data.List as L
import qualified Data.Set as S
import Gargantext.Core
import Gargantext.Text.Terms.Mono.Stem (stem)
import Gargantext.Prelude
-import Data.Char (isAlphaNum, isSpace)
+--import Data.Char (isAlphaNum, isSpace)
-monoterms' :: Lang -> Text -> [Terms]
-monoterms' l txt = map (text2terms l) $ monoterms txt
+-- | TODO remove Num ?
+--isGram c = isAlphaNum c
-monoterms :: Text -> [Text]
-monoterms txt = map toLower $ split isWord txt
- where
- isWord c = c `elem` [' ', '\'', ',', ';']
+words :: Text -> [Text]
+words = monoTexts
-text2terms :: Lang -> Text -> Terms
-text2terms lang txt = Terms label stems
- where
- label = splitOn (pack " ") txt
- stems = S.fromList $ map (stem lang) label
+-- | Sentence split separators
+isSep :: Char -> Bool
+isSep = (`elem` (",.:;?!(){}[]\"\'" :: String))
+
+monoTerms :: Lang -> Text -> [Terms]
+monoTerms l txt = map (monoText2term l) $ monoTexts txt
- --monograms :: Text -> [Text]
---monograms xs = monograms $ toLower $ filter isGram xs
+monoTexts :: Text -> [Text]
+monoTexts = L.concat . monoTextsBySentence
-isGram :: Char -> Bool
-isGram c = isAlphaNum c || isSpace c || c `elem` ['-','/','\'']
+-- | TODO use text2term only
+monoText2term :: Lang -> Text -> Terms
+monoText2term lang txt = Terms [txt] (S.singleton $ stem lang txt)
+monoTextsBySentence :: Text -> [[Text]]
+monoTextsBySentence = map T.words
+ . T.split isSep
+ . T.toLower