[FIX] PubMed api ok.

[gargantext.git] / src / Gargantext / Text / Terms / Mono.hs
diff --git a/src/Gargantext/Text/Terms/Mono.hs b/src/Gargantext/Text/Terms/Mono.hs

index df29e8ffbdc786dfbb2bde2765a1a7d2b20f852a..e7758599047f2e188d0d886eb65f44d364c89079 100644 (file)
--- a/src/Gargantext/Text/Terms/Mono.hs
+++ b/src/Gargantext/Text/Terms/Mono.hs
@@ -13,10 +13,15 @@ Mono-terms are Nterms where n == 1.
  
  {-# LANGUAGE NoImplicitPrelude #-}
  
-module Gargantext.Text.Terms.Mono (monoterms, monoterms')
+module Gargantext.Text.Terms.Mono (monoTerms, monoTexts, monoTextsBySentence, words)
    where
  
-import Data.Text (Text, toLower, split, splitOn, pack)
+import Prelude (String)
+
+import Data.Text (Text)
+import qualified Data.Text as T
+
+import qualified Data.List as L
  import qualified Data.Set as S
  
  import Gargantext.Core
@@ -24,25 +29,29 @@ import Gargantext.Core.Types
  import Gargantext.Text.Terms.Mono.Stem (stem)
  
  import Gargantext.Prelude
-import Data.Char (isAlphaNum, isSpace)
+--import Data.Char (isAlphaNum, isSpace)
  
-monoterms' :: Lang -> Text -> [Terms]
-monoterms' l txt = map (text2terms l) $ monoterms txt
+-- | TODO remove Num ?
+--isGram  c  = isAlphaNum c
  
-monoterms :: Text -> [Text]
-monoterms txt = map toLower $ split isWord txt
-  where
-    isWord c = c `elem` [' ', '\'', ',', ';']
+words :: Text -> [Text]
+words = monoTexts
  
-text2terms :: Lang -> Text -> Terms
-text2terms lang txt = Terms label stems
-  where
-    label = splitOn (pack " ") txt
-    stems = S.fromList $ map (stem lang) label
+-- | Sentence split separators
+isSep :: Char -> Bool
+isSep = (`elem` (",.:;?!(){}[]\"\'" :: String))
+
+monoTerms :: Lang -> Text -> [Terms]
+monoTerms l txt = map (monoText2term l) $ monoTexts txt
  
-  --monograms :: Text -> [Text]
---monograms xs = monograms $ toLower $ filter isGram xs
+monoTexts :: Text -> [Text]
+monoTexts = L.concat . monoTextsBySentence
  
-isGram :: Char -> Bool
-isGram  c  = isAlphaNum c || isSpace c || c `elem` ['-','/','\'']
+-- | TODO use text2term only
+monoText2term :: Lang -> Text -> Terms
+monoText2term lang txt = Terms [txt] (S.singleton $ stem lang txt)
  
+monoTextsBySentence :: Text -> [[Text]]
+monoTextsBySentence = map T.words
+                    . T.split isSep
+                    . T.toLower