src/Gargantext/Ngrams/Words_hs

   1 module Data.Gargantext.Ngrams.Words where
   2 import Data.List (partition)
   3 import Data.Set (fromList, notMember, member)
   4 import Data.Char (isPunctuation, toLower, isAlpha, isSpace)
   5
   6 import NLP.Stemmer (stem, Stemmer(..))
   7 import Language.Aspell (check, suggest, spellChecker, spellCheckerWithOptions)
   8 import Language.Aspell.Options (ACOption(..))
   9
  10 --import Data.Either.Utils (fromRight)
  11 import Data.ByteString.Internal (packChars)
  12
  13
  14 get_lang x = do
  15     let lang = Lang (packChars x)
  16     spell_lang <- spellCheckerWithOptions [lang]
  17     return spell_lang
  18
  19 check' lang x = check lang (packChars x)
  20 suggest' lang x = suggest lang (packChars x)
  21
  22 --spell_lang <- spellChecker
  23 --lang = fromRight s
  24 --suggest' lang x
  25
  26 -- stem French "naturelles"
  27
  28
  29 -- paragraphes
  30 -- lines
  31 -- sentences
  32
  33 -- Prelude.map (\x -> stem French x) $ cleanText "Les hirondelles s envolent dans les cieux."
  34 repl :: Char -> Char
  35 repl x
  36     | x == '\'' = ' '
  37     | x == '/' = ' '
  38     -- | x == '\t' = ' '
  39     -- | x == '\n' = ' '
  40     | otherwise = x
  41
  42 cleanText text = do
  43     -- pb avec \'
  44     --words $ filter (not . isPunctuation) $ Prelude.map toLower text
  45     words $ filter (\x -> isAlpha x || isSpace x) $ Prelude.map (repl . toLower) text
  46
  47 isMiamWord word = do
  48     let miamWord_set = fromList ["salut", "phrase"]
  49     member word miamWord_set
  50
  51 isStopWord word = do
  52     let stopWord_set = fromList ["de", "la", "une", "avec"]
  53     member word stopWord_set
  54
  55 wordsMain = do
  56     let text = "Salut, ceci est une phrase \n\n avec de la ponctuation !"
  57     print $ partition (not . isStopWord) $ cleanText text
  58     print $ filter (not . isStopWord) $ cleanText text
  59     --print $ filter isStopWord $ words $ filter (not . isPunctuation) text