]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Ngrams/Words_hs
[PATH] Data.Gargantext -> Gargantext.
[gargantext.git] / src / Gargantext / Ngrams / Words_hs
1 module Data.Gargantext.Ngrams.Words where
2 import Data.List (partition)
3 import Data.Set (fromList, notMember, member)
4 import Data.Char (isPunctuation, toLower, isAlpha, isSpace)
5
6 import NLP.Stemmer (stem, Stemmer(..))
7 import Language.Aspell (check, suggest, spellChecker, spellCheckerWithOptions)
8 import Language.Aspell.Options (ACOption(..))
9
10 --import Data.Either.Utils (fromRight)
11 import Data.ByteString.Internal (packChars)
12
13
14 get_lang x = do
15 let lang = Lang (packChars x)
16 spell_lang <- spellCheckerWithOptions [lang]
17 return spell_lang
18
19 check' lang x = check lang (packChars x)
20 suggest' lang x = suggest lang (packChars x)
21
22 --spell_lang <- spellChecker
23 --lang = fromRight s
24 --suggest' lang x
25
26 -- stem French "naturelles"
27
28
29 -- paragraphes
30 -- lines
31 -- sentences
32
33 -- Prelude.map (\x -> stem French x) $ cleanText "Les hirondelles s envolent dans les cieux."
34 repl :: Char -> Char
35 repl x
36 | x == '\'' = ' '
37 | x == '/' = ' '
38 -- | x == '\t' = ' '
39 -- | x == '\n' = ' '
40 | otherwise = x
41
42 cleanText text = do
43 -- pb avec \'
44 --words $ filter (not . isPunctuation) $ Prelude.map toLower text
45 words $ filter (\x -> isAlpha x || isSpace x) $ Prelude.map (repl . toLower) text
46
47 isMiamWord word = do
48 let miamWord_set = fromList ["salut", "phrase"]
49 member word miamWord_set
50
51 isStopWord word = do
52 let stopWord_set = fromList ["de", "la", "une", "avec"]
53 member word stopWord_set
54
55 wordsMain = do
56 let text = "Salut, ceci est une phrase \n\n avec de la ponctuation !"
57 print $ partition (not . isStopWord) $ cleanText text
58 print $ filter (not . isStopWord) $ cleanText text
59 --print $ filter isStopWord $ words $ filter (not . isPunctuation) text