]> Git — Sourcephile - gargantext.git/blob - src/Data/Gargantext/Ngrams/Words.hs
[NLP] parseWith function and improving types clarity.
[gargantext.git] / src / Data / Gargantext / Ngrams / Words.hs
1
2 module Data.Gargantext.Ngrams.Words where
3 import Data.List (partition)
4 import Data.Set (fromList, notMember, member)
5 import Data.Char (isPunctuation, toLower, isAlpha, isSpace)
6
7 import NLP.Stemmer (stem, Stemmer(..))
8 import Language.Aspell (check, suggest, spellChecker, spellCheckerWithOptions)
9 import Language.Aspell.Options (ACOption(..))
10
11 --import Data.Either.Utils (fromRight)
12 import Data.ByteString.Internal (packChars)
13
14
15 get_lang x = do
16 let lang = Lang (packChars x)
17 spell_lang <- spellCheckerWithOptions [lang]
18 return spell_lang
19
20 check' lang x = check lang (packChars x)
21 suggest' lang x = suggest lang (packChars x)
22
23 --spell_lang <- spellChecker
24 --lang = fromRight s
25 --suggest' lang x
26
27 -- stem French "naturelles"
28
29
30 -- paragraphes
31 -- lines
32 -- sentences
33
34 -- Prelude.map (\x -> stem French x) $ cleanText "Les hirondelles s envolent dans les cieux."
35 repl :: Char -> Char
36 repl x
37 | x == '\'' = ' '
38 | x == '/' = ' '
39 -- | x == '\t' = ' '
40 -- | x == '\n' = ' '
41 | otherwise = x
42
43 cleanText text = do
44 -- pb avec \'
45 --words $ filter (not . isPunctuation) $ Prelude.map toLower text
46 words $ filter (\x -> isAlpha x || isSpace x) $ Prelude.map (repl . toLower) text
47
48 isMiamWord word = do
49 let miamWord_set = fromList ["salut", "phrase"]
50 member word miamWord_set
51
52 isStopWord word = do
53 let stopWord_set = fromList ["de", "la", "une", "avec"]
54 member word stopWord_set
55
56 wordsMain = do
57 let text = "Salut, ceci est une phrase \n\n avec de la ponctuation !"
58 print $ partition (not . isStopWord) $ cleanText text
59 print $ filter (not . isStopWord) $ cleanText text
60 --print $ filter isStopWord $ words $ filter (not . isPunctuation) text