2 Module : Gargantext.Text.Terms.Stop
3 Description : Mono Terms module
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Stop words and (how to learn it).
12 Main type here is String.
16 {-# LANGUAGE NoImplicitPrelude #-}
17 {-# LANGUAGE OverloadedStrings #-}
19 module Gargantext.Text.Terms.Stop -- (detectLang, detectLangs, stopList)
22 import GHC.Base (Functor)
23 import Numeric.Probability.Distribution ((??))
24 import qualified Numeric.Probability.Distribution as D
26 import Data.Char (toLower)
27 import qualified Data.List as DL
29 import Data.Maybe (maybe)
30 import Data.Map.Strict (Map, toList)
31 import qualified Data.Map.Strict as DM
33 import Data.String (String)
35 import Data.Text (Text)
36 import Data.Text (pack, unpack)
37 import Data.Tuple.Extra (both)
39 import Gargantext.Prelude
40 import Gargantext.Core (Lang(..), allLangs)
41 import Gargantext.Text.Terms.Mono (words)
42 import Gargantext.Text.Metrics.Count (occurrencesWith)
44 import qualified Gargantext.Text.Samples.FR as FR
45 import qualified Gargantext.Text.Samples.EN as EN
46 --import qualified Gargantext.Text.Samples.DE as DE
47 --import qualified Gargantext.Text.Samples.SP as SP
48 --import qualified Gargantext.Text.Samples.CH as CH
50 ------------------------------------------------------------------------
51 data Candidate = Candidate { stop :: Double
55 -- * String preparation
58 blanks :: String -> String
60 blanks xs = [' '] <> xs <> [' ']
62 -- | Blocks increase the size of the word to ease computations
63 -- some border and unexepected effects can happen, need to be tested
64 blockOf :: Int -> String -> String
65 blockOf n = DL.concat . DL.take n . DL.repeat
67 -- | Chunks is the same function as splitBy in Context but for Strings,
68 -- not Text (without pack and unpack operations that are not needed).
69 chunks :: Int -> Int -> String -> [String]
70 chunks n m = DL.take m . filter (not . all (== ' '))
77 allChunks :: [Int] -> Int -> String -> [String]
78 allChunks ns m st = DL.concat $ map (\n -> chunks n m st) ns
80 allChunks' :: [Int] -> Int -> String -> [[String]]
81 allChunks' ns m st = map (\n -> chunks n m st) ns
83 ------------------------------------------------------------------------
84 -- * Analyze candidate
90 data LangWord = LangWord Lang Word
92 type LangProba = Map Lang Double
94 ------------------------------------------------------------------------
95 detectLang :: Text -> Maybe Lang
96 detectLang = head . map fst . detectLangs
98 detectLangs :: Text -> [(Lang, Double)]
99 detectLangs = detectLangs' . unpack
101 detectLangs' :: String -> [(Lang, Double)]
102 detectLangs' s = DL.reverse $ DL.sortOn snd
104 $ detect (wordsToBook [0..2] s) eventLang
106 part :: (Eq p, Fractional p) => p -> p -> p
111 toProba :: (Eq b, Fractional b, Functor t, Foldable t) =>
113 toProba xs = map (\(a,b) -> (a, part b total)) xs
115 total = sum $ map snd xs
117 textSample :: Lang -> String
118 textSample EN = EN.textSample
119 textSample FR = FR.textSample
120 --textSample DE = DE.textSample
121 --textSample SP = SP.textSample
122 --textSample CH = CH.textSample
124 langWord :: Lang -> LangWord
125 langWord l = LangWord l (textSample l)
127 eventLang :: EventLang
128 eventLang = toEventLangs [0..2] [ langWord l | l <- allLangs ]
130 detect :: EventBook -> EventLang -> LangProba
131 detect (EventBook mapFreq _) el =
134 $ map (\(s,n) -> map (\(l,f) -> (l, (fromIntegral n) * f)) $ toPrior s el)
135 $ filter (\x -> fst x /= " ")
138 ------------------------------------------------------------------------
140 type EventLang = Map Lang EventBook
142 toEventLangs :: [Int] -> [LangWord] -> EventLang
143 toEventLangs ns = foldl' (opLang (+)) (emptyEventLang ns) . map (toLang ns)
145 emptyEventLang :: [Int] -> EventLang
146 emptyEventLang ns = toLang ns (LangWord FR "")
148 toLang :: [Int] -> LangWord -> EventLang
149 toLang ns (LangWord l txt) = DM.fromList [(l, wordsToBook ns txt)]
151 opLang :: (Freq -> Freq -> Freq) -> EventLang -> EventLang -> EventLang
152 opLang f = DM.unionWith (op f)
154 ------------------------------------------------------------------------
155 -- | TODO: monoids (but proba >= 0)
157 peb :: String -> EventBook -> Double
158 peb st (EventBook mapFreq mapN) = (fromIntegral a) / (fromIntegral b)
160 a = maybe 0 identity $ DM.lookup st mapFreq
161 b = maybe 1 identity $ DM.lookup (length st) mapN
163 peb' :: String -> EventBook -> (Freq, TotalFreq)
164 peb' st (EventBook mapFreq mapN) = (fromIntegral a, fromIntegral b)
166 a = maybe 0 identity $ DM.lookup st mapFreq
167 b = maybe 1 identity $ DM.lookup (length st) mapN
169 ------------------------------------------------------------------------
170 toPrior :: String -> EventLang -> [(Lang, Double)]
171 toPrior s el = prior $ pebLang s el
173 pebLang :: String -> EventLang -> [(Lang, (Freq,TotalFreq))]
174 pebLang st = map (\(l,eb) -> (l, peb' st eb)) . DM.toList
175 ------------------------------------------------------------------------
176 prior :: [(Lang, (Freq, TotalFreq))] -> [(Lang, Double)]
177 prior ps = zip ls $ zipWith (\x y -> x^(99::Int) * y) (map (\(a,_) -> part a (sum $ map fst ps')) ps')
178 (map (\(a,b) -> a / b) ps')
181 (ls, ps'') = DL.unzip ps
182 ps' = map (both fromIntegral) ps''
183 ------------------------------------------------------------------------
184 data EventBook = EventBook { events_freq :: Map String Freq
185 , events_n :: Map StringSize TotalFreq
189 emptyEventBook :: [Int] -> EventBook
190 emptyEventBook ns = wordToBook ns " "
192 wordsToBook :: [Int] -> String -> EventBook
193 wordsToBook ns txt = foldl' (op (+)) (emptyEventBook ns) eventsBook
195 ws = map unpack $ words $ pack txt
196 eventsBook = map (wordToBook ns) ws
198 wordToBook :: [Int] -> Word -> EventBook
199 wordToBook ns txt = EventBook ef en
201 chks = allChunks' ns 10 txt
202 en = DM.fromList $ map (\(n,ns') -> (n, length ns')) $ zip ns chks
203 ef = foldl' DM.union DM.empty $ map (occurrencesWith identity) chks
205 op :: (Freq -> Freq -> Freq) -> EventBook -> EventBook -> EventBook
206 op f (EventBook ef1 en1)
207 (EventBook ef2 en2) = EventBook (DM.unionWith f ef1 ef2)
208 (DM.unionWith f en1 en2)
210 ------------------------------------------------------------------------
211 ------------------------------------------------------------------------
212 -- * Make the distributions
213 makeDist :: [String] -> D.T Double String
214 makeDist = D.uniform . DL.concat . map (allChunks [0,2] 10)
216 stopDist :: D.T Double String
217 stopDist = makeDist stopList
219 candDist :: D.T Double String
220 candDist = makeDist candList
222 ------------------------------------------------------------------------
223 sumProba :: Num a => D.T a String -> [Char] -> a
224 sumProba ds x = sum $ map ((~?) ds) $ allChunks [0,2] 10 $ map toLower x
226 -- | Get probability according a distribution
227 (~?) :: (Num prob, Eq a) => D.T prob a -> a -> prob
228 (~?) ds x = (==x) ?? ds
230 ------------------------------------------------------------------------
231 candidate :: [Char] -> Candidate
232 candidate x = Candidate (sumProba stopDist x) (sumProba candDist x)
234 ------------------------------------------------------------------------
236 candList = [ "france", "alexandre", "mael", "constitution"
237 , "etats-unis", "associes", "car", "train", "spam"]
241 stopList = map show ([0..9]::[Int]) <> stopListWords
243 stopListWords :: [String]
245 ["a", "a's", "able", "about", "above", "according", "accordingly"
246 , "across", "actually", "after", "afterwards", "again", "against"
247 , "ain't", "all", "allow", "allows", "almost", "alone", "along"
248 , "already", "also", "although", "always", "am", "among", "amongst", "an"
249 , "analyze", "and", "another", "any", "anybody", "anyhow", "anyone"
250 , "anything", "anyway", "anyways", "anywhere", "apart", "appear"
251 , "apply", "appreciate", "appropriate", "are", "aren't", "around"
252 , "as", "aside", "ask", "asking", "associated", "at", "available"
253 , "away", "awfully", "b", "based", "be", "became", "because", "become"
254 , "becomes", "becoming", "been", "before", "beforehand", "behind"
255 , "being", "believe", "below", "beside", "besides", "best", "better"
256 , "between", "beyond", "both", "brief", "but", "by", "c", "c'mon", "c's"
257 , "came", "can", "can't", "cannot", "cant", "cause", "causes", "certain"
258 , "certainly", "changes", "clearly", "co", "com", "come", "comes"
259 , "common", "concerning", "consequently", "consider", "considering"
260 , "contain", "containing", "contains", "corresponding", "could"
261 , "couldn't", "course", "currently", "d", "definitely", "described"
262 , "despite", "detecting", "detects", "did", "didn't", "different", "do"
263 , "does", "doesn't", "doing", "don't", "done", "down", "downwards"
264 , "during", "e", "each", "edu", "eg", "eight", "either", "else"
265 , "elsewhere", "enough", "entirely", "especially", "et", "etc", "even"
266 , "ever", "every", "everybody", "everyone", "everything", "everywhere"
267 , "ex", "exactly", "example", "except", "f", "far", "few", "fifth"
268 , "find", "first", "five", "followed", "following", "follows", "for"
269 , "former", "formerly", "forth", "four", "from", "further", "furthermore"
270 , "g", "get", "gets", "getting", "gif", "given", "gives", "go", "goes"
271 , "going", "gone", "got", "gotten", "greetings", "h", "had", "hadn't"
272 , "happens", "hardly", "has", "hasn't", "have", "haven't", "having"
273 , "he", "he'd", "he'll", "he's", "hello", "help", "hence", "her"
274 , "here", "here's", "hereafter", "hereby", "herein", "hereupon", "hers"
275 , "herself", "hi", "him", "himself", "his", "hither", "hopefully", "how"
276 , "how's", "howbeit", "however", "i", "i'd", "i'll", "i'm", "i've"
277 , "identify", "ie", "if", "ignored", "immediate", "in", "inasmuch"
278 , "inc", "indeed", "indicate", "indicated", "indicates", "inner"
279 , "insofar", "instead", "into", "involves", "inward", "is", "isn't"
280 , "it", "it'd", "it'll", "it's", "its", "itself", "j", "just", "k"
281 , "keep", "keeps", "kept", "know", "known", "knows", "l", "last"
282 , "late", "lately", "later", "latter", "latterly", "least", "less"
283 , "lest", "let", "let's", "like", "liked", "likely", "little", "look"
284 , "looking", "looks", "ltd", "m", "main", "mainly", "many", "may"
285 , "maybe", "me", "mean", "meanwhile", "merely", "might", "min", "more"
286 , "moreover", "most", "mostly", "much", "must", "mustn't", "my", "myself"
287 , "n", "name", "namely", "nd", "near", "nearly", "necessary", "need"
288 , "needs", "neither", "never", "nevertheless", "new", "next", "nine"
289 , "no", "nobody", "non", "none", "noone", "nor", "normally", "not"
290 , "nothing", "novel", "now", "nowhere", "o", "obviously", "of", "off"
291 , "often", "oh", "ok", "okay", "old", "on", "once", "one", "ones"
292 , "only", "onto", "or", "other", "others", "otherwise", "ought", "our"
293 , "ours", "ourselves", "out", "outside", "over", "overall", "own", "p"
294 , "particular", "particularly", "per", "perhaps", "placed", "please"
295 , "plus", "possible", "presents", "presumably", "probably", "provides"
296 , "q", "que", "quite", "qv", "r", "rather", "rd", "re", "really"
297 , "reasonably", "regarding", "regardless", "regards", "relatively"
298 , "respectively", "right", "s", "said", "same", "saw", "say", "saying"
299 , "says", "sds", "second", "secondly", "see", "seeing", "seem", "seemed"
300 , "seeming", "seems", "seen", "self", "selves", "sensible", "sent"
301 , "serious", "seriously", "seven", "several", "shall", "shan't"
302 , "she", "she'd", "she'll", "she's", "should", "shouldn't", "since"
303 , "six", "so", "some", "somebody", "somehow", "someone", "something"
304 , "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry"
305 , "specified", "specify", "specifying", "still", "sub", "such", "sup"
306 , "sure", "t", "t's", "take", "taken", "tell", "tends", "th", "than"
307 , "thank", "thanks", "thanx", "that", "that's", "thats", "the", "their"
308 , "theirs", "them", "themselves", "then", "thence", "there", "there's"
309 , "thereafter", "thereby", "therefore", "therein", "theres", "thereupon"
310 , "these", "they", "they'd", "they'll", "they're", "they've", "think"
311 , "third", "this", "thorough", "thoroughly", "those", "though", "three"
312 , "through", "throughout", "thru", "thus", "to", "together", "too"
313 , "took", "toward", "towards", "tried", "tries", "truly", "try"
314 , "trying", "twice", "two", "u", "un", "under", "unfortunately"
315 , "unless", "unlikely", "until", "unto", "up", "upon", "us", "use"
316 , "used", "useful", "uses", "using", "usually", "uucp", "v", "value"
317 , "various", "very", "via", "viz", "vs", "w", "want", "wants", "was"
318 , "wasn't", "way", "we", "we'd", "we'll", "we're", "we've", "welcome"
319 , "well", "went", "were", "weren't", "what", "what's", "whatever", "when"
320 , "when's", "whence", "whenever", "where", "where's", "whereafter"
321 , "whereas", "whereby", "wherein", "whereupon", "wherever", "whether"
322 , "which", "while", "whither", "who", "who's", "whoever", "whole", "whom"
323 , "whose", "why", "why's", "will", "willing", "wish", "with", "within"
324 , "without", "won't", "wonder", "would", "wouldn't", "x", "y", "yes"
325 , "yet", "you", "you'd", "you'll", "you're", "you've", "your", "yours"
326 , "yourself", "yourselves", "z", "zero"]