2 Module : Gargantext.Text.Terms.Stop
3 Description : Mono Terms module
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Stop words and (how to learn it).
12 Main type here is String.
16 {-# LANGUAGE NoImplicitPrelude #-}
18 module Gargantext.Text.Terms.Stop
21 import GHC.Base (Functor)
22 import Numeric.Probability.Distribution ((??))
23 import qualified Numeric.Probability.Distribution as D
25 import Data.Char (toLower)
26 import qualified Data.List as DL
28 import Data.Maybe (maybe)
29 import Data.Map.Strict (Map, toList)
30 import qualified Data.Map.Strict as DM
32 import Data.String (String)
34 import Data.Text (pack, unpack)
35 import Data.Tuple.Extra (both)
37 import Gargantext.Prelude
38 import Gargantext.Core (Lang(..), allLangs)
39 import Gargantext.Text.Terms.Mono (words)
40 import Gargantext.Text.Metrics.Count (occurrencesWith)
42 import qualified Gargantext.Text.Samples.FR as FR
43 import qualified Gargantext.Text.Samples.EN as EN
44 --import qualified Gargantext.Text.Samples.DE as DE
45 --import qualified Gargantext.Text.Samples.SP as SP
46 --import qualified Gargantext.Text.Samples.CH as CH
48 ------------------------------------------------------------------------
49 data Candidate = Candidate { stop :: Double
53 -- * String preparation
56 blanks :: String -> String
58 blanks xs = [' '] <> xs <> [' ']
60 -- | Blocks increase the size of the word to ease computations
61 -- some border and unexepected effects can happen, need to be tested
62 blockOf :: Int -> String -> String
63 blockOf n = DL.concat . DL.take n . DL.repeat
65 -- | Chunks is the same function as splitBy in Context but for Strings,
66 -- not Text (without pack and unpack operations that are not needed).
67 chunks :: Int -> Int -> String -> [String]
68 chunks n m = DL.take m . filter (not . all (== ' '))
75 allChunks :: [Int] -> Int -> String -> [String]
76 allChunks ns m st = DL.concat $ map (\n -> chunks n m st) ns
78 allChunks' :: [Int] -> Int -> String -> [[String]]
79 allChunks' ns m st = map (\n -> chunks n m st) ns
81 ------------------------------------------------------------------------
82 -- * Analyze candidate
88 data LangWord = LangWord Lang Word
90 type LangProba = Map Lang Double
92 ------------------------------------------------------------------------
93 detectLangs :: String -> [(Lang, Double)]
94 detectLangs s = DL.reverse $ DL.sortOn snd
96 $ detect (wordsToBook [0..2] s) eventLang
98 part :: (Eq p, Fractional p) => p -> p -> p
103 toProba :: (Eq b, Fractional b, Functor t, Foldable t) =>
105 toProba xs = map (\(a,b) -> (a, part b total)) xs
107 total = sum $ map snd xs
109 textSample :: Lang -> String
110 textSample EN = EN.textSample
111 textSample FR = FR.textSample
112 --textSample DE = DE.textSample
113 --textSample SP = SP.textSample
114 --textSample CH = CH.textSample
116 langWord :: Lang -> LangWord
117 langWord l = LangWord l (textSample l)
119 eventLang :: EventLang
120 eventLang = toEventLangs [0..2] [ langWord l | l <- allLangs ]
122 detect :: EventBook -> EventLang -> LangProba
123 detect (EventBook mapFreq _) el =
126 $ map (\(s,n) -> map (\(l,f) -> (l, (fromIntegral n) * f)) $ toPrior s el)
127 $ filter (\x -> fst x /= " ")
130 ------------------------------------------------------------------------
132 type EventLang = Map Lang EventBook
134 toEventLangs :: [Int] -> [LangWord] -> EventLang
135 toEventLangs ns = foldl' (opLang (+)) (emptyEventLang ns) . map (toLang ns)
137 emptyEventLang :: [Int] -> EventLang
138 emptyEventLang ns = toLang ns (LangWord FR "")
140 toLang :: [Int] -> LangWord -> EventLang
141 toLang ns (LangWord l txt) = DM.fromList [(l, wordsToBook ns txt)]
143 opLang :: (Freq -> Freq -> Freq) -> EventLang -> EventLang -> EventLang
144 opLang f = DM.unionWith (op f)
146 ------------------------------------------------------------------------
147 -- | TODO: monoids (but proba >= 0)
149 peb :: String -> EventBook -> Double
150 peb st (EventBook mapFreq mapN) = (fromIntegral a) / (fromIntegral b)
152 a = maybe 0 identity $ DM.lookup st mapFreq
153 b = maybe 1 identity $ DM.lookup (length st) mapN
155 peb' :: String -> EventBook -> (Freq, TotalFreq)
156 peb' st (EventBook mapFreq mapN) = (fromIntegral a, fromIntegral b)
158 a = maybe 0 identity $ DM.lookup st mapFreq
159 b = maybe 1 identity $ DM.lookup (length st) mapN
161 ------------------------------------------------------------------------
162 toPrior :: String -> EventLang -> [(Lang, Double)]
163 toPrior s el = prior $ pebLang s el
165 pebLang :: String -> EventLang -> [(Lang, (Freq,TotalFreq))]
166 pebLang st = map (\(l,eb) -> (l, peb' st eb)) . DM.toList
167 ------------------------------------------------------------------------
168 prior :: [(Lang, (Freq, TotalFreq))] -> [(Lang, Double)]
169 prior ps = zip ls $ zipWith (\x y -> x^(99::Int) * y) (map (\(a,_) -> part a (sum $ map fst ps')) ps')
170 (map (\(a,b) -> a / b) ps')
173 (ls, ps'') = DL.unzip ps
174 ps' = map (both fromIntegral) ps''
175 ------------------------------------------------------------------------
176 data EventBook = EventBook { events_freq :: Map String Freq
177 , events_n :: Map StringSize TotalFreq
181 emptyEventBook :: [Int] -> EventBook
182 emptyEventBook ns = wordToBook ns " "
184 wordsToBook :: [Int] -> String -> EventBook
185 wordsToBook ns txt = foldl' (op (+)) (emptyEventBook ns) eventsBook
187 ws = map unpack $ words $ pack txt
188 eventsBook = map (wordToBook ns) ws
190 wordToBook :: [Int] -> Word -> EventBook
191 wordToBook ns txt = EventBook ef en
193 chks = allChunks' ns 10 txt
194 en = DM.fromList $ map (\(n,ns') -> (n, length ns')) $ zip ns chks
195 ef = foldl' DM.union DM.empty $ map (occurrencesWith identity) chks
197 op :: (Freq -> Freq -> Freq) -> EventBook -> EventBook -> EventBook
198 op f (EventBook ef1 en1)
199 (EventBook ef2 en2) = EventBook (DM.unionWith f ef1 ef2)
200 (DM.unionWith f en1 en2)
202 ------------------------------------------------------------------------
203 ------------------------------------------------------------------------
204 -- * Make the distributions
205 makeDist :: [String] -> D.T Double String
206 makeDist = D.uniform . DL.concat . map (allChunks [0,2] 10)
208 stopDist :: D.T Double String
209 stopDist = makeDist stopList
211 candDist :: D.T Double String
212 candDist = makeDist candList
214 ------------------------------------------------------------------------
215 sumProba :: Num a => D.T a String -> [Char] -> a
216 sumProba ds x = sum $ map ((~?) ds) $ allChunks [0,2] 10 $ map toLower x
218 -- | Get probability according a distribution
219 (~?) :: (Num prob, Eq a) => D.T prob a -> a -> prob
220 (~?) ds x = (==x) ?? ds
222 ------------------------------------------------------------------------
223 candidate :: [Char] -> Candidate
224 candidate x = Candidate (sumProba stopDist x) (sumProba candDist x)
226 ------------------------------------------------------------------------
228 candList = [ "france", "alexandre", "mael", "constitution"
229 , "etats-unis", "associes", "car", "train", "spam"]
233 stopList = map show ([0..9]::[Int]) <> [
234 "a","a's","able","about","above","apply","according","accordingly",
235 "across","actually","after","afterwards","again","against",
236 "ain't","all","allow","allows","almost","alone","along",
237 "involves", "already","also","although","always","am","among","amongst",
238 "an","and","another","any","anybody","anyhow","anyone","anything",
239 "anyway","anyways","anywhere","analyze","apart","appear","appreciate","appropriate",
240 "are","aren't","around","as","aside","ask","asking","associated","at",
241 "available","away","awfully","based", "b","be","became","because","become",
242 "becomes","becoming","been","before","beforehand","behind","being",
243 "believe","below","beside","besides","best","better","between","beyond",
244 "both","brief","but","by","c","c'mon","c's","came","can","can't","cannot",
245 "cant","cause","causes","certain","certainly","changes","clearly","co",
246 "com","come","comes","common","concerning","consequently","consider","considering",
247 "contain","containing","contains","corresponding","could","couldn't","course",
248 "currently","d","definitely","described","detects","detecting","despite","did","didn't","different",
249 "do","does","doesn't","doing","don't","done","down","downwards","during","e",
250 "each","edu","eg","eight","either","else","elsewhere","enough","entirely",
251 "especially","et","etc","even","ever","every","everybody","everyone",
252 "everything","everywhere","ex","exactly","example","except","f","far",
253 "few","find","fifth","first","five","followed","following","follows","for",
254 "former","formerly","forth","four","from","further","furthermore","g",
255 "get","gets","getting","given","gives","go","goes","going","gone","got",
256 "gotten","greetings","h","had","hadn't","happens","hardly","has","hasn't",
257 "have","haven't","having","he","he's","hello","help","hence","her","here",
258 "here's","hereafter","hereby","herein","hereupon","hers","herself","hi",
259 "him","himself","his","hither","hopefully","how","howbeit","however","i",
260 "i'd","identify","i'll","i'm","i've","ie","if","ignored","immediate","in","inasmuch",
261 "inc","indeed","indicate","indicated","indicates","inner","insofar",
262 "instead","into","inward","is","isn't","it","it'd","it'll","it's","its",
263 "itself","j","just","k","keep","keeps","kept","know","known","knows","l",
264 "last","lately","later","latter","latterly","least","less","lest","let",
265 "let's","like","liked","likely","little","look","looking","looks","ltd",
266 "m","mainly","many","may","maybe","me","mean","meanwhile","merely","might",
267 "more","moreover","most","mostly","much","must","my","myself","n",
268 "name","namely","nd","near","nearly","necessary","need","needs","neither",
269 "never","nevertheless","new","next","nine","no","nobody","non","none",
270 "noone","nor","normally","not","nothing","novel","now","nowhere","o",
271 "obviously","of","off","often","oh","ok","okay","old","on","once","one",
272 "ones","only","onto","or","other","others","otherwise","ought","our",
273 "ours","ourselves","out","outside","over","overall","own","p","particular",
274 "particularly","per","perhaps","placed","please","plus","possible",
275 "presents","presumably","probably","provides","q","que","quite","qv","r","rather",
276 "rd","re","really","reasonably","regarding","regardless","regards",
277 "relatively","respectively","right","s","said","same","saw","say",
278 "saying","says","second","secondly","see","seeing","seem","seemed",
279 "seeming","seems","seen","self","selves","sensible","sent","serious",
280 "seriously","seven","several","shall","she","should","shouldn't","since",
281 "six","so","some","somebody","somehow","someone","something","sometime",
282 "sometimes","somewhat","somewhere","soon","sorry","specified","specify",
283 "specifying","still","sub","such","sup","sure","t","t's","take","taken",
284 "tell","tends","th","than","thank","thanks","thanx","that","that's",
285 "thats","the","their","theirs","them","themselves","then","thence","there",
286 "there's","thereafter","thereby","therefore","therein","theres",
287 "thereupon","these","they","they'd","they'll","they're","they've",
288 "think","third","this","thorough","thoroughly","those","though","three",
289 "through","throughout","thru","thus","to","together","too","took","toward",
290 "towards","tried","tries","truly","try","trying","twice","two","u","un",
291 "under","unfortunately","unless","unlikely","until","unto","up","upon",
292 "us","use","used","useful","uses","using","usually","uucp","v","value",
293 "various","very","via","viz","vs","w","want","wants","was","wasn't","way",
294 "we","we'd","we'll","we're","we've","welcome","well","went","were",
295 "weren't","what","what's","whatever","when","whence","whenever","where",
296 "where's","whereafter","whereas","whereby","wherein","whereupon",
297 "wherever","whether","which","while","whither","who","who's","whoever",
298 "whole","whom","whose","why","will","willing","wish","with","within",
299 "without","won't","wonder","would","wouldn't","x","y","yes","yet","you",
300 "you'd","you'll","you're","you've","your","yours","yourself","yourselves",