2 Module : Gargantext.Text.Terms.Stop
3 Description : Mono Terms module
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Stop words and (how to learn it).
12 Main type here is String.
16 {-# LANGUAGE NoImplicitPrelude #-}
18 module Gargantext.Text.Terms.Stop
21 import GHC.Base (Functor)
22 import Numeric.Probability.Distribution ((??))
23 import qualified Numeric.Probability.Distribution as D
25 import Data.Char (toLower)
26 import qualified Data.List as DL
28 import Data.Maybe (maybe)
29 import Data.Map.Strict (Map, toList)
30 import qualified Data.Map.Strict as DM
32 import Data.String (String)
34 import Data.Text (pack, unpack)
35 import Data.Tuple.Extra (both)
37 import Gargantext.Prelude
38 import Gargantext.Core (Lang(..), allLangs)
39 import Gargantext.Text.Terms.Mono (words)
40 import Gargantext.Text.Metrics.Count (occurrencesWith)
42 import qualified Gargantext.Text.Samples.FR as FR
43 import qualified Gargantext.Text.Samples.EN as EN
44 --import qualified Gargantext.Text.Samples.DE as DE
45 --import qualified Gargantext.Text.Samples.SP as SP
46 --import qualified Gargantext.Text.Samples.CH as CH
48 ------------------------------------------------------------------------
49 data Candidate = Candidate { stop :: Double
53 -- * String preparation
56 blanks :: String -> String
58 blanks xs = [' '] <> xs <> [' ']
60 -- | Blocks increase the size of the word to ease computations
61 -- some border and unexepected effects can happen, need to be tested
62 blockOf :: Int -> String -> String
63 blockOf n st = DL.concat $ DL.take n $ DL.repeat st
65 -- | Chunks is the same function as splitBy in Context but for Strings,
66 -- not Text (without pack and unpack operations that are not needed).
67 chunks :: Int -> Int -> String -> [String]
68 chunks n m = DL.take m . filter (not . all (== ' ')) . chunkAlong (n+1) 1 . DL.concat . DL.take 1000 . DL.repeat . blanks
70 allChunks :: [Int] -> Int -> String -> [String]
71 allChunks ns m st = DL.concat $ map (\n -> chunks n m st) ns
73 allChunks' :: [Int] -> Int -> String -> [[String]]
74 allChunks' ns m st = map (\n -> chunks n m st) ns
76 ------------------------------------------------------------------------
77 -- * Analyze candidate
83 data LangWord = LangWord Lang Word
85 type LangProba = Map Lang Double
87 ------------------------------------------------------------------------
88 detectLangs :: String -> [(Lang, Double)]
89 detectLangs s = DL.reverse $ DL.sortOn snd
91 $ detect (wordsToBook [0..2] s) eventLang
93 part :: (Eq p, Fractional p) => p -> p -> p
98 toProba :: (Eq b, Fractional b, Functor t, Foldable t) =>
100 toProba xs = map (\(a,b) -> (a, part b total)) xs
102 total = sum $ map snd xs
104 textSample :: Lang -> String
105 textSample EN = EN.textSample
106 textSample FR = FR.textSample
107 --textSample DE = DE.textSample
108 --textSample SP = SP.textSample
109 --textSample CH = CH.textSample
111 langWord :: Lang -> LangWord
112 langWord l = LangWord l (textSample l)
114 eventLang :: EventLang
115 eventLang = toEventLangs [0..2] [ langWord l | l <- allLangs ]
117 detect :: EventBook -> EventLang -> LangProba
118 detect (EventBook mapFreq _) el =
121 $ map (\(s,n) -> map (\(l,f) -> (l, (fromIntegral n) * f)) $ toPrior s el)
122 $ filter (\x -> fst x /= " ")
125 ------------------------------------------------------------------------
127 type EventLang = Map Lang EventBook
129 toEventLangs :: [Int] -> [LangWord] -> EventLang
130 toEventLangs ns = foldl' (opLang (+)) (emptyEventLang ns) . map (toLang ns)
132 emptyEventLang :: [Int] -> EventLang
133 emptyEventLang ns = toLang ns (LangWord FR "")
135 toLang :: [Int] -> LangWord -> EventLang
136 toLang ns (LangWord l txt) = DM.fromList [(l, wordsToBook ns txt)]
138 opLang :: (Freq -> Freq -> Freq) -> EventLang -> EventLang -> EventLang
139 opLang f = DM.unionWith (op f)
141 ------------------------------------------------------------------------
142 -- | TODO: monoids (but proba >= 0)
144 peb :: String -> EventBook -> Double
145 peb st (EventBook mapFreq mapN) = (fromIntegral a) / (fromIntegral b)
147 a = maybe 0 identity $ DM.lookup st mapFreq
148 b = maybe 1 identity $ DM.lookup (length st) mapN
150 peb' :: String -> EventBook -> (Freq, TotalFreq)
151 peb' st (EventBook mapFreq mapN) = (fromIntegral a, fromIntegral b)
153 a = maybe 0 identity $ DM.lookup st mapFreq
154 b = maybe 1 identity $ DM.lookup (length st) mapN
156 ------------------------------------------------------------------------
157 toPrior :: String -> EventLang -> [(Lang, Double)]
158 toPrior s el = prior $ pebLang s el
160 pebLang :: String -> EventLang -> [(Lang, (Freq,TotalFreq))]
161 pebLang st = map (\(l,eb) -> (l, peb' st eb)) . DM.toList
163 ------------------------------------------------------------------------
164 prior :: [(Lang, (Freq, TotalFreq))] -> [(Lang, Double)]
165 prior ps = zip ls $ zipWith (\x y -> x^(99::Int) * y) (map (\(a,_) -> part a (sum $ map fst ps')) ps')
166 (map (\(a,b) -> a / b) ps')
169 (ls, ps'') = DL.unzip ps
170 ps' = map (both fromIntegral) ps''
171 ------------------------------------------------------------------------
172 data EventBook = EventBook { events_freq :: Map String Freq
173 , events_n :: Map StringSize TotalFreq
177 emptyEventBook :: [Int] -> EventBook
178 emptyEventBook ns = wordToBook ns " "
180 wordsToBook :: [Int] -> String -> EventBook
181 wordsToBook ns txt = foldl' (op (+)) (emptyEventBook ns) eventsBook
183 ws = map unpack $ words $ pack txt
184 eventsBook = map (wordToBook ns) ws
186 wordToBook :: [Int] -> Word -> EventBook
187 wordToBook ns txt = EventBook ef en
189 chks = allChunks' ns 10 txt
190 en = DM.fromList $ map (\(n,ns') -> (n, length ns')) $ zip ns chks
191 ef = foldl' DM.union DM.empty $ map (occurrencesWith identity) chks
193 op :: (Freq -> Freq -> Freq) -> EventBook -> EventBook -> EventBook
194 op f (EventBook ef1 en1)
195 (EventBook ef2 en2) = EventBook (DM.unionWith f ef1 ef2)
196 (DM.unionWith f en1 en2)
199 ------------------------------------------------------------------------
200 ------------------------------------------------------------------------
201 -- * Make the distributions
202 makeDist :: [String] -> D.T Double String
203 makeDist = D.uniform . DL.concat . map (allChunks [0,2] 10)
205 stopDist :: D.T Double String
206 stopDist = makeDist stopList
208 candDist :: D.T Double String
209 candDist = makeDist candList
211 ------------------------------------------------------------------------
212 sumProba :: Num a => D.T a String -> [Char] -> a
213 sumProba ds x = sum $ map ((~?) ds) $ allChunks [0,2] 10 $ map toLower x
215 -- | Get probability according a distribution
216 (~?) :: (Num prob, Eq a) => D.T prob a -> a -> prob
217 (~?) ds x = (==x) ?? ds
219 ------------------------------------------------------------------------
220 candidate :: [Char] -> Candidate
221 candidate x = Candidate (sumProba stopDist x) (sumProba candDist x)
223 ------------------------------------------------------------------------
225 candList = [ "france", "alexandre", "mael", "constitution"
226 , "etats-unis", "associes", "car", "train", "spam"]
230 stopList = map show ([0..9]::[Int]) <> [
231 "a","a's","able","about","above","apply","according","accordingly",
232 "across","actually","after","afterwards","again","against",
233 "ain't","all","allow","allows","almost","alone","along",
234 "involves", "already","also","although","always","am","among","amongst",
235 "an","and","another","any","anybody","anyhow","anyone","anything",
236 "anyway","anyways","anywhere","analyze","apart","appear","appreciate","appropriate",
237 "are","aren't","around","as","aside","ask","asking","associated","at",
238 "available","away","awfully","based", "b","be","became","because","become",
239 "becomes","becoming","been","before","beforehand","behind","being",
240 "believe","below","beside","besides","best","better","between","beyond",
241 "both","brief","but","by","c","c'mon","c's","came","can","can't","cannot",
242 "cant","cause","causes","certain","certainly","changes","clearly","co",
243 "com","come","comes","common","concerning","consequently","consider","considering",
244 "contain","containing","contains","corresponding","could","couldn't","course",
245 "currently","d","definitely","described","detects","detecting","despite","did","didn't","different",
246 "do","does","doesn't","doing","don't","done","down","downwards","during","e",
247 "each","edu","eg","eight","either","else","elsewhere","enough","entirely",
248 "especially","et","etc","even","ever","every","everybody","everyone",
249 "everything","everywhere","ex","exactly","example","except","f","far",
250 "few","find","fifth","first","five","followed","following","follows","for",
251 "former","formerly","forth","four","from","further","furthermore","g",
252 "get","gets","getting","given","gives","go","goes","going","gone","got",
253 "gotten","greetings","h","had","hadn't","happens","hardly","has","hasn't",
254 "have","haven't","having","he","he's","hello","help","hence","her","here",
255 "here's","hereafter","hereby","herein","hereupon","hers","herself","hi",
256 "him","himself","his","hither","hopefully","how","howbeit","however","i",
257 "i'd","identify","i'll","i'm","i've","ie","if","ignored","immediate","in","inasmuch",
258 "inc","indeed","indicate","indicated","indicates","inner","insofar",
259 "instead","into","inward","is","isn't","it","it'd","it'll","it's","its",
260 "itself","j","just","k","keep","keeps","kept","know","known","knows","l",
261 "last","lately","later","latter","latterly","least","less","lest","let",
262 "let's","like","liked","likely","little","look","looking","looks","ltd",
263 "m","mainly","many","may","maybe","me","mean","meanwhile","merely","might",
264 "more","moreover","most","mostly","much","must","my","myself","n",
265 "name","namely","nd","near","nearly","necessary","need","needs","neither",
266 "never","nevertheless","new","next","nine","no","nobody","non","none",
267 "noone","nor","normally","not","nothing","novel","now","nowhere","o",
268 "obviously","of","off","often","oh","ok","okay","old","on","once","one",
269 "ones","only","onto","or","other","others","otherwise","ought","our",
270 "ours","ourselves","out","outside","over","overall","own","p","particular",
271 "particularly","per","perhaps","placed","please","plus","possible",
272 "presents","presumably","probably","provides","q","que","quite","qv","r","rather",
273 "rd","re","really","reasonably","regarding","regardless","regards",
274 "relatively","respectively","right","s","said","same","saw","say",
275 "saying","says","second","secondly","see","seeing","seem","seemed",
276 "seeming","seems","seen","self","selves","sensible","sent","serious",
277 "seriously","seven","several","shall","she","should","shouldn't","since",
278 "six","so","some","somebody","somehow","someone","something","sometime",
279 "sometimes","somewhat","somewhere","soon","sorry","specified","specify",
280 "specifying","still","sub","such","sup","sure","t","t's","take","taken",
281 "tell","tends","th","than","thank","thanks","thanx","that","that's",
282 "thats","the","their","theirs","them","themselves","then","thence","there",
283 "there's","thereafter","thereby","therefore","therein","theres",
284 "thereupon","these","they","they'd","they'll","they're","they've",
285 "think","third","this","thorough","thoroughly","those","though","three",
286 "through","throughout","thru","thus","to","together","too","took","toward",
287 "towards","tried","tries","truly","try","trying","twice","two","u","un",
288 "under","unfortunately","unless","unlikely","until","unto","up","upon",
289 "us","use","used","useful","uses","using","usually","uucp","v","value",
290 "various","very","via","viz","vs","w","want","wants","was","wasn't","way",
291 "we","we'd","we'll","we're","we've","welcome","well","went","were",
292 "weren't","what","what's","whatever","when","whence","whenever","where",
293 "where's","whereafter","whereas","whereby","wherein","whereupon",
294 "wherever","whether","which","while","whither","who","who's","whoever",
295 "whole","whom","whose","why","will","willing","wish","with","within",
296 "without","won't","wonder","would","wouldn't","x","y","yes","yet","you",
297 "you'd","you'll","you're","you've","your","yours","yourself","yourselves",