2 Module : Gargantext.Text.Terms.Stop
3 Description : Mono Terms module
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Stop words and (how to learn it).
12 Main type here is String.
16 {-# LANGUAGE NoImplicitPrelude #-}
18 module Gargantext.Text.Terms.Stop (detectLang, detectLangs, stopList)
21 import GHC.Base (Functor)
22 import Numeric.Probability.Distribution ((??))
23 import qualified Numeric.Probability.Distribution as D
25 import Data.Char (toLower)
26 import qualified Data.List as DL
28 import Data.Maybe (maybe)
29 import Data.Map.Strict (Map, toList)
30 import qualified Data.Map.Strict as DM
32 import Data.String (String)
34 import Data.Text (Text)
35 import Data.Text (pack, unpack)
36 import Data.Tuple.Extra (both)
38 import Gargantext.Prelude
39 import Gargantext.Core (Lang(..), allLangs)
40 import Gargantext.Text.Terms.Mono (words)
41 import Gargantext.Text.Metrics.Count (occurrencesWith)
43 import qualified Gargantext.Text.Samples.FR as FR
44 import qualified Gargantext.Text.Samples.EN as EN
45 --import qualified Gargantext.Text.Samples.DE as DE
46 --import qualified Gargantext.Text.Samples.SP as SP
47 --import qualified Gargantext.Text.Samples.CH as CH
49 ------------------------------------------------------------------------
50 data Candidate = Candidate { stop :: Double
54 -- * String preparation
57 blanks :: String -> String
59 blanks xs = [' '] <> xs <> [' ']
61 -- | Blocks increase the size of the word to ease computations
62 -- some border and unexepected effects can happen, need to be tested
63 blockOf :: Int -> String -> String
64 blockOf n = DL.concat . DL.take n . DL.repeat
66 -- | Chunks is the same function as splitBy in Context but for Strings,
67 -- not Text (without pack and unpack operations that are not needed).
68 chunks :: Int -> Int -> String -> [String]
69 chunks n m = DL.take m . filter (not . all (== ' '))
76 allChunks :: [Int] -> Int -> String -> [String]
77 allChunks ns m st = DL.concat $ map (\n -> chunks n m st) ns
79 allChunks' :: [Int] -> Int -> String -> [[String]]
80 allChunks' ns m st = map (\n -> chunks n m st) ns
82 ------------------------------------------------------------------------
83 -- * Analyze candidate
89 data LangWord = LangWord Lang Word
91 type LangProba = Map Lang Double
93 ------------------------------------------------------------------------
94 detectLang :: Text -> Maybe Lang
95 detectLang = head . map fst . detectLangs
97 detectLangs :: Text -> [(Lang, Double)]
98 detectLangs = detectLangs' . unpack
100 detectLangs' :: String -> [(Lang, Double)]
101 detectLangs' s = DL.reverse $ DL.sortOn snd
103 $ detect (wordsToBook [0..2] s) eventLang
105 part :: (Eq p, Fractional p) => p -> p -> p
110 toProba :: (Eq b, Fractional b, Functor t, Foldable t) =>
112 toProba xs = map (\(a,b) -> (a, part b total)) xs
114 total = sum $ map snd xs
116 textSample :: Lang -> String
117 textSample EN = EN.textSample
118 textSample FR = FR.textSample
119 --textSample DE = DE.textSample
120 --textSample SP = SP.textSample
121 --textSample CH = CH.textSample
123 langWord :: Lang -> LangWord
124 langWord l = LangWord l (textSample l)
126 eventLang :: EventLang
127 eventLang = toEventLangs [0..2] [ langWord l | l <- allLangs ]
129 detect :: EventBook -> EventLang -> LangProba
130 detect (EventBook mapFreq _) el =
133 $ map (\(s,n) -> map (\(l,f) -> (l, (fromIntegral n) * f)) $ toPrior s el)
134 $ filter (\x -> fst x /= " ")
137 ------------------------------------------------------------------------
139 type EventLang = Map Lang EventBook
141 toEventLangs :: [Int] -> [LangWord] -> EventLang
142 toEventLangs ns = foldl' (opLang (+)) (emptyEventLang ns) . map (toLang ns)
144 emptyEventLang :: [Int] -> EventLang
145 emptyEventLang ns = toLang ns (LangWord FR "")
147 toLang :: [Int] -> LangWord -> EventLang
148 toLang ns (LangWord l txt) = DM.fromList [(l, wordsToBook ns txt)]
150 opLang :: (Freq -> Freq -> Freq) -> EventLang -> EventLang -> EventLang
151 opLang f = DM.unionWith (op f)
153 ------------------------------------------------------------------------
154 -- | TODO: monoids (but proba >= 0)
156 peb :: String -> EventBook -> Double
157 peb st (EventBook mapFreq mapN) = (fromIntegral a) / (fromIntegral b)
159 a = maybe 0 identity $ DM.lookup st mapFreq
160 b = maybe 1 identity $ DM.lookup (length st) mapN
162 peb' :: String -> EventBook -> (Freq, TotalFreq)
163 peb' st (EventBook mapFreq mapN) = (fromIntegral a, fromIntegral b)
165 a = maybe 0 identity $ DM.lookup st mapFreq
166 b = maybe 1 identity $ DM.lookup (length st) mapN
168 ------------------------------------------------------------------------
169 toPrior :: String -> EventLang -> [(Lang, Double)]
170 toPrior s el = prior $ pebLang s el
172 pebLang :: String -> EventLang -> [(Lang, (Freq,TotalFreq))]
173 pebLang st = map (\(l,eb) -> (l, peb' st eb)) . DM.toList
174 ------------------------------------------------------------------------
175 prior :: [(Lang, (Freq, TotalFreq))] -> [(Lang, Double)]
176 prior ps = zip ls $ zipWith (\x y -> x^(99::Int) * y) (map (\(a,_) -> part a (sum $ map fst ps')) ps')
177 (map (\(a,b) -> a / b) ps')
180 (ls, ps'') = DL.unzip ps
181 ps' = map (both fromIntegral) ps''
182 ------------------------------------------------------------------------
183 data EventBook = EventBook { events_freq :: Map String Freq
184 , events_n :: Map StringSize TotalFreq
188 emptyEventBook :: [Int] -> EventBook
189 emptyEventBook ns = wordToBook ns " "
191 wordsToBook :: [Int] -> String -> EventBook
192 wordsToBook ns txt = foldl' (op (+)) (emptyEventBook ns) eventsBook
194 ws = map unpack $ words $ pack txt
195 eventsBook = map (wordToBook ns) ws
197 wordToBook :: [Int] -> Word -> EventBook
198 wordToBook ns txt = EventBook ef en
200 chks = allChunks' ns 10 txt
201 en = DM.fromList $ map (\(n,ns') -> (n, length ns')) $ zip ns chks
202 ef = foldl' DM.union DM.empty $ map (occurrencesWith identity) chks
204 op :: (Freq -> Freq -> Freq) -> EventBook -> EventBook -> EventBook
205 op f (EventBook ef1 en1)
206 (EventBook ef2 en2) = EventBook (DM.unionWith f ef1 ef2)
207 (DM.unionWith f en1 en2)
209 ------------------------------------------------------------------------
210 ------------------------------------------------------------------------
211 -- * Make the distributions
212 makeDist :: [String] -> D.T Double String
213 makeDist = D.uniform . DL.concat . map (allChunks [0,2] 10)
215 stopDist :: D.T Double String
216 stopDist = makeDist stopList
218 candDist :: D.T Double String
219 candDist = makeDist candList
221 ------------------------------------------------------------------------
222 sumProba :: Num a => D.T a String -> [Char] -> a
223 sumProba ds x = sum $ map ((~?) ds) $ allChunks [0,2] 10 $ map toLower x
225 -- | Get probability according a distribution
226 (~?) :: (Num prob, Eq a) => D.T prob a -> a -> prob
227 (~?) ds x = (==x) ?? ds
229 ------------------------------------------------------------------------
230 candidate :: [Char] -> Candidate
231 candidate x = Candidate (sumProba stopDist x) (sumProba candDist x)
233 ------------------------------------------------------------------------
235 candList = [ "france", "alexandre", "mael", "constitution"
236 , "etats-unis", "associes", "car", "train", "spam"]
240 stopList = map show ([0..9]::[Int]) <> [
241 "a","a's","able","about","above","apply","according","accordingly",
242 "across","actually","after","afterwards","again","against",
243 "ain't","all","allow","allows","almost","alone","along",
244 "involves", "already","also","although","always","am","among","amongst",
245 "an","and","another","any","anybody","anyhow","anyone","anything",
246 "anyway","anyways","anywhere","analyze","apart","appear","appreciate","appropriate",
247 "are","aren't","around","as","aside","ask","asking","associated","at",
248 "available","away","awfully","based", "b","be","became","because","become",
249 "becomes","becoming","been","before","beforehand","behind","being",
250 "believe","below","beside","besides","best","better","between","beyond",
251 "both","brief","but","by","c","c'mon","c's","came","can","can't","cannot",
252 "cant","cause","causes","certain","certainly","changes","clearly","co",
253 "com","come","comes","common","concerning","consequently","consider","considering",
254 "contain","containing","contains","corresponding","could","couldn't","course",
255 "currently","d","definitely","described","detects","detecting","despite","did","didn't","different",
256 "do","does","doesn't","doing","don't","done","down","downwards","during","e",
257 "each","edu","eg","eight","either","else","elsewhere","enough","entirely",
258 "especially","et","etc","even","ever","every","everybody","everyone",
259 "everything","everywhere","ex","exactly","example","except","f","far",
260 "few","find","fifth","first","five","followed","following","follows","for",
261 "former","formerly","forth","four","from","further","furthermore","g",
262 "get","gets","getting","given","gives","go","goes","going","gone","got",
263 "gotten","greetings","h","had","hadn't","happens","hardly","has","hasn't",
264 "have","haven't","having","he","he's","hello","help","hence","her","here",
265 "here's","hereafter","hereby","herein","hereupon","hers","herself","hi",
266 "him","himself","his","hither","hopefully","how","howbeit","however","i",
267 "i'd","identify","i'll","i'm","i've","ie","if","ignored","immediate","in","inasmuch",
268 "inc","indeed","indicate","indicated","indicates","inner","insofar",
269 "instead","into","inward","is","isn't","it","it'd","it'll","it's","its",
270 "itself","j","just","k","keep","keeps","kept","know","known","knows","l",
271 "last","lately","later","latter","latterly","least","less","lest","let",
272 "let's","like","liked","likely","little","look","looking","looks","ltd",
273 "m","mainly","many","may","maybe","me","mean","meanwhile","merely","might",
274 "more","moreover","most","mostly","much","must","my","myself","n",
275 "name","namely","nd","near","nearly","necessary","need","needs","neither",
276 "never","nevertheless","new","next","nine","no","nobody","non","none",
277 "noone","nor","normally","not","nothing","novel","now","nowhere","o",
278 "obviously","of","off","often","oh","ok","okay","old","on","once","one",
279 "ones","only","onto","or","other","others","otherwise","ought","our",
280 "ours","ourselves","out","outside","over","overall","own","p","particular",
281 "particularly","per","perhaps","placed","please","plus","possible",
282 "presents","presumably","probably","provides","q","que","quite","qv","r","rather",
283 "rd","re","really","reasonably","regarding","regardless","regards",
284 "relatively","respectively","right","s","said","same","saw","say",
285 "saying","says","second","secondly","see","seeing","seem","seemed",
286 "seeming","seems","seen","self","selves","sensible","sent","serious",
287 "seriously","seven","several","shall","she","should","shouldn't","since",
288 "six","so","some","somebody","somehow","someone","something","sometime",
289 "sometimes","somewhat","somewhere","soon","sorry","specified","specify",
290 "specifying","still","sub","such","sup","sure","t","t's","take","taken",
291 "tell","tends","th","than","thank","thanks","thanx","that","that's",
292 "thats","the","their","theirs","them","themselves","then","thence","there",
293 "there's","thereafter","thereby","therefore","therein","theres",
294 "thereupon","these","they","they'd","they'll","they're","they've",
295 "think","third","this","thorough","thoroughly","those","though","three",
296 "through","throughout","thru","thus","to","together","too","took","toward",
297 "towards","tried","tries","truly","try","trying","twice","two","u","un",
298 "under","unfortunately","unless","unlikely","until","unto","up","upon",
299 "us","use","used","useful","uses","using","usually","uucp","v","value",
300 "various","very","via","viz","vs","w","want","wants","was","wasn't","way",
301 "we","we'd","we'll","we're","we've","welcome","well","went","were",
302 "weren't","what","what's","whatever","when","whence","whenever","where",
303 "where's","whereafter","whereas","whereby","wherein","whereupon",
304 "wherever","whether","which","while","whither","who","who's","whoever",
305 "whole","whom","whose","why","will","willing","wish","with","within",
306 "without","won't","wonder","would","wouldn't","x","y","yes","yet","you",
307 "you'd","you'll","you're","you've","your","yours","yourself","yourselves",