2 Module : Gargantext.Text.Terms.Stop
3 Description : Mono Terms module
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Stop words and (how to learn it).
12 Main type here is String.
16 {-# LANGUAGE NoImplicitPrelude #-}
18 module Gargantext.Text.Terms.Stop
21 import Numeric.Probability.Distribution ((??))
22 import qualified Numeric.Probability.Distribution as D
24 import Data.Char (toLower)
25 import qualified Data.List as DL
27 import Data.Maybe (maybe)
28 import Data.Map.Strict (Map, toList)
29 import qualified Data.Map.Strict as DM
31 import Data.String (String)
33 import Data.Text (pack, unpack)
35 import Gargantext.Prelude
36 import Gargantext.Core (Lang(..))
37 import Gargantext.Text.Terms.Mono (words)
38 import Gargantext.Text.Metrics.Count (occurrencesWith)
40 import Gargantext.Text.Samples.FR as FR
41 import Gargantext.Text.Samples.EN as EN
42 import Gargantext.Text.Samples.DE as DE
43 import Gargantext.Text.Samples.SP as SP
44 import Gargantext.Text.Samples.CH as CH
46 ------------------------------------------------------------------------
47 data Candidate = Candidate { stop :: Double
51 -- * String preparation
54 blanks :: String -> String
56 blanks xs = [' '] <> xs <> [' ']
58 -- | Blocks increase the size of the word to ease computations
59 -- some border and unexepected effects can happen, need to be tested
60 blockOf :: Int -> String -> String
61 blockOf n st = DL.concat $ DL.take n $ DL.repeat st
63 -- | Chunks is the same function as splitBy in Context but for Strings,
64 -- not Text (without pack and unpack operations that are not needed).
65 chunks :: Int -> Int -> String -> [String]
66 chunks n m = DL.take m . filter (not . all (== ' ')) . chunkAlong (n+1) 1 . DL.concat . DL.take 1000 . DL.repeat . blanks
68 allChunks :: [Int] -> Int -> String -> [String]
69 allChunks ns m st = DL.concat $ map (\n -> chunks n m st) ns
71 allChunks' :: [Int] -> Int -> String -> [[String]]
72 allChunks' ns m st = map (\n -> chunks n m st) ns
74 ------------------------------------------------------------------------
75 -- * Analyze candidate
81 data LangWord = LangWord Lang Word
83 type LangProba = Map Lang Double
85 ------------------------------------------------------------------------
86 detectLangs :: String -> [(Lang, Double)]
87 detectLangs s = DL.reverse $ DL.sortOn snd
89 $ detect (wordsToBook [0..2] s) testEL
92 testEL = toEventLangs [0..2] [ LangWord EN EN.textMining
93 , LangWord FR FR.textMining
94 , LangWord DE DE.textMining
95 , LangWord SP SP.textMining
96 , LangWord CH CH.textMining
99 detect :: EventBook -> EventLang -> LangProba
100 detect (EventBook mapFreq _) el = DM.unionsWith (+) $ map (\(s,n) -> DM.map (\eb -> (fromIntegral n) * peb s eb) el) $ filter (\x -> fst x /= " ") $ DM.toList mapFreq
102 ------------------------------------------------------------------------
104 type EventLang = Map Lang EventBook
105 toEventLangs :: [Int] -> [LangWord] -> EventLang
106 toEventLangs ns = foldl' (opLang (+)) (emptyEventLang ns) . map (toLang ns)
108 emptyEventLang :: [Int] -> EventLang
109 emptyEventLang ns = toLang ns (LangWord FR "")
111 toLang :: [Int] -> LangWord -> EventLang
112 toLang ns (LangWord l txt) = DM.fromList [(l, wordsToBook ns txt)]
114 opLang :: (Freq -> Freq -> Freq) -> EventLang -> EventLang -> EventLang
115 opLang f = DM.unionWith (op f)
117 ------------------------------------------------------------------------
118 -- | TODO: monoids (but proba >= 0)
120 peb :: String -> EventBook -> Double
121 peb st (EventBook mapFreq mapN) = (fromIntegral a) / (fromIntegral b)
123 a = maybe 0 identity $ DM.lookup st mapFreq
124 b = maybe 1 identity $ DM.lookup (length st) mapN
126 data EventBook = EventBook { events_freq :: Map String Freq
127 , events_n :: Map StringSize TotalFreq
131 emptyEventBook :: [Int] -> EventBook
132 emptyEventBook ns = wordToBook ns " "
134 wordsToBook :: [Int] -> String -> EventBook
135 wordsToBook ns txt = foldl' (op (+)) (emptyEventBook ns) eventsBook
137 ws = map unpack $ words $ pack txt
138 eventsBook = map (wordToBook ns) ws
140 wordToBook :: [Int] -> Word -> EventBook
141 wordToBook ns txt = EventBook ef en
143 chks = allChunks' ns 10 txt
144 en = DM.fromList $ map (\(n,ns') -> (n, length ns')) $ zip ns chks
145 ef = foldl' DM.union DM.empty $ map (occurrencesWith identity) chks
147 op :: (Freq -> Freq -> Freq) -> EventBook -> EventBook -> EventBook
148 op f (EventBook ef1 en1)
149 (EventBook ef2 en2) = EventBook (DM.unionWith f ef1 ef2)
150 (DM.unionWith f en1 en2)
153 ------------------------------------------------------------------------
154 ------------------------------------------------------------------------
155 -- * Make the distributions
156 makeDist :: [String] -> D.T Double String
157 makeDist = D.uniform . DL.concat . map (allChunks [0,2] 10)
159 stopDist :: D.T Double String
160 stopDist = makeDist stopList
162 candDist :: D.T Double String
163 candDist = makeDist candList
165 ------------------------------------------------------------------------
166 sumProba :: Num a => D.T a String -> [Char] -> a
167 sumProba ds x = sum $ map ((~?) ds) $ allChunks [0,2] 10 $ map toLower x
169 -- | Get probability according a distribution
170 (~?) :: (Num prob, Eq a) => D.T prob a -> a -> prob
171 (~?) ds x = (==x) ?? ds
173 ------------------------------------------------------------------------
174 candidate :: [Char] -> Candidate
175 candidate x = Candidate (sumProba stopDist x) (sumProba candDist x)
177 ------------------------------------------------------------------------
179 candList = [ "france", "alexandre", "mael", "constitution"
180 , "etats-unis", "associes", "car", "train", "spam"]
184 stopList = map show ([0..9]::[Int]) <> [
185 "a","a's","able","about","above","apply","according","accordingly",
186 "across","actually","after","afterwards","again","against",
187 "ain't","all","allow","allows","almost","alone","along",
188 "involves", "already","also","although","always","am","among","amongst",
189 "an","and","another","any","anybody","anyhow","anyone","anything",
190 "anyway","anyways","anywhere","analyze","apart","appear","appreciate","appropriate",
191 "are","aren't","around","as","aside","ask","asking","associated","at",
192 "available","away","awfully","based", "b","be","became","because","become",
193 "becomes","becoming","been","before","beforehand","behind","being",
194 "believe","below","beside","besides","best","better","between","beyond",
195 "both","brief","but","by","c","c'mon","c's","came","can","can't","cannot",
196 "cant","cause","causes","certain","certainly","changes","clearly","co",
197 "com","come","comes","common","concerning","consequently","consider","considering",
198 "contain","containing","contains","corresponding","could","couldn't","course",
199 "currently","d","definitely","described","detects","detecting","despite","did","didn't","different",
200 "do","does","doesn't","doing","don't","done","down","downwards","during","e",
201 "each","edu","eg","eight","either","else","elsewhere","enough","entirely",
202 "especially","et","etc","even","ever","every","everybody","everyone",
203 "everything","everywhere","ex","exactly","example","except","f","far",
204 "few","find","fifth","first","five","followed","following","follows","for",
205 "former","formerly","forth","four","from","further","furthermore","g",
206 "get","gets","getting","given","gives","go","goes","going","gone","got",
207 "gotten","greetings","h","had","hadn't","happens","hardly","has","hasn't",
208 "have","haven't","having","he","he's","hello","help","hence","her","here",
209 "here's","hereafter","hereby","herein","hereupon","hers","herself","hi",
210 "him","himself","his","hither","hopefully","how","howbeit","however","i",
211 "i'd","identify","i'll","i'm","i've","ie","if","ignored","immediate","in","inasmuch",
212 "inc","indeed","indicate","indicated","indicates","inner","insofar",
213 "instead","into","inward","is","isn't","it","it'd","it'll","it's","its",
214 "itself","j","just","k","keep","keeps","kept","know","known","knows","l",
215 "last","lately","later","latter","latterly","least","less","lest","let",
216 "let's","like","liked","likely","little","look","looking","looks","ltd",
217 "m","mainly","many","may","maybe","me","mean","meanwhile","merely","might",
218 "more","moreover","most","mostly","much","must","my","myself","n",
219 "name","namely","nd","near","nearly","necessary","need","needs","neither",
220 "never","nevertheless","new","next","nine","no","nobody","non","none",
221 "noone","nor","normally","not","nothing","novel","now","nowhere","o",
222 "obviously","of","off","often","oh","ok","okay","old","on","once","one",
223 "ones","only","onto","or","other","others","otherwise","ought","our",
224 "ours","ourselves","out","outside","over","overall","own","p","particular",
225 "particularly","per","perhaps","placed","please","plus","possible",
226 "presents","presumably","probably","provides","q","que","quite","qv","r","rather",
227 "rd","re","really","reasonably","regarding","regardless","regards",
228 "relatively","respectively","right","s","said","same","saw","say",
229 "saying","says","second","secondly","see","seeing","seem","seemed",
230 "seeming","seems","seen","self","selves","sensible","sent","serious",
231 "seriously","seven","several","shall","she","should","shouldn't","since",
232 "six","so","some","somebody","somehow","someone","something","sometime",
233 "sometimes","somewhat","somewhere","soon","sorry","specified","specify",
234 "specifying","still","sub","such","sup","sure","t","t's","take","taken",
235 "tell","tends","th","than","thank","thanks","thanx","that","that's",
236 "thats","the","their","theirs","them","themselves","then","thence","there",
237 "there's","thereafter","thereby","therefore","therein","theres",
238 "thereupon","these","they","they'd","they'll","they're","they've",
239 "think","third","this","thorough","thoroughly","those","though","three",
240 "through","throughout","thru","thus","to","together","too","took","toward",
241 "towards","tried","tries","truly","try","trying","twice","two","u","un",
242 "under","unfortunately","unless","unlikely","until","unto","up","upon",
243 "us","use","used","useful","uses","using","usually","uucp","v","value",
244 "various","very","via","viz","vs","w","want","wants","was","wasn't","way",
245 "we","we'd","we'll","we're","we've","welcome","well","went","were",
246 "weren't","what","what's","whatever","when","whence","whenever","where",
247 "where's","whereafter","whereas","whereby","wherein","whereupon",
248 "wherever","whether","which","while","whither","who","who's","whoever",
249 "whole","whom","whose","why","will","willing","wish","with","within",
250 "without","won't","wonder","would","wouldn't","x","y","yes","yet","you",
251 "you'd","you'll","you're","you've","your","yours","yourself","yourselves",