2 Module : Gargantext.Text.Terms.Stop
3 Description : Mono Terms module
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Stop words and (how to learn it).
12 Main type here is String.
16 {-# LANGUAGE NoImplicitPrelude #-}
18 module Gargantext.Text.Terms.Stop
21 import Numeric.Probability.Distribution ((??))
22 import qualified Numeric.Probability.Distribution as D
24 import Data.Char (toLower)
25 import qualified Data.List as DL
27 import Data.Maybe (maybe)
28 import Data.Map.Strict (Map)
29 import qualified Data.Map.Strict as DM
31 import Data.String (String)
33 import Data.Text (pack, unpack)
35 import Gargantext.Prelude
36 import Gargantext.Core (Lang(..))
37 import Gargantext.Text.Terms.Mono (words)
38 import Gargantext.Text.Metrics.Count (occurrencesWith)
40 ------------------------------------------------------------------------
41 data Candidate = Candidate { stop :: Double
45 -- * String preparation
48 blanks :: String -> String
50 blanks xs = [' '] <> xs <> [' ']
52 -- | Blocks increase the size of the word to ease computations
53 -- some border and unexepected effects can happen, need to be tested
54 blockOf :: Int -> String -> String
55 blockOf n st = DL.concat $ DL.take n $ DL.repeat st
57 -- | Chunks is the same function as splitBy in Context but for Strings,
58 -- not Text (without pack and unpack operations that are not needed).
59 chunks :: Int -> Int -> String -> [String]
60 chunks n m = DL.take m . filter (not . all (== ' ')) . chunkAlong (n+1) 1 . DL.concat . DL.take 1000 . DL.repeat . blanks
62 allChunks :: [Int] -> Int -> String -> [String]
63 allChunks ns m st = DL.concat $ map (\n -> chunks n m st) ns
65 allChunks' :: [Int] -> Int -> String -> [[String]]
66 allChunks' ns m st = map (\n -> chunks n m st) ns
68 ------------------------------------------------------------------------
69 -- * Analyze candidate
75 data LangWord = LangWord Lang Word
77 type LangProba = Map Lang Double
79 ------------------------------------------------------------------------
81 estimeTest :: String -> LangProba
82 estimeTest s = estime (wordsToBook [0..2] s) testEL
85 testEL = toEventLangs [0,1,2] [ LangWord EN "Lovely day. This day."
86 , LangWord FR "Belle journée, j'y vais."
87 , LangWord EN "Hello Sir, how are you doing? I am fine thank you, good bye"
88 , LangWord FR "Bonjour Monsieur, comment allez-vous? Je vais bien merci."
91 estime :: EventBook -> EventLang -> LangProba
92 estime (EventBook mapFreq _) el = DM.unionsWith (+) $ map (\(s,n) -> DM.map (\eb -> (fromIntegral n) * peb s eb) el) $ filter (\x -> fst x /= " ") $ DM.toList mapFreq
94 ------------------------------------------------------------------------
96 type EventLang = Map Lang EventBook
97 toEventLangs :: [Int] -> [LangWord] -> EventLang
98 toEventLangs ns = foldl' (opLang (+)) (emptyEventLang ns) . map (toLang ns)
100 emptyEventLang :: [Int] -> EventLang
101 emptyEventLang ns = toLang ns (LangWord FR "")
103 toLang :: [Int] -> LangWord -> EventLang
104 toLang ns (LangWord l txt) = DM.fromList [(l, wordsToBook ns txt)]
106 opLang :: (Freq -> Freq -> Freq) -> EventLang -> EventLang -> EventLang
107 opLang f = DM.unionWith (op f)
109 ------------------------------------------------------------------------
110 -- | TODO: monoids (but proba >= 0)
112 peb :: String -> EventBook -> Double
113 peb st (EventBook mapFreq mapN) = (fromIntegral a) / (fromIntegral b)
115 a = maybe 0 identity $ DM.lookup st mapFreq
116 b = maybe 1 identity $ DM.lookup (length st) mapN
118 data EventBook = EventBook { events_freq :: Map String Freq
119 , events_n :: Map StringSize TotalFreq
123 emptyEventBook :: [Int] -> EventBook
124 emptyEventBook ns = wordToBook ns " "
126 wordsToBook :: [Int] -> String -> EventBook
127 wordsToBook ns txt = foldl' (op (+)) (emptyEventBook ns) eventsBook
129 ws = map unpack $ words $ pack txt
130 eventsBook = map (wordToBook ns) ws
132 wordToBook :: [Int] -> Word -> EventBook
133 wordToBook ns txt = EventBook ef en
135 chks = allChunks' ns 10 txt
136 en = DM.fromList $ map (\(n,ns) -> (n, length ns)) $ zip ns chks
137 ef = foldl' DM.union DM.empty $ map (occurrencesWith identity) chks
139 op :: (Freq -> Freq -> Freq) -> EventBook -> EventBook -> EventBook
140 op f (EventBook ef1 en1)
141 (EventBook ef2 en2) = EventBook (DM.unionWith f ef1 ef2)
142 (DM.unionWith f en1 en2)
145 ------------------------------------------------------------------------
146 ------------------------------------------------------------------------
147 -- * Make the distributions
148 makeDist :: [String] -> D.T Double String
149 makeDist = D.uniform . DL.concat . map (allChunks [0,2] 10)
151 stopDist :: D.T Double String
152 stopDist = makeDist stopList
154 candDist :: D.T Double String
155 candDist = makeDist candList
157 ------------------------------------------------------------------------
158 sumProba :: Num a => D.T a String -> [Char] -> a
159 sumProba ds x = sum $ map ((~?) ds) $ allChunks [0,2] 10 $ map toLower x
161 -- | Get probability according a distribution
162 (~?) :: (Num prob, Eq a) => D.T prob a -> a -> prob
163 (~?) ds x = (==x) ?? ds
165 ------------------------------------------------------------------------
166 candidate x = Candidate (sumProba stopDist x) (sumProba candDist x)
168 ------------------------------------------------------------------------
170 candList = [ "france", "alexandre", "mael", "constitution"
171 , "etats-unis", "associes", "car", "train", "spam"]
175 stopList = map show ([0..9]::[Int]) <> [
176 "a","a's","able","about","above","apply","according","accordingly",
177 "across","actually","after","afterwards","again","against",
178 "ain't","all","allow","allows","almost","alone","along",
179 "involves", "already","also","although","always","am","among","amongst",
180 "an","and","another","any","anybody","anyhow","anyone","anything",
181 "anyway","anyways","anywhere","analyze","apart","appear","appreciate","appropriate",
182 "are","aren't","around","as","aside","ask","asking","associated","at",
183 "available","away","awfully","based", "b","be","became","because","become",
184 "becomes","becoming","been","before","beforehand","behind","being",
185 "believe","below","beside","besides","best","better","between","beyond",
186 "both","brief","but","by","c","c'mon","c's","came","can","can't","cannot",
187 "cant","cause","causes","certain","certainly","changes","clearly","co",
188 "com","come","comes","common","concerning","consequently","consider","considering",
189 "contain","containing","contains","corresponding","could","couldn't","course",
190 "currently","d","definitely","described","detects","detecting","despite","did","didn't","different",
191 "do","does","doesn't","doing","don't","done","down","downwards","during","e",
192 "each","edu","eg","eight","either","else","elsewhere","enough","entirely",
193 "especially","et","etc","even","ever","every","everybody","everyone",
194 "everything","everywhere","ex","exactly","example","except","f","far",
195 "few","find","fifth","first","five","followed","following","follows","for",
196 "former","formerly","forth","four","from","further","furthermore","g",
197 "get","gets","getting","given","gives","go","goes","going","gone","got",
198 "gotten","greetings","h","had","hadn't","happens","hardly","has","hasn't",
199 "have","haven't","having","he","he's","hello","help","hence","her","here",
200 "here's","hereafter","hereby","herein","hereupon","hers","herself","hi",
201 "him","himself","his","hither","hopefully","how","howbeit","however","i",
202 "i'd","identify","i'll","i'm","i've","ie","if","ignored","immediate","in","inasmuch",
203 "inc","indeed","indicate","indicated","indicates","inner","insofar",
204 "instead","into","inward","is","isn't","it","it'd","it'll","it's","its",
205 "itself","j","just","k","keep","keeps","kept","know","known","knows","l",
206 "last","lately","later","latter","latterly","least","less","lest","let",
207 "let's","like","liked","likely","little","look","looking","looks","ltd",
208 "m","mainly","many","may","maybe","me","mean","meanwhile","merely","might",
209 "more","moreover","most","mostly","much","must","my","myself","n",
210 "name","namely","nd","near","nearly","necessary","need","needs","neither",
211 "never","nevertheless","new","next","nine","no","nobody","non","none",
212 "noone","nor","normally","not","nothing","novel","now","nowhere","o",
213 "obviously","of","off","often","oh","ok","okay","old","on","once","one",
214 "ones","only","onto","or","other","others","otherwise","ought","our",
215 "ours","ourselves","out","outside","over","overall","own","p","particular",
216 "particularly","per","perhaps","placed","please","plus","possible",
217 "presents","presumably","probably","provides","q","que","quite","qv","r","rather",
218 "rd","re","really","reasonably","regarding","regardless","regards",
219 "relatively","respectively","right","s","said","same","saw","say",
220 "saying","says","second","secondly","see","seeing","seem","seemed",
221 "seeming","seems","seen","self","selves","sensible","sent","serious",
222 "seriously","seven","several","shall","she","should","shouldn't","since",
223 "six","so","some","somebody","somehow","someone","something","sometime",
224 "sometimes","somewhat","somewhere","soon","sorry","specified","specify",
225 "specifying","still","sub","such","sup","sure","t","t's","take","taken",
226 "tell","tends","th","than","thank","thanks","thanx","that","that's",
227 "thats","the","their","theirs","them","themselves","then","thence","there",
228 "there's","thereafter","thereby","therefore","therein","theres",
229 "thereupon","these","they","they'd","they'll","they're","they've",
230 "think","third","this","thorough","thoroughly","those","though","three",
231 "through","throughout","thru","thus","to","together","too","took","toward",
232 "towards","tried","tries","truly","try","trying","twice","two","u","un",
233 "under","unfortunately","unless","unlikely","until","unto","up","upon",
234 "us","use","used","useful","uses","using","usually","uucp","v","value",
235 "various","very","via","viz","vs","w","want","wants","was","wasn't","way",
236 "we","we'd","we'll","we're","we've","welcome","well","went","were",
237 "weren't","what","what's","whatever","when","whence","whenever","where",
238 "where's","whereafter","whereas","whereby","wherein","whereupon",
239 "wherever","whether","which","while","whither","who","who's","whoever",
240 "whole","whom","whose","why","will","willing","wish","with","within",
241 "without","won't","wonder","would","wouldn't","x","y","yes","yet","you",
242 "you'd","you'll","you're","you've","your","yours","yourself","yourselves",