2 Module : Gargantext.Text.Terms.Stop
3 Description : Mono Terms module
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Stop words and (how to learn it).
12 Main type here is String.
16 {-# LANGUAGE NoImplicitPrelude #-}
18 module Gargantext.Text.Terms.Stop
21 import Numeric.Probability.Distribution ((??))
22 import qualified Numeric.Probability.Distribution as D
24 import Data.Char (toLower)
25 import qualified Data.List as DL
27 import Data.Maybe (maybe)
28 import Data.Map.Strict (Map, toList)
29 import qualified Data.Map.Strict as DM
31 import Data.String (String)
33 import Data.Text (pack, unpack)
35 import Gargantext.Prelude
36 import Gargantext.Core (Lang(..), allLangs)
37 import Gargantext.Text.Terms.Mono (words)
38 import Gargantext.Text.Metrics.Count (occurrencesWith)
40 import qualified Gargantext.Text.Samples.FR as FR
41 import qualified Gargantext.Text.Samples.EN as EN
42 --import qualified Gargantext.Text.Samples.DE as DE
43 --import qualified Gargantext.Text.Samples.SP as SP
44 --import qualified Gargantext.Text.Samples.CH as CH
46 ------------------------------------------------------------------------
47 data Candidate = Candidate { stop :: Double
51 -- * String preparation
54 blanks :: String -> String
56 blanks xs = [' '] <> xs <> [' ']
58 -- | Blocks increase the size of the word to ease computations
59 -- some border and unexepected effects can happen, need to be tested
60 blockOf :: Int -> String -> String
61 blockOf n st = DL.concat $ DL.take n $ DL.repeat st
63 -- | Chunks is the same function as splitBy in Context but for Strings,
64 -- not Text (without pack and unpack operations that are not needed).
65 chunks :: Int -> Int -> String -> [String]
66 chunks n m = DL.take m . filter (not . all (== ' ')) . chunkAlong (n+1) 1 . DL.concat . DL.take 1000 . DL.repeat . blanks
68 allChunks :: [Int] -> Int -> String -> [String]
69 allChunks ns m st = DL.concat $ map (\n -> chunks n m st) ns
71 allChunks' :: [Int] -> Int -> String -> [[String]]
72 allChunks' ns m st = map (\n -> chunks n m st) ns
74 ------------------------------------------------------------------------
75 -- * Analyze candidate
81 data LangWord = LangWord Lang Word
83 type LangProba = Map Lang Double
85 ------------------------------------------------------------------------
86 detectLangs :: String -> [(Lang, Double)]
87 detectLangs s = DL.reverse $ DL.sortOn snd
89 $ detect (wordsToBook [0..2] s) testEL
91 textMining :: Lang -> String
92 textMining EN = EN.textMining
93 textMining FR = FR.textMining
94 --textMining DE = DE.textMining
95 --textMining SP = SP.textMining
96 --textMining CH = CH.textMining
98 langWord :: Lang -> LangWord
99 langWord l = LangWord l (textMining l)
102 testEL = toEventLangs [0..2] [ langWord l | l <- allLangs ]
104 detect :: EventBook -> EventLang -> LangProba
105 detect (EventBook mapFreq _) el = DM.unionsWith (+) $ map (\(s,n) -> DM.map (\eb -> (fromIntegral n) * peb s eb) el) $ filter (\x -> fst x /= " ") $ DM.toList mapFreq
107 ------------------------------------------------------------------------
109 type EventLang = Map Lang EventBook
110 toEventLangs :: [Int] -> [LangWord] -> EventLang
111 toEventLangs ns = foldl' (opLang (+)) (emptyEventLang ns) . map (toLang ns)
113 emptyEventLang :: [Int] -> EventLang
114 emptyEventLang ns = toLang ns (LangWord FR "")
116 toLang :: [Int] -> LangWord -> EventLang
117 toLang ns (LangWord l txt) = DM.fromList [(l, wordsToBook ns txt)]
119 opLang :: (Freq -> Freq -> Freq) -> EventLang -> EventLang -> EventLang
120 opLang f = DM.unionWith (op f)
122 ------------------------------------------------------------------------
123 -- | TODO: monoids (but proba >= 0)
125 peb :: String -> EventBook -> Double
126 peb st (EventBook mapFreq mapN) = (fromIntegral a) / (fromIntegral b)
128 a = maybe 0 identity $ DM.lookup st mapFreq
129 b = maybe 1 identity $ DM.lookup (length st) mapN
131 data EventBook = EventBook { events_freq :: Map String Freq
132 , events_n :: Map StringSize TotalFreq
136 emptyEventBook :: [Int] -> EventBook
137 emptyEventBook ns = wordToBook ns " "
139 wordsToBook :: [Int] -> String -> EventBook
140 wordsToBook ns txt = foldl' (op (+)) (emptyEventBook ns) eventsBook
142 ws = map unpack $ words $ pack txt
143 eventsBook = map (wordToBook ns) ws
145 wordToBook :: [Int] -> Word -> EventBook
146 wordToBook ns txt = EventBook ef en
148 chks = allChunks' ns 10 txt
149 en = DM.fromList $ map (\(n,ns') -> (n, length ns')) $ zip ns chks
150 ef = foldl' DM.union DM.empty $ map (occurrencesWith identity) chks
152 op :: (Freq -> Freq -> Freq) -> EventBook -> EventBook -> EventBook
153 op f (EventBook ef1 en1)
154 (EventBook ef2 en2) = EventBook (DM.unionWith f ef1 ef2)
155 (DM.unionWith f en1 en2)
158 ------------------------------------------------------------------------
159 ------------------------------------------------------------------------
160 -- * Make the distributions
161 makeDist :: [String] -> D.T Double String
162 makeDist = D.uniform . DL.concat . map (allChunks [0,2] 10)
164 stopDist :: D.T Double String
165 stopDist = makeDist stopList
167 candDist :: D.T Double String
168 candDist = makeDist candList
170 ------------------------------------------------------------------------
171 sumProba :: Num a => D.T a String -> [Char] -> a
172 sumProba ds x = sum $ map ((~?) ds) $ allChunks [0,2] 10 $ map toLower x
174 -- | Get probability according a distribution
175 (~?) :: (Num prob, Eq a) => D.T prob a -> a -> prob
176 (~?) ds x = (==x) ?? ds
178 ------------------------------------------------------------------------
179 candidate :: [Char] -> Candidate
180 candidate x = Candidate (sumProba stopDist x) (sumProba candDist x)
182 ------------------------------------------------------------------------
184 candList = [ "france", "alexandre", "mael", "constitution"
185 , "etats-unis", "associes", "car", "train", "spam"]
189 stopList = map show ([0..9]::[Int]) <> [
190 "a","a's","able","about","above","apply","according","accordingly",
191 "across","actually","after","afterwards","again","against",
192 "ain't","all","allow","allows","almost","alone","along",
193 "involves", "already","also","although","always","am","among","amongst",
194 "an","and","another","any","anybody","anyhow","anyone","anything",
195 "anyway","anyways","anywhere","analyze","apart","appear","appreciate","appropriate",
196 "are","aren't","around","as","aside","ask","asking","associated","at",
197 "available","away","awfully","based", "b","be","became","because","become",
198 "becomes","becoming","been","before","beforehand","behind","being",
199 "believe","below","beside","besides","best","better","between","beyond",
200 "both","brief","but","by","c","c'mon","c's","came","can","can't","cannot",
201 "cant","cause","causes","certain","certainly","changes","clearly","co",
202 "com","come","comes","common","concerning","consequently","consider","considering",
203 "contain","containing","contains","corresponding","could","couldn't","course",
204 "currently","d","definitely","described","detects","detecting","despite","did","didn't","different",
205 "do","does","doesn't","doing","don't","done","down","downwards","during","e",
206 "each","edu","eg","eight","either","else","elsewhere","enough","entirely",
207 "especially","et","etc","even","ever","every","everybody","everyone",
208 "everything","everywhere","ex","exactly","example","except","f","far",
209 "few","find","fifth","first","five","followed","following","follows","for",
210 "former","formerly","forth","four","from","further","furthermore","g",
211 "get","gets","getting","given","gives","go","goes","going","gone","got",
212 "gotten","greetings","h","had","hadn't","happens","hardly","has","hasn't",
213 "have","haven't","having","he","he's","hello","help","hence","her","here",
214 "here's","hereafter","hereby","herein","hereupon","hers","herself","hi",
215 "him","himself","his","hither","hopefully","how","howbeit","however","i",
216 "i'd","identify","i'll","i'm","i've","ie","if","ignored","immediate","in","inasmuch",
217 "inc","indeed","indicate","indicated","indicates","inner","insofar",
218 "instead","into","inward","is","isn't","it","it'd","it'll","it's","its",
219 "itself","j","just","k","keep","keeps","kept","know","known","knows","l",
220 "last","lately","later","latter","latterly","least","less","lest","let",
221 "let's","like","liked","likely","little","look","looking","looks","ltd",
222 "m","mainly","many","may","maybe","me","mean","meanwhile","merely","might",
223 "more","moreover","most","mostly","much","must","my","myself","n",
224 "name","namely","nd","near","nearly","necessary","need","needs","neither",
225 "never","nevertheless","new","next","nine","no","nobody","non","none",
226 "noone","nor","normally","not","nothing","novel","now","nowhere","o",
227 "obviously","of","off","often","oh","ok","okay","old","on","once","one",
228 "ones","only","onto","or","other","others","otherwise","ought","our",
229 "ours","ourselves","out","outside","over","overall","own","p","particular",
230 "particularly","per","perhaps","placed","please","plus","possible",
231 "presents","presumably","probably","provides","q","que","quite","qv","r","rather",
232 "rd","re","really","reasonably","regarding","regardless","regards",
233 "relatively","respectively","right","s","said","same","saw","say",
234 "saying","says","second","secondly","see","seeing","seem","seemed",
235 "seeming","seems","seen","self","selves","sensible","sent","serious",
236 "seriously","seven","several","shall","she","should","shouldn't","since",
237 "six","so","some","somebody","somehow","someone","something","sometime",
238 "sometimes","somewhat","somewhere","soon","sorry","specified","specify",
239 "specifying","still","sub","such","sup","sure","t","t's","take","taken",
240 "tell","tends","th","than","thank","thanks","thanx","that","that's",
241 "thats","the","their","theirs","them","themselves","then","thence","there",
242 "there's","thereafter","thereby","therefore","therein","theres",
243 "thereupon","these","they","they'd","they'll","they're","they've",
244 "think","third","this","thorough","thoroughly","those","though","three",
245 "through","throughout","thru","thus","to","together","too","took","toward",
246 "towards","tried","tries","truly","try","trying","twice","two","u","un",
247 "under","unfortunately","unless","unlikely","until","unto","up","upon",
248 "us","use","used","useful","uses","using","usually","uucp","v","value",
249 "various","very","via","viz","vs","w","want","wants","was","wasn't","way",
250 "we","we'd","we'll","we're","we've","welcome","well","went","were",
251 "weren't","what","what's","whatever","when","whence","whenever","where",
252 "where's","whereafter","whereas","whereby","wherein","whereupon",
253 "wherever","whether","which","while","whither","who","who's","whoever",
254 "whole","whom","whose","why","will","willing","wish","with","within",
255 "without","won't","wonder","would","wouldn't","x","y","yes","yet","you",
256 "you'd","you'll","you're","you've","your","yours","yourself","yourselves",