]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Terms/Stop.hs
[LISTS] Stop list updated, improved.
[gargantext.git] / src / Gargantext / Text / Terms / Stop.hs
1 {-|
2 Module : Gargantext.Text.Terms.Stop
3 Description : Mono Terms module
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Stop words and (how to learn it).
11
12 Main type here is String.
13
14 -}
15
16 {-# LANGUAGE NoImplicitPrelude #-}
17
18 module Gargantext.Text.Terms.Stop (detectLang, detectLangs, stopList)
19 where
20
21 import GHC.Base (Functor)
22 import Numeric.Probability.Distribution ((??))
23 import qualified Numeric.Probability.Distribution as D
24
25 import Data.Char (toLower)
26 import qualified Data.List as DL
27
28 import Data.Maybe (maybe)
29 import Data.Map.Strict (Map, toList)
30 import qualified Data.Map.Strict as DM
31
32 import Data.String (String)
33
34 import Data.Text (Text)
35 import Data.Text (pack, unpack)
36 import Data.Tuple.Extra (both)
37
38 import Gargantext.Prelude
39 import Gargantext.Core (Lang(..), allLangs)
40 import Gargantext.Text.Terms.Mono (words)
41 import Gargantext.Text.Metrics.Count (occurrencesWith)
42
43 import qualified Gargantext.Text.Samples.FR as FR
44 import qualified Gargantext.Text.Samples.EN as EN
45 --import qualified Gargantext.Text.Samples.DE as DE
46 --import qualified Gargantext.Text.Samples.SP as SP
47 --import qualified Gargantext.Text.Samples.CH as CH
48
49 ------------------------------------------------------------------------
50 data Candidate = Candidate { stop :: Double
51 , noStop :: Double
52 } deriving (Show)
53
54 -- * String preparation
55
56 -- | String prepare
57 blanks :: String -> String
58 blanks [] = []
59 blanks xs = [' '] <> xs <> [' ']
60
61 -- | Blocks increase the size of the word to ease computations
62 -- some border and unexepected effects can happen, need to be tested
63 blockOf :: Int -> String -> String
64 blockOf n = DL.concat . DL.take n . DL.repeat
65
66 -- | Chunks is the same function as splitBy in Context but for Strings,
67 -- not Text (without pack and unpack operations that are not needed).
68 chunks :: Int -> Int -> String -> [String]
69 chunks n m = DL.take m . filter (not . all (== ' '))
70 . chunkAlong (n+1) 1
71 . DL.concat
72 . DL.take 1000
73 . DL.repeat
74 . blanks
75
76 allChunks :: [Int] -> Int -> String -> [String]
77 allChunks ns m st = DL.concat $ map (\n -> chunks n m st) ns
78
79 allChunks' :: [Int] -> Int -> String -> [[String]]
80 allChunks' ns m st = map (\n -> chunks n m st) ns
81
82 ------------------------------------------------------------------------
83 -- * Analyze candidate
84 type StringSize = Int
85 type TotalFreq = Int
86 type Freq = Int
87 type Word = String
88
89 data LangWord = LangWord Lang Word
90
91 type LangProba = Map Lang Double
92
93 ------------------------------------------------------------------------
94 detectLang :: Text -> Maybe Lang
95 detectLang = head . map fst . detectLangs
96
97 detectLangs :: Text -> [(Lang, Double)]
98 detectLangs = detectLangs' . unpack
99
100 detectLangs' :: String -> [(Lang, Double)]
101 detectLangs' s = DL.reverse $ DL.sortOn snd
102 $ toList
103 $ detect (wordsToBook [0..2] s) eventLang
104
105 part :: (Eq p, Fractional p) => p -> p -> p
106 part 0 _ = 0
107 part _ 0 = 0
108 part x y = x / y
109
110 toProba :: (Eq b, Fractional b, Functor t, Foldable t) =>
111 t (a, b) -> t (a, b)
112 toProba xs = map (\(a,b) -> (a, part b total)) xs
113 where
114 total = sum $ map snd xs
115
116 textSample :: Lang -> String
117 textSample EN = EN.textSample
118 textSample FR = FR.textSample
119 --textSample DE = DE.textSample
120 --textSample SP = SP.textSample
121 --textSample CH = CH.textSample
122
123 langWord :: Lang -> LangWord
124 langWord l = LangWord l (textSample l)
125
126 eventLang :: EventLang
127 eventLang = toEventLangs [0..2] [ langWord l | l <- allLangs ]
128
129 detect :: EventBook -> EventLang -> LangProba
130 detect (EventBook mapFreq _) el =
131 DM.unionsWith (+)
132 $ map DM.fromList
133 $ map (\(s,n) -> map (\(l,f) -> (l, (fromIntegral n) * f)) $ toPrior s el)
134 $ filter (\x -> fst x /= " ")
135 $ DM.toList mapFreq
136
137 ------------------------------------------------------------------------
138 -- | TODO: monoids
139 type EventLang = Map Lang EventBook
140
141 toEventLangs :: [Int] -> [LangWord] -> EventLang
142 toEventLangs ns = foldl' (opLang (+)) (emptyEventLang ns) . map (toLang ns)
143
144 emptyEventLang :: [Int] -> EventLang
145 emptyEventLang ns = toLang ns (LangWord FR "")
146
147 toLang :: [Int] -> LangWord -> EventLang
148 toLang ns (LangWord l txt) = DM.fromList [(l, wordsToBook ns txt)]
149
150 opLang :: (Freq -> Freq -> Freq) -> EventLang -> EventLang -> EventLang
151 opLang f = DM.unionWith (op f)
152
153 ------------------------------------------------------------------------
154 -- | TODO: monoids (but proba >= 0)
155
156 peb :: String -> EventBook -> Double
157 peb st (EventBook mapFreq mapN) = (fromIntegral a) / (fromIntegral b)
158 where
159 a = maybe 0 identity $ DM.lookup st mapFreq
160 b = maybe 1 identity $ DM.lookup (length st) mapN
161
162 peb' :: String -> EventBook -> (Freq, TotalFreq)
163 peb' st (EventBook mapFreq mapN) = (fromIntegral a, fromIntegral b)
164 where
165 a = maybe 0 identity $ DM.lookup st mapFreq
166 b = maybe 1 identity $ DM.lookup (length st) mapN
167
168 ------------------------------------------------------------------------
169 toPrior :: String -> EventLang -> [(Lang, Double)]
170 toPrior s el = prior $ pebLang s el
171
172 pebLang :: String -> EventLang -> [(Lang, (Freq,TotalFreq))]
173 pebLang st = map (\(l,eb) -> (l, peb' st eb)) . DM.toList
174 ------------------------------------------------------------------------
175 prior :: [(Lang, (Freq, TotalFreq))] -> [(Lang, Double)]
176 prior ps = zip ls $ zipWith (\x y -> x^(99::Int) * y) (map (\(a,_) -> part a (sum $ map fst ps')) ps')
177 (map (\(a,b) -> a / b) ps')
178 where
179
180 (ls, ps'') = DL.unzip ps
181 ps' = map (both fromIntegral) ps''
182 ------------------------------------------------------------------------
183 data EventBook = EventBook { events_freq :: Map String Freq
184 , events_n :: Map StringSize TotalFreq
185 }
186 deriving (Show)
187
188 emptyEventBook :: [Int] -> EventBook
189 emptyEventBook ns = wordToBook ns " "
190
191 wordsToBook :: [Int] -> String -> EventBook
192 wordsToBook ns txt = foldl' (op (+)) (emptyEventBook ns) eventsBook
193 where
194 ws = map unpack $ words $ pack txt
195 eventsBook = map (wordToBook ns) ws
196
197 wordToBook :: [Int] -> Word -> EventBook
198 wordToBook ns txt = EventBook ef en
199 where
200 chks = allChunks' ns 10 txt
201 en = DM.fromList $ map (\(n,ns') -> (n, length ns')) $ zip ns chks
202 ef = foldl' DM.union DM.empty $ map (occurrencesWith identity) chks
203
204 op :: (Freq -> Freq -> Freq) -> EventBook -> EventBook -> EventBook
205 op f (EventBook ef1 en1)
206 (EventBook ef2 en2) = EventBook (DM.unionWith f ef1 ef2)
207 (DM.unionWith f en1 en2)
208
209 ------------------------------------------------------------------------
210 ------------------------------------------------------------------------
211 -- * Make the distributions
212 makeDist :: [String] -> D.T Double String
213 makeDist = D.uniform . DL.concat . map (allChunks [0,2] 10)
214
215 stopDist :: D.T Double String
216 stopDist = makeDist stopList
217
218 candDist :: D.T Double String
219 candDist = makeDist candList
220
221 ------------------------------------------------------------------------
222 sumProba :: Num a => D.T a String -> [Char] -> a
223 sumProba ds x = sum $ map ((~?) ds) $ allChunks [0,2] 10 $ map toLower x
224
225 -- | Get probability according a distribution
226 (~?) :: (Num prob, Eq a) => D.T prob a -> a -> prob
227 (~?) ds x = (==x) ?? ds
228
229 ------------------------------------------------------------------------
230 candidate :: [Char] -> Candidate
231 candidate x = Candidate (sumProba stopDist x) (sumProba candDist x)
232
233 ------------------------------------------------------------------------
234 candList :: [String]
235 candList = [ "france", "alexandre", "mael", "constitution"
236 , "etats-unis", "associes", "car", "train", "spam"]
237
238
239 stopList :: [String]
240 stopList = map show ([0..9]::[Int]) <> stopListWords
241
242 stopListWords :: [String]
243 stopListWords =
244 ["a", "a's", "able", "about", "above", "according", "accordingly"
245 , "across", "actually", "after", "afterwards", "again", "against"
246 , "ain't", "all", "allow", "allows", "almost", "alone", "along"
247 , "already", "also", "although", "always", "am", "among", "amongst", "an"
248 , "analyze", "and", "another", "any", "anybody", "anyhow", "anyone"
249 , "anything", "anyway", "anyways", "anywhere", "apart", "appear"
250 , "apply", "appreciate", "appropriate", "are", "aren't", "around"
251 , "as", "aside", "ask", "asking", "associated", "at", "available"
252 , "away", "awfully", "b", "based", "be", "became", "because", "become"
253 , "becomes", "becoming", "been", "before", "beforehand", "behind"
254 , "being", "believe", "below", "beside", "besides", "best", "better"
255 , "between", "beyond", "both", "brief", "but", "by", "c", "c'mon", "c's"
256 , "came", "can", "can't", "cannot", "cant", "cause", "causes", "certain"
257 , "certainly", "changes", "clearly", "co", "com", "come", "comes"
258 , "common", "concerning", "consequently", "consider", "considering"
259 , "contain", "containing", "contains", "corresponding", "could"
260 , "couldn't", "course", "currently", "d", "definitely", "described"
261 , "despite", "detecting", "detects", "did", "didn't", "different", "do"
262 , "does", "doesn't", "doing", "don't", "done", "down", "downwards"
263 , "during", "e", "each", "edu", "eg", "eight", "either", "else"
264 , "elsewhere", "enough", "entirely", "especially", "et", "etc", "even"
265 , "ever", "every", "everybody", "everyone", "everything", "everywhere"
266 , "ex", "exactly", "example", "except", "f", "far", "few", "fifth"
267 , "find", "first", "five", "followed", "following", "follows", "for"
268 , "former", "formerly", "forth", "four", "from", "further", "furthermore"
269 , "g", "get", "gets", "getting", "gif", "given", "gives", "go", "goes"
270 , "going", "gone", "got", "gotten", "greetings", "h", "had", "hadn't"
271 , "happens", "hardly", "has", "hasn't", "have", "haven't", "having"
272 , "he", "he'd", "he'll", "he's", "hello", "help", "hence", "her"
273 , "here", "here's", "hereafter", "hereby", "herein", "hereupon", "hers"
274 , "herself", "hi", "him", "himself", "his", "hither", "hopefully", "how"
275 , "how's", "howbeit", "however", "i", "i'd", "i'll", "i'm", "i've"
276 , "identify", "ie", "if", "ignored", "immediate", "in", "inasmuch"
277 , "inc", "indeed", "indicate", "indicated", "indicates", "inner"
278 , "insofar", "instead", "into", "involves", "inward", "is", "isn't"
279 , "it", "it'd", "it'll", "it's", "its", "itself", "j", "just", "k"
280 , "keep", "keeps", "kept", "know", "known", "knows", "l", "last"
281 , "late", "lately", "later", "latter", "latterly", "least", "less"
282 , "lest", "let", "let's", "like", "liked", "likely", "little", "look"
283 , "looking", "looks", "ltd", "m", "main", "mainly", "many", "may"
284 , "maybe", "me", "mean", "meanwhile", "merely", "might", "min", "more"
285 , "moreover", "most", "mostly", "much", "must", "mustn't", "my", "myself"
286 , "n", "name", "namely", "nd", "near", "nearly", "necessary", "need"
287 , "needs", "neither", "never", "nevertheless", "new", "next", "nine"
288 , "no", "nobody", "non", "none", "noone", "nor", "normally", "not"
289 , "nothing", "novel", "now", "nowhere", "o", "obviously", "of", "off"
290 , "often", "oh", "ok", "okay", "old", "on", "once", "one", "ones"
291 , "only", "onto", "or", "other", "others", "otherwise", "ought", "our"
292 , "ours", "ourselves", "out", "outside", "over", "overall", "own", "p"
293 , "particular", "particularly", "per", "perhaps", "placed", "please"
294 , "plus", "possible", "presents", "presumably", "probably", "provides"
295 , "q", "que", "quite", "qv", "r", "rather", "rd", "re", "really"
296 , "reasonably", "regarding", "regardless", "regards", "relatively"
297 , "respectively", "right", "s", "said", "same", "saw", "say", "saying"
298 , "says", "sds", "second", "secondly", "see", "seeing", "seem", "seemed"
299 , "seeming", "seems", "seen", "self", "selves", "sensible", "sent"
300 , "serious", "seriously", "seven", "several", "shall", "shan't"
301 , "she", "she'd", "she'll", "she's", "should", "shouldn't", "since"
302 , "six", "so", "some", "somebody", "somehow", "someone", "something"
303 , "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry"
304 , "specified", "specify", "specifying", "still", "sub", "such", "sup"
305 , "sure", "t", "t's", "take", "taken", "tell", "tends", "th", "than"
306 , "thank", "thanks", "thanx", "that", "that's", "thats", "the", "their"
307 , "theirs", "them", "themselves", "then", "thence", "there", "there's"
308 , "thereafter", "thereby", "therefore", "therein", "theres", "thereupon"
309 , "these", "they", "they'd", "they'll", "they're", "they've", "think"
310 , "third", "this", "thorough", "thoroughly", "those", "though", "three"
311 , "through", "throughout", "thru", "thus", "to", "together", "too"
312 , "took", "toward", "towards", "tried", "tries", "truly", "try"
313 , "trying", "twice", "two", "u", "un", "under", "unfortunately"
314 , "unless", "unlikely", "until", "unto", "up", "upon", "us", "use"
315 , "used", "useful", "uses", "using", "usually", "uucp", "v", "value"
316 , "various", "very", "via", "viz", "vs", "w", "want", "wants", "was"
317 , "wasn't", "way", "we", "we'd", "we'll", "we're", "we've", "welcome"
318 , "well", "went", "were", "weren't", "what", "what's", "whatever", "when"
319 , "when's", "whence", "whenever", "where", "where's", "whereafter"
320 , "whereas", "whereby", "wherein", "whereupon", "wherever", "whether"
321 , "which", "while", "whither", "who", "who's", "whoever", "whole", "whom"
322 , "whose", "why", "why's", "will", "willing", "wish", "with", "within"
323 , "without", "won't", "wonder", "would", "wouldn't", "x", "y", "yes"
324 , "yet", "you", "you'd", "you'll", "you're", "you've", "your", "yours"
325 , "yourself", "yourselves", "z", "zero"]
326
327