]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Terms/Stop.hs
[WORKFLOW] Unsupervised ngrams extraction implemented.
[gargantext.git] / src / Gargantext / Text / Terms / Stop.hs
1 {-|
2 Module : Gargantext.Text.Terms.Stop
3 Description : Mono Terms module
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Stop words and (how to learn it).
11
12 Main type here is String.
13
14 -}
15
16 {-# LANGUAGE NoImplicitPrelude #-}
17 {-# LANGUAGE OverloadedStrings #-}
18
19 module Gargantext.Text.Terms.Stop -- (detectLang, detectLangs, stopList)
20 where
21
22 import GHC.Base (Functor)
23 import Numeric.Probability.Distribution ((??))
24 import qualified Numeric.Probability.Distribution as D
25
26 import Data.Char (toLower)
27 import qualified Data.List as DL
28
29 import Data.Maybe (maybe)
30 import Data.Map.Strict (Map, toList)
31 import qualified Data.Map.Strict as DM
32
33 import Data.String (String)
34
35 import Data.Text (Text)
36 import Data.Text (pack, unpack)
37 import Data.Tuple.Extra (both)
38
39 import Gargantext.Prelude
40 import Gargantext.Core (Lang(..), allLangs)
41 import Gargantext.Text.Terms.Mono (words)
42 import Gargantext.Text.Metrics.Count (occurrencesWith)
43
44 import qualified Gargantext.Text.Samples.FR as FR
45 import qualified Gargantext.Text.Samples.EN as EN
46 --import qualified Gargantext.Text.Samples.DE as DE
47 --import qualified Gargantext.Text.Samples.SP as SP
48 --import qualified Gargantext.Text.Samples.CH as CH
49
50 ------------------------------------------------------------------------
51 data Candidate = Candidate { stop :: Double
52 , noStop :: Double
53 } deriving (Show)
54
55 -- * String preparation
56
57 -- | String prepare
58 blanks :: String -> String
59 blanks [] = []
60 blanks xs = [' '] <> xs <> [' ']
61
62 -- | Blocks increase the size of the word to ease computations
63 -- some border and unexepected effects can happen, need to be tested
64 blockOf :: Int -> String -> String
65 blockOf n = DL.concat . DL.take n . DL.repeat
66
67 -- | Chunks is the same function as splitBy in Context but for Strings,
68 -- not Text (without pack and unpack operations that are not needed).
69 chunks :: Int -> Int -> String -> [String]
70 chunks n m = DL.take m . filter (not . all (== ' '))
71 . chunkAlong (n+1) 1
72 . DL.concat
73 . DL.take 1000
74 . DL.repeat
75 . blanks
76
77 allChunks :: [Int] -> Int -> String -> [String]
78 allChunks ns m st = DL.concat $ map (\n -> chunks n m st) ns
79
80 allChunks' :: [Int] -> Int -> String -> [[String]]
81 allChunks' ns m st = map (\n -> chunks n m st) ns
82
83 ------------------------------------------------------------------------
84 -- * Analyze candidate
85 type StringSize = Int
86 type TotalFreq = Int
87 type Freq = Int
88 type Word = String
89
90 data LangWord = LangWord Lang Word
91
92 type LangProba = Map Lang Double
93
94 ------------------------------------------------------------------------
95 detectLang :: Text -> Maybe Lang
96 detectLang = head . map fst . detectLangs
97
98 detectLangs :: Text -> [(Lang, Double)]
99 detectLangs = detectLangs' . unpack
100
101 detectLangs' :: String -> [(Lang, Double)]
102 detectLangs' s = DL.reverse $ DL.sortOn snd
103 $ toList
104 $ detect (wordsToBook [0..2] s) eventLang
105
106 part :: (Eq p, Fractional p) => p -> p -> p
107 part 0 _ = 0
108 part _ 0 = 0
109 part x y = x / y
110
111 toProba :: (Eq b, Fractional b, Functor t, Foldable t) =>
112 t (a, b) -> t (a, b)
113 toProba xs = map (\(a,b) -> (a, part b total)) xs
114 where
115 total = sum $ map snd xs
116
117 textSample :: Lang -> String
118 textSample EN = EN.textSample
119 textSample FR = FR.textSample
120 --textSample DE = DE.textSample
121 --textSample SP = SP.textSample
122 --textSample CH = CH.textSample
123
124 langWord :: Lang -> LangWord
125 langWord l = LangWord l (textSample l)
126
127 eventLang :: EventLang
128 eventLang = toEventLangs [0..2] [ langWord l | l <- allLangs ]
129
130 detect :: EventBook -> EventLang -> LangProba
131 detect (EventBook mapFreq _) el =
132 DM.unionsWith (+)
133 $ map DM.fromList
134 $ map (\(s,n) -> map (\(l,f) -> (l, (fromIntegral n) * f)) $ toPrior s el)
135 $ filter (\x -> fst x /= " ")
136 $ DM.toList mapFreq
137
138 ------------------------------------------------------------------------
139 -- | TODO: monoids
140 type EventLang = Map Lang EventBook
141
142 toEventLangs :: [Int] -> [LangWord] -> EventLang
143 toEventLangs ns = foldl' (opLang (+)) (emptyEventLang ns) . map (toLang ns)
144
145 emptyEventLang :: [Int] -> EventLang
146 emptyEventLang ns = toLang ns (LangWord FR "")
147
148 toLang :: [Int] -> LangWord -> EventLang
149 toLang ns (LangWord l txt) = DM.fromList [(l, wordsToBook ns txt)]
150
151 opLang :: (Freq -> Freq -> Freq) -> EventLang -> EventLang -> EventLang
152 opLang f = DM.unionWith (op f)
153
154 ------------------------------------------------------------------------
155 -- | TODO: monoids (but proba >= 0)
156
157 peb :: String -> EventBook -> Double
158 peb st (EventBook mapFreq mapN) = (fromIntegral a) / (fromIntegral b)
159 where
160 a = maybe 0 identity $ DM.lookup st mapFreq
161 b = maybe 1 identity $ DM.lookup (length st) mapN
162
163 peb' :: String -> EventBook -> (Freq, TotalFreq)
164 peb' st (EventBook mapFreq mapN) = (fromIntegral a, fromIntegral b)
165 where
166 a = maybe 0 identity $ DM.lookup st mapFreq
167 b = maybe 1 identity $ DM.lookup (length st) mapN
168
169 ------------------------------------------------------------------------
170 toPrior :: String -> EventLang -> [(Lang, Double)]
171 toPrior s el = prior $ pebLang s el
172
173 pebLang :: String -> EventLang -> [(Lang, (Freq,TotalFreq))]
174 pebLang st = map (\(l,eb) -> (l, peb' st eb)) . DM.toList
175 ------------------------------------------------------------------------
176 prior :: [(Lang, (Freq, TotalFreq))] -> [(Lang, Double)]
177 prior ps = zip ls $ zipWith (\x y -> x^(99::Int) * y) (map (\(a,_) -> part a (sum $ map fst ps')) ps')
178 (map (\(a,b) -> a / b) ps')
179 where
180
181 (ls, ps'') = DL.unzip ps
182 ps' = map (both fromIntegral) ps''
183 ------------------------------------------------------------------------
184 data EventBook = EventBook { events_freq :: Map String Freq
185 , events_n :: Map StringSize TotalFreq
186 }
187 deriving (Show)
188
189 emptyEventBook :: [Int] -> EventBook
190 emptyEventBook ns = wordToBook ns " "
191
192 wordsToBook :: [Int] -> String -> EventBook
193 wordsToBook ns txt = foldl' (op (+)) (emptyEventBook ns) eventsBook
194 where
195 ws = map unpack $ words $ pack txt
196 eventsBook = map (wordToBook ns) ws
197
198 wordToBook :: [Int] -> Word -> EventBook
199 wordToBook ns txt = EventBook ef en
200 where
201 chks = allChunks' ns 10 txt
202 en = DM.fromList $ map (\(n,ns') -> (n, length ns')) $ zip ns chks
203 ef = foldl' DM.union DM.empty $ map (occurrencesWith identity) chks
204
205 op :: (Freq -> Freq -> Freq) -> EventBook -> EventBook -> EventBook
206 op f (EventBook ef1 en1)
207 (EventBook ef2 en2) = EventBook (DM.unionWith f ef1 ef2)
208 (DM.unionWith f en1 en2)
209
210 ------------------------------------------------------------------------
211 ------------------------------------------------------------------------
212 -- * Make the distributions
213 makeDist :: [String] -> D.T Double String
214 makeDist = D.uniform . DL.concat . map (allChunks [0,2] 10)
215
216 stopDist :: D.T Double String
217 stopDist = makeDist stopList
218
219 candDist :: D.T Double String
220 candDist = makeDist candList
221
222 ------------------------------------------------------------------------
223 sumProba :: Num a => D.T a String -> [Char] -> a
224 sumProba ds x = sum $ map ((~?) ds) $ allChunks [0,2] 10 $ map toLower x
225
226 -- | Get probability according a distribution
227 (~?) :: (Num prob, Eq a) => D.T prob a -> a -> prob
228 (~?) ds x = (==x) ?? ds
229
230 ------------------------------------------------------------------------
231 candidate :: [Char] -> Candidate
232 candidate x = Candidate (sumProba stopDist x) (sumProba candDist x)
233
234 ------------------------------------------------------------------------
235 candList :: [String]
236 candList = [ "france", "alexandre", "mael", "constitution"
237 , "etats-unis", "associes", "car", "train", "spam"]
238
239
240 stopList :: [String]
241 stopList = map show ([0..9]::[Int]) <> stopListWords
242
243 stopListWords :: [String]
244 stopListWords =
245 ["a", "a's", "able", "about", "above", "according", "accordingly"
246 , "across", "actually", "after", "afterwards", "again", "against"
247 , "ain't", "all", "allow", "allows", "almost", "alone", "along"
248 , "already", "also", "although", "always", "am", "among", "amongst", "an"
249 , "analyze", "and", "another", "any", "anybody", "anyhow", "anyone"
250 , "anything", "anyway", "anyways", "anywhere", "apart", "appear"
251 , "apply", "appreciate", "appropriate", "are", "aren't", "around"
252 , "as", "aside", "ask", "asking", "associated", "at", "available"
253 , "away", "awfully", "b", "based", "be", "became", "because", "become"
254 , "becomes", "becoming", "been", "before", "beforehand", "behind"
255 , "being", "believe", "below", "beside", "besides", "best", "better"
256 , "between", "beyond", "both", "brief", "but", "by", "c", "c'mon", "c's"
257 , "came", "can", "can't", "cannot", "cant", "cause", "causes", "certain"
258 , "certainly", "changes", "clearly", "co", "com", "come", "comes"
259 , "common", "concerning", "consequently", "consider", "considering"
260 , "contain", "containing", "contains", "corresponding", "could"
261 , "couldn't", "course", "currently", "d", "definitely", "described"
262 , "despite", "detecting", "detects", "did", "didn't", "different", "do"
263 , "does", "doesn't", "doing", "don't", "done", "down", "downwards"
264 , "during", "e", "each", "edu", "eg", "eight", "either", "else"
265 , "elsewhere", "enough", "entirely", "especially", "et", "etc", "even"
266 , "ever", "every", "everybody", "everyone", "everything", "everywhere"
267 , "ex", "exactly", "example", "except", "f", "far", "few", "fifth"
268 , "find", "first", "five", "followed", "following", "follows", "for"
269 , "former", "formerly", "forth", "four", "from", "further", "furthermore"
270 , "g", "get", "gets", "getting", "gif", "given", "gives", "go", "goes"
271 , "going", "gone", "got", "gotten", "greetings", "h", "had", "hadn't"
272 , "happens", "hardly", "has", "hasn't", "have", "haven't", "having"
273 , "he", "he'd", "he'll", "he's", "hello", "help", "hence", "her"
274 , "here", "here's", "hereafter", "hereby", "herein", "hereupon", "hers"
275 , "herself", "hi", "him", "himself", "his", "hither", "hopefully", "how"
276 , "how's", "howbeit", "however", "i", "i'd", "i'll", "i'm", "i've"
277 , "identify", "ie", "if", "ignored", "immediate", "in", "inasmuch"
278 , "inc", "indeed", "indicate", "indicated", "indicates", "inner"
279 , "insofar", "instead", "into", "involves", "inward", "is", "isn't"
280 , "it", "it'd", "it'll", "it's", "its", "itself", "j", "just", "k"
281 , "keep", "keeps", "kept", "know", "known", "knows", "l", "last"
282 , "late", "lately", "later", "latter", "latterly", "least", "less"
283 , "lest", "let", "let's", "like", "liked", "likely", "little", "look"
284 , "looking", "looks", "ltd", "m", "main", "mainly", "many", "may"
285 , "maybe", "me", "mean", "meanwhile", "merely", "might", "min", "more"
286 , "moreover", "most", "mostly", "much", "must", "mustn't", "my", "myself"
287 , "n", "name", "namely", "nd", "near", "nearly", "necessary", "need"
288 , "needs", "neither", "never", "nevertheless", "new", "next", "nine"
289 , "no", "nobody", "non", "none", "noone", "nor", "normally", "not"
290 , "nothing", "novel", "now", "nowhere", "o", "obviously", "of", "off"
291 , "often", "oh", "ok", "okay", "old", "on", "once", "one", "ones"
292 , "only", "onto", "or", "other", "others", "otherwise", "ought", "our"
293 , "ours", "ourselves", "out", "outside", "over", "overall", "own", "p"
294 , "particular", "particularly", "per", "perhaps", "placed", "please"
295 , "plus", "possible", "presents", "presumably", "probably", "provides"
296 , "q", "que", "quite", "qv", "r", "rather", "rd", "re", "really"
297 , "reasonably", "regarding", "regardless", "regards", "relatively"
298 , "respectively", "right", "s", "said", "same", "saw", "say", "saying"
299 , "says", "sds", "second", "secondly", "see", "seeing", "seem", "seemed"
300 , "seeming", "seems", "seen", "self", "selves", "sensible", "sent"
301 , "serious", "seriously", "seven", "several", "shall", "shan't"
302 , "she", "she'd", "she'll", "she's", "should", "shouldn't", "since"
303 , "six", "so", "some", "somebody", "somehow", "someone", "something"
304 , "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry"
305 , "specified", "specify", "specifying", "still", "sub", "such", "sup"
306 , "sure", "t", "t's", "take", "taken", "tell", "tends", "th", "than"
307 , "thank", "thanks", "thanx", "that", "that's", "thats", "the", "their"
308 , "theirs", "them", "themselves", "then", "thence", "there", "there's"
309 , "thereafter", "thereby", "therefore", "therein", "theres", "thereupon"
310 , "these", "they", "they'd", "they'll", "they're", "they've", "think"
311 , "third", "this", "thorough", "thoroughly", "those", "though", "three"
312 , "through", "throughout", "thru", "thus", "to", "together", "too"
313 , "took", "toward", "towards", "tried", "tries", "truly", "try"
314 , "trying", "twice", "two", "u", "un", "under", "unfortunately"
315 , "unless", "unlikely", "until", "unto", "up", "upon", "us", "use"
316 , "used", "useful", "uses", "using", "usually", "uucp", "v", "value"
317 , "various", "very", "via", "viz", "vs", "w", "want", "wants", "was"
318 , "wasn't", "way", "we", "we'd", "we'll", "we're", "we've", "welcome"
319 , "well", "went", "were", "weren't", "what", "what's", "whatever", "when"
320 , "when's", "whence", "whenever", "where", "where's", "whereafter"
321 , "whereas", "whereby", "wherein", "whereupon", "wherever", "whether"
322 , "which", "while", "whither", "who", "who's", "whoever", "whole", "whom"
323 , "whose", "why", "why's", "will", "willing", "wish", "with", "within"
324 , "without", "won't", "wonder", "would", "wouldn't", "x", "y", "yes"
325 , "yet", "you", "you'd", "you'll", "you're", "you've", "your", "yours"
326 , "yourself", "yourselves", "z", "zero"]
327
328