]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Terms/Stop.hs
Merge branch 'rest'
[gargantext.git] / src / Gargantext / Text / Terms / Stop.hs
1 {-|
2 Module : Gargantext.Text.Terms.Stop
3 Description : Mono Terms module
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Stop words and (how to learn it).
11
12 Main type here is String.
13
14 -}
15
16 {-# LANGUAGE NoImplicitPrelude #-}
17
18 module Gargantext.Text.Terms.Stop
19 where
20
21 import GHC.Base (Functor)
22 import Numeric.Probability.Distribution ((??))
23 import qualified Numeric.Probability.Distribution as D
24
25 import Data.Char (toLower)
26 import qualified Data.List as DL
27
28 import Data.Maybe (maybe)
29 import Data.Map.Strict (Map, toList)
30 import qualified Data.Map.Strict as DM
31
32 import Data.String (String)
33
34 import Data.Text (pack, unpack)
35 import Data.Tuple.Extra (both)
36
37 import Gargantext.Prelude
38 import Gargantext.Core (Lang(..), allLangs)
39 import Gargantext.Text.Terms.Mono (words)
40 import Gargantext.Text.Metrics.Count (occurrencesWith)
41
42 import qualified Gargantext.Text.Samples.FR as FR
43 import qualified Gargantext.Text.Samples.EN as EN
44 --import qualified Gargantext.Text.Samples.DE as DE
45 --import qualified Gargantext.Text.Samples.SP as SP
46 --import qualified Gargantext.Text.Samples.CH as CH
47
48 ------------------------------------------------------------------------
49 data Candidate = Candidate { stop :: Double
50 , noStop :: Double
51 } deriving (Show)
52
53 -- * String preparation
54
55 -- | String prepare
56 blanks :: String -> String
57 blanks [] = []
58 blanks xs = [' '] <> xs <> [' ']
59
60 -- | Blocks increase the size of the word to ease computations
61 -- some border and unexepected effects can happen, need to be tested
62 blockOf :: Int -> String -> String
63 blockOf n = DL.concat . DL.take n . DL.repeat
64
65 -- | Chunks is the same function as splitBy in Context but for Strings,
66 -- not Text (without pack and unpack operations that are not needed).
67 chunks :: Int -> Int -> String -> [String]
68 chunks n m = DL.take m . filter (not . all (== ' '))
69 . chunkAlong (n+1) 1
70 . DL.concat
71 . DL.take 1000
72 . DL.repeat
73 . blanks
74
75 allChunks :: [Int] -> Int -> String -> [String]
76 allChunks ns m st = DL.concat $ map (\n -> chunks n m st) ns
77
78 allChunks' :: [Int] -> Int -> String -> [[String]]
79 allChunks' ns m st = map (\n -> chunks n m st) ns
80
81 ------------------------------------------------------------------------
82 -- * Analyze candidate
83 type StringSize = Int
84 type TotalFreq = Int
85 type Freq = Int
86 type Word = String
87
88 data LangWord = LangWord Lang Word
89
90 type LangProba = Map Lang Double
91
92 ------------------------------------------------------------------------
93 detectLangs :: String -> [(Lang, Double)]
94 detectLangs s = DL.reverse $ DL.sortOn snd
95 $ toList
96 $ detect (wordsToBook [0..2] s) eventLang
97
98 part :: (Eq p, Fractional p) => p -> p -> p
99 part 0 _ = 0
100 part _ 0 = 0
101 part x y = x / y
102
103 toProba :: (Eq b, Fractional b, Functor t, Foldable t) =>
104 t (a, b) -> t (a, b)
105 toProba xs = map (\(a,b) -> (a, part b total)) xs
106 where
107 total = sum $ map snd xs
108
109 textSample :: Lang -> String
110 textSample EN = EN.textSample
111 textSample FR = FR.textSample
112 --textSample DE = DE.textSample
113 --textSample SP = SP.textSample
114 --textSample CH = CH.textSample
115
116 langWord :: Lang -> LangWord
117 langWord l = LangWord l (textSample l)
118
119 eventLang :: EventLang
120 eventLang = toEventLangs [0..2] [ langWord l | l <- allLangs ]
121
122 detect :: EventBook -> EventLang -> LangProba
123 detect (EventBook mapFreq _) el =
124 DM.unionsWith (+)
125 $ map DM.fromList
126 $ map (\(s,n) -> map (\(l,f) -> (l, (fromIntegral n) * f)) $ toPrior s el)
127 $ filter (\x -> fst x /= " ")
128 $ DM.toList mapFreq
129
130 ------------------------------------------------------------------------
131 -- | TODO: monoids
132 type EventLang = Map Lang EventBook
133
134 toEventLangs :: [Int] -> [LangWord] -> EventLang
135 toEventLangs ns = foldl' (opLang (+)) (emptyEventLang ns) . map (toLang ns)
136
137 emptyEventLang :: [Int] -> EventLang
138 emptyEventLang ns = toLang ns (LangWord FR "")
139
140 toLang :: [Int] -> LangWord -> EventLang
141 toLang ns (LangWord l txt) = DM.fromList [(l, wordsToBook ns txt)]
142
143 opLang :: (Freq -> Freq -> Freq) -> EventLang -> EventLang -> EventLang
144 opLang f = DM.unionWith (op f)
145
146 ------------------------------------------------------------------------
147 -- | TODO: monoids (but proba >= 0)
148
149 peb :: String -> EventBook -> Double
150 peb st (EventBook mapFreq mapN) = (fromIntegral a) / (fromIntegral b)
151 where
152 a = maybe 0 identity $ DM.lookup st mapFreq
153 b = maybe 1 identity $ DM.lookup (length st) mapN
154
155 peb' :: String -> EventBook -> (Freq, TotalFreq)
156 peb' st (EventBook mapFreq mapN) = (fromIntegral a, fromIntegral b)
157 where
158 a = maybe 0 identity $ DM.lookup st mapFreq
159 b = maybe 1 identity $ DM.lookup (length st) mapN
160
161 ------------------------------------------------------------------------
162 toPrior :: String -> EventLang -> [(Lang, Double)]
163 toPrior s el = prior $ pebLang s el
164
165 pebLang :: String -> EventLang -> [(Lang, (Freq,TotalFreq))]
166 pebLang st = map (\(l,eb) -> (l, peb' st eb)) . DM.toList
167 ------------------------------------------------------------------------
168 prior :: [(Lang, (Freq, TotalFreq))] -> [(Lang, Double)]
169 prior ps = zip ls $ zipWith (\x y -> x^(99::Int) * y) (map (\(a,_) -> part a (sum $ map fst ps')) ps')
170 (map (\(a,b) -> a / b) ps')
171 where
172
173 (ls, ps'') = DL.unzip ps
174 ps' = map (both fromIntegral) ps''
175 ------------------------------------------------------------------------
176 data EventBook = EventBook { events_freq :: Map String Freq
177 , events_n :: Map StringSize TotalFreq
178 }
179 deriving (Show)
180
181 emptyEventBook :: [Int] -> EventBook
182 emptyEventBook ns = wordToBook ns " "
183
184 wordsToBook :: [Int] -> String -> EventBook
185 wordsToBook ns txt = foldl' (op (+)) (emptyEventBook ns) eventsBook
186 where
187 ws = map unpack $ words $ pack txt
188 eventsBook = map (wordToBook ns) ws
189
190 wordToBook :: [Int] -> Word -> EventBook
191 wordToBook ns txt = EventBook ef en
192 where
193 chks = allChunks' ns 10 txt
194 en = DM.fromList $ map (\(n,ns') -> (n, length ns')) $ zip ns chks
195 ef = foldl' DM.union DM.empty $ map (occurrencesWith identity) chks
196
197 op :: (Freq -> Freq -> Freq) -> EventBook -> EventBook -> EventBook
198 op f (EventBook ef1 en1)
199 (EventBook ef2 en2) = EventBook (DM.unionWith f ef1 ef2)
200 (DM.unionWith f en1 en2)
201
202 ------------------------------------------------------------------------
203 ------------------------------------------------------------------------
204 -- * Make the distributions
205 makeDist :: [String] -> D.T Double String
206 makeDist = D.uniform . DL.concat . map (allChunks [0,2] 10)
207
208 stopDist :: D.T Double String
209 stopDist = makeDist stopList
210
211 candDist :: D.T Double String
212 candDist = makeDist candList
213
214 ------------------------------------------------------------------------
215 sumProba :: Num a => D.T a String -> [Char] -> a
216 sumProba ds x = sum $ map ((~?) ds) $ allChunks [0,2] 10 $ map toLower x
217
218 -- | Get probability according a distribution
219 (~?) :: (Num prob, Eq a) => D.T prob a -> a -> prob
220 (~?) ds x = (==x) ?? ds
221
222 ------------------------------------------------------------------------
223 candidate :: [Char] -> Candidate
224 candidate x = Candidate (sumProba stopDist x) (sumProba candDist x)
225
226 ------------------------------------------------------------------------
227 candList :: [String]
228 candList = [ "france", "alexandre", "mael", "constitution"
229 , "etats-unis", "associes", "car", "train", "spam"]
230
231
232 stopList :: [String]
233 stopList = map show ([0..9]::[Int]) <> [
234 "a","a's","able","about","above","apply","according","accordingly",
235 "across","actually","after","afterwards","again","against",
236 "ain't","all","allow","allows","almost","alone","along",
237 "involves", "already","also","although","always","am","among","amongst",
238 "an","and","another","any","anybody","anyhow","anyone","anything",
239 "anyway","anyways","anywhere","analyze","apart","appear","appreciate","appropriate",
240 "are","aren't","around","as","aside","ask","asking","associated","at",
241 "available","away","awfully","based", "b","be","became","because","become",
242 "becomes","becoming","been","before","beforehand","behind","being",
243 "believe","below","beside","besides","best","better","between","beyond",
244 "both","brief","but","by","c","c'mon","c's","came","can","can't","cannot",
245 "cant","cause","causes","certain","certainly","changes","clearly","co",
246 "com","come","comes","common","concerning","consequently","consider","considering",
247 "contain","containing","contains","corresponding","could","couldn't","course",
248 "currently","d","definitely","described","detects","detecting","despite","did","didn't","different",
249 "do","does","doesn't","doing","don't","done","down","downwards","during","e",
250 "each","edu","eg","eight","either","else","elsewhere","enough","entirely",
251 "especially","et","etc","even","ever","every","everybody","everyone",
252 "everything","everywhere","ex","exactly","example","except","f","far",
253 "few","find","fifth","first","five","followed","following","follows","for",
254 "former","formerly","forth","four","from","further","furthermore","g",
255 "get","gets","getting","given","gives","go","goes","going","gone","got",
256 "gotten","greetings","h","had","hadn't","happens","hardly","has","hasn't",
257 "have","haven't","having","he","he's","hello","help","hence","her","here",
258 "here's","hereafter","hereby","herein","hereupon","hers","herself","hi",
259 "him","himself","his","hither","hopefully","how","howbeit","however","i",
260 "i'd","identify","i'll","i'm","i've","ie","if","ignored","immediate","in","inasmuch",
261 "inc","indeed","indicate","indicated","indicates","inner","insofar",
262 "instead","into","inward","is","isn't","it","it'd","it'll","it's","its",
263 "itself","j","just","k","keep","keeps","kept","know","known","knows","l",
264 "last","lately","later","latter","latterly","least","less","lest","let",
265 "let's","like","liked","likely","little","look","looking","looks","ltd",
266 "m","mainly","many","may","maybe","me","mean","meanwhile","merely","might",
267 "more","moreover","most","mostly","much","must","my","myself","n",
268 "name","namely","nd","near","nearly","necessary","need","needs","neither",
269 "never","nevertheless","new","next","nine","no","nobody","non","none",
270 "noone","nor","normally","not","nothing","novel","now","nowhere","o",
271 "obviously","of","off","often","oh","ok","okay","old","on","once","one",
272 "ones","only","onto","or","other","others","otherwise","ought","our",
273 "ours","ourselves","out","outside","over","overall","own","p","particular",
274 "particularly","per","perhaps","placed","please","plus","possible",
275 "presents","presumably","probably","provides","q","que","quite","qv","r","rather",
276 "rd","re","really","reasonably","regarding","regardless","regards",
277 "relatively","respectively","right","s","said","same","saw","say",
278 "saying","says","second","secondly","see","seeing","seem","seemed",
279 "seeming","seems","seen","self","selves","sensible","sent","serious",
280 "seriously","seven","several","shall","she","should","shouldn't","since",
281 "six","so","some","somebody","somehow","someone","something","sometime",
282 "sometimes","somewhat","somewhere","soon","sorry","specified","specify",
283 "specifying","still","sub","such","sup","sure","t","t's","take","taken",
284 "tell","tends","th","than","thank","thanks","thanx","that","that's",
285 "thats","the","their","theirs","them","themselves","then","thence","there",
286 "there's","thereafter","thereby","therefore","therein","theres",
287 "thereupon","these","they","they'd","they'll","they're","they've",
288 "think","third","this","thorough","thoroughly","those","though","three",
289 "through","throughout","thru","thus","to","together","too","took","toward",
290 "towards","tried","tries","truly","try","trying","twice","two","u","un",
291 "under","unfortunately","unless","unlikely","until","unto","up","upon",
292 "us","use","used","useful","uses","using","usually","uucp","v","value",
293 "various","very","via","viz","vs","w","want","wants","was","wasn't","way",
294 "we","we'd","we'll","we're","we've","welcome","well","went","were",
295 "weren't","what","what's","whatever","when","whence","whenever","where",
296 "where's","whereafter","whereas","whereby","wherein","whereupon",
297 "wherever","whether","which","while","whither","who","who's","whoever",
298 "whole","whom","whose","why","will","willing","wish","with","within",
299 "without","won't","wonder","would","wouldn't","x","y","yes","yet","you",
300 "you'd","you'll","you're","you've","your","yours","yourself","yourselves",
301 "z","zero"]
302
303
304
305