]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Terms/Stop.hs
Merge remote-tracking branch 'origin/lang-parser'
[gargantext.git] / src / Gargantext / Text / Terms / Stop.hs
1 {-|
2 Module : Gargantext.Text.Terms.Stop
3 Description : Mono Terms module
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Stop words and (how to learn it).
11
12 Main type here is String.
13
14 -}
15
16 {-# LANGUAGE NoImplicitPrelude #-}
17
18 module Gargantext.Text.Terms.Stop
19 where
20
21 import GHC.Base (Functor)
22 import Numeric.Probability.Distribution ((??))
23 import qualified Numeric.Probability.Distribution as D
24
25 import Data.Char (toLower)
26 import qualified Data.List as DL
27
28 import Data.Maybe (maybe)
29 import Data.Map.Strict (Map, toList)
30 import qualified Data.Map.Strict as DM
31
32 import Data.String (String)
33
34 import Data.Text (pack, unpack)
35 import Data.Tuple.Extra (both)
36
37 import Gargantext.Prelude
38 import Gargantext.Core (Lang(..), allLangs)
39 import Gargantext.Text.Terms.Mono (words)
40 import Gargantext.Text.Metrics.Count (occurrencesWith)
41
42 import qualified Gargantext.Text.Samples.FR as FR
43 import qualified Gargantext.Text.Samples.EN as EN
44 --import qualified Gargantext.Text.Samples.DE as DE
45 --import qualified Gargantext.Text.Samples.SP as SP
46 --import qualified Gargantext.Text.Samples.CH as CH
47
48 ------------------------------------------------------------------------
49 data Candidate = Candidate { stop :: Double
50 , noStop :: Double
51 } deriving (Show)
52
53 -- * String preparation
54
55 -- | String prepare
56 blanks :: String -> String
57 blanks [] = []
58 blanks xs = [' '] <> xs <> [' ']
59
60 -- | Blocks increase the size of the word to ease computations
61 -- some border and unexepected effects can happen, need to be tested
62 blockOf :: Int -> String -> String
63 blockOf n st = DL.concat $ DL.take n $ DL.repeat st
64
65 -- | Chunks is the same function as splitBy in Context but for Strings,
66 -- not Text (without pack and unpack operations that are not needed).
67 chunks :: Int -> Int -> String -> [String]
68 chunks n m = DL.take m . filter (not . all (== ' ')) . chunkAlong (n+1) 1 . DL.concat . DL.take 1000 . DL.repeat . blanks
69
70 allChunks :: [Int] -> Int -> String -> [String]
71 allChunks ns m st = DL.concat $ map (\n -> chunks n m st) ns
72
73 allChunks' :: [Int] -> Int -> String -> [[String]]
74 allChunks' ns m st = map (\n -> chunks n m st) ns
75
76 ------------------------------------------------------------------------
77 -- * Analyze candidate
78 type StringSize = Int
79 type TotalFreq = Int
80 type Freq = Int
81 type Word = String
82
83 data LangWord = LangWord Lang Word
84
85 type LangProba = Map Lang Double
86
87 ------------------------------------------------------------------------
88 detectLangs :: String -> [(Lang, Double)]
89 detectLangs s = DL.reverse $ DL.sortOn snd
90 $ toList
91 $ detect (wordsToBook [0..2] s) eventLang
92
93 part :: (Eq p, Fractional p) => p -> p -> p
94 part 0 _ = 0
95 part _ 0 = 0
96 part x y = x / y
97
98 toProba :: (Eq b, Fractional b, Functor t, Foldable t) =>
99 t (a, b) -> t (a, b)
100 toProba xs = map (\(a,b) -> (a, part b total)) xs
101 where
102 total = sum $ map snd xs
103
104 textSample :: Lang -> String
105 textSample EN = EN.textSample
106 textSample FR = FR.textSample
107 --textSample DE = DE.textSample
108 --textSample SP = SP.textSample
109 --textSample CH = CH.textSample
110
111 langWord :: Lang -> LangWord
112 langWord l = LangWord l (textSample l)
113
114 eventLang :: EventLang
115 eventLang = toEventLangs [0..2] [ langWord l | l <- allLangs ]
116
117 detect :: EventBook -> EventLang -> LangProba
118 detect (EventBook mapFreq _) el =
119 DM.unionsWith (+)
120 $ map DM.fromList
121 $ map (\(s,n) -> map (\(l,f) -> (l, (fromIntegral n) * f)) $ toPrior s el)
122 $ filter (\x -> fst x /= " ")
123 $ DM.toList mapFreq
124
125 ------------------------------------------------------------------------
126 -- | TODO: monoids
127 type EventLang = Map Lang EventBook
128
129 toEventLangs :: [Int] -> [LangWord] -> EventLang
130 toEventLangs ns = foldl' (opLang (+)) (emptyEventLang ns) . map (toLang ns)
131
132 emptyEventLang :: [Int] -> EventLang
133 emptyEventLang ns = toLang ns (LangWord FR "")
134
135 toLang :: [Int] -> LangWord -> EventLang
136 toLang ns (LangWord l txt) = DM.fromList [(l, wordsToBook ns txt)]
137
138 opLang :: (Freq -> Freq -> Freq) -> EventLang -> EventLang -> EventLang
139 opLang f = DM.unionWith (op f)
140
141 ------------------------------------------------------------------------
142 -- | TODO: monoids (but proba >= 0)
143
144 peb :: String -> EventBook -> Double
145 peb st (EventBook mapFreq mapN) = (fromIntegral a) / (fromIntegral b)
146 where
147 a = maybe 0 identity $ DM.lookup st mapFreq
148 b = maybe 1 identity $ DM.lookup (length st) mapN
149
150 peb' :: String -> EventBook -> (Freq, TotalFreq)
151 peb' st (EventBook mapFreq mapN) = (fromIntegral a, fromIntegral b)
152 where
153 a = maybe 0 identity $ DM.lookup st mapFreq
154 b = maybe 1 identity $ DM.lookup (length st) mapN
155
156 ------------------------------------------------------------------------
157 toPrior :: String -> EventLang -> [(Lang, Double)]
158 toPrior s el = prior $ pebLang s el
159
160 pebLang :: String -> EventLang -> [(Lang, (Freq,TotalFreq))]
161 pebLang st = map (\(l,eb) -> (l, peb' st eb)) . DM.toList
162
163 ------------------------------------------------------------------------
164 prior :: [(Lang, (Freq, TotalFreq))] -> [(Lang, Double)]
165 prior ps = zip ls $ zipWith (\x y -> x^99 * y) (map (\(a,_) -> part a (sum $ map fst ps')) ps')
166 (map (\(a,b) -> a / b) ps')
167 where
168
169 (ls, ps'') = DL.unzip ps
170 ps' = map (both fromIntegral) ps''
171 ------------------------------------------------------------------------
172 data EventBook = EventBook { events_freq :: Map String Freq
173 , events_n :: Map StringSize TotalFreq
174 }
175 deriving (Show)
176
177 emptyEventBook :: [Int] -> EventBook
178 emptyEventBook ns = wordToBook ns " "
179
180 wordsToBook :: [Int] -> String -> EventBook
181 wordsToBook ns txt = foldl' (op (+)) (emptyEventBook ns) eventsBook
182 where
183 ws = map unpack $ words $ pack txt
184 eventsBook = map (wordToBook ns) ws
185
186 wordToBook :: [Int] -> Word -> EventBook
187 wordToBook ns txt = EventBook ef en
188 where
189 chks = allChunks' ns 10 txt
190 en = DM.fromList $ map (\(n,ns') -> (n, length ns')) $ zip ns chks
191 ef = foldl' DM.union DM.empty $ map (occurrencesWith identity) chks
192
193 op :: (Freq -> Freq -> Freq) -> EventBook -> EventBook -> EventBook
194 op f (EventBook ef1 en1)
195 (EventBook ef2 en2) = EventBook (DM.unionWith f ef1 ef2)
196 (DM.unionWith f en1 en2)
197
198
199 ------------------------------------------------------------------------
200 ------------------------------------------------------------------------
201 -- * Make the distributions
202 makeDist :: [String] -> D.T Double String
203 makeDist = D.uniform . DL.concat . map (allChunks [0,2] 10)
204
205 stopDist :: D.T Double String
206 stopDist = makeDist stopList
207
208 candDist :: D.T Double String
209 candDist = makeDist candList
210
211 ------------------------------------------------------------------------
212 sumProba :: Num a => D.T a String -> [Char] -> a
213 sumProba ds x = sum $ map ((~?) ds) $ allChunks [0,2] 10 $ map toLower x
214
215 -- | Get probability according a distribution
216 (~?) :: (Num prob, Eq a) => D.T prob a -> a -> prob
217 (~?) ds x = (==x) ?? ds
218
219 ------------------------------------------------------------------------
220 candidate :: [Char] -> Candidate
221 candidate x = Candidate (sumProba stopDist x) (sumProba candDist x)
222
223 ------------------------------------------------------------------------
224 candList :: [String]
225 candList = [ "france", "alexandre", "mael", "constitution"
226 , "etats-unis", "associes", "car", "train", "spam"]
227
228
229 stopList :: [String]
230 stopList = map show ([0..9]::[Int]) <> [
231 "a","a's","able","about","above","apply","according","accordingly",
232 "across","actually","after","afterwards","again","against",
233 "ain't","all","allow","allows","almost","alone","along",
234 "involves", "already","also","although","always","am","among","amongst",
235 "an","and","another","any","anybody","anyhow","anyone","anything",
236 "anyway","anyways","anywhere","analyze","apart","appear","appreciate","appropriate",
237 "are","aren't","around","as","aside","ask","asking","associated","at",
238 "available","away","awfully","based", "b","be","became","because","become",
239 "becomes","becoming","been","before","beforehand","behind","being",
240 "believe","below","beside","besides","best","better","between","beyond",
241 "both","brief","but","by","c","c'mon","c's","came","can","can't","cannot",
242 "cant","cause","causes","certain","certainly","changes","clearly","co",
243 "com","come","comes","common","concerning","consequently","consider","considering",
244 "contain","containing","contains","corresponding","could","couldn't","course",
245 "currently","d","definitely","described","detects","detecting","despite","did","didn't","different",
246 "do","does","doesn't","doing","don't","done","down","downwards","during","e",
247 "each","edu","eg","eight","either","else","elsewhere","enough","entirely",
248 "especially","et","etc","even","ever","every","everybody","everyone",
249 "everything","everywhere","ex","exactly","example","except","f","far",
250 "few","find","fifth","first","five","followed","following","follows","for",
251 "former","formerly","forth","four","from","further","furthermore","g",
252 "get","gets","getting","given","gives","go","goes","going","gone","got",
253 "gotten","greetings","h","had","hadn't","happens","hardly","has","hasn't",
254 "have","haven't","having","he","he's","hello","help","hence","her","here",
255 "here's","hereafter","hereby","herein","hereupon","hers","herself","hi",
256 "him","himself","his","hither","hopefully","how","howbeit","however","i",
257 "i'd","identify","i'll","i'm","i've","ie","if","ignored","immediate","in","inasmuch",
258 "inc","indeed","indicate","indicated","indicates","inner","insofar",
259 "instead","into","inward","is","isn't","it","it'd","it'll","it's","its",
260 "itself","j","just","k","keep","keeps","kept","know","known","knows","l",
261 "last","lately","later","latter","latterly","least","less","lest","let",
262 "let's","like","liked","likely","little","look","looking","looks","ltd",
263 "m","mainly","many","may","maybe","me","mean","meanwhile","merely","might",
264 "more","moreover","most","mostly","much","must","my","myself","n",
265 "name","namely","nd","near","nearly","necessary","need","needs","neither",
266 "never","nevertheless","new","next","nine","no","nobody","non","none",
267 "noone","nor","normally","not","nothing","novel","now","nowhere","o",
268 "obviously","of","off","often","oh","ok","okay","old","on","once","one",
269 "ones","only","onto","or","other","others","otherwise","ought","our",
270 "ours","ourselves","out","outside","over","overall","own","p","particular",
271 "particularly","per","perhaps","placed","please","plus","possible",
272 "presents","presumably","probably","provides","q","que","quite","qv","r","rather",
273 "rd","re","really","reasonably","regarding","regardless","regards",
274 "relatively","respectively","right","s","said","same","saw","say",
275 "saying","says","second","secondly","see","seeing","seem","seemed",
276 "seeming","seems","seen","self","selves","sensible","sent","serious",
277 "seriously","seven","several","shall","she","should","shouldn't","since",
278 "six","so","some","somebody","somehow","someone","something","sometime",
279 "sometimes","somewhat","somewhere","soon","sorry","specified","specify",
280 "specifying","still","sub","such","sup","sure","t","t's","take","taken",
281 "tell","tends","th","than","thank","thanks","thanx","that","that's",
282 "thats","the","their","theirs","them","themselves","then","thence","there",
283 "there's","thereafter","thereby","therefore","therein","theres",
284 "thereupon","these","they","they'd","they'll","they're","they've",
285 "think","third","this","thorough","thoroughly","those","though","three",
286 "through","throughout","thru","thus","to","together","too","took","toward",
287 "towards","tried","tries","truly","try","trying","twice","two","u","un",
288 "under","unfortunately","unless","unlikely","until","unto","up","upon",
289 "us","use","used","useful","uses","using","usually","uucp","v","value",
290 "various","very","via","viz","vs","w","want","wants","was","wasn't","way",
291 "we","we'd","we'll","we're","we've","welcome","well","went","were",
292 "weren't","what","what's","whatever","when","whence","whenever","where",
293 "where's","whereafter","whereas","whereby","wherein","whereupon",
294 "wherever","whether","which","while","whither","who","who's","whoever",
295 "whole","whom","whose","why","will","willing","wish","with","within",
296 "without","won't","wonder","would","wouldn't","x","y","yes","yet","you",
297 "you'd","you'll","you're","you've","your","yours","yourself","yourselves",
298 "z","zero"]
299
300
301
302