]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Terms/Stop.hs
Build only Langs which are fully supported
[gargantext.git] / src / Gargantext / Text / Terms / Stop.hs
1 {-|
2 Module : Gargantext.Text.Terms.Stop
3 Description : Mono Terms module
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Stop words and (how to learn it).
11
12 Main type here is String.
13
14 -}
15
16 {-# LANGUAGE NoImplicitPrelude #-}
17
18 module Gargantext.Text.Terms.Stop
19 where
20
21 import Numeric.Probability.Distribution ((??))
22 import qualified Numeric.Probability.Distribution as D
23
24 import Data.Char (toLower)
25 import qualified Data.List as DL
26
27 import Data.Maybe (maybe)
28 import Data.Map.Strict (Map, toList)
29 import qualified Data.Map.Strict as DM
30
31 import Data.String (String)
32
33 import Data.Text (pack, unpack)
34
35 import Gargantext.Prelude
36 import Gargantext.Core (Lang(..), allLangs)
37 import Gargantext.Text.Terms.Mono (words)
38 import Gargantext.Text.Metrics.Count (occurrencesWith)
39
40 import qualified Gargantext.Text.Samples.FR as FR
41 import qualified Gargantext.Text.Samples.EN as EN
42 --import qualified Gargantext.Text.Samples.DE as DE
43 --import qualified Gargantext.Text.Samples.SP as SP
44 --import qualified Gargantext.Text.Samples.CH as CH
45
46 ------------------------------------------------------------------------
47 data Candidate = Candidate { stop :: Double
48 , noStop :: Double
49 } deriving (Show)
50
51 -- * String preparation
52
53 -- | String prepare
54 blanks :: String -> String
55 blanks [] = []
56 blanks xs = [' '] <> xs <> [' ']
57
58 -- | Blocks increase the size of the word to ease computations
59 -- some border and unexepected effects can happen, need to be tested
60 blockOf :: Int -> String -> String
61 blockOf n st = DL.concat $ DL.take n $ DL.repeat st
62
63 -- | Chunks is the same function as splitBy in Context but for Strings,
64 -- not Text (without pack and unpack operations that are not needed).
65 chunks :: Int -> Int -> String -> [String]
66 chunks n m = DL.take m . filter (not . all (== ' ')) . chunkAlong (n+1) 1 . DL.concat . DL.take 1000 . DL.repeat . blanks
67
68 allChunks :: [Int] -> Int -> String -> [String]
69 allChunks ns m st = DL.concat $ map (\n -> chunks n m st) ns
70
71 allChunks' :: [Int] -> Int -> String -> [[String]]
72 allChunks' ns m st = map (\n -> chunks n m st) ns
73
74 ------------------------------------------------------------------------
75 -- * Analyze candidate
76 type StringSize = Int
77 type TotalFreq = Int
78 type Freq = Int
79 type Word = String
80
81 data LangWord = LangWord Lang Word
82
83 type LangProba = Map Lang Double
84
85 ------------------------------------------------------------------------
86 detectLangs :: String -> [(Lang, Double)]
87 detectLangs s = DL.reverse $ DL.sortOn snd
88 $ toList
89 $ detect (wordsToBook [0..2] s) testEL
90
91 textMining :: Lang -> String
92 textMining EN = EN.textMining
93 textMining FR = FR.textMining
94 --textMining DE = DE.textMining
95 --textMining SP = SP.textMining
96 --textMining CH = CH.textMining
97
98 langWord :: Lang -> LangWord
99 langWord l = LangWord l (textMining l)
100
101 testEL :: EventLang
102 testEL = toEventLangs [0..2] [ langWord l | l <- allLangs ]
103
104 detect :: EventBook -> EventLang -> LangProba
105 detect (EventBook mapFreq _) el = DM.unionsWith (+) $ map (\(s,n) -> DM.map (\eb -> (fromIntegral n) * peb s eb) el) $ filter (\x -> fst x /= " ") $ DM.toList mapFreq
106
107 ------------------------------------------------------------------------
108 -- | TODO: monoids
109 type EventLang = Map Lang EventBook
110 toEventLangs :: [Int] -> [LangWord] -> EventLang
111 toEventLangs ns = foldl' (opLang (+)) (emptyEventLang ns) . map (toLang ns)
112
113 emptyEventLang :: [Int] -> EventLang
114 emptyEventLang ns = toLang ns (LangWord FR "")
115
116 toLang :: [Int] -> LangWord -> EventLang
117 toLang ns (LangWord l txt) = DM.fromList [(l, wordsToBook ns txt)]
118
119 opLang :: (Freq -> Freq -> Freq) -> EventLang -> EventLang -> EventLang
120 opLang f = DM.unionWith (op f)
121
122 ------------------------------------------------------------------------
123 -- | TODO: monoids (but proba >= 0)
124
125 peb :: String -> EventBook -> Double
126 peb st (EventBook mapFreq mapN) = (fromIntegral a) / (fromIntegral b)
127 where
128 a = maybe 0 identity $ DM.lookup st mapFreq
129 b = maybe 1 identity $ DM.lookup (length st) mapN
130
131 data EventBook = EventBook { events_freq :: Map String Freq
132 , events_n :: Map StringSize TotalFreq
133 }
134 deriving (Show)
135
136 emptyEventBook :: [Int] -> EventBook
137 emptyEventBook ns = wordToBook ns " "
138
139 wordsToBook :: [Int] -> String -> EventBook
140 wordsToBook ns txt = foldl' (op (+)) (emptyEventBook ns) eventsBook
141 where
142 ws = map unpack $ words $ pack txt
143 eventsBook = map (wordToBook ns) ws
144
145 wordToBook :: [Int] -> Word -> EventBook
146 wordToBook ns txt = EventBook ef en
147 where
148 chks = allChunks' ns 10 txt
149 en = DM.fromList $ map (\(n,ns') -> (n, length ns')) $ zip ns chks
150 ef = foldl' DM.union DM.empty $ map (occurrencesWith identity) chks
151
152 op :: (Freq -> Freq -> Freq) -> EventBook -> EventBook -> EventBook
153 op f (EventBook ef1 en1)
154 (EventBook ef2 en2) = EventBook (DM.unionWith f ef1 ef2)
155 (DM.unionWith f en1 en2)
156
157
158 ------------------------------------------------------------------------
159 ------------------------------------------------------------------------
160 -- * Make the distributions
161 makeDist :: [String] -> D.T Double String
162 makeDist = D.uniform . DL.concat . map (allChunks [0,2] 10)
163
164 stopDist :: D.T Double String
165 stopDist = makeDist stopList
166
167 candDist :: D.T Double String
168 candDist = makeDist candList
169
170 ------------------------------------------------------------------------
171 sumProba :: Num a => D.T a String -> [Char] -> a
172 sumProba ds x = sum $ map ((~?) ds) $ allChunks [0,2] 10 $ map toLower x
173
174 -- | Get probability according a distribution
175 (~?) :: (Num prob, Eq a) => D.T prob a -> a -> prob
176 (~?) ds x = (==x) ?? ds
177
178 ------------------------------------------------------------------------
179 candidate :: [Char] -> Candidate
180 candidate x = Candidate (sumProba stopDist x) (sumProba candDist x)
181
182 ------------------------------------------------------------------------
183 candList :: [String]
184 candList = [ "france", "alexandre", "mael", "constitution"
185 , "etats-unis", "associes", "car", "train", "spam"]
186
187
188 stopList :: [String]
189 stopList = map show ([0..9]::[Int]) <> [
190 "a","a's","able","about","above","apply","according","accordingly",
191 "across","actually","after","afterwards","again","against",
192 "ain't","all","allow","allows","almost","alone","along",
193 "involves", "already","also","although","always","am","among","amongst",
194 "an","and","another","any","anybody","anyhow","anyone","anything",
195 "anyway","anyways","anywhere","analyze","apart","appear","appreciate","appropriate",
196 "are","aren't","around","as","aside","ask","asking","associated","at",
197 "available","away","awfully","based", "b","be","became","because","become",
198 "becomes","becoming","been","before","beforehand","behind","being",
199 "believe","below","beside","besides","best","better","between","beyond",
200 "both","brief","but","by","c","c'mon","c's","came","can","can't","cannot",
201 "cant","cause","causes","certain","certainly","changes","clearly","co",
202 "com","come","comes","common","concerning","consequently","consider","considering",
203 "contain","containing","contains","corresponding","could","couldn't","course",
204 "currently","d","definitely","described","detects","detecting","despite","did","didn't","different",
205 "do","does","doesn't","doing","don't","done","down","downwards","during","e",
206 "each","edu","eg","eight","either","else","elsewhere","enough","entirely",
207 "especially","et","etc","even","ever","every","everybody","everyone",
208 "everything","everywhere","ex","exactly","example","except","f","far",
209 "few","find","fifth","first","five","followed","following","follows","for",
210 "former","formerly","forth","four","from","further","furthermore","g",
211 "get","gets","getting","given","gives","go","goes","going","gone","got",
212 "gotten","greetings","h","had","hadn't","happens","hardly","has","hasn't",
213 "have","haven't","having","he","he's","hello","help","hence","her","here",
214 "here's","hereafter","hereby","herein","hereupon","hers","herself","hi",
215 "him","himself","his","hither","hopefully","how","howbeit","however","i",
216 "i'd","identify","i'll","i'm","i've","ie","if","ignored","immediate","in","inasmuch",
217 "inc","indeed","indicate","indicated","indicates","inner","insofar",
218 "instead","into","inward","is","isn't","it","it'd","it'll","it's","its",
219 "itself","j","just","k","keep","keeps","kept","know","known","knows","l",
220 "last","lately","later","latter","latterly","least","less","lest","let",
221 "let's","like","liked","likely","little","look","looking","looks","ltd",
222 "m","mainly","many","may","maybe","me","mean","meanwhile","merely","might",
223 "more","moreover","most","mostly","much","must","my","myself","n",
224 "name","namely","nd","near","nearly","necessary","need","needs","neither",
225 "never","nevertheless","new","next","nine","no","nobody","non","none",
226 "noone","nor","normally","not","nothing","novel","now","nowhere","o",
227 "obviously","of","off","often","oh","ok","okay","old","on","once","one",
228 "ones","only","onto","or","other","others","otherwise","ought","our",
229 "ours","ourselves","out","outside","over","overall","own","p","particular",
230 "particularly","per","perhaps","placed","please","plus","possible",
231 "presents","presumably","probably","provides","q","que","quite","qv","r","rather",
232 "rd","re","really","reasonably","regarding","regardless","regards",
233 "relatively","respectively","right","s","said","same","saw","say",
234 "saying","says","second","secondly","see","seeing","seem","seemed",
235 "seeming","seems","seen","self","selves","sensible","sent","serious",
236 "seriously","seven","several","shall","she","should","shouldn't","since",
237 "six","so","some","somebody","somehow","someone","something","sometime",
238 "sometimes","somewhat","somewhere","soon","sorry","specified","specify",
239 "specifying","still","sub","such","sup","sure","t","t's","take","taken",
240 "tell","tends","th","than","thank","thanks","thanx","that","that's",
241 "thats","the","their","theirs","them","themselves","then","thence","there",
242 "there's","thereafter","thereby","therefore","therein","theres",
243 "thereupon","these","they","they'd","they'll","they're","they've",
244 "think","third","this","thorough","thoroughly","those","though","three",
245 "through","throughout","thru","thus","to","together","too","took","toward",
246 "towards","tried","tries","truly","try","trying","twice","two","u","un",
247 "under","unfortunately","unless","unlikely","until","unto","up","upon",
248 "us","use","used","useful","uses","using","usually","uucp","v","value",
249 "various","very","via","viz","vs","w","want","wants","was","wasn't","way",
250 "we","we'd","we'll","we're","we've","welcome","well","went","were",
251 "weren't","what","what's","whatever","when","whence","whenever","where",
252 "where's","whereafter","whereas","whereby","wherein","whereupon",
253 "wherever","whether","which","while","whither","who","who's","whoever",
254 "whole","whom","whose","why","will","willing","wish","with","within",
255 "without","won't","wonder","would","wouldn't","x","y","yes","yet","you",
256 "you'd","you'll","you're","you've","your","yours","yourself","yourselves",
257 "z","zero"]
258
259
260
261