]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Terms/Stop.hs
[FIX] adding missing files.
[gargantext.git] / src / Gargantext / Text / Terms / Stop.hs
1 {-|
2 Module : Gargantext.Text.Terms.Stop
3 Description : Mono Terms module
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Stop words and (how to learn it).
11
12 Main type here is String.
13
14 -}
15
16 {-# LANGUAGE NoImplicitPrelude #-}
17
18 module Gargantext.Text.Terms.Stop
19 where
20
21 import Numeric.Probability.Distribution ((??))
22 import qualified Numeric.Probability.Distribution as D
23
24 import Data.Char (toLower)
25 import qualified Data.List as DL
26
27 import Data.Maybe (maybe)
28 import Data.Map.Strict (Map, toList)
29 import qualified Data.Map.Strict as DM
30
31 import Data.String (String)
32
33 import Data.Text (pack, unpack)
34
35 import Gargantext.Prelude
36 import Gargantext.Core (Lang(..))
37 import Gargantext.Text.Terms.Mono (words)
38 import Gargantext.Text.Metrics.Count (occurrencesWith)
39
40 import Gargantext.Text.Samples.FR as FR
41 import Gargantext.Text.Samples.EN as EN
42 import Gargantext.Text.Samples.DE as DE
43 import Gargantext.Text.Samples.SP as SP
44 import Gargantext.Text.Samples.CH as CH
45
46 ------------------------------------------------------------------------
47 data Candidate = Candidate { stop :: Double
48 , noStop :: Double
49 } deriving (Show)
50
51 -- * String preparation
52
53 -- | String prepare
54 blanks :: String -> String
55 blanks [] = []
56 blanks xs = [' '] <> xs <> [' ']
57
58 -- | Blocks increase the size of the word to ease computations
59 -- some border and unexepected effects can happen, need to be tested
60 blockOf :: Int -> String -> String
61 blockOf n st = DL.concat $ DL.take n $ DL.repeat st
62
63 -- | Chunks is the same function as splitBy in Context but for Strings,
64 -- not Text (without pack and unpack operations that are not needed).
65 chunks :: Int -> Int -> String -> [String]
66 chunks n m = DL.take m . filter (not . all (== ' ')) . chunkAlong (n+1) 1 . DL.concat . DL.take 1000 . DL.repeat . blanks
67
68 allChunks :: [Int] -> Int -> String -> [String]
69 allChunks ns m st = DL.concat $ map (\n -> chunks n m st) ns
70
71 allChunks' :: [Int] -> Int -> String -> [[String]]
72 allChunks' ns m st = map (\n -> chunks n m st) ns
73
74 ------------------------------------------------------------------------
75 -- * Analyze candidate
76 type StringSize = Int
77 type TotalFreq = Int
78 type Freq = Int
79 type Word = String
80
81 data LangWord = LangWord Lang Word
82
83 type LangProba = Map Lang Double
84
85 ------------------------------------------------------------------------
86 detectLangs :: String -> [(Lang, Double)]
87 detectLangs s = DL.reverse $ DL.sortOn snd
88 $ toList
89 $ detect (wordsToBook [0..2] s) testEL
90
91 testEL :: EventLang
92 testEL = toEventLangs [0..2] [ LangWord EN EN.textMining
93 , LangWord FR FR.textMining
94 , LangWord DE DE.textMining
95 , LangWord SP SP.textMining
96 , LangWord CH CH.textMining
97 ]
98
99 detect :: EventBook -> EventLang -> LangProba
100 detect (EventBook mapFreq _) el = DM.unionsWith (+) $ map (\(s,n) -> DM.map (\eb -> (fromIntegral n) * peb s eb) el) $ filter (\x -> fst x /= " ") $ DM.toList mapFreq
101
102 ------------------------------------------------------------------------
103 -- | TODO: monoids
104 type EventLang = Map Lang EventBook
105 toEventLangs :: [Int] -> [LangWord] -> EventLang
106 toEventLangs ns = foldl' (opLang (+)) (emptyEventLang ns) . map (toLang ns)
107
108 emptyEventLang :: [Int] -> EventLang
109 emptyEventLang ns = toLang ns (LangWord FR "")
110
111 toLang :: [Int] -> LangWord -> EventLang
112 toLang ns (LangWord l txt) = DM.fromList [(l, wordsToBook ns txt)]
113
114 opLang :: (Freq -> Freq -> Freq) -> EventLang -> EventLang -> EventLang
115 opLang f = DM.unionWith (op f)
116
117 ------------------------------------------------------------------------
118 -- | TODO: monoids (but proba >= 0)
119
120 peb :: String -> EventBook -> Double
121 peb st (EventBook mapFreq mapN) = (fromIntegral a) / (fromIntegral b)
122 where
123 a = maybe 0 identity $ DM.lookup st mapFreq
124 b = maybe 1 identity $ DM.lookup (length st) mapN
125
126 data EventBook = EventBook { events_freq :: Map String Freq
127 , events_n :: Map StringSize TotalFreq
128 }
129 deriving (Show)
130
131 emptyEventBook :: [Int] -> EventBook
132 emptyEventBook ns = wordToBook ns " "
133
134 wordsToBook :: [Int] -> String -> EventBook
135 wordsToBook ns txt = foldl' (op (+)) (emptyEventBook ns) eventsBook
136 where
137 ws = map unpack $ words $ pack txt
138 eventsBook = map (wordToBook ns) ws
139
140 wordToBook :: [Int] -> Word -> EventBook
141 wordToBook ns txt = EventBook ef en
142 where
143 chks = allChunks' ns 10 txt
144 en = DM.fromList $ map (\(n,ns') -> (n, length ns')) $ zip ns chks
145 ef = foldl' DM.union DM.empty $ map (occurrencesWith identity) chks
146
147 op :: (Freq -> Freq -> Freq) -> EventBook -> EventBook -> EventBook
148 op f (EventBook ef1 en1)
149 (EventBook ef2 en2) = EventBook (DM.unionWith f ef1 ef2)
150 (DM.unionWith f en1 en2)
151
152
153 ------------------------------------------------------------------------
154 ------------------------------------------------------------------------
155 -- * Make the distributions
156 makeDist :: [String] -> D.T Double String
157 makeDist = D.uniform . DL.concat . map (allChunks [0,2] 10)
158
159 stopDist :: D.T Double String
160 stopDist = makeDist stopList
161
162 candDist :: D.T Double String
163 candDist = makeDist candList
164
165 ------------------------------------------------------------------------
166 sumProba :: Num a => D.T a String -> [Char] -> a
167 sumProba ds x = sum $ map ((~?) ds) $ allChunks [0,2] 10 $ map toLower x
168
169 -- | Get probability according a distribution
170 (~?) :: (Num prob, Eq a) => D.T prob a -> a -> prob
171 (~?) ds x = (==x) ?? ds
172
173 ------------------------------------------------------------------------
174 candidate :: [Char] -> Candidate
175 candidate x = Candidate (sumProba stopDist x) (sumProba candDist x)
176
177 ------------------------------------------------------------------------
178 candList :: [String]
179 candList = [ "france", "alexandre", "mael", "constitution"
180 , "etats-unis", "associes", "car", "train", "spam"]
181
182
183 stopList :: [String]
184 stopList = map show ([0..9]::[Int]) <> [
185 "a","a's","able","about","above","apply","according","accordingly",
186 "across","actually","after","afterwards","again","against",
187 "ain't","all","allow","allows","almost","alone","along",
188 "involves", "already","also","although","always","am","among","amongst",
189 "an","and","another","any","anybody","anyhow","anyone","anything",
190 "anyway","anyways","anywhere","analyze","apart","appear","appreciate","appropriate",
191 "are","aren't","around","as","aside","ask","asking","associated","at",
192 "available","away","awfully","based", "b","be","became","because","become",
193 "becomes","becoming","been","before","beforehand","behind","being",
194 "believe","below","beside","besides","best","better","between","beyond",
195 "both","brief","but","by","c","c'mon","c's","came","can","can't","cannot",
196 "cant","cause","causes","certain","certainly","changes","clearly","co",
197 "com","come","comes","common","concerning","consequently","consider","considering",
198 "contain","containing","contains","corresponding","could","couldn't","course",
199 "currently","d","definitely","described","detects","detecting","despite","did","didn't","different",
200 "do","does","doesn't","doing","don't","done","down","downwards","during","e",
201 "each","edu","eg","eight","either","else","elsewhere","enough","entirely",
202 "especially","et","etc","even","ever","every","everybody","everyone",
203 "everything","everywhere","ex","exactly","example","except","f","far",
204 "few","find","fifth","first","five","followed","following","follows","for",
205 "former","formerly","forth","four","from","further","furthermore","g",
206 "get","gets","getting","given","gives","go","goes","going","gone","got",
207 "gotten","greetings","h","had","hadn't","happens","hardly","has","hasn't",
208 "have","haven't","having","he","he's","hello","help","hence","her","here",
209 "here's","hereafter","hereby","herein","hereupon","hers","herself","hi",
210 "him","himself","his","hither","hopefully","how","howbeit","however","i",
211 "i'd","identify","i'll","i'm","i've","ie","if","ignored","immediate","in","inasmuch",
212 "inc","indeed","indicate","indicated","indicates","inner","insofar",
213 "instead","into","inward","is","isn't","it","it'd","it'll","it's","its",
214 "itself","j","just","k","keep","keeps","kept","know","known","knows","l",
215 "last","lately","later","latter","latterly","least","less","lest","let",
216 "let's","like","liked","likely","little","look","looking","looks","ltd",
217 "m","mainly","many","may","maybe","me","mean","meanwhile","merely","might",
218 "more","moreover","most","mostly","much","must","my","myself","n",
219 "name","namely","nd","near","nearly","necessary","need","needs","neither",
220 "never","nevertheless","new","next","nine","no","nobody","non","none",
221 "noone","nor","normally","not","nothing","novel","now","nowhere","o",
222 "obviously","of","off","often","oh","ok","okay","old","on","once","one",
223 "ones","only","onto","or","other","others","otherwise","ought","our",
224 "ours","ourselves","out","outside","over","overall","own","p","particular",
225 "particularly","per","perhaps","placed","please","plus","possible",
226 "presents","presumably","probably","provides","q","que","quite","qv","r","rather",
227 "rd","re","really","reasonably","regarding","regardless","regards",
228 "relatively","respectively","right","s","said","same","saw","say",
229 "saying","says","second","secondly","see","seeing","seem","seemed",
230 "seeming","seems","seen","self","selves","sensible","sent","serious",
231 "seriously","seven","several","shall","she","should","shouldn't","since",
232 "six","so","some","somebody","somehow","someone","something","sometime",
233 "sometimes","somewhat","somewhere","soon","sorry","specified","specify",
234 "specifying","still","sub","such","sup","sure","t","t's","take","taken",
235 "tell","tends","th","than","thank","thanks","thanx","that","that's",
236 "thats","the","their","theirs","them","themselves","then","thence","there",
237 "there's","thereafter","thereby","therefore","therein","theres",
238 "thereupon","these","they","they'd","they'll","they're","they've",
239 "think","third","this","thorough","thoroughly","those","though","three",
240 "through","throughout","thru","thus","to","together","too","took","toward",
241 "towards","tried","tries","truly","try","trying","twice","two","u","un",
242 "under","unfortunately","unless","unlikely","until","unto","up","upon",
243 "us","use","used","useful","uses","using","usually","uucp","v","value",
244 "various","very","via","viz","vs","w","want","wants","was","wasn't","way",
245 "we","we'd","we'll","we're","we've","welcome","well","went","were",
246 "weren't","what","what's","whatever","when","whence","whenever","where",
247 "where's","whereafter","whereas","whereby","wherein","whereupon",
248 "wherever","whether","which","while","whither","who","who's","whoever",
249 "whole","whom","whose","why","will","willing","wish","with","within",
250 "without","won't","wonder","would","wouldn't","x","y","yes","yet","you",
251 "you'd","you'll","you're","you've","your","yours","yourself","yourselves",
252 "z","zero"]
253
254
255
256