]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Terms/Stop.hs
[FEAT] Lang detect.
[gargantext.git] / src / Gargantext / Text / Terms / Stop.hs
1 {-|
2 Module : Gargantext.Text.Terms.Stop
3 Description : Mono Terms module
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Stop words and (how to learn it).
11
12 Main type here is String.
13
14 -}
15
16 {-# LANGUAGE NoImplicitPrelude #-}
17
18 module Gargantext.Text.Terms.Stop
19 where
20
21 import Numeric.Probability.Distribution ((??))
22 import qualified Numeric.Probability.Distribution as D
23
24 import Data.Char (toLower)
25 import qualified Data.List as DL
26
27 import Data.Maybe (maybe)
28 import Data.Map.Strict (Map)
29 import qualified Data.Map.Strict as DM
30
31 import Data.String (String)
32
33 import Data.Text (pack, unpack)
34
35 import Gargantext.Prelude
36 import Gargantext.Core (Lang(..))
37 import Gargantext.Text.Terms.Mono (words)
38 import Gargantext.Text.Metrics.Count (occurrencesWith)
39
40 ------------------------------------------------------------------------
41 data Candidate = Candidate { stop :: Double
42 , noStop :: Double
43 } deriving (Show)
44
45 -- * String preparation
46
47 -- | String prepare
48 blanks :: String -> String
49 blanks [] = []
50 blanks xs = [' '] <> xs <> [' ']
51
52 -- | Blocks increase the size of the word to ease computations
53 -- some border and unexepected effects can happen, need to be tested
54 blockOf :: Int -> String -> String
55 blockOf n st = DL.concat $ DL.take n $ DL.repeat st
56
57 -- | Chunks is the same function as splitBy in Context but for Strings,
58 -- not Text (without pack and unpack operations that are not needed).
59 chunks :: Int -> Int -> String -> [String]
60 chunks n m = DL.take m . filter (not . all (== ' ')) . chunkAlong (n+1) 1 . DL.concat . DL.take 1000 . DL.repeat . blanks
61
62 allChunks :: [Int] -> Int -> String -> [String]
63 allChunks ns m st = DL.concat $ map (\n -> chunks n m st) ns
64
65 allChunks' :: [Int] -> Int -> String -> [[String]]
66 allChunks' ns m st = map (\n -> chunks n m st) ns
67
68 ------------------------------------------------------------------------
69 -- * Analyze candidate
70 type StringSize = Int
71 type TotalFreq = Int
72 type Freq = Int
73 type Word = String
74
75 data LangWord = LangWord Lang Word
76
77 type LangProba = Map Lang Double
78
79 ------------------------------------------------------------------------
80
81 estimeTest :: String -> LangProba
82 estimeTest s = estime (wordsToBook [0..2] s) testEL
83
84 testEL :: EventLang
85 testEL = toEventLangs [0,1,2] [ LangWord EN "Lovely day. This day."
86 , LangWord FR "Belle journée, j'y vais."
87 , LangWord EN "Hello Sir, how are you doing? I am fine thank you, good bye"
88 , LangWord FR "Bonjour Monsieur, comment allez-vous? Je vais bien merci."
89 ]
90
91 estime :: EventBook -> EventLang -> LangProba
92 estime (EventBook mapFreq _) el = DM.unionsWith (+) $ map (\(s,n) -> DM.map (\eb -> (fromIntegral n) * peb s eb) el) $ filter (\x -> fst x /= " ") $ DM.toList mapFreq
93
94 ------------------------------------------------------------------------
95 -- | TODO: monoids
96 type EventLang = Map Lang EventBook
97 toEventLangs :: [Int] -> [LangWord] -> EventLang
98 toEventLangs ns = foldl' (opLang (+)) (emptyEventLang ns) . map (toLang ns)
99
100 emptyEventLang :: [Int] -> EventLang
101 emptyEventLang ns = toLang ns (LangWord FR "")
102
103 toLang :: [Int] -> LangWord -> EventLang
104 toLang ns (LangWord l txt) = DM.fromList [(l, wordsToBook ns txt)]
105
106 opLang :: (Freq -> Freq -> Freq) -> EventLang -> EventLang -> EventLang
107 opLang f = DM.unionWith (op f)
108
109 ------------------------------------------------------------------------
110 -- | TODO: monoids (but proba >= 0)
111
112 peb :: String -> EventBook -> Double
113 peb st (EventBook mapFreq mapN) = (fromIntegral a) / (fromIntegral b)
114 where
115 a = maybe 0 identity $ DM.lookup st mapFreq
116 b = maybe 1 identity $ DM.lookup (length st) mapN
117
118 data EventBook = EventBook { events_freq :: Map String Freq
119 , events_n :: Map StringSize TotalFreq
120 }
121 deriving (Show)
122
123 emptyEventBook :: [Int] -> EventBook
124 emptyEventBook ns = wordToBook ns " "
125
126 wordsToBook :: [Int] -> String -> EventBook
127 wordsToBook ns txt = foldl' (op (+)) (emptyEventBook ns) eventsBook
128 where
129 ws = map unpack $ words $ pack txt
130 eventsBook = map (wordToBook ns) ws
131
132 wordToBook :: [Int] -> Word -> EventBook
133 wordToBook ns txt = EventBook ef en
134 where
135 chks = allChunks' ns 10 txt
136 en = DM.fromList $ map (\(n,ns) -> (n, length ns)) $ zip ns chks
137 ef = foldl' DM.union DM.empty $ map (occurrencesWith identity) chks
138
139 op :: (Freq -> Freq -> Freq) -> EventBook -> EventBook -> EventBook
140 op f (EventBook ef1 en1)
141 (EventBook ef2 en2) = EventBook (DM.unionWith f ef1 ef2)
142 (DM.unionWith f en1 en2)
143
144
145 ------------------------------------------------------------------------
146 ------------------------------------------------------------------------
147 -- * Make the distributions
148 makeDist :: [String] -> D.T Double String
149 makeDist = D.uniform . DL.concat . map (allChunks [0,2] 10)
150
151 stopDist :: D.T Double String
152 stopDist = makeDist stopList
153
154 candDist :: D.T Double String
155 candDist = makeDist candList
156
157 ------------------------------------------------------------------------
158 sumProba :: Num a => D.T a String -> [Char] -> a
159 sumProba ds x = sum $ map ((~?) ds) $ allChunks [0,2] 10 $ map toLower x
160
161 -- | Get probability according a distribution
162 (~?) :: (Num prob, Eq a) => D.T prob a -> a -> prob
163 (~?) ds x = (==x) ?? ds
164
165 ------------------------------------------------------------------------
166 candidate x = Candidate (sumProba stopDist x) (sumProba candDist x)
167
168 ------------------------------------------------------------------------
169 candList :: [String]
170 candList = [ "france", "alexandre", "mael", "constitution"
171 , "etats-unis", "associes", "car", "train", "spam"]
172
173
174 stopList :: [String]
175 stopList = map show ([0..9]::[Int]) <> [
176 "a","a's","able","about","above","apply","according","accordingly",
177 "across","actually","after","afterwards","again","against",
178 "ain't","all","allow","allows","almost","alone","along",
179 "involves", "already","also","although","always","am","among","amongst",
180 "an","and","another","any","anybody","anyhow","anyone","anything",
181 "anyway","anyways","anywhere","analyze","apart","appear","appreciate","appropriate",
182 "are","aren't","around","as","aside","ask","asking","associated","at",
183 "available","away","awfully","based", "b","be","became","because","become",
184 "becomes","becoming","been","before","beforehand","behind","being",
185 "believe","below","beside","besides","best","better","between","beyond",
186 "both","brief","but","by","c","c'mon","c's","came","can","can't","cannot",
187 "cant","cause","causes","certain","certainly","changes","clearly","co",
188 "com","come","comes","common","concerning","consequently","consider","considering",
189 "contain","containing","contains","corresponding","could","couldn't","course",
190 "currently","d","definitely","described","detects","detecting","despite","did","didn't","different",
191 "do","does","doesn't","doing","don't","done","down","downwards","during","e",
192 "each","edu","eg","eight","either","else","elsewhere","enough","entirely",
193 "especially","et","etc","even","ever","every","everybody","everyone",
194 "everything","everywhere","ex","exactly","example","except","f","far",
195 "few","find","fifth","first","five","followed","following","follows","for",
196 "former","formerly","forth","four","from","further","furthermore","g",
197 "get","gets","getting","given","gives","go","goes","going","gone","got",
198 "gotten","greetings","h","had","hadn't","happens","hardly","has","hasn't",
199 "have","haven't","having","he","he's","hello","help","hence","her","here",
200 "here's","hereafter","hereby","herein","hereupon","hers","herself","hi",
201 "him","himself","his","hither","hopefully","how","howbeit","however","i",
202 "i'd","identify","i'll","i'm","i've","ie","if","ignored","immediate","in","inasmuch",
203 "inc","indeed","indicate","indicated","indicates","inner","insofar",
204 "instead","into","inward","is","isn't","it","it'd","it'll","it's","its",
205 "itself","j","just","k","keep","keeps","kept","know","known","knows","l",
206 "last","lately","later","latter","latterly","least","less","lest","let",
207 "let's","like","liked","likely","little","look","looking","looks","ltd",
208 "m","mainly","many","may","maybe","me","mean","meanwhile","merely","might",
209 "more","moreover","most","mostly","much","must","my","myself","n",
210 "name","namely","nd","near","nearly","necessary","need","needs","neither",
211 "never","nevertheless","new","next","nine","no","nobody","non","none",
212 "noone","nor","normally","not","nothing","novel","now","nowhere","o",
213 "obviously","of","off","often","oh","ok","okay","old","on","once","one",
214 "ones","only","onto","or","other","others","otherwise","ought","our",
215 "ours","ourselves","out","outside","over","overall","own","p","particular",
216 "particularly","per","perhaps","placed","please","plus","possible",
217 "presents","presumably","probably","provides","q","que","quite","qv","r","rather",
218 "rd","re","really","reasonably","regarding","regardless","regards",
219 "relatively","respectively","right","s","said","same","saw","say",
220 "saying","says","second","secondly","see","seeing","seem","seemed",
221 "seeming","seems","seen","self","selves","sensible","sent","serious",
222 "seriously","seven","several","shall","she","should","shouldn't","since",
223 "six","so","some","somebody","somehow","someone","something","sometime",
224 "sometimes","somewhat","somewhere","soon","sorry","specified","specify",
225 "specifying","still","sub","such","sup","sure","t","t's","take","taken",
226 "tell","tends","th","than","thank","thanks","thanx","that","that's",
227 "thats","the","their","theirs","them","themselves","then","thence","there",
228 "there's","thereafter","thereby","therefore","therein","theres",
229 "thereupon","these","they","they'd","they'll","they're","they've",
230 "think","third","this","thorough","thoroughly","those","though","three",
231 "through","throughout","thru","thus","to","together","too","took","toward",
232 "towards","tried","tries","truly","try","trying","twice","two","u","un",
233 "under","unfortunately","unless","unlikely","until","unto","up","upon",
234 "us","use","used","useful","uses","using","usually","uucp","v","value",
235 "various","very","via","viz","vs","w","want","wants","was","wasn't","way",
236 "we","we'd","we'll","we're","we've","welcome","well","went","were",
237 "weren't","what","what's","whatever","when","whence","whenever","where",
238 "where's","whereafter","whereas","whereby","wherein","whereupon",
239 "wherever","whether","which","while","whither","who","who's","whoever",
240 "whole","whom","whose","why","will","willing","wish","with","within",
241 "without","won't","wonder","would","wouldn't","x","y","yes","yet","you",
242 "you'd","you'll","you're","you've","your","yours","yourself","yourselves",
243 "z","zero"]
244
245
246
247