]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Terms/Stop.hs
Merge branch 'master' into bayes
[gargantext.git] / src / Gargantext / Text / Terms / Stop.hs
1 {-|
2 Module : Gargantext.Text.Terms.Stop
3 Description : Mono Terms module
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Stop words and (how to learn it).
11
12 Main type here is String.
13
14 -}
15
16 {-# LANGUAGE NoImplicitPrelude #-}
17
18 module Gargantext.Text.Terms.Stop
19 where
20
21 import Numeric.Probability.Distribution ((??))
22 import qualified Numeric.Probability.Distribution as D
23
24 import Data.String (String)
25
26 import Data.Char (toLower)
27 import qualified Data.List as DL
28 -- import qualified Data.Map as M
29
30 import Gargantext.Prelude
31
32 data Candidate = Candidate { stop :: Double
33 , noStop :: Double
34 } deriving (Show)
35
36 -- * String preparation
37
38 -- | String prepare
39 blanks :: String -> String
40 blanks [] = []
41 blanks xs = [' '] <> xs <> [' ']
42
43 -- | Blocks increase the size of the word to ease computations
44 -- some border and unexepected effects can happen, need to be tested
45 blockOf :: Int -> String -> String
46 blockOf n st = DL.concat $ DL.take n $ DL.repeat st
47
48 -- | Chunks is the same function as splitBy in Context but for Strings,
49 -- not Text (without pack and unpack operations that are not needed).
50 chunks :: Int -> Int -> String -> [String]
51 chunks n m = DL.take m . chunkAlong (n+1) 1 . DL.concat . DL.take 1000 . DL.repeat . blanks
52
53 allChunks :: [Int] -> Int -> String -> [String]
54 allChunks ns m st = DL.concat $ map (\n -> chunks n m st) ns
55
56 ------------------------------------------------------------------------
57 -- * Make the distributions
58 makeDist :: [String] -> D.T Double String
59 makeDist = D.uniform . DL.concat . map (allChunks [0,2] 10)
60
61 stopDist :: D.T Double String
62 stopDist = makeDist stopList
63
64 candDist :: D.T Double String
65 candDist = makeDist candList
66
67 ------------------------------------------------------------------------
68 -- * Analyze candidate
69 sumProba :: Num a => D.T a String -> [Char] -> a
70 sumProba ds x = sum $ map ((~?) ds) $ allChunks [0,2] 10 $ map toLower x
71
72 -- | Get probability according a distribution
73 (~?) :: (Num prob, Eq a) => D.T prob a -> a -> prob
74 (~?) ds x = (==x) ?? ds
75
76 ------------------------------------------------------------------------
77 candidate x = Candidate (sumProba stopDist x) (sumProba candDist x)
78
79 ------------------------------------------------------------------------
80 candList :: [String]
81 candList = ["france", "alexandre", "mael", "constitution", "delanoe", "etats-unis", "associes", "car", "train", "spam"]
82
83
84 stopList :: [String]
85 stopList = map show ([0..9]::[Int]) <> [
86 "a","a's","able","about","above","apply","according","accordingly",
87 "across","actually","after","afterwards","again","against",
88 "ain't","all","allow","allows","almost","alone","along",
89 "involves", "already","also","although","always","am","among","amongst",
90 "an","and","another","any","anybody","anyhow","anyone","anything",
91 "anyway","anyways","anywhere","analyze","apart","appear","appreciate","appropriate",
92 "are","aren't","around","as","aside","ask","asking","associated","at",
93 "available","away","awfully","based", "b","be","became","because","become",
94 "becomes","becoming","been","before","beforehand","behind","being",
95 "believe","below","beside","besides","best","better","between","beyond",
96 "both","brief","but","by","c","c'mon","c's","came","can","can't","cannot",
97 "cant","cause","causes","certain","certainly","changes","clearly","co",
98 "com","come","comes","common","concerning","consequently","consider","considering",
99 "contain","containing","contains","corresponding","could","couldn't","course",
100 "currently","d","definitely","described","detects","detecting","despite","did","didn't","different",
101 "do","does","doesn't","doing","don't","done","down","downwards","during","e",
102 "each","edu","eg","eight","either","else","elsewhere","enough","entirely",
103 "especially","et","etc","even","ever","every","everybody","everyone",
104 "everything","everywhere","ex","exactly","example","except","f","far",
105 "few","find","fifth","first","five","followed","following","follows","for",
106 "former","formerly","forth","four","from","further","furthermore","g",
107 "get","gets","getting","given","gives","go","goes","going","gone","got",
108 "gotten","greetings","h","had","hadn't","happens","hardly","has","hasn't",
109 "have","haven't","having","he","he's","hello","help","hence","her","here",
110 "here's","hereafter","hereby","herein","hereupon","hers","herself","hi",
111 "him","himself","his","hither","hopefully","how","howbeit","however","i",
112 "i'd","identify","i'll","i'm","i've","ie","if","ignored","immediate","in","inasmuch",
113 "inc","indeed","indicate","indicated","indicates","inner","insofar",
114 "instead","into","inward","is","isn't","it","it'd","it'll","it's","its",
115 "itself","j","just","k","keep","keeps","kept","know","known","knows","l",
116 "last","lately","later","latter","latterly","least","less","lest","let",
117 "let's","like","liked","likely","little","look","looking","looks","ltd",
118 "m","mainly","many","may","maybe","me","mean","meanwhile","merely","might",
119 "more","moreover","most","mostly","much","must","my","myself","n",
120 "name","namely","nd","near","nearly","necessary","need","needs","neither",
121 "never","nevertheless","new","next","nine","no","nobody","non","none",
122 "noone","nor","normally","not","nothing","novel","now","nowhere","o",
123 "obviously","of","off","often","oh","ok","okay","old","on","once","one",
124 "ones","only","onto","or","other","others","otherwise","ought","our",
125 "ours","ourselves","out","outside","over","overall","own","p","particular",
126 "particularly","per","perhaps","placed","please","plus","possible",
127 "presents","presumably","probably","provides","q","que","quite","qv","r","rather",
128 "rd","re","really","reasonably","regarding","regardless","regards",
129 "relatively","respectively","right","s","said","same","saw","say",
130 "saying","says","second","secondly","see","seeing","seem","seemed",
131 "seeming","seems","seen","self","selves","sensible","sent","serious",
132 "seriously","seven","several","shall","she","should","shouldn't","since",
133 "six","so","some","somebody","somehow","someone","something","sometime",
134 "sometimes","somewhat","somewhere","soon","sorry","specified","specify",
135 "specifying","still","sub","such","sup","sure","t","t's","take","taken",
136 "tell","tends","th","than","thank","thanks","thanx","that","that's",
137 "thats","the","their","theirs","them","themselves","then","thence","there",
138 "there's","thereafter","thereby","therefore","therein","theres",
139 "thereupon","these","they","they'd","they'll","they're","they've",
140 "think","third","this","thorough","thoroughly","those","though","three",
141 "through","throughout","thru","thus","to","together","too","took","toward",
142 "towards","tried","tries","truly","try","trying","twice","two","u","un",
143 "under","unfortunately","unless","unlikely","until","unto","up","upon",
144 "us","use","used","useful","uses","using","usually","uucp","v","value",
145 "various","very","via","viz","vs","w","want","wants","was","wasn't","way",
146 "we","we'd","we'll","we're","we've","welcome","well","went","were",
147 "weren't","what","what's","whatever","when","whence","whenever","where",
148 "where's","whereafter","whereas","whereby","wherein","whereupon",
149 "wherever","whether","which","while","whither","who","who's","whoever",
150 "whole","whom","whose","why","will","willing","wish","with","within",
151 "without","won't","wonder","would","wouldn't","x","y","yes","yet","you",
152 "you'd","you'll","you're","you've","your","yours","yourself","yourselves",
153 "z","zero"]
154
155
156
157