src/Gargantext/Text/Terms/Stop.hs

   1 {-|
   2 Module      : Gargantext.Text.Terms.Stop
   3 Description : Mono Terms module
   4 Copyright   : (c) CNRS, 2017 - present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 Stop words and (how to learn it).
  11
  12 Main type here is String.
  13
  14 -}
  15
  16 {-# LANGUAGE NoImplicitPrelude #-}
  17
  18 module Gargantext.Text.Terms.Stop
  19   where
  20
  21 import GHC.Base (Functor)
  22 import Numeric.Probability.Distribution ((??))
  23 import qualified Numeric.Probability.Distribution as D
  24
  25 import Data.Char (toLower)
  26 import qualified Data.List as DL
  27
  28 import Data.Maybe (maybe)
  29 import Data.Map.Strict (Map, toList)
  30 import qualified Data.Map.Strict as DM
  31
  32 import Data.String (String)
  33
  34 import Data.Text (pack, unpack)
  35 import Data.Tuple.Extra (both)
  36
  37 import Gargantext.Prelude
  38 import Gargantext.Core (Lang(..), allLangs)
  39 import Gargantext.Text.Terms.Mono (words)
  40 import Gargantext.Text.Metrics.Count (occurrencesWith)
  41
  42 import qualified Gargantext.Text.Samples.FR as FR
  43 import qualified Gargantext.Text.Samples.EN as EN
  44 --import qualified Gargantext.Text.Samples.DE as DE
  45 --import qualified Gargantext.Text.Samples.SP as SP
  46 --import qualified Gargantext.Text.Samples.CH as CH
  47
  48 ------------------------------------------------------------------------
  49 data Candidate = Candidate { stop :: Double
  50                            , noStop :: Double
  51  } deriving (Show)
  52
  53 -- * String preparation
  54
  55 -- | String prepare
  56 blanks :: String -> String
  57 blanks [] = []
  58 blanks xs = [' '] <> xs <> [' ']
  59
  60 -- | Blocks increase the size of the word to ease computations
  61 -- some border and unexepected effects can happen, need to be tested
  62 blockOf :: Int -> String -> String
  63 blockOf n = DL.concat . DL.take n . DL.repeat
  64
  65 -- | Chunks is the same function as splitBy in Context but for Strings,
  66 -- not Text (without pack and unpack operations that are not needed).
  67 chunks :: Int -> Int -> String -> [String]
  68 chunks n m = DL.take m . filter (not . all (== ' '))
  69                        . chunkAlong (n+1) 1
  70                        . DL.concat
  71                        . DL.take 1000
  72                        . DL.repeat
  73                        . blanks
  74
  75 allChunks :: [Int] -> Int -> String -> [String]
  76 allChunks ns m st = DL.concat $ map (\n -> chunks n m st) ns
  77
  78 allChunks' :: [Int] -> Int -> String -> [[String]]
  79 allChunks' ns m st = map (\n -> chunks n m st) ns
  80
  81 ------------------------------------------------------------------------
  82 -- * Analyze candidate
  83 type StringSize = Int
  84 type TotalFreq  = Int
  85 type Freq       = Int
  86 type Word       = String
  87
  88 data LangWord = LangWord Lang Word
  89
  90 type LangProba = Map Lang Double
  91
  92 ------------------------------------------------------------------------
  93 detectLangs :: String -> [(Lang, Double)]
  94 detectLangs s =  DL.reverse $ DL.sortOn snd
  95                             $ toList
  96                             $ detect (wordsToBook [0..2] s) eventLang
  97
  98 part :: (Eq p, Fractional p) => p -> p -> p
  99 part 0 _ = 0
 100 part _ 0 = 0
 101 part x y = x / y
 102
 103 toProba :: (Eq b, Fractional b, Functor t, Foldable t) =>
 104                  t (a, b) -> t (a, b)
 105 toProba xs = map (\(a,b) -> (a, part b total)) xs
 106   where
 107     total = sum $ map snd xs
 108
 109 textSample :: Lang -> String
 110 textSample EN = EN.textSample
 111 textSample FR = FR.textSample
 112 --textSample DE = DE.textSample
 113 --textSample SP = SP.textSample
 114 --textSample CH = CH.textSample
 115
 116 langWord :: Lang -> LangWord
 117 langWord l = LangWord l (textSample l)
 118
 119 eventLang :: EventLang
 120 eventLang = toEventLangs [0..2] [ langWord l | l <- allLangs ]
 121
 122 detect :: EventBook -> EventLang -> LangProba
 123 detect (EventBook mapFreq _) el =
 124   DM.unionsWith (+)
 125   $ map DM.fromList
 126   $ map (\(s,n) -> map (\(l,f) -> (l, (fromIntegral n) * f)) $ toPrior s el)
 127   $ filter (\x -> fst x /= "  ")
 128   $ DM.toList mapFreq
 129
 130 ------------------------------------------------------------------------
 131 -- | TODO: monoids
 132 type EventLang = Map Lang EventBook
 133
 134 toEventLangs :: [Int] -> [LangWord] -> EventLang
 135 toEventLangs ns = foldl' (opLang (+)) (emptyEventLang ns) . map (toLang ns)
 136
 137 emptyEventLang :: [Int] -> EventLang
 138 emptyEventLang ns = toLang ns (LangWord FR "")
 139
 140 toLang :: [Int] -> LangWord -> EventLang
 141 toLang ns (LangWord l txt) = DM.fromList [(l, wordsToBook ns txt)]
 142
 143 opLang :: (Freq -> Freq -> Freq) -> EventLang -> EventLang -> EventLang
 144 opLang f = DM.unionWith (op f)
 145
 146 ------------------------------------------------------------------------
 147 -- | TODO: monoids (but proba >= 0)
 148
 149 peb :: String -> EventBook -> Double
 150 peb st (EventBook mapFreq mapN) = (fromIntegral a) / (fromIntegral b)
 151   where
 152     a = maybe 0 identity $ DM.lookup st mapFreq
 153     b = maybe 1 identity $ DM.lookup (length st) mapN
 154
 155 peb' :: String -> EventBook -> (Freq, TotalFreq)
 156 peb' st (EventBook mapFreq mapN) = (fromIntegral a, fromIntegral b)
 157   where
 158     a = maybe 0 identity $ DM.lookup st mapFreq
 159     b = maybe 1 identity $ DM.lookup (length st) mapN
 160
 161 ------------------------------------------------------------------------
 162 toPrior :: String -> EventLang -> [(Lang, Double)]
 163 toPrior s el = prior $ pebLang s el
 164
 165 pebLang :: String -> EventLang -> [(Lang, (Freq,TotalFreq))]
 166 pebLang st = map (\(l,eb) -> (l, peb' st eb)) .  DM.toList
 167 ------------------------------------------------------------------------
 168 prior :: [(Lang, (Freq, TotalFreq))] -> [(Lang, Double)]
 169 prior ps = zip ls $ zipWith (\x y -> x^(99::Int) * y) (map (\(a,_) -> part a (sum $ map fst ps')) ps')
 170                                 (map (\(a,b) -> a / b) ps')
 171   where
 172
 173     (ls, ps'') = DL.unzip ps
 174     ps' = map (both fromIntegral) ps''
 175 ------------------------------------------------------------------------
 176 data EventBook = EventBook { events_freq :: Map String     Freq
 177                            , events_n    :: Map StringSize TotalFreq
 178                            }
 179                              deriving (Show)
 180
 181 emptyEventBook :: [Int] -> EventBook
 182 emptyEventBook ns = wordToBook ns " "
 183
 184 wordsToBook :: [Int] -> String -> EventBook
 185 wordsToBook ns txt = foldl' (op (+)) (emptyEventBook ns) eventsBook
 186   where
 187     ws = map unpack $ words $ pack txt
 188     eventsBook = map (wordToBook ns) ws
 189
 190 wordToBook :: [Int] -> Word -> EventBook
 191 wordToBook ns txt = EventBook ef en
 192   where
 193     chks = allChunks' ns 10 txt
 194     en = DM.fromList $ map (\(n,ns') -> (n, length ns')) $ zip ns chks
 195     ef = foldl' DM.union DM.empty $ map (occurrencesWith identity) chks
 196
 197 op :: (Freq -> Freq -> Freq) -> EventBook -> EventBook -> EventBook
 198 op f (EventBook ef1 en1)
 199      (EventBook ef2 en2) = EventBook (DM.unionWith f ef1 ef2)
 200                                      (DM.unionWith f en1 en2)
 201
 202 ------------------------------------------------------------------------
 203 ------------------------------------------------------------------------
 204 -- * Make the distributions
 205 makeDist :: [String] -> D.T Double String
 206 makeDist = D.uniform . DL.concat . map (allChunks [0,2] 10)
 207
 208 stopDist :: D.T Double String
 209 stopDist = makeDist stopList
 210
 211 candDist :: D.T Double String
 212 candDist = makeDist candList
 213
 214 ------------------------------------------------------------------------
 215 sumProba :: Num a => D.T a String -> [Char] -> a
 216 sumProba ds x = sum $ map ((~?) ds) $ allChunks [0,2] 10 $ map toLower x
 217
 218 -- | Get probability according a distribution
 219 (~?) :: (Num prob, Eq a) => D.T prob a -> a -> prob
 220 (~?) ds x = (==x) ?? ds
 221
 222 ------------------------------------------------------------------------
 223 candidate :: [Char] -> Candidate
 224 candidate x = Candidate (sumProba stopDist x) (sumProba candDist x)
 225
 226 ------------------------------------------------------------------------
 227 candList :: [String]
 228 candList = [ "france", "alexandre", "mael", "constitution"
 229            , "etats-unis", "associes", "car", "train", "spam"]
 230
 231
 232 stopList :: [String]
 233 stopList = map show ([0..9]::[Int]) <> [
 234     "a","a's","able","about","above","apply","according","accordingly",
 235     "across","actually","after","afterwards","again","against",
 236     "ain't","all","allow","allows","almost","alone","along",
 237     "involves", "already","also","although","always","am","among","amongst",
 238     "an","and","another","any","anybody","anyhow","anyone","anything",
 239     "anyway","anyways","anywhere","analyze","apart","appear","appreciate","appropriate",
 240     "are","aren't","around","as","aside","ask","asking","associated","at",
 241     "available","away","awfully","based", "b","be","became","because","become",
 242     "becomes","becoming","been","before","beforehand","behind","being",
 243     "believe","below","beside","besides","best","better","between","beyond",
 244     "both","brief","but","by","c","c'mon","c's","came","can","can't","cannot",
 245     "cant","cause","causes","certain","certainly","changes","clearly","co",
 246     "com","come","comes","common","concerning","consequently","consider","considering",
 247     "contain","containing","contains","corresponding","could","couldn't","course",
 248     "currently","d","definitely","described","detects","detecting","despite","did","didn't","different",
 249     "do","does","doesn't","doing","don't","done","down","downwards","during","e",
 250     "each","edu","eg","eight","either","else","elsewhere","enough","entirely",
 251     "especially","et","etc","even","ever","every","everybody","everyone",
 252     "everything","everywhere","ex","exactly","example","except","f","far",
 253     "few","find","fifth","first","five","followed","following","follows","for",
 254     "former","formerly","forth","four","from","further","furthermore","g",
 255     "get","gets","getting","given","gives","go","goes","going","gone","got",
 256     "gotten","greetings","h","had","hadn't","happens","hardly","has","hasn't",
 257     "have","haven't","having","he","he's","hello","help","hence","her","here",
 258     "here's","hereafter","hereby","herein","hereupon","hers","herself","hi",
 259     "him","himself","his","hither","hopefully","how","howbeit","however","i",
 260     "i'd","identify","i'll","i'm","i've","ie","if","ignored","immediate","in","inasmuch",
 261     "inc","indeed","indicate","indicated","indicates","inner","insofar",
 262     "instead","into","inward","is","isn't","it","it'd","it'll","it's","its",
 263     "itself","j","just","k","keep","keeps","kept","know","known","knows","l",
 264     "last","lately","later","latter","latterly","least","less","lest","let",
 265     "let's","like","liked","likely","little","look","looking","looks","ltd",
 266     "m","mainly","many","may","maybe","me","mean","meanwhile","merely","might",
 267     "more","moreover","most","mostly","much","must","my","myself","n",
 268     "name","namely","nd","near","nearly","necessary","need","needs","neither",
 269     "never","nevertheless","new","next","nine","no","nobody","non","none",
 270     "noone","nor","normally","not","nothing","novel","now","nowhere","o",
 271     "obviously","of","off","often","oh","ok","okay","old","on","once","one",
 272     "ones","only","onto","or","other","others","otherwise","ought","our",
 273     "ours","ourselves","out","outside","over","overall","own","p","particular",
 274     "particularly","per","perhaps","placed","please","plus","possible",
 275     "presents","presumably","probably","provides","q","que","quite","qv","r","rather",
 276     "rd","re","really","reasonably","regarding","regardless","regards",
 277     "relatively","respectively","right","s","said","same","saw","say",
 278     "saying","says","second","secondly","see","seeing","seem","seemed",
 279     "seeming","seems","seen","self","selves","sensible","sent","serious",
 280     "seriously","seven","several","shall","she","should","shouldn't","since",
 281     "six","so","some","somebody","somehow","someone","something","sometime",
 282     "sometimes","somewhat","somewhere","soon","sorry","specified","specify",
 283     "specifying","still","sub","such","sup","sure","t","t's","take","taken",
 284     "tell","tends","th","than","thank","thanks","thanx","that","that's",
 285     "thats","the","their","theirs","them","themselves","then","thence","there",
 286     "there's","thereafter","thereby","therefore","therein","theres",
 287     "thereupon","these","they","they'd","they'll","they're","they've",
 288     "think","third","this","thorough","thoroughly","those","though","three",
 289     "through","throughout","thru","thus","to","together","too","took","toward",
 290     "towards","tried","tries","truly","try","trying","twice","two","u","un",
 291     "under","unfortunately","unless","unlikely","until","unto","up","upon",
 292     "us","use","used","useful","uses","using","usually","uucp","v","value",
 293     "various","very","via","viz","vs","w","want","wants","was","wasn't","way",
 294     "we","we'd","we'll","we're","we've","welcome","well","went","were",
 295     "weren't","what","what's","whatever","when","whence","whenever","where",
 296     "where's","whereafter","whereas","whereby","wherein","whereupon",
 297     "wherever","whether","which","while","whither","who","who's","whoever",
 298     "whole","whom","whose","why","will","willing","wish","with","within",
 299     "without","won't","wonder","would","wouldn't","x","y","yes","yet","you",
 300     "you'd","you'll","you're","you've","your","yours","yourself","yourselves",
 301     "z","zero"]
 302
 303
 304
 305