src/Gargantext/Text/Terms/Stop.hs

   1 {-|
   2 Module      : Gargantext.Text.Terms.Stop
   3 Description : Mono Terms module
   4 Copyright   : (c) CNRS, 2017 - present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 Stop words and (how to learn it).
  11
  12 Main type here is String.
  13
  14 -}
  15
  16 {-# LANGUAGE NoImplicitPrelude #-}
  17
  18 module Gargantext.Text.Terms.Stop
  19   where
  20
  21 import GHC.Base (Functor)
  22 import Numeric.Probability.Distribution ((??))
  23 import qualified Numeric.Probability.Distribution as D
  24
  25 import Data.Char (toLower)
  26 import qualified Data.List as DL
  27
  28 import Data.Maybe (maybe)
  29 import Data.Map.Strict (Map, toList)
  30 import qualified Data.Map.Strict as DM
  31
  32 import Data.String (String)
  33
  34 import Data.Text (pack, unpack)
  35 import Data.Tuple.Extra (both)
  36
  37 import Gargantext.Prelude
  38 import Gargantext.Core (Lang(..), allLangs)
  39 import Gargantext.Text.Terms.Mono (words)
  40 import Gargantext.Text.Metrics.Count (occurrencesWith)
  41
  42 import qualified Gargantext.Text.Samples.FR as FR
  43 import qualified Gargantext.Text.Samples.EN as EN
  44 --import qualified Gargantext.Text.Samples.DE as DE
  45 --import qualified Gargantext.Text.Samples.SP as SP
  46 --import qualified Gargantext.Text.Samples.CH as CH
  47
  48 ------------------------------------------------------------------------
  49 data Candidate = Candidate { stop :: Double
  50                            , noStop :: Double
  51  } deriving (Show)
  52
  53 -- * String preparation
  54
  55 -- | String prepare
  56 blanks :: String -> String
  57 blanks [] = []
  58 blanks xs = [' '] <> xs <> [' ']
  59
  60 -- | Blocks increase the size of the word to ease computations
  61 -- some border and unexepected effects can happen, need to be tested
  62 blockOf :: Int -> String -> String
  63 blockOf n st = DL.concat $ DL.take n $ DL.repeat st
  64
  65 -- | Chunks is the same function as splitBy in Context but for Strings,
  66 -- not Text (without pack and unpack operations that are not needed).
  67 chunks :: Int -> Int -> String -> [String]
  68 chunks n m = DL.take m . filter (not . all (== ' ')) . chunkAlong (n+1) 1 . DL.concat . DL.take 1000 . DL.repeat . blanks
  69
  70 allChunks :: [Int] -> Int -> String -> [String]
  71 allChunks ns m st = DL.concat $ map (\n -> chunks n m st) ns
  72
  73 allChunks' :: [Int] -> Int -> String -> [[String]]
  74 allChunks' ns m st = map (\n -> chunks n m st) ns
  75
  76 ------------------------------------------------------------------------
  77 -- * Analyze candidate
  78 type StringSize = Int
  79 type TotalFreq  = Int
  80 type Freq       = Int
  81 type Word       = String
  82
  83 data LangWord = LangWord Lang Word
  84
  85 type LangProba = Map Lang Double
  86
  87 ------------------------------------------------------------------------
  88 detectLangs :: String -> [(Lang, Double)]
  89 detectLangs s =  DL.reverse $ DL.sortOn snd
  90                             $ toList
  91                             $ detect (wordsToBook [0..2] s) eventLang
  92
  93 part :: (Eq p, Fractional p) => p -> p -> p
  94 part 0 _ = 0
  95 part _ 0 = 0
  96 part x y = x / y
  97
  98 toProba :: (Eq b, Fractional b, Functor t, Foldable t) =>
  99                  t (a, b) -> t (a, b)
 100 toProba xs = map (\(a,b) -> (a, part b total)) xs
 101   where
 102     total = sum $ map snd xs
 103
 104 textSample :: Lang -> String
 105 textSample EN = EN.textSample
 106 textSample FR = FR.textSample
 107 --textSample DE = DE.textSample
 108 --textSample SP = SP.textSample
 109 --textSample CH = CH.textSample
 110
 111 langWord :: Lang -> LangWord
 112 langWord l = LangWord l (textSample l)
 113
 114 eventLang :: EventLang
 115 eventLang = toEventLangs [0..2] [ langWord l | l <- allLangs ]
 116
 117 detect :: EventBook -> EventLang -> LangProba
 118 detect (EventBook mapFreq _) el =
 119   DM.unionsWith (+)
 120   $ map DM.fromList
 121   $ map (\(s,n) -> map (\(l,f) -> (l, (fromIntegral n) * f)) $ toPrior s el)
 122   $ filter (\x -> fst x /= "  ")
 123   $ DM.toList mapFreq
 124
 125 ------------------------------------------------------------------------
 126 -- | TODO: monoids
 127 type EventLang = Map Lang EventBook
 128
 129 toEventLangs :: [Int] -> [LangWord] -> EventLang
 130 toEventLangs ns = foldl' (opLang (+)) (emptyEventLang ns) . map (toLang ns)
 131
 132 emptyEventLang :: [Int] -> EventLang
 133 emptyEventLang ns = toLang ns (LangWord FR "")
 134
 135 toLang :: [Int] -> LangWord -> EventLang
 136 toLang ns (LangWord l txt) = DM.fromList [(l, wordsToBook ns txt)]
 137
 138 opLang :: (Freq -> Freq -> Freq) -> EventLang -> EventLang -> EventLang
 139 opLang f = DM.unionWith (op f)
 140
 141 ------------------------------------------------------------------------
 142 -- | TODO: monoids (but proba >= 0)
 143
 144 peb :: String -> EventBook -> Double
 145 peb st (EventBook mapFreq mapN) = (fromIntegral a) / (fromIntegral b)
 146   where
 147     a = maybe 0 identity $ DM.lookup st mapFreq
 148     b = maybe 1 identity $ DM.lookup (length st) mapN
 149
 150 peb' :: String -> EventBook -> (Freq, TotalFreq)
 151 peb' st (EventBook mapFreq mapN) = (fromIntegral a, fromIntegral b)
 152   where
 153     a = maybe 0 identity $ DM.lookup st mapFreq
 154     b = maybe 1 identity $ DM.lookup (length st) mapN
 155
 156 ------------------------------------------------------------------------
 157 toPrior :: String -> EventLang -> [(Lang, Double)]
 158 toPrior s el = prior $ pebLang s el
 159
 160 pebLang :: String -> EventLang -> [(Lang, (Freq,TotalFreq))]
 161 pebLang st = map (\(l,eb) -> (l, peb' st eb)) .  DM.toList
 162
 163 ------------------------------------------------------------------------
 164 prior :: [(Lang, (Freq, TotalFreq))] -> [(Lang, Double)]
 165 prior ps = zip ls $ zipWith (\x y -> x^99 * y) (map (\(a,_) -> part a (sum $ map fst ps')) ps')
 166                                 (map (\(a,b) -> a / b) ps')
 167   where
 168
 169     (ls, ps'') = DL.unzip ps
 170     ps' = map (both fromIntegral) ps''
 171 ------------------------------------------------------------------------
 172 data EventBook = EventBook { events_freq :: Map String     Freq
 173                            , events_n    :: Map StringSize TotalFreq
 174                            }
 175                              deriving (Show)
 176
 177 emptyEventBook :: [Int] -> EventBook
 178 emptyEventBook ns = wordToBook ns " "
 179
 180 wordsToBook :: [Int] -> String -> EventBook
 181 wordsToBook ns txt = foldl' (op (+)) (emptyEventBook ns) eventsBook
 182   where
 183     ws = map unpack $ words $ pack txt
 184     eventsBook = map (wordToBook ns) ws
 185
 186 wordToBook :: [Int] -> Word -> EventBook
 187 wordToBook ns txt = EventBook ef en
 188   where
 189     chks = allChunks' ns 10 txt
 190     en = DM.fromList $ map (\(n,ns') -> (n, length ns')) $ zip ns chks
 191     ef = foldl' DM.union DM.empty $ map (occurrencesWith identity) chks
 192
 193 op :: (Freq -> Freq -> Freq) -> EventBook -> EventBook -> EventBook
 194 op f (EventBook ef1 en1)
 195      (EventBook ef2 en2) = EventBook (DM.unionWith f ef1 ef2)
 196                                      (DM.unionWith f en1 en2)
 197
 198
 199 ------------------------------------------------------------------------
 200 ------------------------------------------------------------------------
 201 -- * Make the distributions
 202 makeDist :: [String] -> D.T Double String
 203 makeDist = D.uniform . DL.concat . map (allChunks [0,2] 10)
 204
 205 stopDist :: D.T Double String
 206 stopDist = makeDist stopList
 207
 208 candDist :: D.T Double String
 209 candDist = makeDist candList
 210
 211 ------------------------------------------------------------------------
 212 sumProba :: Num a => D.T a String -> [Char] -> a
 213 sumProba ds x = sum $ map ((~?) ds) $ allChunks [0,2] 10 $ map toLower x
 214
 215 -- | Get probability according a distribution
 216 (~?) :: (Num prob, Eq a) => D.T prob a -> a -> prob
 217 (~?) ds x = (==x) ?? ds
 218
 219 ------------------------------------------------------------------------
 220 candidate :: [Char] -> Candidate
 221 candidate x = Candidate (sumProba stopDist x) (sumProba candDist x)
 222
 223 ------------------------------------------------------------------------
 224 candList :: [String]
 225 candList = [ "france", "alexandre", "mael", "constitution"
 226            , "etats-unis", "associes", "car", "train", "spam"]
 227
 228
 229 stopList :: [String]
 230 stopList = map show ([0..9]::[Int]) <> [
 231     "a","a's","able","about","above","apply","according","accordingly",
 232     "across","actually","after","afterwards","again","against",
 233     "ain't","all","allow","allows","almost","alone","along",
 234     "involves", "already","also","although","always","am","among","amongst",
 235     "an","and","another","any","anybody","anyhow","anyone","anything",
 236     "anyway","anyways","anywhere","analyze","apart","appear","appreciate","appropriate",
 237     "are","aren't","around","as","aside","ask","asking","associated","at",
 238     "available","away","awfully","based", "b","be","became","because","become",
 239     "becomes","becoming","been","before","beforehand","behind","being",
 240     "believe","below","beside","besides","best","better","between","beyond",
 241     "both","brief","but","by","c","c'mon","c's","came","can","can't","cannot",
 242     "cant","cause","causes","certain","certainly","changes","clearly","co",
 243     "com","come","comes","common","concerning","consequently","consider","considering",
 244     "contain","containing","contains","corresponding","could","couldn't","course",
 245     "currently","d","definitely","described","detects","detecting","despite","did","didn't","different",
 246     "do","does","doesn't","doing","don't","done","down","downwards","during","e",
 247     "each","edu","eg","eight","either","else","elsewhere","enough","entirely",
 248     "especially","et","etc","even","ever","every","everybody","everyone",
 249     "everything","everywhere","ex","exactly","example","except","f","far",
 250     "few","find","fifth","first","five","followed","following","follows","for",
 251     "former","formerly","forth","four","from","further","furthermore","g",
 252     "get","gets","getting","given","gives","go","goes","going","gone","got",
 253     "gotten","greetings","h","had","hadn't","happens","hardly","has","hasn't",
 254     "have","haven't","having","he","he's","hello","help","hence","her","here",
 255     "here's","hereafter","hereby","herein","hereupon","hers","herself","hi",
 256     "him","himself","his","hither","hopefully","how","howbeit","however","i",
 257     "i'd","identify","i'll","i'm","i've","ie","if","ignored","immediate","in","inasmuch",
 258     "inc","indeed","indicate","indicated","indicates","inner","insofar",
 259     "instead","into","inward","is","isn't","it","it'd","it'll","it's","its",
 260     "itself","j","just","k","keep","keeps","kept","know","known","knows","l",
 261     "last","lately","later","latter","latterly","least","less","lest","let",
 262     "let's","like","liked","likely","little","look","looking","looks","ltd",
 263     "m","mainly","many","may","maybe","me","mean","meanwhile","merely","might",
 264     "more","moreover","most","mostly","much","must","my","myself","n",
 265     "name","namely","nd","near","nearly","necessary","need","needs","neither",
 266     "never","nevertheless","new","next","nine","no","nobody","non","none",
 267     "noone","nor","normally","not","nothing","novel","now","nowhere","o",
 268     "obviously","of","off","often","oh","ok","okay","old","on","once","one",
 269     "ones","only","onto","or","other","others","otherwise","ought","our",
 270     "ours","ourselves","out","outside","over","overall","own","p","particular",
 271     "particularly","per","perhaps","placed","please","plus","possible",
 272     "presents","presumably","probably","provides","q","que","quite","qv","r","rather",
 273     "rd","re","really","reasonably","regarding","regardless","regards",
 274     "relatively","respectively","right","s","said","same","saw","say",
 275     "saying","says","second","secondly","see","seeing","seem","seemed",
 276     "seeming","seems","seen","self","selves","sensible","sent","serious",
 277     "seriously","seven","several","shall","she","should","shouldn't","since",
 278     "six","so","some","somebody","somehow","someone","something","sometime",
 279     "sometimes","somewhat","somewhere","soon","sorry","specified","specify",
 280     "specifying","still","sub","such","sup","sure","t","t's","take","taken",
 281     "tell","tends","th","than","thank","thanks","thanx","that","that's",
 282     "thats","the","their","theirs","them","themselves","then","thence","there",
 283     "there's","thereafter","thereby","therefore","therein","theres",
 284     "thereupon","these","they","they'd","they'll","they're","they've",
 285     "think","third","this","thorough","thoroughly","those","though","three",
 286     "through","throughout","thru","thus","to","together","too","took","toward",
 287     "towards","tried","tries","truly","try","trying","twice","two","u","un",
 288     "under","unfortunately","unless","unlikely","until","unto","up","upon",
 289     "us","use","used","useful","uses","using","usually","uucp","v","value",
 290     "various","very","via","viz","vs","w","want","wants","was","wasn't","way",
 291     "we","we'd","we'll","we're","we've","welcome","well","went","were",
 292     "weren't","what","what's","whatever","when","whence","whenever","where",
 293     "where's","whereafter","whereas","whereby","wherein","whereupon",
 294     "wherever","whether","which","while","whither","who","who's","whoever",
 295     "whole","whom","whose","why","will","willing","wish","with","within",
 296     "without","won't","wonder","would","wouldn't","x","y","yes","yet","you",
 297     "you'd","you'll","you're","you've","your","yours","yourself","yourselves",
 298     "z","zero"]
 299
 300
 301
 302