src/Gargantext/Text/Terms/Stop.hs

   1 {-|
   2 Module      : Gargantext.Text.Terms.Stop
   3 Description : Mono Terms module
   4 Copyright   : (c) CNRS, 2017 - present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 Stop words and (how to learn it).
  11
  12 Main type here is String.
  13
  14 -}
  15
  16 {-# LANGUAGE NoImplicitPrelude #-}
  17
  18 module Gargantext.Text.Terms.Stop
  19   where
  20
  21 import Numeric.Probability.Distribution ((??))
  22 import qualified Numeric.Probability.Distribution as D
  23
  24 import Data.Char (toLower)
  25 import qualified Data.List as DL
  26
  27 import Data.Maybe (maybe)
  28 import Data.Map.Strict (Map, toList)
  29 import qualified Data.Map.Strict as DM
  30
  31 import Data.String (String)
  32
  33 import Data.Text (pack, unpack)
  34
  35 import Gargantext.Prelude
  36 import Gargantext.Core (Lang(..), allLangs)
  37 import Gargantext.Text.Terms.Mono (words)
  38 import Gargantext.Text.Metrics.Count (occurrencesWith)
  39
  40 import qualified Gargantext.Text.Samples.FR as FR
  41 import qualified Gargantext.Text.Samples.EN as EN
  42 --import qualified Gargantext.Text.Samples.DE as DE
  43 --import qualified Gargantext.Text.Samples.SP as SP
  44 --import qualified Gargantext.Text.Samples.CH as CH
  45
  46 ------------------------------------------------------------------------
  47 data Candidate = Candidate { stop :: Double
  48                            , noStop :: Double
  49  } deriving (Show)
  50
  51 -- * String preparation
  52
  53 -- | String prepare
  54 blanks :: String -> String
  55 blanks [] = []
  56 blanks xs = [' '] <> xs <> [' ']
  57
  58 -- | Blocks increase the size of the word to ease computations
  59 -- some border and unexepected effects can happen, need to be tested
  60 blockOf :: Int -> String -> String
  61 blockOf n st = DL.concat $ DL.take n $ DL.repeat st
  62
  63 -- | Chunks is the same function as splitBy in Context but for Strings,
  64 -- not Text (without pack and unpack operations that are not needed).
  65 chunks :: Int -> Int -> String -> [String]
  66 chunks n m = DL.take m . filter (not . all (== ' ')) . chunkAlong (n+1) 1 . DL.concat . DL.take 1000 . DL.repeat . blanks
  67
  68 allChunks :: [Int] -> Int -> String -> [String]
  69 allChunks ns m st = DL.concat $ map (\n -> chunks n m st) ns
  70
  71 allChunks' :: [Int] -> Int -> String -> [[String]]
  72 allChunks' ns m st = map (\n -> chunks n m st) ns
  73
  74 ------------------------------------------------------------------------
  75 -- * Analyze candidate
  76 type StringSize = Int
  77 type TotalFreq  = Int
  78 type Freq       = Int
  79 type Word       = String
  80
  81 data LangWord = LangWord Lang Word
  82
  83 type LangProba = Map Lang Double
  84
  85 ------------------------------------------------------------------------
  86 detectLangs :: String -> [(Lang, Double)]
  87 detectLangs s = DL.reverse $ DL.sortOn snd
  88                            $ toList
  89                            $ detect (wordsToBook [0..2] s) testEL
  90
  91 textMining :: Lang -> String
  92 textMining EN = EN.textMining
  93 textMining FR = FR.textMining
  94 --textMining DE = DE.textMining
  95 --textMining SP = SP.textMining
  96 --textMining CH = CH.textMining
  97
  98 langWord :: Lang -> LangWord
  99 langWord l = LangWord l (textMining l)
 100
 101 testEL :: EventLang
 102 testEL = toEventLangs [0..2] [ langWord l | l <- allLangs ]
 103
 104 detect :: EventBook -> EventLang -> LangProba
 105 detect (EventBook mapFreq _) el = DM.unionsWith (+) $ map (\(s,n) -> DM.map (\eb -> (fromIntegral n) * peb s eb) el) $ filter (\x -> fst x /= "  ") $ DM.toList mapFreq
 106
 107 ------------------------------------------------------------------------
 108 -- | TODO: monoids
 109 type EventLang = Map Lang EventBook
 110 toEventLangs :: [Int] -> [LangWord] -> EventLang
 111 toEventLangs ns = foldl' (opLang (+)) (emptyEventLang ns) . map (toLang ns)
 112
 113 emptyEventLang :: [Int] -> EventLang
 114 emptyEventLang ns = toLang ns (LangWord FR "")
 115
 116 toLang :: [Int] -> LangWord -> EventLang
 117 toLang ns (LangWord l txt) = DM.fromList [(l, wordsToBook ns txt)]
 118
 119 opLang :: (Freq -> Freq -> Freq) -> EventLang -> EventLang -> EventLang
 120 opLang f = DM.unionWith (op f)
 121
 122 ------------------------------------------------------------------------
 123 -- | TODO: monoids (but proba >= 0)
 124
 125 peb :: String -> EventBook -> Double
 126 peb st (EventBook mapFreq mapN) = (fromIntegral a) / (fromIntegral b)
 127   where
 128     a = maybe 0 identity $ DM.lookup st mapFreq
 129     b = maybe 1 identity $ DM.lookup (length st) mapN
 130
 131 data EventBook = EventBook { events_freq :: Map String     Freq
 132                            , events_n    :: Map StringSize TotalFreq
 133                            }
 134                              deriving (Show)
 135
 136 emptyEventBook :: [Int] -> EventBook
 137 emptyEventBook ns = wordToBook ns " "
 138
 139 wordsToBook :: [Int] -> String -> EventBook
 140 wordsToBook ns txt = foldl' (op (+)) (emptyEventBook ns) eventsBook
 141   where
 142     ws = map unpack $ words $ pack txt
 143     eventsBook = map (wordToBook ns) ws
 144
 145 wordToBook :: [Int] -> Word -> EventBook
 146 wordToBook ns txt = EventBook ef en
 147   where
 148     chks = allChunks' ns 10 txt
 149     en = DM.fromList $ map (\(n,ns') -> (n, length ns')) $ zip ns chks
 150     ef = foldl' DM.union DM.empty $ map (occurrencesWith identity) chks
 151
 152 op :: (Freq -> Freq -> Freq) -> EventBook -> EventBook -> EventBook
 153 op f (EventBook ef1 en1)
 154      (EventBook ef2 en2) = EventBook (DM.unionWith f ef1 ef2)
 155                                      (DM.unionWith f en1 en2)
 156
 157
 158 ------------------------------------------------------------------------
 159 ------------------------------------------------------------------------
 160 -- * Make the distributions
 161 makeDist :: [String] -> D.T Double String
 162 makeDist = D.uniform . DL.concat . map (allChunks [0,2] 10)
 163
 164 stopDist :: D.T Double String
 165 stopDist = makeDist stopList
 166
 167 candDist :: D.T Double String
 168 candDist = makeDist candList
 169
 170 ------------------------------------------------------------------------
 171 sumProba :: Num a => D.T a String -> [Char] -> a
 172 sumProba ds x = sum $ map ((~?) ds) $ allChunks [0,2] 10 $ map toLower x
 173
 174 -- | Get probability according a distribution
 175 (~?) :: (Num prob, Eq a) => D.T prob a -> a -> prob
 176 (~?) ds x = (==x) ?? ds
 177
 178 ------------------------------------------------------------------------
 179 candidate :: [Char] -> Candidate
 180 candidate x = Candidate (sumProba stopDist x) (sumProba candDist x)
 181
 182 ------------------------------------------------------------------------
 183 candList :: [String]
 184 candList = [ "france", "alexandre", "mael", "constitution"
 185            , "etats-unis", "associes", "car", "train", "spam"]
 186
 187
 188 stopList :: [String]
 189 stopList = map show ([0..9]::[Int]) <> [
 190     "a","a's","able","about","above","apply","according","accordingly",
 191     "across","actually","after","afterwards","again","against",
 192     "ain't","all","allow","allows","almost","alone","along",
 193     "involves", "already","also","although","always","am","among","amongst",
 194     "an","and","another","any","anybody","anyhow","anyone","anything",
 195     "anyway","anyways","anywhere","analyze","apart","appear","appreciate","appropriate",
 196     "are","aren't","around","as","aside","ask","asking","associated","at",
 197     "available","away","awfully","based", "b","be","became","because","become",
 198     "becomes","becoming","been","before","beforehand","behind","being",
 199     "believe","below","beside","besides","best","better","between","beyond",
 200     "both","brief","but","by","c","c'mon","c's","came","can","can't","cannot",
 201     "cant","cause","causes","certain","certainly","changes","clearly","co",
 202     "com","come","comes","common","concerning","consequently","consider","considering",
 203     "contain","containing","contains","corresponding","could","couldn't","course",
 204     "currently","d","definitely","described","detects","detecting","despite","did","didn't","different",
 205     "do","does","doesn't","doing","don't","done","down","downwards","during","e",
 206     "each","edu","eg","eight","either","else","elsewhere","enough","entirely",
 207     "especially","et","etc","even","ever","every","everybody","everyone",
 208     "everything","everywhere","ex","exactly","example","except","f","far",
 209     "few","find","fifth","first","five","followed","following","follows","for",
 210     "former","formerly","forth","four","from","further","furthermore","g",
 211     "get","gets","getting","given","gives","go","goes","going","gone","got",
 212     "gotten","greetings","h","had","hadn't","happens","hardly","has","hasn't",
 213     "have","haven't","having","he","he's","hello","help","hence","her","here",
 214     "here's","hereafter","hereby","herein","hereupon","hers","herself","hi",
 215     "him","himself","his","hither","hopefully","how","howbeit","however","i",
 216     "i'd","identify","i'll","i'm","i've","ie","if","ignored","immediate","in","inasmuch",
 217     "inc","indeed","indicate","indicated","indicates","inner","insofar",
 218     "instead","into","inward","is","isn't","it","it'd","it'll","it's","its",
 219     "itself","j","just","k","keep","keeps","kept","know","known","knows","l",
 220     "last","lately","later","latter","latterly","least","less","lest","let",
 221     "let's","like","liked","likely","little","look","looking","looks","ltd",
 222     "m","mainly","many","may","maybe","me","mean","meanwhile","merely","might",
 223     "more","moreover","most","mostly","much","must","my","myself","n",
 224     "name","namely","nd","near","nearly","necessary","need","needs","neither",
 225     "never","nevertheless","new","next","nine","no","nobody","non","none",
 226     "noone","nor","normally","not","nothing","novel","now","nowhere","o",
 227     "obviously","of","off","often","oh","ok","okay","old","on","once","one",
 228     "ones","only","onto","or","other","others","otherwise","ought","our",
 229     "ours","ourselves","out","outside","over","overall","own","p","particular",
 230     "particularly","per","perhaps","placed","please","plus","possible",
 231     "presents","presumably","probably","provides","q","que","quite","qv","r","rather",
 232     "rd","re","really","reasonably","regarding","regardless","regards",
 233     "relatively","respectively","right","s","said","same","saw","say",
 234     "saying","says","second","secondly","see","seeing","seem","seemed",
 235     "seeming","seems","seen","self","selves","sensible","sent","serious",
 236     "seriously","seven","several","shall","she","should","shouldn't","since",
 237     "six","so","some","somebody","somehow","someone","something","sometime",
 238     "sometimes","somewhat","somewhere","soon","sorry","specified","specify",
 239     "specifying","still","sub","such","sup","sure","t","t's","take","taken",
 240     "tell","tends","th","than","thank","thanks","thanx","that","that's",
 241     "thats","the","their","theirs","them","themselves","then","thence","there",
 242     "there's","thereafter","thereby","therefore","therein","theres",
 243     "thereupon","these","they","they'd","they'll","they're","they've",
 244     "think","third","this","thorough","thoroughly","those","though","three",
 245     "through","throughout","thru","thus","to","together","too","took","toward",
 246     "towards","tried","tries","truly","try","trying","twice","two","u","un",
 247     "under","unfortunately","unless","unlikely","until","unto","up","upon",
 248     "us","use","used","useful","uses","using","usually","uucp","v","value",
 249     "various","very","via","viz","vs","w","want","wants","was","wasn't","way",
 250     "we","we'd","we'll","we're","we've","welcome","well","went","were",
 251     "weren't","what","what's","whatever","when","whence","whenever","where",
 252     "where's","whereafter","whereas","whereby","wherein","whereupon",
 253     "wherever","whether","which","while","whither","who","who's","whoever",
 254     "whole","whom","whose","why","will","willing","wish","with","within",
 255     "without","won't","wonder","would","wouldn't","x","y","yes","yet","you",
 256     "you'd","you'll","you're","you've","your","yours","yourself","yourselves",
 257     "z","zero"]
 258
 259
 260
 261