src/Gargantext/Text/Terms/Stop.hs

   1 {-|
   2 Module      : Gargantext.Text.Terms.Stop
   3 Description : Mono Terms module
   4 Copyright   : (c) CNRS, 2017 - present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 Stop words and (how to learn it).
  11
  12 Main type here is String.
  13
  14 -}
  15
  16 {-# LANGUAGE NoImplicitPrelude #-}
  17
  18 module Gargantext.Text.Terms.Stop
  19   where
  20
  21 import Numeric.Probability.Distribution ((??))
  22 import qualified Numeric.Probability.Distribution as D
  23
  24 import Data.Char (toLower)
  25 import qualified Data.List as DL
  26
  27 import Data.Maybe (maybe)
  28 import Data.Map.Strict (Map, toList)
  29 import qualified Data.Map.Strict as DM
  30
  31 import Data.String (String)
  32
  33 import Data.Text (pack, unpack)
  34
  35 import Gargantext.Prelude
  36 import Gargantext.Core (Lang(..))
  37 import Gargantext.Text.Terms.Mono (words)
  38 import Gargantext.Text.Metrics.Count (occurrencesWith)
  39
  40 import Gargantext.Text.Samples.FR as FR
  41 import Gargantext.Text.Samples.EN as EN
  42 import Gargantext.Text.Samples.DE as DE
  43 import Gargantext.Text.Samples.SP as SP
  44 import Gargantext.Text.Samples.CH as CH
  45
  46 ------------------------------------------------------------------------
  47 data Candidate = Candidate { stop :: Double
  48                            , noStop :: Double
  49  } deriving (Show)
  50
  51 -- * String preparation
  52
  53 -- | String prepare
  54 blanks :: String -> String
  55 blanks [] = []
  56 blanks xs = [' '] <> xs <> [' ']
  57
  58 -- | Blocks increase the size of the word to ease computations
  59 -- some border and unexepected effects can happen, need to be tested
  60 blockOf :: Int -> String -> String
  61 blockOf n st = DL.concat $ DL.take n $ DL.repeat st
  62
  63 -- | Chunks is the same function as splitBy in Context but for Strings,
  64 -- not Text (without pack and unpack operations that are not needed).
  65 chunks :: Int -> Int -> String -> [String]
  66 chunks n m = DL.take m . filter (not . all (== ' ')) . chunkAlong (n+1) 1 . DL.concat . DL.take 1000 . DL.repeat . blanks
  67
  68 allChunks :: [Int] -> Int -> String -> [String]
  69 allChunks ns m st = DL.concat $ map (\n -> chunks n m st) ns
  70
  71 allChunks' :: [Int] -> Int -> String -> [[String]]
  72 allChunks' ns m st = map (\n -> chunks n m st) ns
  73
  74 ------------------------------------------------------------------------
  75 -- * Analyze candidate
  76 type StringSize = Int
  77 type TotalFreq  = Int
  78 type Freq       = Int
  79 type Word       = String
  80
  81 data LangWord = LangWord Lang Word
  82
  83 type LangProba = Map Lang Double
  84
  85 ------------------------------------------------------------------------
  86 detectLangs :: String -> [(Lang, Double)]
  87 detectLangs s = DL.reverse $ DL.sortOn snd
  88                            $ toList
  89                            $ detect (wordsToBook [0..2] s) testEL
  90
  91 testEL :: EventLang
  92 testEL = toEventLangs [0..2] [ LangWord EN EN.textMining
  93                               , LangWord FR FR.textMining
  94                               , LangWord DE DE.textMining
  95                               , LangWord SP SP.textMining
  96                               , LangWord CH CH.textMining
  97                               ]
  98
  99 detect :: EventBook -> EventLang -> LangProba
 100 detect (EventBook mapFreq _) el = DM.unionsWith (+) $ map (\(s,n) -> DM.map (\eb -> (fromIntegral n) * peb s eb) el) $ filter (\x -> fst x /= "  ") $ DM.toList mapFreq
 101
 102 ------------------------------------------------------------------------
 103 -- | TODO: monoids
 104 type EventLang = Map Lang EventBook
 105 toEventLangs :: [Int] -> [LangWord] -> EventLang
 106 toEventLangs ns = foldl' (opLang (+)) (emptyEventLang ns) . map (toLang ns)
 107
 108 emptyEventLang :: [Int] -> EventLang
 109 emptyEventLang ns = toLang ns (LangWord FR "")
 110
 111 toLang :: [Int] -> LangWord -> EventLang
 112 toLang ns (LangWord l txt) = DM.fromList [(l, wordsToBook ns txt)]
 113
 114 opLang :: (Freq -> Freq -> Freq) -> EventLang -> EventLang -> EventLang
 115 opLang f = DM.unionWith (op f)
 116
 117 ------------------------------------------------------------------------
 118 -- | TODO: monoids (but proba >= 0)
 119
 120 peb :: String -> EventBook -> Double
 121 peb st (EventBook mapFreq mapN) = (fromIntegral a) / (fromIntegral b)
 122   where
 123     a = maybe 0 identity $ DM.lookup st mapFreq
 124     b = maybe 1 identity $ DM.lookup (length st) mapN
 125
 126 data EventBook = EventBook { events_freq :: Map String     Freq
 127                            , events_n    :: Map StringSize TotalFreq
 128                            }
 129                              deriving (Show)
 130
 131 emptyEventBook :: [Int] -> EventBook
 132 emptyEventBook ns = wordToBook ns " "
 133
 134 wordsToBook :: [Int] -> String -> EventBook
 135 wordsToBook ns txt = foldl' (op (+)) (emptyEventBook ns) eventsBook
 136   where
 137     ws = map unpack $ words $ pack txt
 138     eventsBook = map (wordToBook ns) ws
 139
 140 wordToBook :: [Int] -> Word -> EventBook
 141 wordToBook ns txt = EventBook ef en
 142   where
 143     chks = allChunks' ns 10 txt
 144     en = DM.fromList $ map (\(n,ns') -> (n, length ns')) $ zip ns chks
 145     ef = foldl' DM.union DM.empty $ map (occurrencesWith identity) chks
 146
 147 op :: (Freq -> Freq -> Freq) -> EventBook -> EventBook -> EventBook
 148 op f (EventBook ef1 en1)
 149      (EventBook ef2 en2) = EventBook (DM.unionWith f ef1 ef2)
 150                                      (DM.unionWith f en1 en2)
 151
 152
 153 ------------------------------------------------------------------------
 154 ------------------------------------------------------------------------
 155 -- * Make the distributions
 156 makeDist :: [String] -> D.T Double String
 157 makeDist = D.uniform . DL.concat . map (allChunks [0,2] 10)
 158
 159 stopDist :: D.T Double String
 160 stopDist = makeDist stopList
 161
 162 candDist :: D.T Double String
 163 candDist = makeDist candList
 164
 165 ------------------------------------------------------------------------
 166 sumProba :: Num a => D.T a String -> [Char] -> a
 167 sumProba ds x = sum $ map ((~?) ds) $ allChunks [0,2] 10 $ map toLower x
 168
 169 -- | Get probability according a distribution
 170 (~?) :: (Num prob, Eq a) => D.T prob a -> a -> prob
 171 (~?) ds x = (==x) ?? ds
 172
 173 ------------------------------------------------------------------------
 174 candidate :: [Char] -> Candidate
 175 candidate x = Candidate (sumProba stopDist x) (sumProba candDist x)
 176
 177 ------------------------------------------------------------------------
 178 candList :: [String]
 179 candList = [ "france", "alexandre", "mael", "constitution"
 180            , "etats-unis", "associes", "car", "train", "spam"]
 181
 182
 183 stopList :: [String]
 184 stopList = map show ([0..9]::[Int]) <> [
 185     "a","a's","able","about","above","apply","according","accordingly",
 186     "across","actually","after","afterwards","again","against",
 187     "ain't","all","allow","allows","almost","alone","along",
 188     "involves", "already","also","although","always","am","among","amongst",
 189     "an","and","another","any","anybody","anyhow","anyone","anything",
 190     "anyway","anyways","anywhere","analyze","apart","appear","appreciate","appropriate",
 191     "are","aren't","around","as","aside","ask","asking","associated","at",
 192     "available","away","awfully","based", "b","be","became","because","become",
 193     "becomes","becoming","been","before","beforehand","behind","being",
 194     "believe","below","beside","besides","best","better","between","beyond",
 195     "both","brief","but","by","c","c'mon","c's","came","can","can't","cannot",
 196     "cant","cause","causes","certain","certainly","changes","clearly","co",
 197     "com","come","comes","common","concerning","consequently","consider","considering",
 198     "contain","containing","contains","corresponding","could","couldn't","course",
 199     "currently","d","definitely","described","detects","detecting","despite","did","didn't","different",
 200     "do","does","doesn't","doing","don't","done","down","downwards","during","e",
 201     "each","edu","eg","eight","either","else","elsewhere","enough","entirely",
 202     "especially","et","etc","even","ever","every","everybody","everyone",
 203     "everything","everywhere","ex","exactly","example","except","f","far",
 204     "few","find","fifth","first","five","followed","following","follows","for",
 205     "former","formerly","forth","four","from","further","furthermore","g",
 206     "get","gets","getting","given","gives","go","goes","going","gone","got",
 207     "gotten","greetings","h","had","hadn't","happens","hardly","has","hasn't",
 208     "have","haven't","having","he","he's","hello","help","hence","her","here",
 209     "here's","hereafter","hereby","herein","hereupon","hers","herself","hi",
 210     "him","himself","his","hither","hopefully","how","howbeit","however","i",
 211     "i'd","identify","i'll","i'm","i've","ie","if","ignored","immediate","in","inasmuch",
 212     "inc","indeed","indicate","indicated","indicates","inner","insofar",
 213     "instead","into","inward","is","isn't","it","it'd","it'll","it's","its",
 214     "itself","j","just","k","keep","keeps","kept","know","known","knows","l",
 215     "last","lately","later","latter","latterly","least","less","lest","let",
 216     "let's","like","liked","likely","little","look","looking","looks","ltd",
 217     "m","mainly","many","may","maybe","me","mean","meanwhile","merely","might",
 218     "more","moreover","most","mostly","much","must","my","myself","n",
 219     "name","namely","nd","near","nearly","necessary","need","needs","neither",
 220     "never","nevertheless","new","next","nine","no","nobody","non","none",
 221     "noone","nor","normally","not","nothing","novel","now","nowhere","o",
 222     "obviously","of","off","often","oh","ok","okay","old","on","once","one",
 223     "ones","only","onto","or","other","others","otherwise","ought","our",
 224     "ours","ourselves","out","outside","over","overall","own","p","particular",
 225     "particularly","per","perhaps","placed","please","plus","possible",
 226     "presents","presumably","probably","provides","q","que","quite","qv","r","rather",
 227     "rd","re","really","reasonably","regarding","regardless","regards",
 228     "relatively","respectively","right","s","said","same","saw","say",
 229     "saying","says","second","secondly","see","seeing","seem","seemed",
 230     "seeming","seems","seen","self","selves","sensible","sent","serious",
 231     "seriously","seven","several","shall","she","should","shouldn't","since",
 232     "six","so","some","somebody","somehow","someone","something","sometime",
 233     "sometimes","somewhat","somewhere","soon","sorry","specified","specify",
 234     "specifying","still","sub","such","sup","sure","t","t's","take","taken",
 235     "tell","tends","th","than","thank","thanks","thanx","that","that's",
 236     "thats","the","their","theirs","them","themselves","then","thence","there",
 237     "there's","thereafter","thereby","therefore","therein","theres",
 238     "thereupon","these","they","they'd","they'll","they're","they've",
 239     "think","third","this","thorough","thoroughly","those","though","three",
 240     "through","throughout","thru","thus","to","together","too","took","toward",
 241     "towards","tried","tries","truly","try","trying","twice","two","u","un",
 242     "under","unfortunately","unless","unlikely","until","unto","up","upon",
 243     "us","use","used","useful","uses","using","usually","uucp","v","value",
 244     "various","very","via","viz","vs","w","want","wants","was","wasn't","way",
 245     "we","we'd","we'll","we're","we've","welcome","well","went","were",
 246     "weren't","what","what's","whatever","when","whence","whenever","where",
 247     "where's","whereafter","whereas","whereby","wherein","whereupon",
 248     "wherever","whether","which","while","whither","who","who's","whoever",
 249     "whole","whom","whose","why","will","willing","wish","with","within",
 250     "without","won't","wonder","would","wouldn't","x","y","yes","yet","you",
 251     "you'd","you'll","you're","you've","your","yours","yourself","yourselves",
 252     "z","zero"]
 253
 254
 255
 256