{-| Module : Gargantext.Text.Terms.Stop Description : Mono Terms module Copyright : (c) CNRS, 2017 - present License : AGPL + CECILL v3 Maintainer : team@gargantext.org Stability : experimental Portability : POSIX Stop words and (how to learn it). Main type here is String. -} {-# LANGUAGE NoImplicitPrelude #-} module Gargantext.Text.Terms.Stop (detectLang, detectLangs, stopList) where import GHC.Base (Functor) import Numeric.Probability.Distribution ((??)) import qualified Numeric.Probability.Distribution as D import Data.Char (toLower) import qualified Data.List as DL import Data.Maybe (maybe) import Data.Map.Strict (Map, toList) import qualified Data.Map.Strict as DM import Data.String (String) import Data.Text (Text) import Data.Text (pack, unpack) import Data.Tuple.Extra (both) import Gargantext.Prelude import Gargantext.Core (Lang(..), allLangs) import Gargantext.Text.Terms.Mono (words) import Gargantext.Text.Metrics.Count (occurrencesWith) import qualified Gargantext.Text.Samples.FR as FR import qualified Gargantext.Text.Samples.EN as EN --import qualified Gargantext.Text.Samples.DE as DE --import qualified Gargantext.Text.Samples.SP as SP --import qualified Gargantext.Text.Samples.CH as CH ------------------------------------------------------------------------ data Candidate = Candidate { stop :: Double , noStop :: Double } deriving (Show) -- * String preparation -- | String prepare blanks :: String -> String blanks [] = [] blanks xs = [' '] <> xs <> [' '] -- | Blocks increase the size of the word to ease computations -- some border and unexepected effects can happen, need to be tested blockOf :: Int -> String -> String blockOf n = DL.concat . DL.take n . DL.repeat -- | Chunks is the same function as splitBy in Context but for Strings, -- not Text (without pack and unpack operations that are not needed). chunks :: Int -> Int -> String -> [String] chunks n m = DL.take m . filter (not . all (== ' ')) . chunkAlong (n+1) 1 . DL.concat . DL.take 1000 . DL.repeat . blanks allChunks :: [Int] -> Int -> String -> [String] allChunks ns m st = DL.concat $ map (\n -> chunks n m st) ns allChunks' :: [Int] -> Int -> String -> [[String]] allChunks' ns m st = map (\n -> chunks n m st) ns ------------------------------------------------------------------------ -- * Analyze candidate type StringSize = Int type TotalFreq = Int type Freq = Int type Word = String data LangWord = LangWord Lang Word type LangProba = Map Lang Double ------------------------------------------------------------------------ detectLang :: Text -> Maybe Lang detectLang = head . map fst . detectLangs detectLangs :: Text -> [(Lang, Double)] detectLangs = detectLangs' . unpack detectLangs' :: String -> [(Lang, Double)] detectLangs' s = DL.reverse $ DL.sortOn snd $ toList $ detect (wordsToBook [0..2] s) eventLang part :: (Eq p, Fractional p) => p -> p -> p part 0 _ = 0 part _ 0 = 0 part x y = x / y toProba :: (Eq b, Fractional b, Functor t, Foldable t) => t (a, b) -> t (a, b) toProba xs = map (\(a,b) -> (a, part b total)) xs where total = sum $ map snd xs textSample :: Lang -> String textSample EN = EN.textSample textSample FR = FR.textSample --textSample DE = DE.textSample --textSample SP = SP.textSample --textSample CH = CH.textSample langWord :: Lang -> LangWord langWord l = LangWord l (textSample l) eventLang :: EventLang eventLang = toEventLangs [0..2] [ langWord l | l <- allLangs ] detect :: EventBook -> EventLang -> LangProba detect (EventBook mapFreq _) el = DM.unionsWith (+) $ map DM.fromList $ map (\(s,n) -> map (\(l,f) -> (l, (fromIntegral n) * f)) $ toPrior s el) $ filter (\x -> fst x /= " ") $ DM.toList mapFreq ------------------------------------------------------------------------ -- | TODO: monoids type EventLang = Map Lang EventBook toEventLangs :: [Int] -> [LangWord] -> EventLang toEventLangs ns = foldl' (opLang (+)) (emptyEventLang ns) . map (toLang ns) emptyEventLang :: [Int] -> EventLang emptyEventLang ns = toLang ns (LangWord FR "") toLang :: [Int] -> LangWord -> EventLang toLang ns (LangWord l txt) = DM.fromList [(l, wordsToBook ns txt)] opLang :: (Freq -> Freq -> Freq) -> EventLang -> EventLang -> EventLang opLang f = DM.unionWith (op f) ------------------------------------------------------------------------ -- | TODO: monoids (but proba >= 0) peb :: String -> EventBook -> Double peb st (EventBook mapFreq mapN) = (fromIntegral a) / (fromIntegral b) where a = maybe 0 identity $ DM.lookup st mapFreq b = maybe 1 identity $ DM.lookup (length st) mapN peb' :: String -> EventBook -> (Freq, TotalFreq) peb' st (EventBook mapFreq mapN) = (fromIntegral a, fromIntegral b) where a = maybe 0 identity $ DM.lookup st mapFreq b = maybe 1 identity $ DM.lookup (length st) mapN ------------------------------------------------------------------------ toPrior :: String -> EventLang -> [(Lang, Double)] toPrior s el = prior $ pebLang s el pebLang :: String -> EventLang -> [(Lang, (Freq,TotalFreq))] pebLang st = map (\(l,eb) -> (l, peb' st eb)) . DM.toList ------------------------------------------------------------------------ prior :: [(Lang, (Freq, TotalFreq))] -> [(Lang, Double)] prior ps = zip ls $ zipWith (\x y -> x^(99::Int) * y) (map (\(a,_) -> part a (sum $ map fst ps')) ps') (map (\(a,b) -> a / b) ps') where (ls, ps'') = DL.unzip ps ps' = map (both fromIntegral) ps'' ------------------------------------------------------------------------ data EventBook = EventBook { events_freq :: Map String Freq , events_n :: Map StringSize TotalFreq } deriving (Show) emptyEventBook :: [Int] -> EventBook emptyEventBook ns = wordToBook ns " " wordsToBook :: [Int] -> String -> EventBook wordsToBook ns txt = foldl' (op (+)) (emptyEventBook ns) eventsBook where ws = map unpack $ words $ pack txt eventsBook = map (wordToBook ns) ws wordToBook :: [Int] -> Word -> EventBook wordToBook ns txt = EventBook ef en where chks = allChunks' ns 10 txt en = DM.fromList $ map (\(n,ns') -> (n, length ns')) $ zip ns chks ef = foldl' DM.union DM.empty $ map (occurrencesWith identity) chks op :: (Freq -> Freq -> Freq) -> EventBook -> EventBook -> EventBook op f (EventBook ef1 en1) (EventBook ef2 en2) = EventBook (DM.unionWith f ef1 ef2) (DM.unionWith f en1 en2) ------------------------------------------------------------------------ ------------------------------------------------------------------------ -- * Make the distributions makeDist :: [String] -> D.T Double String makeDist = D.uniform . DL.concat . map (allChunks [0,2] 10) stopDist :: D.T Double String stopDist = makeDist stopList candDist :: D.T Double String candDist = makeDist candList ------------------------------------------------------------------------ sumProba :: Num a => D.T a String -> [Char] -> a sumProba ds x = sum $ map ((~?) ds) $ allChunks [0,2] 10 $ map toLower x -- | Get probability according a distribution (~?) :: (Num prob, Eq a) => D.T prob a -> a -> prob (~?) ds x = (==x) ?? ds ------------------------------------------------------------------------ candidate :: [Char] -> Candidate candidate x = Candidate (sumProba stopDist x) (sumProba candDist x) ------------------------------------------------------------------------ candList :: [String] candList = [ "france", "alexandre", "mael", "constitution" , "etats-unis", "associes", "car", "train", "spam"] stopList :: [String] stopList = map show ([0..9]::[Int]) <> stopListWords stopListWords :: [String] stopListWords = ["a", "a's", "able", "about", "above", "according", "accordingly" , "across", "actually", "after", "afterwards", "again", "against" , "ain't", "all", "allow", "allows", "almost", "alone", "along" , "already", "also", "although", "always", "am", "among", "amongst", "an" , "analyze", "and", "another", "any", "anybody", "anyhow", "anyone" , "anything", "anyway", "anyways", "anywhere", "apart", "appear" , "apply", "appreciate", "appropriate", "are", "aren't", "around" , "as", "aside", "ask", "asking", "associated", "at", "available" , "away", "awfully", "b", "based", "be", "became", "because", "become" , "becomes", "becoming", "been", "before", "beforehand", "behind" , "being", "believe", "below", "beside", "besides", "best", "better" , "between", "beyond", "both", "brief", "but", "by", "c", "c'mon", "c's" , "came", "can", "can't", "cannot", "cant", "cause", "causes", "certain" , "certainly", "changes", "clearly", "co", "com", "come", "comes" , "common", "concerning", "consequently", "consider", "considering" , "contain", "containing", "contains", "corresponding", "could" , "couldn't", "course", "currently", "d", "definitely", "described" , "despite", "detecting", "detects", "did", "didn't", "different", "do" , "does", "doesn't", "doing", "don't", "done", "down", "downwards" , "during", "e", "each", "edu", "eg", "eight", "either", "else" , "elsewhere", "enough", "entirely", "especially", "et", "etc", "even" , "ever", "every", "everybody", "everyone", "everything", "everywhere" , "ex", "exactly", "example", "except", "f", "far", "few", "fifth" , "find", "first", "five", "followed", "following", "follows", "for" , "former", "formerly", "forth", "four", "from", "further", "furthermore" , "g", "get", "gets", "getting", "gif", "given", "gives", "go", "goes" , "going", "gone", "got", "gotten", "greetings", "h", "had", "hadn't" , "happens", "hardly", "has", "hasn't", "have", "haven't", "having" , "he", "he'd", "he'll", "he's", "hello", "help", "hence", "her" , "here", "here's", "hereafter", "hereby", "herein", "hereupon", "hers" , "herself", "hi", "him", "himself", "his", "hither", "hopefully", "how" , "how's", "howbeit", "however", "i", "i'd", "i'll", "i'm", "i've" , "identify", "ie", "if", "ignored", "immediate", "in", "inasmuch" , "inc", "indeed", "indicate", "indicated", "indicates", "inner" , "insofar", "instead", "into", "involves", "inward", "is", "isn't" , "it", "it'd", "it'll", "it's", "its", "itself", "j", "just", "k" , "keep", "keeps", "kept", "know", "known", "knows", "l", "last" , "late", "lately", "later", "latter", "latterly", "least", "less" , "lest", "let", "let's", "like", "liked", "likely", "little", "look" , "looking", "looks", "ltd", "m", "main", "mainly", "many", "may" , "maybe", "me", "mean", "meanwhile", "merely", "might", "min", "more" , "moreover", "most", "mostly", "much", "must", "mustn't", "my", "myself" , "n", "name", "namely", "nd", "near", "nearly", "necessary", "need" , "needs", "neither", "never", "nevertheless", "new", "next", "nine" , "no", "nobody", "non", "none", "noone", "nor", "normally", "not" , "nothing", "novel", "now", "nowhere", "o", "obviously", "of", "off" , "often", "oh", "ok", "okay", "old", "on", "once", "one", "ones" , "only", "onto", "or", "other", "others", "otherwise", "ought", "our" , "ours", "ourselves", "out", "outside", "over", "overall", "own", "p" , "particular", "particularly", "per", "perhaps", "placed", "please" , "plus", "possible", "presents", "presumably", "probably", "provides" , "q", "que", "quite", "qv", "r", "rather", "rd", "re", "really" , "reasonably", "regarding", "regardless", "regards", "relatively" , "respectively", "right", "s", "said", "same", "saw", "say", "saying" , "says", "sds", "second", "secondly", "see", "seeing", "seem", "seemed" , "seeming", "seems", "seen", "self", "selves", "sensible", "sent" , "serious", "seriously", "seven", "several", "shall", "shan't" , "she", "she'd", "she'll", "she's", "should", "shouldn't", "since" , "six", "so", "some", "somebody", "somehow", "someone", "something" , "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry" , "specified", "specify", "specifying", "still", "sub", "such", "sup" , "sure", "t", "t's", "take", "taken", "tell", "tends", "th", "than" , "thank", "thanks", "thanx", "that", "that's", "thats", "the", "their" , "theirs", "them", "themselves", "then", "thence", "there", "there's" , "thereafter", "thereby", "therefore", "therein", "theres", "thereupon" , "these", "they", "they'd", "they'll", "they're", "they've", "think" , "third", "this", "thorough", "thoroughly", "those", "though", "three" , "through", "throughout", "thru", "thus", "to", "together", "too" , "took", "toward", "towards", "tried", "tries", "truly", "try" , "trying", "twice", "two", "u", "un", "under", "unfortunately" , "unless", "unlikely", "until", "unto", "up", "upon", "us", "use" , "used", "useful", "uses", "using", "usually", "uucp", "v", "value" , "various", "very", "via", "viz", "vs", "w", "want", "wants", "was" , "wasn't", "way", "we", "we'd", "we'll", "we're", "we've", "welcome" , "well", "went", "were", "weren't", "what", "what's", "whatever", "when" , "when's", "whence", "whenever", "where", "where's", "whereafter" , "whereas", "whereby", "wherein", "whereupon", "wherever", "whether" , "which", "while", "whither", "who", "who's", "whoever", "whole", "whom" , "whose", "why", "why's", "will", "willing", "wish", "with", "within" , "without", "won't", "wonder", "would", "wouldn't", "x", "y", "yes" , "yet", "you", "you'd", "you'll", "you're", "you've", "your", "yours" , "yourself", "yourselves", "z", "zero"]