Eleve n + 1
[gargantext.git] / src / Gargantext / Text / Terms / Stop.hs
index 313a277a96623f71915b450c71b9509e58c73acc..88cb745ad027539d06ad99dbc7d4041b5107e182 100644 (file)
@@ -14,10 +14,12 @@ Main type here is String.
 -}
 
 {-# LANGUAGE NoImplicitPrelude #-}
+{-# LANGUAGE OverloadedStrings #-}
 
-module Gargantext.Text.Terms.Stop
+module Gargantext.Text.Terms.Stop -- (detectLang, detectLangs, stopList)
   where
 
+import GHC.Base (Functor)
 import Numeric.Probability.Distribution ((??))
 import qualified Numeric.Probability.Distribution as D
 
@@ -25,18 +27,26 @@ import Data.Char (toLower)
 import qualified Data.List as DL
 
 import Data.Maybe (maybe)
-import Data.Map.Strict (Map)
+import Data.Map.Strict (Map, toList)
 import qualified Data.Map.Strict as DM
 
 import Data.String (String)
 
+import Data.Text (Text)
 import Data.Text (pack, unpack)
+import Data.Tuple.Extra (both)
 
 import Gargantext.Prelude
-import Gargantext.Core (Lang(..))
+import Gargantext.Core (Lang(..), allLangs)
 import Gargantext.Text.Terms.Mono (words)
 import Gargantext.Text.Metrics.Count (occurrencesWith)
 
+import qualified Gargantext.Text.Samples.FR as FR
+import qualified Gargantext.Text.Samples.EN as EN
+--import qualified Gargantext.Text.Samples.DE as DE
+--import qualified Gargantext.Text.Samples.SP as SP
+--import qualified Gargantext.Text.Samples.CH as CH
+
 ------------------------------------------------------------------------
 data Candidate = Candidate { stop :: Double
                            , noStop :: Double
@@ -52,12 +62,17 @@ blanks xs = [' '] <> xs <> [' ']
 -- | Blocks increase the size of the word to ease computations
 -- some border and unexepected effects can happen, need to be tested
 blockOf :: Int -> String -> String
-blockOf n st = DL.concat $ DL.take n $ DL.repeat st
+blockOf n = DL.concat . DL.take n . DL.repeat
 
 -- | Chunks is the same function as splitBy in Context but for Strings,
 -- not Text (without pack and unpack operations that are not needed).
 chunks :: Int -> Int -> String -> [String]
-chunks n m = DL.take m . filter (not . all (== ' ')) . chunkAlong (n+1) 1 . DL.concat . DL.take 1000 . DL.repeat . blanks
+chunks n m = DL.take m . filter (not . all (== ' '))
+                       . chunkAlong (n+1) 1
+                       . DL.concat
+                       . DL.take 1000
+                       . DL.repeat
+                       . blanks
 
 allChunks :: [Int] -> Int -> String -> [String]
 allChunks ns m st = DL.concat $ map (\n -> chunks n m st) ns
@@ -77,23 +92,53 @@ data LangWord = LangWord Lang Word
 type LangProba = Map Lang Double
 
 ------------------------------------------------------------------------
+detectLang :: Text -> Maybe Lang
+detectLang = head . map fst . detectLangs
+
+detectLangs :: Text -> [(Lang, Double)]
+detectLangs = detectLangs' . unpack
+
+detectLangs' :: String -> [(Lang, Double)]
+detectLangs' s =  DL.reverse $ DL.sortOn snd
+                            $ toList
+                            $ detect (wordsToBook [0..2] s) eventLang
+
+part :: (Eq p, Fractional p) => p -> p -> p
+part 0 _ = 0
+part _ 0 = 0
+part x y = x / y
+
+toProba :: (Eq b, Fractional b, Functor t, Foldable t) =>
+                 t (a, b) -> t (a, b)
+toProba xs = map (\(a,b) -> (a, part b total)) xs
+  where
+    total = sum $ map snd xs
+
+textSample :: Lang -> String
+textSample EN = EN.textSample
+textSample FR = FR.textSample
+--textSample DE = DE.textSample
+--textSample SP = SP.textSample
+--textSample CH = CH.textSample
 
-estimeTest :: String -> LangProba
-estimeTest s = estime (wordsToBook [0..2] s) testEL
+langWord :: Lang -> LangWord
+langWord l = LangWord l (textSample l)
 
-testEL :: EventLang
-testEL = toEventLangs [0,1,2] [ LangWord EN "Lovely day. This day."
-                              , LangWord FR "Belle journée, j'y vais."
-                              , LangWord EN "Hello Sir, how are you doing? I am fine thank you, good bye"
-                              , LangWord FR "Bonjour Monsieur, comment allez-vous? Je vais bien merci."
-                              ]
+eventLang :: EventLang
+eventLang = toEventLangs [0..2] [ langWord l | l <- allLangs ]
 
-estime :: EventBook -> EventLang -> LangProba
-estime (EventBook mapFreq _) el = DM.unionsWith (+) $ map (\(s,n) -> DM.map (\eb -> (fromIntegral n) * peb s eb) el) $ filter (\x -> fst x /= "  ") $ DM.toList mapFreq
+detect :: EventBook -> EventLang -> LangProba
+detect (EventBook mapFreq _) el = 
+  DM.unionsWith (+)
+  $ map DM.fromList
+  $ map (\(s,n) -> map (\(l,f) -> (l, (fromIntegral n) * f)) $ toPrior s el)
+  $ filter (\x -> fst x /= "  ")
+  $ DM.toList mapFreq
 
 ------------------------------------------------------------------------
 -- | TODO: monoids
 type EventLang = Map Lang EventBook
+
 toEventLangs :: [Int] -> [LangWord] -> EventLang
 toEventLangs ns = foldl' (opLang (+)) (emptyEventLang ns) . map (toLang ns)
 
@@ -115,6 +160,27 @@ peb st (EventBook mapFreq mapN) = (fromIntegral a) / (fromIntegral b)
     a = maybe 0 identity $ DM.lookup st mapFreq
     b = maybe 1 identity $ DM.lookup (length st) mapN
 
+peb' :: String -> EventBook -> (Freq, TotalFreq)
+peb' st (EventBook mapFreq mapN) = (fromIntegral a, fromIntegral b)
+  where
+    a = maybe 0 identity $ DM.lookup st mapFreq
+    b = maybe 1 identity $ DM.lookup (length st) mapN
+
+------------------------------------------------------------------------
+toPrior :: String -> EventLang -> [(Lang, Double)]
+toPrior s el = prior $ pebLang s el
+
+pebLang :: String -> EventLang -> [(Lang, (Freq,TotalFreq))]
+pebLang st = map (\(l,eb) -> (l, peb' st eb)) .  DM.toList
+------------------------------------------------------------------------
+prior :: [(Lang, (Freq, TotalFreq))] -> [(Lang, Double)]
+prior ps = zip ls $ zipWith (\x y -> x^(99::Int) * y) (map (\(a,_) -> part a (sum $ map fst ps')) ps') 
+                                (map (\(a,b) -> a / b) ps')
+  where
+   
+    (ls, ps'') = DL.unzip ps
+    ps' = map (both fromIntegral) ps''
+------------------------------------------------------------------------
 data EventBook = EventBook { events_freq :: Map String     Freq
                            , events_n    :: Map StringSize TotalFreq
                            }
@@ -133,7 +199,7 @@ wordToBook :: [Int] -> Word -> EventBook
 wordToBook ns txt = EventBook ef en
   where
     chks = allChunks' ns 10 txt
-    en = DM.fromList $ map (\(n,ns) -> (n, length ns)) $ zip ns chks
+    en = DM.fromList $ map (\(n,ns') -> (n, length ns')) $ zip ns chks
     ef = foldl' DM.union DM.empty $ map (occurrencesWith identity) chks
 
 op :: (Freq -> Freq -> Freq) -> EventBook -> EventBook -> EventBook
@@ -141,7 +207,6 @@ op f (EventBook ef1 en1)
      (EventBook ef2 en2) = EventBook (DM.unionWith f ef1 ef2)
                                      (DM.unionWith f en1 en2)
 
-
 ------------------------------------------------------------------------
 ------------------------------------------------------------------------
 -- * Make the distributions
@@ -163,6 +228,7 @@ sumProba ds x = sum $ map ((~?) ds) $ allChunks [0,2] 10 $ map toLower x
 (~?) ds x = (==x) ?? ds
 
 ------------------------------------------------------------------------
+candidate :: [Char] -> Candidate
 candidate x = Candidate (sumProba stopDist x) (sumProba candDist x)
 
 ------------------------------------------------------------------------
@@ -172,76 +238,91 @@ candList = [ "france", "alexandre", "mael", "constitution"
 
 
 stopList :: [String]
-stopList = map show ([0..9]::[Int]) <> [
-    "a","a's","able","about","above","apply","according","accordingly",
-    "across","actually","after","afterwards","again","against",
-    "ain't","all","allow","allows","almost","alone","along",
-    "involves", "already","also","although","always","am","among","amongst",
-    "an","and","another","any","anybody","anyhow","anyone","anything",
-    "anyway","anyways","anywhere","analyze","apart","appear","appreciate","appropriate",
-    "are","aren't","around","as","aside","ask","asking","associated","at",
-    "available","away","awfully","based", "b","be","became","because","become",
-    "becomes","becoming","been","before","beforehand","behind","being",
-    "believe","below","beside","besides","best","better","between","beyond",
-    "both","brief","but","by","c","c'mon","c's","came","can","can't","cannot",
-    "cant","cause","causes","certain","certainly","changes","clearly","co",
-    "com","come","comes","common","concerning","consequently","consider","considering",
-    "contain","containing","contains","corresponding","could","couldn't","course",
-    "currently","d","definitely","described","detects","detecting","despite","did","didn't","different",
-    "do","does","doesn't","doing","don't","done","down","downwards","during","e",
-    "each","edu","eg","eight","either","else","elsewhere","enough","entirely",
-    "especially","et","etc","even","ever","every","everybody","everyone",
-    "everything","everywhere","ex","exactly","example","except","f","far",
-    "few","find","fifth","first","five","followed","following","follows","for",
-    "former","formerly","forth","four","from","further","furthermore","g",
-    "get","gets","getting","given","gives","go","goes","going","gone","got",
-    "gotten","greetings","h","had","hadn't","happens","hardly","has","hasn't",
-    "have","haven't","having","he","he's","hello","help","hence","her","here",
-    "here's","hereafter","hereby","herein","hereupon","hers","herself","hi",
-    "him","himself","his","hither","hopefully","how","howbeit","however","i",
-    "i'd","identify","i'll","i'm","i've","ie","if","ignored","immediate","in","inasmuch",
-    "inc","indeed","indicate","indicated","indicates","inner","insofar",
-    "instead","into","inward","is","isn't","it","it'd","it'll","it's","its",
-    "itself","j","just","k","keep","keeps","kept","know","known","knows","l",
-    "last","lately","later","latter","latterly","least","less","lest","let",
-    "let's","like","liked","likely","little","look","looking","looks","ltd",
-    "m","mainly","many","may","maybe","me","mean","meanwhile","merely","might",
-    "more","moreover","most","mostly","much","must","my","myself","n",
-    "name","namely","nd","near","nearly","necessary","need","needs","neither",
-    "never","nevertheless","new","next","nine","no","nobody","non","none",
-    "noone","nor","normally","not","nothing","novel","now","nowhere","o",
-    "obviously","of","off","often","oh","ok","okay","old","on","once","one",
-    "ones","only","onto","or","other","others","otherwise","ought","our",
-    "ours","ourselves","out","outside","over","overall","own","p","particular",
-    "particularly","per","perhaps","placed","please","plus","possible",
-    "presents","presumably","probably","provides","q","que","quite","qv","r","rather",
-    "rd","re","really","reasonably","regarding","regardless","regards",
-    "relatively","respectively","right","s","said","same","saw","say",
-    "saying","says","second","secondly","see","seeing","seem","seemed",
-    "seeming","seems","seen","self","selves","sensible","sent","serious",
-    "seriously","seven","several","shall","she","should","shouldn't","since",
-    "six","so","some","somebody","somehow","someone","something","sometime",
-    "sometimes","somewhat","somewhere","soon","sorry","specified","specify",
-    "specifying","still","sub","such","sup","sure","t","t's","take","taken",
-    "tell","tends","th","than","thank","thanks","thanx","that","that's",
-    "thats","the","their","theirs","them","themselves","then","thence","there",
-    "there's","thereafter","thereby","therefore","therein","theres",
-    "thereupon","these","they","they'd","they'll","they're","they've",
-    "think","third","this","thorough","thoroughly","those","though","three",
-    "through","throughout","thru","thus","to","together","too","took","toward",
-    "towards","tried","tries","truly","try","trying","twice","two","u","un",
-    "under","unfortunately","unless","unlikely","until","unto","up","upon",
-    "us","use","used","useful","uses","using","usually","uucp","v","value",
-    "various","very","via","viz","vs","w","want","wants","was","wasn't","way",
-    "we","we'd","we'll","we're","we've","welcome","well","went","were",
-    "weren't","what","what's","whatever","when","whence","whenever","where",
-    "where's","whereafter","whereas","whereby","wherein","whereupon",
-    "wherever","whether","which","while","whither","who","who's","whoever",
-    "whole","whom","whose","why","will","willing","wish","with","within",
-    "without","won't","wonder","would","wouldn't","x","y","yes","yet","you",
-    "you'd","you'll","you're","you've","your","yours","yourself","yourselves",
-    "z","zero"]
-
-
+stopList = map show ([0..9]::[Int]) <> stopListWords
+
+stopListWords :: [String]
+stopListWords = 
+  ["a", "a's", "able", "about", "above", "according", "accordingly"
+  , "across", "actually", "after", "afterwards", "again", "against"
+  , "ain't", "all", "allow", "allows", "almost", "alone", "along"
+  , "already", "also", "although", "always", "am", "among", "amongst", "an"
+  , "analyze", "and", "another", "any", "anybody", "anyhow", "anyone"
+  , "anything", "anyway", "anyways", "anywhere", "apart", "appear"
+  , "apply", "appreciate", "appropriate", "are", "aren't", "around"
+  , "as", "aside", "ask", "asking", "associated", "at", "available"
+  , "away", "awfully", "b", "based", "be", "became", "because", "become"
+  , "becomes", "becoming", "been", "before", "beforehand", "behind"
+  , "being", "believe", "below", "beside", "besides", "best", "better"
+  , "between", "beyond", "both", "brief", "but", "by", "c", "c'mon", "c's"
+  , "came", "can", "can't", "cannot", "cant", "cause", "causes", "certain"
+  , "certainly", "changes", "clearly", "co", "com", "come", "comes"
+  , "common", "concerning", "consequently", "consider", "considering"
+  , "contain", "containing", "contains", "corresponding", "could"
+  , "couldn't", "course", "currently", "d", "definitely", "described"
+  , "despite", "detecting", "detects", "did", "didn't", "different", "do"
+  , "does", "doesn't", "doing", "don't", "done", "down", "downwards"
+  , "during", "e", "each", "edu", "eg", "eight", "either", "else"
+  , "elsewhere", "enough", "entirely", "especially", "et", "etc", "even"
+  , "ever", "every", "everybody", "everyone", "everything", "everywhere"
+  , "ex", "exactly", "example", "except", "f", "far", "few", "fifth"
+  , "find", "first", "five", "followed", "following", "follows", "for"
+  , "former", "formerly", "forth", "four", "from", "further", "furthermore"
+  , "g", "get", "gets", "getting", "gif", "given", "gives", "go", "goes"
+  , "going", "gone", "got", "gotten", "greetings", "h", "had", "hadn't"
+  , "happens", "hardly", "has", "hasn't", "have", "haven't", "having"
+  , "he", "he'd", "he'll", "he's", "hello", "help", "hence", "her"
+  , "here", "here's", "hereafter", "hereby", "herein", "hereupon", "hers"
+  , "herself", "hi", "him", "himself", "his", "hither", "hopefully", "how"
+  , "how's", "howbeit", "however", "i", "i'd", "i'll", "i'm", "i've"
+  , "identify", "ie", "if", "ignored", "immediate", "in", "inasmuch"
+  , "inc", "indeed", "indicate", "indicated", "indicates", "inner"
+  , "insofar", "instead", "into", "involves", "inward", "is", "isn't"
+  , "it", "it'd", "it'll", "it's", "its", "itself", "j", "just", "k"
+  , "keep", "keeps", "kept", "know", "known", "knows", "l", "last"
+  , "late", "lately", "later", "latter", "latterly", "least", "less"
+  , "lest", "let", "let's", "like", "liked", "likely", "little", "look"
+  , "looking", "looks", "ltd", "m", "main", "mainly", "many", "may"
+  , "maybe", "me", "mean", "meanwhile", "merely", "might", "min", "more"
+  , "moreover", "most", "mostly", "much", "must", "mustn't", "my", "myself"
+  , "n", "name", "namely", "nd", "near", "nearly", "necessary", "need"
+  , "needs", "neither", "never", "nevertheless", "new", "next", "nine"
+  , "no", "nobody", "non", "none", "noone", "nor", "normally", "not"
+  , "nothing", "novel", "now", "nowhere", "o", "obviously", "of", "off"
+  , "often", "oh", "ok", "okay", "old", "on", "once", "one", "ones"
+  , "only", "onto", "or", "other", "others", "otherwise", "ought", "our"
+  , "ours", "ourselves", "out", "outside", "over", "overall", "own", "p"
+  , "particular", "particularly", "per", "perhaps", "placed", "please"
+  , "plus", "possible", "presents", "presumably", "probably", "provides"
+  , "q", "que", "quite", "qv", "r", "rather", "rd", "re", "really"
+  , "reasonably", "regarding", "regardless", "regards", "relatively"
+  , "respectively", "right", "s", "said", "same", "saw", "say", "saying"
+  , "says", "sds", "second", "secondly", "see", "seeing", "seem", "seemed"
+  , "seeming", "seems", "seen", "self", "selves", "sensible", "sent"
+  , "serious", "seriously", "seven", "several", "shall", "shan't"
+  , "she", "she'd", "she'll", "she's", "should", "shouldn't", "since"
+  , "six", "so", "some", "somebody", "somehow", "someone", "something"
+  , "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry"
+  , "specified", "specify", "specifying", "still", "sub", "such", "sup"
+  , "sure", "t", "t's", "take", "taken", "tell", "tends", "th", "than"
+  , "thank", "thanks", "thanx", "that", "that's", "thats", "the", "their"
+  , "theirs", "them", "themselves", "then", "thence", "there", "there's"
+  , "thereafter", "thereby", "therefore", "therein", "theres", "thereupon"
+  , "these", "they", "they'd", "they'll", "they're", "they've", "think"
+  , "third", "this", "thorough", "thoroughly", "those", "though", "three"
+  , "through", "throughout", "thru", "thus", "to", "together", "too"
+  , "took", "toward", "towards", "tried", "tries", "truly", "try"
+  , "trying", "twice", "two", "u", "un", "under", "unfortunately"
+  , "unless", "unlikely", "until", "unto", "up", "upon", "us", "use"
+  , "used", "useful", "uses", "using", "usually", "uucp", "v", "value"
+  , "various", "very", "via", "viz", "vs", "w", "want", "wants", "was"
+  , "wasn't", "way", "we", "we'd", "we'll", "we're", "we've", "welcome"
+  , "well", "went", "were", "weren't", "what", "what's", "whatever", "when"
+  , "when's", "whence", "whenever", "where", "where's", "whereafter"
+  , "whereas", "whereby", "wherein", "whereupon", "wherever", "whether"
+  , "which", "while", "whither", "who", "who's", "whoever", "whole", "whom"
+  , "whose", "why", "why's", "will", "willing", "wish", "with", "within"
+  , "without", "won't", "wonder", "would", "wouldn't", "x", "y", "yes"
+  , "yet", "you", "you'd", "you'll", "you're", "you've", "your", "yours"
+  , "yourself", "yourselves", "z", "zero"]