2 Module : Gargantext.Text.Terms.Stop
3 Description : Mono Terms module
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 - generalize to byteString
13 Stop words and (how to learn it).
15 Main type here is String.
19 {-# LANGUAGE NoImplicitPrelude #-}
20 {-# LANGUAGE OverloadedStrings #-}
22 module Gargantext.Text.Learn -- (detectLang, detectLangs, stopList)
25 --import Data.Char (toLower)
26 import qualified Data.List as DL
28 import Data.Maybe (maybe)
29 import Data.Map.Strict (Map, toList)
30 import qualified Data.Map.Strict as DM
32 import Data.String (String)
34 import Data.Text (Text)
35 import Data.Text (pack, unpack, toLower)
36 import Data.Tuple.Extra (both, second)
38 import Gargantext.Prelude
39 import Gargantext.Core (Lang(..), allLangs)
40 import Gargantext.Text.Terms.Mono (words)
41 import Gargantext.Text.Metrics.Count (occurrencesWith)
43 import qualified Gargantext.Text.Samples.FR as FR
44 import qualified Gargantext.Text.Samples.EN as EN
45 --import qualified Gargantext.Text.Samples.DE as DE
46 --import qualified Gargantext.Text.Samples.SP as SP
47 --import qualified Gargantext.Text.Samples.CH as CH
49 ------------------------------------------------------------------------
50 data Candidate = Candidate { stop :: Double
54 ------------------------------------------------------------------------
55 -- * Analyze candidate
61 data CatWord a = CatWord a Word
62 type CatProb a = Map a Double
64 type Events a = Map a EventBook
66 ------------------------------------------------------------------------
67 detectStopDefault :: Text -> Maybe Bool
68 detectStopDefault = undefined
70 detectBool :: [(Bool, Text)] -> Text -> Maybe Bool
71 detectBool events = detectDefault False events
73 detectDefault :: Ord a => a -> [(a, Text)] -> Text -> Maybe a
74 detectDefault = detectDefaultWith identity
76 detectDefaultWith :: Ord a => (b -> Text) -> a -> [(a, b)] -> b -> Maybe a
77 detectDefaultWith f d events = detectDefaultWithPriors f ps
79 ps = priorEventsWith f d events
81 detectDefaultWithPriors :: Ord b => (a -> Text) -> Events b -> a -> Maybe b
82 detectDefaultWithPriors f priors = detectCat 99 priors . f
84 priorEventsWith :: Ord a => (t -> Text) -> a -> [(a, t)] -> Events a
85 priorEventsWith f d e = toEvents d [0..2] 10 es
87 es = map (\(a,b) -> CatWord a (unpack $ toLower $ f b)) e
90 ------------------------------------------------------------------------
91 detectLangDefault :: Text -> Maybe Lang
92 detectLangDefault = detectCat 99 eventLang
94 eventLang :: Events Lang
95 eventLang = toEvents FR [0..2] 10 [ langWord l | l <- allLangs ]
97 langWord :: Lang -> CatWord Lang
98 langWord l = CatWord l (textSample l)
100 textSample :: Lang -> String
101 textSample EN = EN.textSample
102 textSample FR = FR.textSample
103 --textSample DE = DE.textSample
104 --textSample SP = SP.textSample
105 --textSample CH = CH.textSample
106 ------------------------------------------------------------------------
107 detectCat :: Ord a => Int -> Events a -> Text -> Maybe a
108 detectCat n es = head . map fst . (detectCat' n es) . unpack
110 detectCat' :: Ord a => Int -> Events a -> String -> [(a, Double)]
111 detectCat' n' es' s = DL.reverse $ DL.sortOn snd
113 $ detectWith n' es' (wordsToBook [0..2] n' s)
116 detectWith :: Ord a => Int -> Events a -> EventBook -> CatProb a
117 detectWith n'' el (EventBook mapFreq _) =
120 $ map (\(s,m) -> map (\(l,f) -> (l, (fromIntegral m) * f)) $ toPrior n'' s el)
121 $ filter (\x -> fst x /= " ")
124 -- | TODO: monoids (but proba >= 0)
125 toPrior :: Int -> String -> Events a -> [(a, Double)]
126 toPrior n'' s el = prior n'' $ pebLang s el
128 pebLang :: String -> Events a -> [(a, (Freq,TotalFreq))]
129 pebLang st = map (\(l,eb) -> (l, peb st eb)) . DM.toList
131 peb :: String -> EventBook -> (Freq, TotalFreq)
132 peb st (EventBook mapFreq mapN) = (fromIntegral a, fromIntegral b)
134 a = maybe 0 identity $ DM.lookup st mapFreq
135 b = maybe 1 identity $ DM.lookup (length st) mapN
138 prior :: Int -> [(a, (Freq, TotalFreq))] -> [(a, Double)]
139 prior i ps = zip ls $ zipWith (\x y -> x^i * y) (map (\(a,_) -> part a (sum $ map fst ps')) ps')
140 (map (\(a,b) -> a / b) ps')
142 (ls, ps'') = DL.unzip ps
143 ps' = map (both fromIntegral) ps''
145 part :: (Eq p, Fractional p) => p -> p -> p
151 toProba :: (Eq b, Fractional b, Functor t, Foldable t) =>
153 toProba xs = map (\(a,b) -> (a, part b total)) xs
155 total = sum $ map snd xs
158 toEvents :: Ord a => a -> [Int] -> Int -> [CatWord a] -> Events a
159 toEvents e ns n = foldl' (opEvent (+)) (emptyEvent e ns n) . map (toEvent ns n)
161 emptyEvent :: Ord a => a -> [Int] -> Int -> Events a
162 emptyEvent e' ns' n'= toEvent ns' n' (CatWord e' "")
164 toEvent :: Ord a => [Int] -> Int -> CatWord a -> Events a
165 toEvent ns'' n'' (CatWord l txt) = DM.fromList [(l, wordsToBook ns'' n'' txt)]
167 opEvent :: Ord a => (Freq -> Freq -> Freq) -> Events a -> Events a -> Events a
168 opEvent f = DM.unionWith (op f)
170 ------------------------------------------------------------------------
171 ------------------------------------------------------------------------
172 data EventBook = EventBook { events_freq :: Map String Freq
173 , events_n :: Map StringSize TotalFreq
177 emptyEventBook :: [Int] -> Int -> EventBook
178 emptyEventBook ns n = wordToBook ns n " "
180 wordsToBook :: [Int] -> Int -> String -> EventBook
181 wordsToBook ns n txt = foldl' (op (+)) (emptyEventBook ns n) eventsBook
183 ws = map unpack $ words $ pack txt
184 eventsBook = map (wordToBook ns n) ws
186 wordToBook :: [Int] -> Int -> Word -> EventBook
187 wordToBook ns n txt = EventBook ef en
189 chks = allChunks ns n txt
190 en = DM.fromList $ map (\(n',ns') -> (n', length ns')) $ zip ns chks
191 ef = foldl' DM.union DM.empty $ map (occurrencesWith identity) chks
193 op :: (Freq -> Freq -> Freq) -> EventBook -> EventBook -> EventBook
194 op f (EventBook ef1 en1)
195 (EventBook ef2 en2) = EventBook (DM.unionWith f ef1 ef2)
196 (DM.unionWith f en1 en2)
198 ------------------------------------------------------------------------
199 ------------------------------------------------------------------------
200 allChunks :: [Int] -> Int -> String -> [[String]]
201 allChunks ns m st = map (\n -> chunks n m st) ns
203 -- | Chunks is the same function as splitBy in Context but for Strings,
204 -- not Text (without pack and unpack operations that are not needed).
205 chunks :: Int -> Int -> String -> [String]
206 chunks n m = DL.take m . filter (not . all (== ' '))
213 -- | String preparation
214 blanks :: String -> String
216 blanks xs = [' '] <> xs <> [' ']
220 -- Some previous tests to be removed
221 --import GHC.Base (Functor)
222 --import Numeric.Probability.Distribution ((??))
223 --import qualified Numeric.Probability.Distribution as D
225 -- | Blocks increase the size of the word to ease computations
226 -- some border and unexepected effects can happen, need to be tested
227 blockOf :: Int -> String -> String
228 blockOf n = DL.concat . DL.take n . DL.repeat
230 -- * Make the distributions
231 makeDist :: [String] -> D.T Double String
232 makeDist = D.uniform . DL.concat . map (DL.concat . allChunks [0,2] 10)
234 stopDist :: D.T Double String
235 stopDist = makeDist $ map show ([0..9]::[Int]) <> EN.stopList
237 candDist :: D.T Double String
238 candDist = makeDist candList
240 ------------------------------------------------------------------------
241 sumProba :: Num a => D.T a String -> [Char] -> a
242 sumProba ds x = sum $ map ((~?) ds) $ DL.concat $ allChunks [0,2] 10 $ map toLower x
244 -- | Get probability according a distribution
245 (~?) :: (Num prob, Eq a) => D.T prob a -> a -> prob
246 (~?) ds x = (==x) ?? ds
248 ------------------------------------------------------------------------
249 candidate :: [Char] -> Candidate
250 candidate x = Candidate (sumProba stopDist x) (sumProba candDist x)
252 ------------------------------------------------------------------------
254 candList = [ "france", "alexandre", "mael", "constitution"
255 , "etats-unis", "associes", "car", "train", "spam"]