2 Module : Gargantext.Core.Text.Terms.Stop
3 Description : Mono Terms module
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 - generalize to byteString
12 - Stop words and (how to learn it).
13 - Main type here is String check if Chars on Text would be optimized
17 {-# LANGUAGE TypeSynonymInstances #-}
19 module Gargantext.Core.Text.Learn -- (detectLang, detectLangs, stopList)
22 import Codec.Serialise
23 import qualified Data.List as DL
25 import Data.Maybe (maybe)
26 import Data.Map.Strict (Map, toList)
27 import qualified Data.Map.Strict as DM
30 import Data.String (String)
32 import Data.Text (Text)
33 import Data.Text (pack, unpack, toLower)
34 import Data.Tuple.Extra (both)
35 import qualified Data.ByteString.Lazy as BSL
37 import Gargantext.Prelude
38 import Gargantext.Prelude.Utils
39 import Gargantext.Core (Lang(..), allLangs)
40 import Gargantext.Core.Text.Terms.Mono (words)
41 import Gargantext.Core.Text.Metrics.Count (occurrencesWith)
43 import qualified Gargantext.Core.Text.Samples.FR as FR
44 import qualified Gargantext.Core.Text.Samples.EN as EN
45 --import qualified Gargantext.Core.Text.Samples.DE as DE
46 --import qualified Gargantext.Core.Text.Samples.SP as SP
47 --import qualified Gargantext.Core.Text.Samples.CH as CH
49 ------------------------------------------------------------------------
50 data Candidate = Candidate { stop :: Double
54 ------------------------------------------------------------------------
55 -- * Analyze candidate
61 data CatWord a = CatWord a Word
62 type CatProb a = Map a Double
64 type Events a = Map a EventBook
65 ------------------------------------------------------------------------
66 data EventBook = EventBook { events_freq :: Map String Freq
67 , events_n :: Map StringSize TotalFreq
69 deriving (Show, Generic)
71 instance Serialise EventBook
73 instance (Serialise a, Ord a) => SaveFile (Events a) where
74 saveFile' f d = BSL.writeFile f (serialise d)
76 instance (Serialise a, Ord a) => ReadFile (Events a) where
77 readFile' filepath = deserialise <$> BSL.readFile filepath
79 ------------------------------------------------------------------------
80 detectStopDefault :: Text -> Maybe Bool
81 detectStopDefault = undefined
83 detectBool :: [(Bool, Text)] -> Text -> Maybe Bool
84 detectBool events = detectDefault False events
86 detectDefault :: Ord a => a -> [(a, Text)] -> Text -> Maybe a
87 detectDefault = detectDefaultWith identity
89 detectDefaultWith :: Ord a => (b -> Text) -> a -> [(a, b)] -> b -> Maybe a
90 detectDefaultWith f d events = detectDefaultWithPriors f ps
92 ps = priorEventsWith f d events
94 detectDefaultWithPriors :: Ord b => (a -> Text) -> Events b -> a -> Maybe b
95 detectDefaultWithPriors f priors = detectCat 99 priors . f
97 priorEventsWith :: Ord a => (t -> Text) -> a -> [(a, t)] -> Events a
98 priorEventsWith f d e = toEvents d [0..2] 10 es
100 es = map (\(a,b) -> CatWord a (unpack $ toLower $ f b)) e
103 ------------------------------------------------------------------------
104 detectLangDefault :: Text -> Maybe Lang
105 detectLangDefault = detectCat 99 eventLang
107 eventLang :: Events Lang
108 eventLang = toEvents FR [0..2] 10 [ langWord l | l <- allLangs ]
110 langWord :: Lang -> CatWord Lang
111 langWord l = CatWord l (textSample l)
113 textSample :: Lang -> String
114 textSample EN = EN.textSample
115 textSample FR = FR.textSample
116 textSample _ = panic "textSample: not impl yet"
117 --textSample DE = DE.textSample
118 --textSample SP = SP.textSample
119 --textSample CH = CH.textSample
120 ------------------------------------------------------------------------
121 detectCat :: Ord a => Int -> Events a -> Text -> Maybe a
122 detectCat n es = head . map fst . (detectCat' n es) . unpack
124 detectCat' :: Ord a => Int -> Events a -> String -> [(a, Double)]
125 detectCat' n' es' s = DL.reverse $ DL.sortOn snd
127 $ detectWith n' es' (wordsToBook [0..2] n' s)
130 detectWith :: Ord a => Int -> Events a -> EventBook -> CatProb a
131 detectWith n'' el (EventBook mapFreq _) =
134 $ map (\(s,m) -> map (\(l,f) -> (l, (fromIntegral m) * f)) $ toPrior n'' s el)
135 $ filter (\x -> fst x /= " ")
138 -- | TODO: monoids (but proba >= 0)
139 toPrior :: Int -> String -> Events a -> [(a, Double)]
140 toPrior n'' s el = prior n'' $ pebLang s el
142 pebLang :: String -> Events a -> [(a, (Freq,TotalFreq))]
143 pebLang st = map (\(l,eb) -> (l, peb st eb)) . DM.toList
145 peb :: String -> EventBook -> (Freq, TotalFreq)
146 peb st (EventBook mapFreq mapN) = (fromIntegral a, fromIntegral b)
148 a = maybe 0 identity $ DM.lookup st mapFreq
149 b = maybe 1 identity $ DM.lookup (length st) mapN
152 prior :: Int -> [(a, (Freq, TotalFreq))] -> [(a, Double)]
153 prior i ps = zip ls $ zipWith (\x y -> x^i * y) (map (\(a,_) -> part a (sum $ map fst ps')) ps')
154 (map (\(a,b) -> a / b) ps')
156 (ls, ps'') = DL.unzip ps
157 ps' = map (both fromIntegral) ps''
159 part :: (Eq p, Fractional p) => p -> p -> p
165 toProba :: (Eq b, Fractional b, Functor t, Foldable t) =>
167 toProba xs = map (\(a,b) -> (a, part b total)) xs
169 total = sum $ map snd xs
172 toEvents :: Ord a => a -> [Int] -> Int -> [CatWord a] -> Events a
173 toEvents e ns n = foldl' (opEvent (+)) (emptyEvent e ns n) . map (toEvent ns n)
175 emptyEvent :: Ord a => a -> [Int] -> Int -> Events a
176 emptyEvent e' ns' n'= toEvent ns' n' (CatWord e' "")
178 toEvent :: Ord a => [Int] -> Int -> CatWord a -> Events a
179 toEvent ns'' n'' (CatWord l txt) = DM.fromList [(l, wordsToBook ns'' n'' txt)]
181 opEvent :: Ord a => (Freq -> Freq -> Freq) -> Events a -> Events a -> Events a
182 opEvent f = DM.unionWith (op f)
184 ------------------------------------------------------------------------
186 emptyEventBook :: [Int] -> Int -> EventBook
187 emptyEventBook ns n = wordToBook ns n " "
189 wordsToBook :: [Int] -> Int -> String -> EventBook
190 wordsToBook ns n txt = foldl' (op (+)) (emptyEventBook ns n) eventsBook
192 ws = map unpack $ words $ pack txt
193 eventsBook = map (wordToBook ns n) ws
195 wordToBook :: [Int] -> Int -> Word -> EventBook
196 wordToBook ns n txt = EventBook ef en
198 chks = allChunks ns n txt
199 en = DM.fromList $ map (\(n',ns') -> (n', length ns')) $ zip ns chks
200 ef = foldl' DM.union DM.empty $ map (occurrencesWith identity) chks
202 op :: (Freq -> Freq -> Freq) -> EventBook -> EventBook -> EventBook
203 op f (EventBook ef1 en1)
204 (EventBook ef2 en2) = EventBook (DM.unionWith f ef1 ef2)
205 (DM.unionWith f en1 en2)
207 ------------------------------------------------------------------------
208 ------------------------------------------------------------------------
209 allChunks :: [Int] -> Int -> String -> [[String]]
210 allChunks ns m st = map (\n -> chunks n m st) ns
212 -- | Chunks is the same function as splitBy in Context but for Strings,
213 -- not Text (without pack and unpack operations that are not needed).
214 chunks :: Int -> Int -> String -> [String]
215 chunks n m = DL.take m . filter (not . all (== ' '))
222 -- | String preparation
223 blanks :: String -> String
225 blanks xs = [' '] <> xs <> [' ']
229 -- Some previous tests to be removed
230 --import GHC.Base (Functor)
231 --import Numeric.Probability.Distribution ((??))
232 --import qualified Numeric.Probability.Distribution as D
234 -- | Blocks increase the size of the word to ease computations
235 -- some border and unexepected effects can happen, need to be tested
236 blockOf :: Int -> String -> String
237 blockOf n = DL.concat . DL.take n . DL.repeat
239 -- * Make the distributions
240 makeDist :: [String] -> D.T Double String
241 makeDist = D.uniform . DL.concat . map (DL.concat . allChunks [0,2] 10)
243 stopDist :: D.T Double String
244 stopDist = makeDist $ map show ([0..9]::[Int]) <> EN.stopList
246 candDist :: D.T Double String
247 candDist = makeDist candList
249 ------------------------------------------------------------------------
250 sumProba :: Num a => D.T a String -> [Char] -> a
251 sumProba ds x = sum $ map ((~?) ds) $ DL.concat $ allChunks [0,2] 10 $ map toLower x
253 -- | Get probability according a distribution
254 (~?) :: (Num prob, Eq a) => D.T prob a -> a -> prob
255 (~?) ds x = (==x) ?? ds
257 ------------------------------------------------------------------------
258 candidate :: [Char] -> Candidate
259 candidate x = Candidate (sumProba stopDist x) (sumProba candDist x)
261 ------------------------------------------------------------------------
263 candList = [ "france", "alexandre", "mael", "constitution"
264 , "etats-unis", "associes", "car", "train", "spam"]