2 Module : Gargantext.Core.Text.Terms.Stop
3 Description : Mono Terms module
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 - generalize to byteString
12 - Stop words and (how to learn it).
13 - Main type here is String check if Chars on Text would be optimized
17 {-# LANGUAGE TypeSynonymInstances #-}
19 module Gargantext.Core.Text.Learn -- (detectLang, detectLangs, stopList)
22 import Codec.Serialise
23 import qualified Data.List as DL
25 import Data.Map.Strict (Map, toList)
26 import qualified Data.Map.Strict as DM
29 import Data.String (String)
31 import Data.Text (Text)
32 import Data.Text (pack, unpack, toLower)
33 import Data.Tuple.Extra (both)
34 import qualified Data.ByteString.Lazy as BSL
36 import Gargantext.Prelude
37 import Gargantext.Database.GargDB
38 import Gargantext.Core (Lang(..), allLangs)
39 import Gargantext.Core.Text.Terms.Mono (words)
40 import Gargantext.Core.Text.Metrics.Count (occurrencesWith)
42 import qualified Gargantext.Core.Text.Samples.ZH as ZH
43 import qualified Gargantext.Core.Text.Samples.DE as DE
44 import qualified Gargantext.Core.Text.Samples.EN as EN
45 import qualified Gargantext.Core.Text.Samples.ES as ES
46 import qualified Gargantext.Core.Text.Samples.FR as FR
47 import qualified Gargantext.Core.Text.Samples.PL as PL
49 ------------------------------------------------------------------------
50 data Candidate = Candidate { stop :: Double
54 ------------------------------------------------------------------------
55 -- * Analyze candidate
61 data CatWord a = CatWord a Word
62 type CatProb a = Map a Double
64 type Events a = Map a EventBook
65 ------------------------------------------------------------------------
66 data EventBook = EventBook { events_freq :: Map String Freq
67 , events_n :: Map StringSize TotalFreq
69 deriving (Show, Generic)
71 instance Serialise EventBook
73 instance (Serialise a, Ord a) => SaveFile (Events a) where
74 saveFile' f d = BSL.writeFile f (serialise d)
76 instance (Serialise a, Ord a) => ReadFile (Events a) where
77 readFile' filepath = deserialise <$> BSL.readFile filepath
79 ------------------------------------------------------------------------
80 detectStopDefault :: Text -> Maybe Bool
81 detectStopDefault = undefined
83 detectBool :: [(Bool, Text)] -> Text -> Maybe Bool
84 detectBool events = detectDefault False events
86 detectDefault :: Ord a => a -> [(a, Text)] -> Text -> Maybe a
87 detectDefault = detectDefaultWith identity
89 detectDefaultWith :: Ord a => (b -> Text) -> a -> [(a, b)] -> b -> Maybe a
90 detectDefaultWith f d events = detectDefaultWithPriors f ps
92 ps = priorEventsWith f d events
94 detectDefaultWithPriors :: Ord b => (a -> Text) -> Events b -> a -> Maybe b
95 detectDefaultWithPriors f priors = detectCat 99 priors . f
97 priorEventsWith :: Ord a => (t -> Text) -> a -> [(a, t)] -> Events a
98 priorEventsWith f d e = toEvents d [0..2] 10 es
100 es = map (\(a,b) -> CatWord a (unpack $ toLower $ f b)) e
103 ------------------------------------------------------------------------
104 detectLangDefault :: Text -> Maybe Lang
105 detectLangDefault = detectCat 99 eventLang
107 eventLang :: Events Lang
108 eventLang = toEvents FR [0..2] 10 [ langWord l | l <- allLangs ]
110 langWord :: Lang -> CatWord Lang
111 langWord l = CatWord l (textSample l)
113 textSample :: Lang -> String
114 textSample EN = EN.textSample
115 textSample FR = FR.textSample
116 textSample DE = DE.textSample
117 textSample ES = ES.textSample
118 textSample ZH = ZH.textSample
119 textSample PL = PL.textSample
120 textSample _ = panic "[G.C.T.L:detectLangDefault] not impl yet"
121 --textSample DE = DE.textSample
122 --textSample SP = SP.textSample
123 --textSample CH = CH.textSample
124 ------------------------------------------------------------------------
125 detectCat :: Ord a => Int -> Events a -> Text -> Maybe a
126 detectCat n es = head . map fst . (detectCat' n es) . unpack
128 detectCat' :: Ord a => Int -> Events a -> String -> [(a, Double)]
129 detectCat' n' es' s = DL.reverse $ DL.sortOn snd
131 $ detectWith n' es' (wordsToBook [0..2] n' s)
134 detectWith :: Ord a => Int -> Events a -> EventBook -> CatProb a
135 detectWith n'' el (EventBook mapFreq _) =
138 $ map (\(s,m) -> map (\(l,f) -> (l, (fromIntegral m) * f)) $ toPrior n'' s el)
139 $ filter (\x -> fst x /= " ")
142 -- | TODO: monoids (but proba >= 0)
143 toPrior :: Int -> String -> Events a -> [(a, Double)]
144 toPrior n'' s el = prior n'' $ pebLang s el
146 pebLang :: String -> Events a -> [(a, (Freq,TotalFreq))]
147 pebLang st = map (\(l,eb) -> (l, peb st eb)) . DM.toList
149 peb :: String -> EventBook -> (Freq, TotalFreq)
150 peb st (EventBook mapFreq mapN) = (fromIntegral a, fromIntegral b)
152 a = maybe 0 identity $ DM.lookup st mapFreq
153 b = maybe 1 identity $ DM.lookup (length st) mapN
156 prior :: Int -> [(a, (Freq, TotalFreq))] -> [(a, Double)]
157 prior i ps = zip ls $ zipWith (\x y -> x^i * y) (map (\(a,_) -> part a (sum $ map fst ps')) ps')
158 (map (\(a,b) -> a / b) ps')
160 (ls, ps'') = DL.unzip ps
161 ps' = map (both fromIntegral) ps''
163 part :: (Eq p, Fractional p) => p -> p -> p
169 toProba :: (Eq b, Fractional b, Functor t, Foldable t) =>
171 toProba xs = map (\(a,b) -> (a, part b total)) xs
173 total = sum $ map snd xs
176 toEvents :: Ord a => a -> [Int] -> Int -> [CatWord a] -> Events a
177 toEvents e ns n = foldl' (opEvent (+)) (emptyEvent e ns n) . map (toEvent ns n)
179 emptyEvent :: Ord a => a -> [Int] -> Int -> Events a
180 emptyEvent e' ns' n'= toEvent ns' n' (CatWord e' "")
182 toEvent :: Ord a => [Int] -> Int -> CatWord a -> Events a
183 toEvent ns'' n'' (CatWord l txt) = DM.fromList [(l, wordsToBook ns'' n'' txt)]
185 opEvent :: Ord a => (Freq -> Freq -> Freq) -> Events a -> Events a -> Events a
186 opEvent f = DM.unionWith (op f)
188 ------------------------------------------------------------------------
190 emptyEventBook :: [Int] -> Int -> EventBook
191 emptyEventBook ns n = wordToBook ns n " "
193 wordsToBook :: [Int] -> Int -> String -> EventBook
194 wordsToBook ns n txt = foldl' (op (+)) (emptyEventBook ns n) eventsBook
196 ws = map unpack $ words $ pack txt
197 eventsBook = map (wordToBook ns n) ws
199 wordToBook :: [Int] -> Int -> Word -> EventBook
200 wordToBook ns n txt = EventBook ef en
202 chks = allChunks ns n txt
203 en = DM.fromList $ map (\(n',ns') -> (n', length ns')) $ zip ns chks
204 ef = foldl' DM.union DM.empty $ map (occurrencesWith identity) chks
206 op :: (Freq -> Freq -> Freq) -> EventBook -> EventBook -> EventBook
207 op f (EventBook ef1 en1)
208 (EventBook ef2 en2) = EventBook (DM.unionWith f ef1 ef2)
209 (DM.unionWith f en1 en2)
211 ------------------------------------------------------------------------
212 ------------------------------------------------------------------------
213 allChunks :: [Int] -> Int -> String -> [[String]]
214 allChunks ns m st = map (\n -> chunks n m st) ns
216 -- | Chunks is the same function as splitBy in Context but for Strings,
217 -- not Text (without pack and unpack operations that are not needed).
218 chunks :: Int -> Int -> String -> [String]
219 chunks n m = DL.take m . filter (not . all (== ' '))
226 -- | String preparation
227 blanks :: String -> String
229 blanks xs = [' '] <> xs <> [' ']
233 -- Some previous tests to be removed
234 --import GHC.Base (Functor)
235 --import Numeric.Probability.Distribution ((??))
236 --import qualified Numeric.Probability.Distribution as D
238 -- | Blocks increase the size of the word to ease computations
239 -- some border and unexepected effects can happen, need to be tested
240 blockOf :: Int -> String -> String
241 blockOf n = DL.concat . DL.take n . DL.repeat
243 -- * Make the distributions
244 makeDist :: [String] -> D.T Double String
245 makeDist = D.uniform . DL.concat . map (DL.concat . allChunks [0,2] 10)
247 stopDist :: D.T Double String
248 stopDist = makeDist $ map show ([0..9]::[Int]) <> EN.stopList
250 candDist :: D.T Double String
251 candDist = makeDist candList
253 ------------------------------------------------------------------------
254 sumProba :: Num a => D.T a String -> [Char] -> a
255 sumProba ds x = sum $ map ((~?) ds) $ DL.concat $ allChunks [0,2] 10 $ map toLower x
257 -- | Get probability according a distribution
258 (~?) :: (Num prob, Eq a) => D.T prob a -> a -> prob
259 (~?) ds x = (==x) ?? ds
261 ------------------------------------------------------------------------
262 candidate :: [Char] -> Candidate
263 candidate x = Candidate (sumProba stopDist x) (sumProba candDist x)
265 ------------------------------------------------------------------------
267 candList = [ "france", "alexandre", "mael", "constitution"
268 , "etats-unis", "associes", "car", "train", "spam"]