2 Module : Gargantext.Core.Text.Terms.Stop
3 Description : Mono Terms module
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 - generalize to byteString
12 - Stop words and (how to learn it).
13 - Main type here is String check if Chars on Text would be optimized
17 {-# LANGUAGE TypeSynonymInstances #-}
19 module Gargantext.Core.Text.Learn -- (detectLang, detectLangs, stopList)
22 import Codec.Serialise
23 import qualified Data.List as DL
25 import Data.Map.Strict (Map, toList)
26 import qualified Data.Map.Strict as DM
29 import Data.String (String)
31 import Data.Text (Text)
32 import Data.Text (pack, unpack, toLower)
33 import Data.Tuple.Extra (both)
34 import qualified Data.ByteString.Lazy as BSL
36 import Gargantext.Prelude
37 import Gargantext.Prelude.GargDB
38 import Gargantext.Core (Lang(..), allLangs)
39 import Gargantext.Core.Text.Terms.Mono (words)
40 import Gargantext.Core.Text.Metrics.Count (occurrencesWith)
42 import qualified Gargantext.Core.Text.Samples.FR as FR
43 import qualified Gargantext.Core.Text.Samples.EN as EN
44 --import qualified Gargantext.Core.Text.Samples.DE as DE
45 --import qualified Gargantext.Core.Text.Samples.SP as SP
46 --import qualified Gargantext.Core.Text.Samples.CH as CH
48 ------------------------------------------------------------------------
49 data Candidate = Candidate { stop :: Double
53 ------------------------------------------------------------------------
54 -- * Analyze candidate
60 data CatWord a = CatWord a Word
61 type CatProb a = Map a Double
63 type Events a = Map a EventBook
64 ------------------------------------------------------------------------
65 data EventBook = EventBook { events_freq :: Map String Freq
66 , events_n :: Map StringSize TotalFreq
68 deriving (Show, Generic)
70 instance Serialise EventBook
72 instance (Serialise a, Ord a) => SaveFile (Events a) where
73 saveFile' f d = BSL.writeFile f (serialise d)
75 instance (Serialise a, Ord a) => ReadFile (Events a) where
76 readFile' filepath = deserialise <$> BSL.readFile filepath
78 ------------------------------------------------------------------------
79 detectStopDefault :: Text -> Maybe Bool
80 detectStopDefault = undefined
82 detectBool :: [(Bool, Text)] -> Text -> Maybe Bool
83 detectBool events = detectDefault False events
85 detectDefault :: Ord a => a -> [(a, Text)] -> Text -> Maybe a
86 detectDefault = detectDefaultWith identity
88 detectDefaultWith :: Ord a => (b -> Text) -> a -> [(a, b)] -> b -> Maybe a
89 detectDefaultWith f d events = detectDefaultWithPriors f ps
91 ps = priorEventsWith f d events
93 detectDefaultWithPriors :: Ord b => (a -> Text) -> Events b -> a -> Maybe b
94 detectDefaultWithPriors f priors = detectCat 99 priors . f
96 priorEventsWith :: Ord a => (t -> Text) -> a -> [(a, t)] -> Events a
97 priorEventsWith f d e = toEvents d [0..2] 10 es
99 es = map (\(a,b) -> CatWord a (unpack $ toLower $ f b)) e
102 ------------------------------------------------------------------------
103 detectLangDefault :: Text -> Maybe Lang
104 detectLangDefault = detectCat 99 eventLang
106 eventLang :: Events Lang
107 eventLang = toEvents FR [0..2] 10 [ langWord l | l <- allLangs ]
109 langWord :: Lang -> CatWord Lang
110 langWord l = CatWord l (textSample l)
112 textSample :: Lang -> String
113 textSample EN = EN.textSample
114 textSample FR = FR.textSample
115 textSample _ = panic "textSample: not impl yet"
116 --textSample DE = DE.textSample
117 --textSample SP = SP.textSample
118 --textSample CH = CH.textSample
119 ------------------------------------------------------------------------
120 detectCat :: Ord a => Int -> Events a -> Text -> Maybe a
121 detectCat n es = head . map fst . (detectCat' n es) . unpack
123 detectCat' :: Ord a => Int -> Events a -> String -> [(a, Double)]
124 detectCat' n' es' s = DL.reverse $ DL.sortOn snd
126 $ detectWith n' es' (wordsToBook [0..2] n' s)
129 detectWith :: Ord a => Int -> Events a -> EventBook -> CatProb a
130 detectWith n'' el (EventBook mapFreq _) =
133 $ map (\(s,m) -> map (\(l,f) -> (l, (fromIntegral m) * f)) $ toPrior n'' s el)
134 $ filter (\x -> fst x /= " ")
137 -- | TODO: monoids (but proba >= 0)
138 toPrior :: Int -> String -> Events a -> [(a, Double)]
139 toPrior n'' s el = prior n'' $ pebLang s el
141 pebLang :: String -> Events a -> [(a, (Freq,TotalFreq))]
142 pebLang st = map (\(l,eb) -> (l, peb st eb)) . DM.toList
144 peb :: String -> EventBook -> (Freq, TotalFreq)
145 peb st (EventBook mapFreq mapN) = (fromIntegral a, fromIntegral b)
147 a = maybe 0 identity $ DM.lookup st mapFreq
148 b = maybe 1 identity $ DM.lookup (length st) mapN
151 prior :: Int -> [(a, (Freq, TotalFreq))] -> [(a, Double)]
152 prior i ps = zip ls $ zipWith (\x y -> x^i * y) (map (\(a,_) -> part a (sum $ map fst ps')) ps')
153 (map (\(a,b) -> a / b) ps')
155 (ls, ps'') = DL.unzip ps
156 ps' = map (both fromIntegral) ps''
158 part :: (Eq p, Fractional p) => p -> p -> p
164 toProba :: (Eq b, Fractional b, Functor t, Foldable t) =>
166 toProba xs = map (\(a,b) -> (a, part b total)) xs
168 total = sum $ map snd xs
171 toEvents :: Ord a => a -> [Int] -> Int -> [CatWord a] -> Events a
172 toEvents e ns n = foldl' (opEvent (+)) (emptyEvent e ns n) . map (toEvent ns n)
174 emptyEvent :: Ord a => a -> [Int] -> Int -> Events a
175 emptyEvent e' ns' n'= toEvent ns' n' (CatWord e' "")
177 toEvent :: Ord a => [Int] -> Int -> CatWord a -> Events a
178 toEvent ns'' n'' (CatWord l txt) = DM.fromList [(l, wordsToBook ns'' n'' txt)]
180 opEvent :: Ord a => (Freq -> Freq -> Freq) -> Events a -> Events a -> Events a
181 opEvent f = DM.unionWith (op f)
183 ------------------------------------------------------------------------
185 emptyEventBook :: [Int] -> Int -> EventBook
186 emptyEventBook ns n = wordToBook ns n " "
188 wordsToBook :: [Int] -> Int -> String -> EventBook
189 wordsToBook ns n txt = foldl' (op (+)) (emptyEventBook ns n) eventsBook
191 ws = map unpack $ words $ pack txt
192 eventsBook = map (wordToBook ns n) ws
194 wordToBook :: [Int] -> Int -> Word -> EventBook
195 wordToBook ns n txt = EventBook ef en
197 chks = allChunks ns n txt
198 en = DM.fromList $ map (\(n',ns') -> (n', length ns')) $ zip ns chks
199 ef = foldl' DM.union DM.empty $ map (occurrencesWith identity) chks
201 op :: (Freq -> Freq -> Freq) -> EventBook -> EventBook -> EventBook
202 op f (EventBook ef1 en1)
203 (EventBook ef2 en2) = EventBook (DM.unionWith f ef1 ef2)
204 (DM.unionWith f en1 en2)
206 ------------------------------------------------------------------------
207 ------------------------------------------------------------------------
208 allChunks :: [Int] -> Int -> String -> [[String]]
209 allChunks ns m st = map (\n -> chunks n m st) ns
211 -- | Chunks is the same function as splitBy in Context but for Strings,
212 -- not Text (without pack and unpack operations that are not needed).
213 chunks :: Int -> Int -> String -> [String]
214 chunks n m = DL.take m . filter (not . all (== ' '))
221 -- | String preparation
222 blanks :: String -> String
224 blanks xs = [' '] <> xs <> [' ']
228 -- Some previous tests to be removed
229 --import GHC.Base (Functor)
230 --import Numeric.Probability.Distribution ((??))
231 --import qualified Numeric.Probability.Distribution as D
233 -- | Blocks increase the size of the word to ease computations
234 -- some border and unexepected effects can happen, need to be tested
235 blockOf :: Int -> String -> String
236 blockOf n = DL.concat . DL.take n . DL.repeat
238 -- * Make the distributions
239 makeDist :: [String] -> D.T Double String
240 makeDist = D.uniform . DL.concat . map (DL.concat . allChunks [0,2] 10)
242 stopDist :: D.T Double String
243 stopDist = makeDist $ map show ([0..9]::[Int]) <> EN.stopList
245 candDist :: D.T Double String
246 candDist = makeDist candList
248 ------------------------------------------------------------------------
249 sumProba :: Num a => D.T a String -> [Char] -> a
250 sumProba ds x = sum $ map ((~?) ds) $ DL.concat $ allChunks [0,2] 10 $ map toLower x
252 -- | Get probability according a distribution
253 (~?) :: (Num prob, Eq a) => D.T prob a -> a -> prob
254 (~?) ds x = (==x) ?? ds
256 ------------------------------------------------------------------------
257 candidate :: [Char] -> Candidate
258 candidate x = Candidate (sumProba stopDist x) (sumProba candDist x)
260 ------------------------------------------------------------------------
262 candList = [ "france", "alexandre", "mael", "constitution"
263 , "etats-unis", "associes", "car", "train", "spam"]