2 Module : Gargantext.Text.Terms.Stop
3 Description : Mono Terms module
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 - generalize to byteString
12 - Stop words and (how to learn it).
13 - Main type here is String check if Chars on Text would be optimized
17 {-# LANGUAGE NoImplicitPrelude #-}
18 {-# LANGUAGE OverloadedStrings #-}
19 {-# LANGUAGE DeriveGeneric #-}
20 {-# LANGUAGE TypeSynonymInstances #-}
21 {-# LANGUAGE FlexibleInstances #-}
23 module Gargantext.Text.Learn -- (detectLang, detectLangs, stopList)
26 import Codec.Serialise
27 import qualified Data.List as DL
29 import Data.Maybe (maybe)
30 import Data.Map.Strict (Map, toList)
31 import qualified Data.Map.Strict as DM
34 import Data.String (String)
36 import Data.Text (Text)
37 import Data.Text (pack, unpack, toLower)
38 import Data.Tuple.Extra (both)
39 import qualified Data.ByteString.Lazy as BSL
41 import Gargantext.Prelude
42 import Gargantext.Prelude.Utils
43 import Gargantext.Core (Lang(..), allLangs)
44 import Gargantext.Text.Terms.Mono (words)
45 import Gargantext.Text.Metrics.Count (occurrencesWith)
47 import qualified Gargantext.Text.Samples.FR as FR
48 import qualified Gargantext.Text.Samples.EN as EN
49 --import qualified Gargantext.Text.Samples.DE as DE
50 --import qualified Gargantext.Text.Samples.SP as SP
51 --import qualified Gargantext.Text.Samples.CH as CH
53 ------------------------------------------------------------------------
54 data Candidate = Candidate { stop :: Double
58 ------------------------------------------------------------------------
59 -- * Analyze candidate
65 data CatWord a = CatWord a Word
66 type CatProb a = Map a Double
68 type Events a = Map a EventBook
69 ------------------------------------------------------------------------
70 data EventBook = EventBook { events_freq :: Map String Freq
71 , events_n :: Map StringSize TotalFreq
73 deriving (Show, Generic)
75 instance Serialise EventBook
77 instance (Serialise a, Ord a) => SaveFile (Events a) where
78 saveFile' f d = BSL.writeFile f (serialise d)
80 instance (Serialise a, Ord a) => ReadFile (Events a) where
81 readFile' filepath = deserialise <$> BSL.readFile filepath
83 ------------------------------------------------------------------------
84 detectStopDefault :: Text -> Maybe Bool
85 detectStopDefault = undefined
87 detectBool :: [(Bool, Text)] -> Text -> Maybe Bool
88 detectBool events = detectDefault False events
90 detectDefault :: Ord a => a -> [(a, Text)] -> Text -> Maybe a
91 detectDefault = detectDefaultWith identity
93 detectDefaultWith :: Ord a => (b -> Text) -> a -> [(a, b)] -> b -> Maybe a
94 detectDefaultWith f d events = detectDefaultWithPriors f ps
96 ps = priorEventsWith f d events
98 detectDefaultWithPriors :: Ord b => (a -> Text) -> Events b -> a -> Maybe b
99 detectDefaultWithPriors f priors = detectCat 99 priors . f
101 priorEventsWith :: Ord a => (t -> Text) -> a -> [(a, t)] -> Events a
102 priorEventsWith f d e = toEvents d [0..2] 10 es
104 es = map (\(a,b) -> CatWord a (unpack $ toLower $ f b)) e
107 ------------------------------------------------------------------------
108 detectLangDefault :: Text -> Maybe Lang
109 detectLangDefault = detectCat 99 eventLang
111 eventLang :: Events Lang
112 eventLang = toEvents FR [0..2] 10 [ langWord l | l <- allLangs ]
114 langWord :: Lang -> CatWord Lang
115 langWord l = CatWord l (textSample l)
117 textSample :: Lang -> String
118 textSample EN = EN.textSample
119 textSample FR = FR.textSample
120 textSample _ = panic "textSample: not impl yet"
121 --textSample DE = DE.textSample
122 --textSample SP = SP.textSample
123 --textSample CH = CH.textSample
124 ------------------------------------------------------------------------
125 detectCat :: Ord a => Int -> Events a -> Text -> Maybe a
126 detectCat n es = head . map fst . (detectCat' n es) . unpack
128 detectCat' :: Ord a => Int -> Events a -> String -> [(a, Double)]
129 detectCat' n' es' s = DL.reverse $ DL.sortOn snd
131 $ detectWith n' es' (wordsToBook [0..2] n' s)
134 detectWith :: Ord a => Int -> Events a -> EventBook -> CatProb a
135 detectWith n'' el (EventBook mapFreq _) =
138 $ map (\(s,m) -> map (\(l,f) -> (l, (fromIntegral m) * f)) $ toPrior n'' s el)
139 $ filter (\x -> fst x /= " ")
142 -- | TODO: monoids (but proba >= 0)
143 toPrior :: Int -> String -> Events a -> [(a, Double)]
144 toPrior n'' s el = prior n'' $ pebLang s el
146 pebLang :: String -> Events a -> [(a, (Freq,TotalFreq))]
147 pebLang st = map (\(l,eb) -> (l, peb st eb)) . DM.toList
149 peb :: String -> EventBook -> (Freq, TotalFreq)
150 peb st (EventBook mapFreq mapN) = (fromIntegral a, fromIntegral b)
152 a = maybe 0 identity $ DM.lookup st mapFreq
153 b = maybe 1 identity $ DM.lookup (length st) mapN
156 prior :: Int -> [(a, (Freq, TotalFreq))] -> [(a, Double)]
157 prior i ps = zip ls $ zipWith (\x y -> x^i * y) (map (\(a,_) -> part a (sum $ map fst ps')) ps')
158 (map (\(a,b) -> a / b) ps')
160 (ls, ps'') = DL.unzip ps
161 ps' = map (both fromIntegral) ps''
163 part :: (Eq p, Fractional p) => p -> p -> p
169 toProba :: (Eq b, Fractional b, Functor t, Foldable t) =>
171 toProba xs = map (\(a,b) -> (a, part b total)) xs
173 total = sum $ map snd xs
176 toEvents :: Ord a => a -> [Int] -> Int -> [CatWord a] -> Events a
177 toEvents e ns n = foldl' (opEvent (+)) (emptyEvent e ns n) . map (toEvent ns n)
179 emptyEvent :: Ord a => a -> [Int] -> Int -> Events a
180 emptyEvent e' ns' n'= toEvent ns' n' (CatWord e' "")
182 toEvent :: Ord a => [Int] -> Int -> CatWord a -> Events a
183 toEvent ns'' n'' (CatWord l txt) = DM.fromList [(l, wordsToBook ns'' n'' txt)]
185 opEvent :: Ord a => (Freq -> Freq -> Freq) -> Events a -> Events a -> Events a
186 opEvent f = DM.unionWith (op f)
188 ------------------------------------------------------------------------
190 emptyEventBook :: [Int] -> Int -> EventBook
191 emptyEventBook ns n = wordToBook ns n " "
193 wordsToBook :: [Int] -> Int -> String -> EventBook
194 wordsToBook ns n txt = foldl' (op (+)) (emptyEventBook ns n) eventsBook
196 ws = map unpack $ words $ pack txt
197 eventsBook = map (wordToBook ns n) ws
199 wordToBook :: [Int] -> Int -> Word -> EventBook
200 wordToBook ns n txt = EventBook ef en
202 chks = allChunks ns n txt
203 en = DM.fromList $ map (\(n',ns') -> (n', length ns')) $ zip ns chks
204 ef = foldl' DM.union DM.empty $ map (occurrencesWith identity) chks
206 op :: (Freq -> Freq -> Freq) -> EventBook -> EventBook -> EventBook
207 op f (EventBook ef1 en1)
208 (EventBook ef2 en2) = EventBook (DM.unionWith f ef1 ef2)
209 (DM.unionWith f en1 en2)
211 ------------------------------------------------------------------------
212 ------------------------------------------------------------------------
213 allChunks :: [Int] -> Int -> String -> [[String]]
214 allChunks ns m st = map (\n -> chunks n m st) ns
216 -- | Chunks is the same function as splitBy in Context but for Strings,
217 -- not Text (without pack and unpack operations that are not needed).
218 chunks :: Int -> Int -> String -> [String]
219 chunks n m = DL.take m . filter (not . all (== ' '))
226 -- | String preparation
227 blanks :: String -> String
229 blanks xs = [' '] <> xs <> [' ']
233 -- Some previous tests to be removed
234 --import GHC.Base (Functor)
235 --import Numeric.Probability.Distribution ((??))
236 --import qualified Numeric.Probability.Distribution as D
238 -- | Blocks increase the size of the word to ease computations
239 -- some border and unexepected effects can happen, need to be tested
240 blockOf :: Int -> String -> String
241 blockOf n = DL.concat . DL.take n . DL.repeat
243 -- * Make the distributions
244 makeDist :: [String] -> D.T Double String
245 makeDist = D.uniform . DL.concat . map (DL.concat . allChunks [0,2] 10)
247 stopDist :: D.T Double String
248 stopDist = makeDist $ map show ([0..9]::[Int]) <> EN.stopList
250 candDist :: D.T Double String
251 candDist = makeDist candList
253 ------------------------------------------------------------------------
254 sumProba :: Num a => D.T a String -> [Char] -> a
255 sumProba ds x = sum $ map ((~?) ds) $ DL.concat $ allChunks [0,2] 10 $ map toLower x
257 -- | Get probability according a distribution
258 (~?) :: (Num prob, Eq a) => D.T prob a -> a -> prob
259 (~?) ds x = (==x) ?? ds
261 ------------------------------------------------------------------------
262 candidate :: [Char] -> Candidate
263 candidate x = Candidate (sumProba stopDist x) (sumProba candDist x)
265 ------------------------------------------------------------------------
267 candList = [ "france", "alexandre", "mael", "constitution"
268 , "etats-unis", "associes", "car", "train", "spam"]