2 Module : Gargantext.Text.Terms.Stop
3 Description : Mono Terms module
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 - generalize to byteString
13 Stop words and (how to learn it).
15 Main type here is String.
19 {-# LANGUAGE NoImplicitPrelude #-}
20 {-# LANGUAGE OverloadedStrings #-}
21 {-# LANGUAGE DeriveGeneric #-}
22 {-# LANGUAGE TypeSynonymInstances #-}
23 {-# LANGUAGE FlexibleInstances #-}
25 module Gargantext.Text.Learn -- (detectLang, detectLangs, stopList)
28 import Codec.Serialise
29 import qualified Data.List as DL
31 import Data.Maybe (maybe)
32 import Data.Map.Strict (Map, toList)
33 import qualified Data.Map.Strict as DM
36 import Data.String (String)
38 import Data.Text (Text)
39 import Data.Text (pack, unpack, toLower)
40 import Data.Tuple.Extra (both)
41 import qualified Data.ByteString.Lazy as BSL
43 import Gargantext.Prelude
44 import Gargantext.Prelude.Utils
45 import Gargantext.Core (Lang(..), allLangs)
46 import Gargantext.Text.Terms.Mono (words)
47 import Gargantext.Text.Metrics.Count (occurrencesWith)
49 import qualified Gargantext.Text.Samples.FR as FR
50 import qualified Gargantext.Text.Samples.EN as EN
51 --import qualified Gargantext.Text.Samples.DE as DE
52 --import qualified Gargantext.Text.Samples.SP as SP
53 --import qualified Gargantext.Text.Samples.CH as CH
55 ------------------------------------------------------------------------
56 data Candidate = Candidate { stop :: Double
60 ------------------------------------------------------------------------
61 -- * Analyze candidate
67 data CatWord a = CatWord a Word
68 type CatProb a = Map a Double
70 type Events a = Map a EventBook
72 ------------------------------------------------------------------------
73 detectStopDefault :: Text -> Maybe Bool
74 detectStopDefault = undefined
76 detectBool :: [(Bool, Text)] -> Text -> Maybe Bool
77 detectBool events = detectDefault False events
79 detectDefault :: Ord a => a -> [(a, Text)] -> Text -> Maybe a
80 detectDefault = detectDefaultWith identity
82 detectDefaultWith :: Ord a => (b -> Text) -> a -> [(a, b)] -> b -> Maybe a
83 detectDefaultWith f d events = detectDefaultWithPriors f ps
85 ps = priorEventsWith f d events
87 detectDefaultWithPriors :: Ord b => (a -> Text) -> Events b -> a -> Maybe b
88 detectDefaultWithPriors f priors = detectCat 99 priors . f
90 priorEventsWith :: Ord a => (t -> Text) -> a -> [(a, t)] -> Events a
91 priorEventsWith f d e = toEvents d [0..2] 10 es
93 es = map (\(a,b) -> CatWord a (unpack $ toLower $ f b)) e
96 ------------------------------------------------------------------------
97 detectLangDefault :: Text -> Maybe Lang
98 detectLangDefault = detectCat 99 eventLang
100 eventLang :: Events Lang
101 eventLang = toEvents FR [0..2] 10 [ langWord l | l <- allLangs ]
103 langWord :: Lang -> CatWord Lang
104 langWord l = CatWord l (textSample l)
106 textSample :: Lang -> String
107 textSample EN = EN.textSample
108 textSample FR = FR.textSample
109 --textSample DE = DE.textSample
110 --textSample SP = SP.textSample
111 --textSample CH = CH.textSample
112 ------------------------------------------------------------------------
113 detectCat :: Ord a => Int -> Events a -> Text -> Maybe a
114 detectCat n es = head . map fst . (detectCat' n es) . unpack
116 detectCat' :: Ord a => Int -> Events a -> String -> [(a, Double)]
117 detectCat' n' es' s = DL.reverse $ DL.sortOn snd
119 $ detectWith n' es' (wordsToBook [0..2] n' s)
122 detectWith :: Ord a => Int -> Events a -> EventBook -> CatProb a
123 detectWith n'' el (EventBook mapFreq _) =
126 $ map (\(s,m) -> map (\(l,f) -> (l, (fromIntegral m) * f)) $ toPrior n'' s el)
127 $ filter (\x -> fst x /= " ")
130 -- | TODO: monoids (but proba >= 0)
131 toPrior :: Int -> String -> Events a -> [(a, Double)]
132 toPrior n'' s el = prior n'' $ pebLang s el
134 pebLang :: String -> Events a -> [(a, (Freq,TotalFreq))]
135 pebLang st = map (\(l,eb) -> (l, peb st eb)) . DM.toList
137 peb :: String -> EventBook -> (Freq, TotalFreq)
138 peb st (EventBook mapFreq mapN) = (fromIntegral a, fromIntegral b)
140 a = maybe 0 identity $ DM.lookup st mapFreq
141 b = maybe 1 identity $ DM.lookup (length st) mapN
144 prior :: Int -> [(a, (Freq, TotalFreq))] -> [(a, Double)]
145 prior i ps = zip ls $ zipWith (\x y -> x^i * y) (map (\(a,_) -> part a (sum $ map fst ps')) ps')
146 (map (\(a,b) -> a / b) ps')
148 (ls, ps'') = DL.unzip ps
149 ps' = map (both fromIntegral) ps''
151 part :: (Eq p, Fractional p) => p -> p -> p
157 toProba :: (Eq b, Fractional b, Functor t, Foldable t) =>
159 toProba xs = map (\(a,b) -> (a, part b total)) xs
161 total = sum $ map snd xs
164 toEvents :: Ord a => a -> [Int] -> Int -> [CatWord a] -> Events a
165 toEvents e ns n = foldl' (opEvent (+)) (emptyEvent e ns n) . map (toEvent ns n)
167 emptyEvent :: Ord a => a -> [Int] -> Int -> Events a
168 emptyEvent e' ns' n'= toEvent ns' n' (CatWord e' "")
170 toEvent :: Ord a => [Int] -> Int -> CatWord a -> Events a
171 toEvent ns'' n'' (CatWord l txt) = DM.fromList [(l, wordsToBook ns'' n'' txt)]
173 opEvent :: Ord a => (Freq -> Freq -> Freq) -> Events a -> Events a -> Events a
174 opEvent f = DM.unionWith (op f)
176 ------------------------------------------------------------------------
177 ------------------------------------------------------------------------
178 data EventBook = EventBook { events_freq :: Map String Freq
179 , events_n :: Map StringSize TotalFreq
181 deriving (Show, Generic)
183 instance Serialise EventBook
185 instance (Serialise a, Ord a) => SaveFile (Events a) where
186 saveFile' f d = BSL.writeFile f (serialise d)
188 instance (Serialise a, Ord a) => ReadFile (Events a) where
189 readFile' filepath = deserialise <$> BSL.readFile filepath
192 emptyEventBook :: [Int] -> Int -> EventBook
193 emptyEventBook ns n = wordToBook ns n " "
195 wordsToBook :: [Int] -> Int -> String -> EventBook
196 wordsToBook ns n txt = foldl' (op (+)) (emptyEventBook ns n) eventsBook
198 ws = map unpack $ words $ pack txt
199 eventsBook = map (wordToBook ns n) ws
201 wordToBook :: [Int] -> Int -> Word -> EventBook
202 wordToBook ns n txt = EventBook ef en
204 chks = allChunks ns n txt
205 en = DM.fromList $ map (\(n',ns') -> (n', length ns')) $ zip ns chks
206 ef = foldl' DM.union DM.empty $ map (occurrencesWith identity) chks
208 op :: (Freq -> Freq -> Freq) -> EventBook -> EventBook -> EventBook
209 op f (EventBook ef1 en1)
210 (EventBook ef2 en2) = EventBook (DM.unionWith f ef1 ef2)
211 (DM.unionWith f en1 en2)
213 ------------------------------------------------------------------------
214 ------------------------------------------------------------------------
215 allChunks :: [Int] -> Int -> String -> [[String]]
216 allChunks ns m st = map (\n -> chunks n m st) ns
218 -- | Chunks is the same function as splitBy in Context but for Strings,
219 -- not Text (without pack and unpack operations that are not needed).
220 chunks :: Int -> Int -> String -> [String]
221 chunks n m = DL.take m . filter (not . all (== ' '))
228 -- | String preparation
229 blanks :: String -> String
231 blanks xs = [' '] <> xs <> [' ']
235 -- Some previous tests to be removed
236 --import GHC.Base (Functor)
237 --import Numeric.Probability.Distribution ((??))
238 --import qualified Numeric.Probability.Distribution as D
240 -- | Blocks increase the size of the word to ease computations
241 -- some border and unexepected effects can happen, need to be tested
242 blockOf :: Int -> String -> String
243 blockOf n = DL.concat . DL.take n . DL.repeat
245 -- * Make the distributions
246 makeDist :: [String] -> D.T Double String
247 makeDist = D.uniform . DL.concat . map (DL.concat . allChunks [0,2] 10)
249 stopDist :: D.T Double String
250 stopDist = makeDist $ map show ([0..9]::[Int]) <> EN.stopList
252 candDist :: D.T Double String
253 candDist = makeDist candList
255 ------------------------------------------------------------------------
256 sumProba :: Num a => D.T a String -> [Char] -> a
257 sumProba ds x = sum $ map ((~?) ds) $ DL.concat $ allChunks [0,2] 10 $ map toLower x
259 -- | Get probability according a distribution
260 (~?) :: (Num prob, Eq a) => D.T prob a -> a -> prob
261 (~?) ds x = (==x) ?? ds
263 ------------------------------------------------------------------------
264 candidate :: [Char] -> Candidate
265 candidate x = Candidate (sumProba stopDist x) (sumProba candDist x)
267 ------------------------------------------------------------------------
269 candList = [ "france", "alexandre", "mael", "constitution"
270 , "etats-unis", "associes", "car", "train", "spam"]