2 Module : Gargantext.Text.Terms.Stop
3 Description : Mono Terms module
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Stop words and (how to learn it).
12 Main type here is String.
16 {-# LANGUAGE NoImplicitPrelude #-}
17 {-# LANGUAGE OverloadedStrings #-}
19 module Gargantext.Text.Learn -- (detectLang, detectLangs, stopList)
22 --import Data.Char (toLower)
23 import qualified Data.List as DL
25 import Data.Maybe (maybe)
26 import Data.Map.Strict (Map, toList)
27 import qualified Data.Map.Strict as DM
29 import Data.String (String)
31 import Data.Text (Text)
32 import Data.Text (pack, unpack, toLower)
33 import Data.Tuple.Extra (both)
35 import Gargantext.Prelude
36 import Gargantext.Core (Lang(..), allLangs)
37 import Gargantext.Text.Terms.Mono (words)
38 import Gargantext.Text.Metrics.Count (occurrencesWith)
40 import qualified Gargantext.Text.Samples.FR as FR
41 import qualified Gargantext.Text.Samples.EN as EN
42 --import qualified Gargantext.Text.Samples.DE as DE
43 --import qualified Gargantext.Text.Samples.SP as SP
44 --import qualified Gargantext.Text.Samples.CH as CH
46 ------------------------------------------------------------------------
47 data Candidate = Candidate { stop :: Double
51 ------------------------------------------------------------------------
52 -- * Analyze candidate
58 data CatWord a = CatWord a Word
59 type CatProb a = Map a Double
61 type Events a = Map a EventBook
63 ------------------------------------------------------------------------
64 detectLangDefault :: Text -> Maybe Lang
65 detectLangDefault = detectCat 99 eventLang
67 eventLang :: Events Lang
68 eventLang = toEvents FR [0..2] 10 [ langWord l | l <- allLangs ]
70 langWord :: Lang -> CatWord Lang
71 langWord l = CatWord l (textSample l)
73 textSample :: Lang -> String
74 textSample EN = EN.textSample
75 textSample FR = FR.textSample
76 --textSample DE = DE.textSample
77 --textSample SP = SP.textSample
78 --textSample CH = CH.textSample
80 detectStopDefault :: Text -> Maybe Bool
81 detectStopDefault = undefined
83 detectDefault :: [(Bool, Text)] -> Text -> Maybe Bool
84 detectDefault events = detectCat 99 (priorEvents events)
86 priorEvents events' = toEvents True [0..2] 10 (map (\(a,b) -> CatWord a (unpack $ toLower b)) events')
88 ------------------------------------------------------------------------
89 detectCat :: Ord a => Int -> Events a -> Text -> Maybe a
90 detectCat n es = head . map fst . (detectCat' n es) . unpack
92 detectCat' :: Ord a => Int -> Events a -> String -> [(a, Double)]
93 detectCat' n' es' s = DL.reverse $ DL.sortOn snd
95 $ detectWith n' es' (wordsToBook [0..2] n' s)
98 detectWith :: Ord a => Int -> Events a -> EventBook -> CatProb a
99 detectWith n'' el (EventBook mapFreq _) =
102 $ map (\(s,m) -> map (\(l,f) -> (l, (fromIntegral m) * f)) $ toPrior n'' s el)
103 $ filter (\x -> fst x /= " ")
106 -- | TODO: monoids (but proba >= 0)
107 toPrior :: Int -> String -> Events a -> [(a, Double)]
108 toPrior n'' s el = prior n'' $ pebLang s el
110 pebLang :: String -> Events a -> [(a, (Freq,TotalFreq))]
111 pebLang st = map (\(l,eb) -> (l, peb st eb)) . DM.toList
113 peb :: String -> EventBook -> (Freq, TotalFreq)
114 peb st (EventBook mapFreq mapN) = (fromIntegral a, fromIntegral b)
116 a = maybe 0 identity $ DM.lookup st mapFreq
117 b = maybe 1 identity $ DM.lookup (length st) mapN
120 prior :: Int -> [(a, (Freq, TotalFreq))] -> [(a, Double)]
121 prior i ps = zip ls $ zipWith (\x y -> x^i * y) (map (\(a,_) -> part a (sum $ map fst ps')) ps')
122 (map (\(a,b) -> a / b) ps')
124 (ls, ps'') = DL.unzip ps
125 ps' = map (both fromIntegral) ps''
127 part :: (Eq p, Fractional p) => p -> p -> p
133 toProba :: (Eq b, Fractional b, Functor t, Foldable t) =>
135 toProba xs = map (\(a,b) -> (a, part b total)) xs
137 total = sum $ map snd xs
140 toEvents :: Ord a => a -> [Int] -> Int -> [CatWord a] -> Events a
141 toEvents e ns n = foldl' (opEvent (+)) (emptyEvent e ns n) . map (toEvent ns n)
143 emptyEvent :: Ord a => a -> [Int] -> Int -> Events a
144 emptyEvent e' ns' n'= toEvent ns' n' (CatWord e' "")
146 toEvent :: Ord a => [Int] -> Int -> CatWord a -> Events a
147 toEvent ns'' n'' (CatWord l txt) = DM.fromList [(l, wordsToBook ns'' n'' txt)]
149 opEvent :: Ord a => (Freq -> Freq -> Freq) -> Events a -> Events a -> Events a
150 opEvent f = DM.unionWith (op f)
152 ------------------------------------------------------------------------
153 ------------------------------------------------------------------------
154 data EventBook = EventBook { events_freq :: Map String Freq
155 , events_n :: Map StringSize TotalFreq
159 emptyEventBook :: [Int] -> Int -> EventBook
160 emptyEventBook ns n = wordToBook ns n " "
162 wordsToBook :: [Int] -> Int -> String -> EventBook
163 wordsToBook ns n txt = foldl' (op (+)) (emptyEventBook ns n) eventsBook
165 ws = map unpack $ words $ pack txt
166 eventsBook = map (wordToBook ns n) ws
168 wordToBook :: [Int] -> Int -> Word -> EventBook
169 wordToBook ns n txt = EventBook ef en
171 chks = allChunks ns n txt
172 en = DM.fromList $ map (\(n',ns') -> (n', length ns')) $ zip ns chks
173 ef = foldl' DM.union DM.empty $ map (occurrencesWith identity) chks
175 op :: (Freq -> Freq -> Freq) -> EventBook -> EventBook -> EventBook
176 op f (EventBook ef1 en1)
177 (EventBook ef2 en2) = EventBook (DM.unionWith f ef1 ef2)
178 (DM.unionWith f en1 en2)
180 ------------------------------------------------------------------------
181 ------------------------------------------------------------------------
182 allChunks :: [Int] -> Int -> String -> [[String]]
183 allChunks ns m st = map (\n -> chunks n m st) ns
185 -- | Chunks is the same function as splitBy in Context but for Strings,
186 -- not Text (without pack and unpack operations that are not needed).
187 chunks :: Int -> Int -> String -> [String]
188 chunks n m = DL.take m . filter (not . all (== ' '))
195 -- | String preparation
196 blanks :: String -> String
198 blanks xs = [' '] <> xs <> [' ']
202 -- Some previous tests to be removed
203 --import GHC.Base (Functor)
204 --import Numeric.Probability.Distribution ((??))
205 --import qualified Numeric.Probability.Distribution as D
207 -- | Blocks increase the size of the word to ease computations
208 -- some border and unexepected effects can happen, need to be tested
209 blockOf :: Int -> String -> String
210 blockOf n = DL.concat . DL.take n . DL.repeat
212 -- * Make the distributions
213 makeDist :: [String] -> D.T Double String
214 makeDist = D.uniform . DL.concat . map (DL.concat . allChunks [0,2] 10)
216 stopDist :: D.T Double String
217 stopDist = makeDist $ map show ([0..9]::[Int]) <> EN.stopList
219 candDist :: D.T Double String
220 candDist = makeDist candList
222 ------------------------------------------------------------------------
223 sumProba :: Num a => D.T a String -> [Char] -> a
224 sumProba ds x = sum $ map ((~?) ds) $ DL.concat $ allChunks [0,2] 10 $ map toLower x
226 -- | Get probability according a distribution
227 (~?) :: (Num prob, Eq a) => D.T prob a -> a -> prob
228 (~?) ds x = (==x) ?? ds
230 ------------------------------------------------------------------------
231 candidate :: [Char] -> Candidate
232 candidate x = Candidate (sumProba stopDist x) (sumProba candDist x)
234 ------------------------------------------------------------------------
236 candList = [ "france", "alexandre", "mael", "constitution"
237 , "etats-unis", "associes", "car", "train", "spam"]