]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Learn.hs
[FIX/FEAT] Langs
[gargantext.git] / src / Gargantext / Core / Text / Learn.hs
1 {-|
2 Module : Gargantext.Core.Text.Terms.Stop
3 Description : Mono Terms module
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 TODO:
11 - generalize to byteString
12 - Stop words and (how to learn it).
13 - Main type here is String check if Chars on Text would be optimized
14
15 -}
16
17 {-# LANGUAGE TypeSynonymInstances #-}
18
19 module Gargantext.Core.Text.Learn -- (detectLang, detectLangs, stopList)
20 where
21
22 import Codec.Serialise
23 import qualified Data.List as DL
24
25 import Data.Map.Strict (Map, toList)
26 import qualified Data.Map.Strict as DM
27
28 import GHC.Generics
29 import Data.String (String)
30
31 import Data.Text (Text)
32 import Data.Text (pack, unpack, toLower)
33 import Data.Tuple.Extra (both)
34 import qualified Data.ByteString.Lazy as BSL
35
36 import Gargantext.Prelude
37 import Gargantext.Database.GargDB
38 import Gargantext.Core (Lang(..), allLangs)
39 import Gargantext.Core.Text.Terms.Mono (words)
40 import Gargantext.Core.Text.Metrics.Count (occurrencesWith)
41
42 import qualified Gargantext.Core.Text.Samples.ZH as ZH
43 import qualified Gargantext.Core.Text.Samples.DE as DE
44 import qualified Gargantext.Core.Text.Samples.EN as EN
45 import qualified Gargantext.Core.Text.Samples.ES as ES
46 import qualified Gargantext.Core.Text.Samples.FR as FR
47 import qualified Gargantext.Core.Text.Samples.PL as PL
48
49 ------------------------------------------------------------------------
50 data Candidate = Candidate { stop :: Double
51 , noStop :: Double
52 } deriving (Show)
53
54 ------------------------------------------------------------------------
55 -- * Analyze candidate
56 type StringSize = Int
57 type TotalFreq = Int
58 type Freq = Int
59 type Word = String
60
61 data CatWord a = CatWord a Word
62 type CatProb a = Map a Double
63
64 type Events a = Map a EventBook
65 ------------------------------------------------------------------------
66 data EventBook = EventBook { events_freq :: Map String Freq
67 , events_n :: Map StringSize TotalFreq
68 }
69 deriving (Show, Generic)
70
71 instance Serialise EventBook
72
73 instance (Serialise a, Ord a) => SaveFile (Events a) where
74 saveFile' f d = BSL.writeFile f (serialise d)
75
76 instance (Serialise a, Ord a) => ReadFile (Events a) where
77 readFile' filepath = deserialise <$> BSL.readFile filepath
78
79 ------------------------------------------------------------------------
80 detectStopDefault :: Text -> Maybe Bool
81 detectStopDefault = undefined
82
83 detectBool :: [(Bool, Text)] -> Text -> Maybe Bool
84 detectBool events = detectDefault False events
85
86 detectDefault :: Ord a => a -> [(a, Text)] -> Text -> Maybe a
87 detectDefault = detectDefaultWith identity
88
89 detectDefaultWith :: Ord a => (b -> Text) -> a -> [(a, b)] -> b -> Maybe a
90 detectDefaultWith f d events = detectDefaultWithPriors f ps
91 where
92 ps = priorEventsWith f d events
93
94 detectDefaultWithPriors :: Ord b => (a -> Text) -> Events b -> a -> Maybe b
95 detectDefaultWithPriors f priors = detectCat 99 priors . f
96
97 priorEventsWith :: Ord a => (t -> Text) -> a -> [(a, t)] -> Events a
98 priorEventsWith f d e = toEvents d [0..2] 10 es
99 where
100 es = map (\(a,b) -> CatWord a (unpack $ toLower $ f b)) e
101
102
103 ------------------------------------------------------------------------
104 detectLangDefault :: Text -> Maybe Lang
105 detectLangDefault = detectCat 99 eventLang
106 where
107 eventLang :: Events Lang
108 eventLang = toEvents FR [0..2] 10 [ langWord l | l <- allLangs ]
109
110 langWord :: Lang -> CatWord Lang
111 langWord l = CatWord l (textSample l)
112
113 textSample :: Lang -> String
114 textSample EN = EN.textSample
115 textSample FR = FR.textSample
116 textSample DE = DE.textSample
117 textSample ES = ES.textSample
118 textSample ZH = ZH.textSample
119 textSample PL = PL.textSample
120 textSample _ = panic "[G.C.T.L:detectLangDefault] not impl yet"
121 --textSample DE = DE.textSample
122 --textSample SP = SP.textSample
123 --textSample CH = CH.textSample
124 ------------------------------------------------------------------------
125 detectCat :: Ord a => Int -> Events a -> Text -> Maybe a
126 detectCat n es = head . map fst . (detectCat' n es) . unpack
127 where
128 detectCat' :: Ord a => Int -> Events a -> String -> [(a, Double)]
129 detectCat' n' es' s = DL.reverse $ DL.sortOn snd
130 $ toList
131 $ detectWith n' es' (wordsToBook [0..2] n' s)
132
133
134 detectWith :: Ord a => Int -> Events a -> EventBook -> CatProb a
135 detectWith n'' el (EventBook mapFreq _) =
136 DM.unionsWith (+)
137 $ map DM.fromList
138 $ map (\(s,m) -> map (\(l,f) -> (l, (fromIntegral m) * f)) $ toPrior n'' s el)
139 $ filter (\x -> fst x /= " ")
140 $ DM.toList mapFreq
141
142 -- | TODO: monoids (but proba >= 0)
143 toPrior :: Int -> String -> Events a -> [(a, Double)]
144 toPrior n'' s el = prior n'' $ pebLang s el
145 where
146 pebLang :: String -> Events a -> [(a, (Freq,TotalFreq))]
147 pebLang st = map (\(l,eb) -> (l, peb st eb)) . DM.toList
148
149 peb :: String -> EventBook -> (Freq, TotalFreq)
150 peb st (EventBook mapFreq mapN) = (fromIntegral a, fromIntegral b)
151 where
152 a = maybe 0 identity $ DM.lookup st mapFreq
153 b = maybe 1 identity $ DM.lookup (length st) mapN
154
155
156 prior :: Int -> [(a, (Freq, TotalFreq))] -> [(a, Double)]
157 prior i ps = zip ls $ zipWith (\x y -> x^i * y) (map (\(a,_) -> part a (sum $ map fst ps')) ps')
158 (map (\(a,b) -> a / b) ps')
159 where
160 (ls, ps'') = DL.unzip ps
161 ps' = map (both fromIntegral) ps''
162
163 part :: (Eq p, Fractional p) => p -> p -> p
164 part 0 _ = 0
165 part _ 0 = 0
166 part x y = x / y
167
168 {-
169 toProba :: (Eq b, Fractional b, Functor t, Foldable t) =>
170 t (a, b) -> t (a, b)
171 toProba xs = map (\(a,b) -> (a, part b total)) xs
172 where
173 total = sum $ map snd xs
174 -}
175 -- | TODO: monoids
176 toEvents :: Ord a => a -> [Int] -> Int -> [CatWord a] -> Events a
177 toEvents e ns n = foldl' (opEvent (+)) (emptyEvent e ns n) . map (toEvent ns n)
178 where
179 emptyEvent :: Ord a => a -> [Int] -> Int -> Events a
180 emptyEvent e' ns' n'= toEvent ns' n' (CatWord e' "")
181
182 toEvent :: Ord a => [Int] -> Int -> CatWord a -> Events a
183 toEvent ns'' n'' (CatWord l txt) = DM.fromList [(l, wordsToBook ns'' n'' txt)]
184
185 opEvent :: Ord a => (Freq -> Freq -> Freq) -> Events a -> Events a -> Events a
186 opEvent f = DM.unionWith (op f)
187
188 ------------------------------------------------------------------------
189
190 emptyEventBook :: [Int] -> Int -> EventBook
191 emptyEventBook ns n = wordToBook ns n " "
192
193 wordsToBook :: [Int] -> Int -> String -> EventBook
194 wordsToBook ns n txt = foldl' (op (+)) (emptyEventBook ns n) eventsBook
195 where
196 ws = map unpack $ words $ pack txt
197 eventsBook = map (wordToBook ns n) ws
198
199 wordToBook :: [Int] -> Int -> Word -> EventBook
200 wordToBook ns n txt = EventBook ef en
201 where
202 chks = allChunks ns n txt
203 en = DM.fromList $ map (\(n',ns') -> (n', length ns')) $ zip ns chks
204 ef = foldl' DM.union DM.empty $ map (occurrencesWith identity) chks
205
206 op :: (Freq -> Freq -> Freq) -> EventBook -> EventBook -> EventBook
207 op f (EventBook ef1 en1)
208 (EventBook ef2 en2) = EventBook (DM.unionWith f ef1 ef2)
209 (DM.unionWith f en1 en2)
210
211 ------------------------------------------------------------------------
212 ------------------------------------------------------------------------
213 allChunks :: [Int] -> Int -> String -> [[String]]
214 allChunks ns m st = map (\n -> chunks n m st) ns
215
216 -- | Chunks is the same function as splitBy in Context but for Strings,
217 -- not Text (without pack and unpack operations that are not needed).
218 chunks :: Int -> Int -> String -> [String]
219 chunks n m = DL.take m . filter (not . all (== ' '))
220 . chunkAlong (n+1) 1
221 . DL.concat
222 . DL.take 1000
223 . DL.repeat
224 . blanks
225
226 -- | String preparation
227 blanks :: String -> String
228 blanks [] = []
229 blanks xs = [' '] <> xs <> [' ']
230
231
232 {-
233 -- Some previous tests to be removed
234 --import GHC.Base (Functor)
235 --import Numeric.Probability.Distribution ((??))
236 --import qualified Numeric.Probability.Distribution as D
237
238 -- | Blocks increase the size of the word to ease computations
239 -- some border and unexepected effects can happen, need to be tested
240 blockOf :: Int -> String -> String
241 blockOf n = DL.concat . DL.take n . DL.repeat
242
243 -- * Make the distributions
244 makeDist :: [String] -> D.T Double String
245 makeDist = D.uniform . DL.concat . map (DL.concat . allChunks [0,2] 10)
246
247 stopDist :: D.T Double String
248 stopDist = makeDist $ map show ([0..9]::[Int]) <> EN.stopList
249
250 candDist :: D.T Double String
251 candDist = makeDist candList
252
253 ------------------------------------------------------------------------
254 sumProba :: Num a => D.T a String -> [Char] -> a
255 sumProba ds x = sum $ map ((~?) ds) $ DL.concat $ allChunks [0,2] 10 $ map toLower x
256
257 -- | Get probability according a distribution
258 (~?) :: (Num prob, Eq a) => D.T prob a -> a -> prob
259 (~?) ds x = (==x) ?? ds
260
261 ------------------------------------------------------------------------
262 candidate :: [Char] -> Candidate
263 candidate x = Candidate (sumProba stopDist x) (sumProba candDist x)
264
265 ------------------------------------------------------------------------
266 candList :: [String]
267 candList = [ "france", "alexandre", "mael", "constitution"
268 , "etats-unis", "associes", "car", "train", "spam"]
269
270 --}