]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Eleve.hs
[NGRAMS] fix entropy.
[gargantext.git] / src / Gargantext / Text / Eleve.hs
1 {-|
2 Module : Gargantext.Text.Eleve
3 Description : Unsupervized Word segmentation
4 Copyright : (c) CNRS, 2019-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 # Implementation of Unsupervized Word Segmentation
11
12 References:
13
14 - Python implementation (Korantin August, Emmanuel Navarro):
15 [EleVe](https://github.com/kodexlab/eleve.git)
16
17 - Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
18 Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
19 the 50th Annual Meeting of the Association for Computational Linguistics
20 , pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
21
22 Notes for current implementation:
23 - TODO fix normalization
24 - TODO extract longer ngrams (see paper above, viterbi algo can be used)
25 - TODO AD TEST: prop (Node c _e f) = c == Map.size f
26
27 - AD: Real ngrams extraction test
28 from Gargantext.Text.Terms import extractTermsUnsupervised
29 docs <- runCmdRepl $ selectDocs 1004
30 extractTermsUnsupervised 3 $ DT.intercalate " "
31 $ catMaybes
32 $ Gargantext.map _hyperdataDocument_abstract docs
33
34 -}
35 {-# LANGUAGE ConstraintKinds #-}
36 {-# LANGUAGE NoImplicitPrelude #-}
37 {-# LANGUAGE OverloadedStrings #-}
38 {-# LANGUAGE RankNTypes #-}
39 {-# LANGUAGE TemplateHaskell #-}
40 {-# LANGUAGE TypeFamilies #-}
41
42 module Gargantext.Text.Eleve where
43
44 import Debug.Trace (trace)
45 -- import Debug.SimpleReflect
46
47 import Control.Lens (Lens', Getting, (^.), (^?), view, makeLenses, _Just)
48 import Control.Monad (forM_)
49 import Data.Ord (Ord)
50 import qualified Data.List as L
51 import Data.Monoid
52 import Data.Text (Text)
53 import qualified Data.Text as T
54 import Data.Map (Map)
55 import Data.Maybe (fromMaybe)
56 import qualified Data.Map as Map
57 import Gargantext.Prelude hiding (cs)
58 import qualified Data.Tree as Tree
59 import Data.Tree (Tree)
60 import qualified Prelude as P (putStrLn, logBase, isNaN, RealFloat)
61
62 type Entropy e =
63 ( Fractional e
64 , Floating e
65 , P.RealFloat e
66 , Show e
67 -- ^ TODO: only used for debugging
68 )
69 ------------------------------------------------------------------------
70 -- | Example and tests for development
71 data I e = I
72 { _info_entropy :: e
73 , _info_autonomy :: e
74 }
75
76 instance Show e => Show (I e) where
77 show (I e n) = show (e, n)
78
79 makeLenses ''I
80
81 type ModEntropy i o e = (e -> e) -> i -> o
82
83 set_autonomy :: ModEntropy e (I e) e
84 set_autonomy f e = I e (f e)
85
86 data StartStop = Start | Stop
87 deriving (Ord, Eq, Show)
88
89 data Token = NonTerminal Text
90 | Terminal StartStop
91 deriving (Ord, Eq, Show)
92
93 isTerminal :: Token -> Bool
94 isTerminal (Terminal _) = True
95 isTerminal (NonTerminal _) = False
96
97 parseToken :: Text -> Token
98 parseToken "<start>" = Terminal Start
99 parseToken "<stop>" = Terminal Stop
100 parseToken t = NonTerminal t
101
102
103 toToken :: [Text] -> [Token]
104 toToken xs = Terminal Start : (NonTerminal <$> xs) <> [Terminal Stop]
105
106 printToken :: Token -> Text
107 printToken = f
108 where
109 f (NonTerminal x) = x
110 f (Terminal Start) = "<start>"
111 f (Terminal Stop) = "<stop>"
112
113 ------------------------------------------------------------------------
114
115 data Trie k e
116 = Node { _node_count :: Int
117 , _node_entropy :: e
118 , _node_children :: Map k (Trie k e)
119 }
120 | Leaf { _node_count :: Int }
121 deriving (Show)
122
123 makeLenses ''Trie
124
125 insertTries :: Ord k => [[k]] -> Trie k ()
126 insertTries = L.foldr insertTrie emptyTrie
127
128 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
129 insertTrie [] n = n { _node_count = _node_count n +1}
130 insertTrie (x:xs) (Leaf c) = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
131 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
132 where
133 f = Just . insertTrie xs . fromMaybe emptyTrie
134
135 -- emptyTrie :: (Ord k, Monoid e) => Trie k e
136 -- emptyTrie = Node 0 mempty mempty
137 emptyTrie :: Trie k e
138 emptyTrie = Leaf 0
139
140 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
141 mkTrie c children
142 | Map.null children = Leaf c
143 | otherwise = Node c mempty children
144
145 -----------------------------
146
147 -- | Trie to Tree since Tree as nice print function
148 toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
149 toTree k (Leaf c) = Tree.Node (k, c, Nothing) []
150 toTree k (Node c e cs) = Tree.Node (k, c, Just e) (map (uncurry toTree) $ Map.toList cs)
151
152 ------------------------------------------------------------------------
153 ------------------------------------------------------------------------
154
155 nan :: Floating e => e
156 nan = 0 / 0
157
158 noNaNs :: P.RealFloat e => [e] -> [e]
159 noNaNs = filter (not . P.isNaN)
160
161 updateIfDefined :: P.RealFloat e => e -> e -> e
162 updateIfDefined e0 e | P.isNaN e = e0
163 | otherwise = e
164
165 subst :: Entropy e => (e, e) -> e -> e
166 subst (src, dst) x | sim src x = dst
167 | otherwise = x
168
169 entropyTrie :: Entropy e => (k -> Bool) -> Trie k () -> Trie k e
170 entropyTrie _ (Leaf c) = Leaf c
171 entropyTrie pred (Node c () children) = Node c e (map (entropyTrie pred) children)
172 where
173 e = sum $ map f $ Map.toList children
174 f (k, child) = if pred k then chc * P.logBase 2 (fromIntegral c)
175 else - chc * P.logBase 2 chc
176 where
177 chc = fromIntegral (_node_count child) / fromIntegral c
178 ------------------------------------------------------------------------
179
180 normalizeLevel :: Entropy e => [e] -> e -> e
181 normalizeLevel = checkDiff (go . noNaNs)
182
183 where
184 -- checkDiff f es e = let e' = f es e in if e == e' then e' else trace ("normalizeLevel: diff " <> show e <> " " <> show e') e'
185 checkDiff = identity
186 -- go [] = panic "normalizeLevel: impossible"
187 -- trace "normalizeLevel"
188 -- go [_] = identity
189 go es = \e -> (e - m) / v
190 {-
191 in if P.isNaN e'
192 then trace ("normalizeLevel " <> show (e,m,v,es))
193 e
194 else e'
195 -}
196 where
197 m = mean es
198 v = deviation es
199
200 {- Unused
201
202 nodeChildren :: Trie k e -> Map k (Trie k e)
203 nodeChildren (Node _ _ cs) = cs
204 nodeChildren (Leaf _) = Map.empty
205
206 -}
207
208
209 class IsTrie trie where
210 buildTrie :: Entropy e => (Int -> [[Text]] -> [[Token]]) -> Int -> [[Text]] -> trie Token e
211 nodeEntropy :: Entropy e => Getting e i e -> trie k i -> e
212 nodeChild :: Ord k => k -> trie k e -> trie k e
213 findTrie :: Ord k => [k] -> trie k e -> trie k e
214 normalizeEntropy :: Entropy e
215 => Getting e i e -> ModEntropy i o e
216 -> trie k i -> trie k o
217
218 -- UNUSED
219 --nodeAutonomy :: (Ord k, Entropy e) => Getting e i e -> trie k i -> [k] -> e
220 --nodeAutonomy inE t ks = nodeEntropy inE $ findTrie ks t
221
222 instance IsTrie Trie where
223 buildTrie to n ts = entropyTrie isTerminal $ insertTries $ to n ts
224
225 nodeEntropy inE (Node _ e _) = e ^. inE
226 nodeEntropy _ (Leaf _) = -- trace "nodeEntropy of Leaf" $
227 nan
228
229 nodeChild k (Node _ _ cs) = fromMaybe emptyTrie (Map.lookup k cs)
230 nodeChild _ (Leaf _) = emptyTrie
231
232 findTrie ks t = L.foldl (flip nodeChild) t ks
233
234 normalizeEntropy inE modE t = go (modE identity) (entropyLevels inE t) t
235 where
236 go _ [] _ = panic "normalizeEntropy' empty levels"
237 go _ _ (Leaf c) = Leaf c
238 -- go _ ([] : _) _ = panic "normalizeEntropy': empty level"
239 go f (es : ess) (Node c i children)
240 -- | any (sim (i ^. inE)) es
241 = Node c (f i) $ go (modE $ normalizeLevel es) ess <$> children
242 -- | otherwise
243 -- = panic "NOT an elem"
244
245
246 {-
247 This is only normalizing a node with respect to its brothers (unlike all the
248 nodes of the same level).
249
250 normalizeEntropy inE modE = go $ modE identity
251 where
252 go _ (Leaf c) = Leaf c
253 go f (Node c i children)
254 | Map.null children =
255 panic "normalizeEntropy: impossible"
256 | otherwise =
257 Node c (f i) $ go (modE $ normalizeLevel es) <$> children
258 where
259 es = [ i' ^. inE | Node _ i' _ <- Map.elems children ]
260 -}
261 ------------------------------------------------------------------------
262
263 levels :: Trie k e -> [[Trie k e]]
264 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
265 where
266 subForest :: Trie k e -> [Trie k e]
267 subForest (Leaf _) = []
268 subForest (Node _ _ children) = Map.elems children
269
270 entropyLevels :: Entropy e => Getting e i e -> Trie k i -> [[e]]
271 entropyLevels inE = fmap (noNaNs . map (nodeEntropy inE)) . levels
272
273 ------------------------------------------------------------------------
274
275 data Tries k e = Tries
276 { _fwd :: Trie k e
277 , _bwd :: Trie k e
278 }
279
280 instance IsTrie Tries where
281 buildTrie to n tts = Tries { _fwd = buildTrie to n tts
282 , _bwd = buildTrie to n (map reverse $ tts)
283 }
284
285 nodeEntropy inE (Tries fwd bwd) =
286 mean $ noNaNs [nodeEntropy inE fwd, nodeEntropy inE bwd]
287
288 findTrie ks (Tries fwd bwd) = Tries (findTrie ks fwd) (findTrie ks bwd)
289 -- ^^
290 -- TODO: here this is tempting to reverse but this is not always what we
291 -- want. See also nodeAutonomy.
292
293 nodeChild k (Tries fwd bwd) = Tries (nodeChild k fwd) (nodeChild k bwd)
294
295 normalizeEntropy inE modE = onTries (normalizeEntropy inE modE)
296
297 onTries :: (Trie k i -> Trie k o) -> Tries k i -> Tries k o
298 onTries f (Tries fwd bwd) = Tries (f fwd) (f bwd)
299
300 ------------------------------------------------------------------------
301 split :: (IsTrie trie, Entropy e) => Lens' i e -> trie Token i -> [Token] -> [[Token]]
302 split _ _ [] = []
303 split inE t0 (Terminal Start:xs0) = split inE (nodeChild (Terminal Start) t0) xs0
304 split inE t0 (x0:xs0) = go (nodeChild x0 t0) [x0] xs0
305 where
306 consRev [] xss = xss
307 consRev xs xss = reverse xs : xss
308
309 go _ pref [] = [reverse pref]
310 go _ pref (Terminal Stop:_) = [reverse pref]
311 go t pref (Terminal Start:xs) = go t pref xs
312 go t pref (x:xs) =
313 -- trace (show (if acc then "ACC" else "CUT", (reverse (x : pref), ext), if acc then ">" else "<=", ((reverse pref, et), "+", ([x], ext0)))) $
314 if acc
315 then go xt (x:pref) xs
316 else consRev pref $ go xt0 [x] xs
317 where
318 xt = nodeChild x t
319 xt0 = nodeChild x t0
320 et = ne 0 t
321 -- ^ entropy of the current prefix
322 ext0 = ne 0 xt0
323 -- ^ entropy of [x]
324 ext = ne 0 xt
325 -- ^ entropy of the current prefix plus x
326 acc = ext > et + ext0
327 -- aut(["in","this","paper"]) > aut(["in","this"]) + aut(["paper"])
328
329 ne d t = if P.isNaN e then d else e
330 where e = nodeEntropy inE t
331
332 {-
333 split :: Entropy e => Lens' i e -> Tries Token i -> [Token] -> [[Token]]
334 split inE t0 ts =
335 maximumWith (sum . map $ nodeAutonomy inE t0) (all the splits of ts)
336 -}
337
338 ------------------------------------------------------------------------
339 ------------------------------------------------------------------------
340
341 mainEleve :: Int -> [[Text]] -> [[[Text]]]
342 mainEleve _ _ = []
343 {-
344 mainEleve n input = map (map printToken) . split identity (t :: Trie Token Double) <$> inp
345 where
346 inp = toToken <$> input
347 t = buildTrie $ L.concat $ chunkAlong n 1 <$> inp
348 -}
349
350 sim :: Entropy e => e -> e -> Bool
351 sim x y = x == y || (P.isNaN x && P.isNaN y)
352
353 chunkAlongEleve :: Int -> [a] -> [[a]]
354 chunkAlongEleve n xs = L.take n <$> L.tails xs
355
356 toToken' :: Int -> [[Text]] -> [[Token]]
357 toToken' n input = L.concat $ (filter (/= [Terminal Stop]) . chunkAlongEleve (n + 2)) <$> toToken <$> input
358
359 testEleve :: e ~ Double => Bool -> Int -> [Text] -> [(Text, Int, e, e, e, e, e)] -> IO Bool
360 testEleve debug n output checks = do
361 let
362 {-
363 pss = [ (ps, findTrie ps fwd ^? _Just . node_entropy) -- . info_entropy)
364 | ps <- L.nub $ [ c
365 | m <- [1..n]
366 , cs <- chunkAlong m 1 <$> inp
367 , c <- cs
368 ]
369 ]
370 -}
371 --res = map (map printToken) . split identity fwd <$> inp
372 --res = map (map printToken) . split info_norm_entropy' nt' <$> inp
373 res = map (map printToken) . split info_autonomy nt <$> inp
374 when debug $ do
375 P.putStrLn (show input)
376 -- forM_ pss (P.putStrLn . show)
377 P.putStrLn ""
378 P.putStrLn "Levels:"
379 forM_ (entropyLevels identity (_fwd t)) $ \level ->
380 P.putStrLn $ " " <> show level
381 P.putStrLn ""
382 P.putStrLn "Forward:"
383 printTrie (_fwd nt)
384 P.putStrLn ""
385 P.putStrLn "Backward:"
386 printTrie (_bwd nt)
387 P.putStrLn ""
388 P.putStrLn "Splitting:"
389 P.putStrLn $ show res
390 forM_ checks checker
391 pure $ expected == res
392
393 where
394 out = T.words <$> output
395 expected = fmap (T.splitOn "-") <$> out
396 input = (T.splitOn "-" =<<) <$> out
397 inp = toToken <$> input
398 t = buildTrie toToken' n input
399 -- nt = normalizeEntropy identity set_autonomy (fwd :: Trie Token Double)
400 -- nt = normalizeEntropy' info_entropy (\f -> info_norm_entropy' %~ f) nt
401 nt = normalizeEntropy identity set_autonomy t
402
403 check f msg ref my =
404 if f ref my
405 then P.putStrLn $ " PASS " <> msg <> " " <> show ref
406 else P.putStrLn $ " FAIL " <> msg <> " ref=" <> show ref <> " my=" <> show my
407
408 checker (ngram, count, entropy, _ev, autonomy, bwd_entropy, fwd_entropy) = do
409 let ns = parseToken <$> T.words ngram
410 t' = findTrie ns nt
411 nsb = parseToken <$> (reverse $ T.words ngram)
412 tb' = findTrie nsb nt
413
414 P.putStrLn $ " " <> T.unpack ngram <> ":"
415 check (==) "count" count (_node_count (_fwd t'))
416 check sim "entropy" entropy (mean [(nodeEntropy info_entropy (_fwd t')), (nodeEntropy info_entropy (_bwd tb'))])
417
418 -- (nodeEntropy info_entropy t')
419 check sim "autonomy" autonomy (nodeEntropy info_autonomy t')
420 check sim "fwd_entropy" fwd_entropy (nodeEntropy info_entropy (_fwd t'))
421 check sim "bwd_entropy" bwd_entropy (nodeEntropy info_entropy (_bwd t'))
422
423 printTrie =
424 P.putStrLn . Tree.drawTree
425 . fmap show
426 . toTree (NonTerminal "")
427
428 -- | TODO real data is a list of tokenized sentences
429 example0, example1, example2, example3, example4, example5, example6 :: [Text]
430 example0 = ["New-York is New-York and New-York"]
431 example1 = ["to-be or not to-be"]
432 example2 = ["to-be-or not to-be-or NOT to-be and"]
433 example3 = example0 <> example0
434 -- > TEST: Should not have York New in the trie
435 example4 = ["a-b-c-d e a-b-c-d f"]
436 example5 = ["a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"]
437 example6 = ["le-petit chat"
438 ,"le-petit chien"
439 ,"le-petit rat"
440 ,"le gros rat"
441 ]
442
443 checks0, checks2 :: [(Text, Int, Double, Double, Double, Double, Double)]
444
445 checks0 =
446 [("<start>", 1, nan, nan, nan, nan, 0.0)
447 ,("New", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 1.584962500721156, 0.0)
448 ,("York", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 0.0, 1.584962500721156)
449 ,("is", 1, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, 0.0)
450 ,("and", 1, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, 0.0)
451 ,("<stop>", 0, nan, nan, nan, 0.0, nan)
452
453 {-
454 ,("<start> New", 1, nan, nan, nan, nan, 0.0)
455 ,("New York", 3, 1.584962500721156, 1.584962500721156, 1.4142135623730951, nan, 1.584962500721156)
456 ,("York is", 1, 0.0, nan, nan, nan, 0.0)
457 ,("is New", 1, 0.0, nan, nan, nan, 0.0)
458 ,("York and", 1, 0.0, nan, nan, nan, 0.0)
459 ,("and New", 1, 0.0, nan, nan, nan, 0.0)
460 ,("York <stop>", 1, nan, nan, nan, nan, nan)
461
462 ,("<start> New York", 1, nan, nan, nan, nan, 0.0)
463 ,("New York is", 1, 0.0, nan, nan, nan, 0.0)
464 ,("York is New", 1, 0.0, nan, nan, nan, 0.0)
465 ,("is New York", 1, 0.0, nan, nan, nan, 0.0)
466 ,("New York and", 1, 0.0, nan, nan, nan, 0.0)
467 ,("York and New", 1, 0.0, nan, nan, nan, 0.0)
468 ,("and New York", 1, 0.0, nan, nan, nan, 0.0)
469 ,("New York <stop>", 1, nan, nan, nan, nan, nan)
470 --}
471 ]
472
473
474
475 checks2 =
476 [("to be", 3, 1.2516291673878228, 1.2516291673878228, 1.5535694744293167, nan, 0.9182958340544896)
477 ,("be or", 2, 0.5, nan, nan, nan, 1.0)
478 ,("or not", 1, 0.0, nan, nan, nan, 0.0)
479 ,("not to", 1, 0.0, nan, nan, nan, 0.0)
480 ,("or NOT", 1, 0.0, nan, nan, nan, 0.0)
481 ,("NOT to", 1, 0.0, nan, nan, nan, 0.0)
482 ,("be and", 1, 0.0, nan, nan, nan, 0.0)
483 ]
484
485
486 runTests :: IO ()
487 runTests =
488 forM_
489 [("example0", 2, example0, checks0)
490 ,("example1", 2, example1, [])
491 ,("example2", 3, example2, checks2)
492 ,("example3", 2, example3, [])
493 ,("example4", 4, example4, [])
494 ,("example5", 5, example5, [])
495 ]
496 (\(name, n, ex, checks) -> do
497 P.putStrLn $ name <> " " <> show n
498 b <- testEleve False n ex checks
499 P.putStrLn $ " splitting: " <> if b then "PASS" else "FAIL"
500 )