]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Eleve.hs
ElEve: reverse, buildTrie, printTrie...
[gargantext.git] / src / Gargantext / Text / Eleve.hs
1 {-|
2 Module : Gargantext.Text.Eleve
3 Description : Unsupervized Word segmentation
4 Copyright : (c) CNRS, 2019-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 # Implementation of Unsupervized Word Segmentation
11
12 References:
13
14 - Python implementation (Korantin August, Emmanuel Navarro):
15 [EleVe](https://github.com/kodexlab/eleve.git)
16
17 - Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
18 Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
19 the 50th Annual Meeting of the Association for Computational Linguistics
20 , pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
21
22 Notes for current implementation:
23 - TODO fix normalization
24 - TODO extract longer ngrams (see paper above, viterbi algo can be used)
25 - TODO AD TEST: prop (Node c _e f) = c == Map.size f
26
27 - AD: Real ngrams extraction test
28 from Gargantext.Text.Terms import extractTermsUnsupervised
29 docs <- runCmdRepl $ selectDocs 1004
30 extractTermsUnsupervised 3 $ DT.intercalate " "
31 $ catMaybes
32 $ Gargantext.map _hyperdataDocument_abstract docs
33
34 -}
35 {-# LANGUAGE ConstraintKinds #-}
36 {-# LANGUAGE NoImplicitPrelude #-}
37 {-# LANGUAGE OverloadedStrings #-}
38 {-# LANGUAGE RankNTypes #-}
39 {-# LANGUAGE TemplateHaskell #-}
40 {-# LANGUAGE TypeFamilies #-}
41
42 module Gargantext.Text.Eleve where
43
44 -- import Debug.Trace (trace)
45 -- import Debug.SimpleReflect
46
47 import Data.Functor.Reverse
48 import Control.Lens (Lens', Getting, (^.), (^?), view, makeLenses, _Just, under, reversed, at, (.~), to, set)
49 import Control.Monad (forM_)
50 import Data.Ord (Ord)
51 import qualified Data.List as L
52 import Data.Monoid
53 import Data.Text (Text)
54 import qualified Data.Text as T
55 import Data.Map (Map)
56 import Data.Maybe (fromMaybe)
57 import qualified Data.Map as Map
58 import Gargantext.Prelude hiding (cs)
59 import qualified Gargantext.Prelude as GP
60 import qualified Data.Tree as Tree
61 import Data.Tree (Tree)
62 import qualified Prelude as P (putStrLn, logBase, isNaN, RealFloat)
63
64 type Entropy e =
65 ( Fractional e
66 , Floating e
67 , P.RealFloat e
68 , Show e
69 -- ^ TODO: only used for debugging
70 )
71 ------------------------------------------------------------------------
72 -- | Example and tests for development
73 data I e = I
74 { _info_entropy :: e
75 , _info_entropy_var :: e
76 , _info_autonomy :: e
77 }
78
79 instance Show e => Show (I e) where
80 show (I e v n) = show (e, v, n)
81
82 makeLenses ''I
83
84 type ModEntropy i o e = (e -> e) -> i -> o
85
86 set_autonomy :: ModEntropy e (I e) e
87 set_autonomy f e = I e e (f e)
88
89 set_entropy_var :: ModEntropy e (I e) e
90 set_entropy_var f e = I e (f e) e
91
92
93 data StartStop = Start | Stop
94 deriving (Ord, Eq, Show)
95
96 data Token = NonTerminal Text
97 | Terminal StartStop
98 deriving (Ord, Eq, Show)
99
100 isTerminal :: Token -> Bool
101 isTerminal (Terminal _) = True
102 isTerminal (NonTerminal _) = False
103
104 parseToken :: Text -> Token
105 parseToken "<start>" = Terminal Start
106 parseToken "<stop>" = Terminal Stop
107 parseToken t = NonTerminal t
108
109 toToken :: [Text] -> [Token]
110 toToken xs = Terminal Start : (NonTerminal <$> xs) <> [Terminal Stop]
111
112 printToken :: Token -> Text
113 printToken = f
114 where
115 f (NonTerminal x) = x
116 f (Terminal Start) = "<start>"
117 f (Terminal Stop) = "<stop>"
118 ------------------------------------------------------------------------
119
120 data Trie k e
121 = Node { _node_count :: Int
122 , _node_entropy :: e
123 , _node_children :: Map k (Trie k e)
124 }
125 | Leaf { _node_count :: Int }
126 deriving (Show)
127
128 makeLenses ''Trie
129
130 insertTries :: Ord k => [[k]] -> Trie k ()
131 insertTries = L.foldr insertTrie emptyTrie
132
133 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
134 insertTrie [] n = n { _node_count = _node_count n +1}
135 insertTrie (x:xs) (Leaf c) = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
136 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
137 where
138 f = Just . insertTrie xs . fromMaybe emptyTrie
139
140 -- emptyTrie :: (Ord k, Monoid e) => Trie k e
141 -- emptyTrie = Node 0 mempty mempty
142 emptyTrie :: Trie k e
143 emptyTrie = Leaf 0
144
145 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
146 mkTrie c children
147 | Map.null children = Leaf c
148 | otherwise = Node c mempty children
149
150 -----------------------------
151 -- | Trie to Tree since Tree as nice print function
152 toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
153 toTree k (Leaf c) = Tree.Node (k, c, Nothing) []
154 toTree k (Node c e cs) = Tree.Node (k, c, Just e) (map (uncurry toTree) $ Map.toList cs)
155
156 ------------------------------------------------------------------------
157 ------------------------------------------------------------------------
158 nan :: Floating e => e
159 nan = 0 / 0
160
161 noNaNs :: P.RealFloat e => [e] -> [e]
162 noNaNs = filter (not . P.isNaN)
163
164 updateIfDefined :: P.RealFloat e => e -> e -> e
165 updateIfDefined e0 e | P.isNaN e = e0
166 | otherwise = e
167
168 sim :: Entropy e => e -> e -> Bool
169 sim x y = x == y || (P.isNaN x && P.isNaN y)
170
171 subst :: Entropy e => (e, e) -> e -> e
172 subst (src, dst) x | sim src x = dst
173 | otherwise = x
174
175 entropyTrie :: Entropy e => (k -> Bool) -> Trie k () -> Trie k e
176 entropyTrie _ (Leaf c) = Leaf c
177 entropyTrie pred (Node c () children) = Node c e (map (entropyTrie pred) children)
178 where
179 e = sum $ map f $ Map.toList children
180 f (k, child) = if pred k then chc * P.logBase 2 (fromIntegral c)
181 else - chc * P.logBase 2 chc
182 where
183 chc = fromIntegral (_node_count child) / fromIntegral c
184 ------------------------------------------------------------------------
185 normalizeLevel :: Entropy e => e -> e -> e -> e -> e
186 normalizeLevel prev m v e = ((e - prev) - m) / v
187
188 {- Unused
189
190 nodeChildren :: Trie k e -> Map k (Trie k e)
191 nodeChildren (Node _ _ cs) = cs
192 nodeChildren (Leaf _) = Map.empty
193
194 -}
195
196
197 class IsTrie trie where
198 buildTrie :: Entropy e => [[Token]] -> trie Token e
199 nodeEntropy :: Entropy e => Getting e i e -> trie k i -> e
200 nodeChild :: Ord k => k -> trie k e -> trie k e
201 findTrie :: Ord k => [k] -> trie k e -> trie k e
202 printTrie :: (Show i, Entropy e) => Getting e i e -> trie Token i -> IO ()
203 normalizeEntropy :: Entropy e
204 => Getting e i e -> ModEntropy i o e
205 -> trie k i -> trie k o
206
207 -- UNUSED
208 --nodeAutonomy :: (Ord k, Entropy e) => Getting e i e -> trie k i -> [k] -> e
209 --nodeAutonomy inE t ks = nodeEntropy inE $ findTrie ks t
210
211 instance IsTrie Trie where
212 buildTrie ts = entropyTrie isTerminal $ insertTries ts
213
214 nodeEntropy inE (Node _ e _) = e ^. inE
215 nodeEntropy _ (Leaf _) = nan
216
217 nodeChild k (Node _ _ cs) = fromMaybe emptyTrie (Map.lookup k cs)
218 nodeChild _ (Leaf _) = emptyTrie
219
220 findTrie ks t = L.foldl (flip nodeChild) t ks
221
222 printTrie inE t = do
223 P.putStrLn . Tree.drawTree
224 . fmap show
225 $ toTree (NonTerminal "") t
226 P.putStrLn " Levels:"
227 forM_ (normalizationLevels inE t) $ \level ->
228 P.putStrLn $ " " <> show level
229
230 normalizeEntropy inE modE t = go (modE identity) (normalizationLevels inE t) t
231 where
232 go _ _ (Leaf c) = Leaf c
233 go _ [] _ = panic "normalizeEntropy' empty levels"
234 go f ((m, v, _) : ess) (Node c i children)
235 = Node c (f i) $ go (modE $ normalizeLevel (i ^. inE) m v) ess <$> children
236
237
238 {-
239 This is only normalizing a node with respect to its brothers (unlike all the
240 nodes of the same level).
241
242 normalizeEntropy inE modE = go $ modE identity
243 where
244 go _ (Leaf c) = Leaf c
245 go f (Node c i children)
246 | Map.null children =
247 panic "normalizeEntropy: impossible"
248 | otherwise =
249 Node c (f i) $ go (modE $ normalizeLevel es) <$> children
250 where
251 es = [ i' ^. inE | Node _ i' _ <- Map.elems children ]
252 -}
253 ------------------------------------------------------------------------
254
255 levels :: Trie k e -> [[Trie k e]]
256 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
257 where
258 subForest :: Trie k e -> [Trie k e]
259 subForest (Leaf _) = []
260 subForest (Node _ _ children) = Map.elems children
261
262 entropyLevels :: Entropy e => Getting e i e -> Trie k i -> [[e]]
263 entropyLevels inE = fmap (noNaNs . map (nodeEntropy inE)) . L.tail . levels
264
265 normalizationLevels :: Entropy e => Getting e i e -> Trie k i -> [(e, e, Int)]
266 normalizationLevels inE = fmap f . entropyLevels inE
267 where
268 f es = (mean es, deviation es, length es)
269
270 ------------------------------------------------------------------------
271
272 data Tries k e = Tries
273 { _fwd :: Trie k e
274 , _bwd :: Trie k e
275 }
276
277 makeLenses ''Tries
278
279
280
281 instance IsTrie Tries where
282 buildTrie tts = Tries { _fwd = buildTrie tts
283 , _bwd = buildTrie (reverse <$> tts)
284 }
285
286 nodeEntropy inE (Tries fwd bwd) =
287 -- VETODO reverse the query for bwd here
288 -- mean $ noNaNs [nodeEntropy inE fwd, nodeEntropy inE bwd . under reversed]
289 mean $ noNaNs [nodeEntropy inE fwd, nodeEntropy inE bwd]
290
291 findTrie ks (Tries fwd bwd) = Tries (findTrie ks fwd) (findTrie ks bwd)
292 -- ^^
293 -- TODO: here this is tempting to reverse but this is not always what we
294 -- want. See also nodeAutonomy.
295 -- AD: I also tried to reverse here and I confirm getting unexpected results (whereas VETODO FIX below is ok)
296 -- since recursivity of the function makes the reverse multiple times (I guess)
297
298 nodeChild k (Tries fwd bwd) = Tries (nodeChild k fwd) (nodeChild k bwd)
299
300 normalizeEntropy inE modE = onTries (normalizeEntropy inE modE)
301
302 printTrie inE (Tries fwd bwd) = do
303 P.putStrLn "Forward:"
304 printTrie inE fwd
305 P.putStrLn ""
306 P.putStrLn "Backward:"
307 printTrie inE bwd
308
309 onTries :: (Trie k i -> Trie k o) -> Tries k i -> Tries k o
310 onTries f (Tries fwd bwd) = Tries (f fwd) (f bwd)
311
312 ------------------------------------------------------------------------
313 split :: (IsTrie trie, Entropy e) => Lens' i e -> trie Token i -> [Token] -> [[Token]]
314 split _ _ [] = []
315 split inE t0 (Terminal Start:xs0) = split inE (nodeChild (Terminal Start) t0) xs0
316 split inE t0 (x0:xs0) = go (nodeChild x0 t0) [x0] xs0
317 where
318 consRev [] xss = xss
319 consRev xs xss = reverse xs : xss
320
321 go _ pref [] = [reverse pref]
322 go _ pref (Terminal Stop:_) = [reverse pref]
323 go t pref (Terminal Start:xs) = go t pref xs
324 go t pref (x:xs) =
325 -- trace (show (if acc then "ACC" else "CUT", (reverse (x : pref), ext), if acc then ">" else "<=", ((reverse pref, et), "+", ([x], ext0)))) $
326 if acc
327 then go xt (x:pref) xs
328 else consRev pref $ go xt0 [x] xs
329 where
330 xt = nodeChild x t
331 xt0 = nodeChild x t0
332 et = ne 0 t
333 -- ^ entropy of the current prefix
334 ext0 = ne 0 xt0
335 -- ^ entropy of [x]
336 ext = ne 0 xt
337 -- ^ entropy of the current prefix plus x
338 acc = ext > et + ext0
339 -- aut(["in","this","paper"]) > aut(["in","this"]) + aut(["paper"])
340
341 ne d t = if P.isNaN e then d else e
342 where e = nodeEntropy inE t
343
344 {-
345 split :: Entropy e => Lens' i e -> Tries Token i -> [Token] -> [[Token]]
346 split inE t0 ts =
347 maximumWith (sum . map $ nodeAutonomy inE t0) (all the splits of ts)
348 -}
349
350 ------------------------------------------------------------------------
351 ------------------------------------------------------------------------
352
353 mainEleve :: Int -> [[Text]] -> [[[Text]]]
354 mainEleve _ _ = []
355 {-
356 mainEleve n input = map (map printToken) . split identity (t :: Trie Token Double) <$> inp
357 where
358 inp = toToken <$> input
359 t = buildTrie $ L.concat $ chunkAlong n 1 <$> inp
360 -}
361
362 chunkAlongEleve :: Int -> [a] -> [[a]]
363 chunkAlongEleve n xs = L.take n <$> L.tails xs
364
365 toToken' :: Int -> [[Text]] -> [[Token]]
366 toToken' n input = L.concat $ (filter (/= [Terminal Stop]) . chunkAlongEleve (n + 2)) <$> toToken <$> input
367
368 ---------------------------------------------
369 set_entropy_vars :: Entropy e => Getting e i e -> (e -> i -> o) -> Tries Token i -> Trie Token o
370 set_entropy_vars inE modE tries@(Tries fwd _bwd) =
371 mapTree (\k -> modE $ entropy_var'' inE tries k) [] fwd
372
373 mapTree :: ([Token] -> t -> e) -> [Token] -> Trie Token t -> Trie Token e
374 mapTree f k t = go f k t
375 where
376 go _ _ (Leaf c) = Leaf c
377 go f k (Node c i children) = Node c (f k i) (Map.mapWithKey (\k'-> go f (k <> [k'])) children)
378
379 entropy_var'' :: Entropy e => Getting e i e -> Tries Token i -> [Token] -> e
380 entropy_var'' inE tries ng = mean $ noNaNs [fwd, bwd]
381 where
382 fwd = (nodeEntropy inE (_fwd $ findTrie ng tries))
383 bwd = (nodeEntropy inE (_bwd $ findTrie (reverse ng) tries))
384
385 ---------------------------------------------
386 -- | TODO remove function below after following bug fixed
387 -- | TODO entropy_var' /= entropy_var on "<start> token.."
388 entropy_var' :: Entropy e => Tries Token (I e) -> [Token] -> e
389 entropy_var' tries ng = (mean $ noNaNs [ (nodeEntropy info_entropy (_fwd $ findTrie ng tries))
390 , (nodeEntropy info_entropy (_bwd $ findTrie (reverse ng) tries))
391 ]
392 )
393
394 entropy_var :: Entropy e => [Text] -> Tries Token (I e) -> e
395 entropy_var ng trie = (mean [ (nodeEntropy info_entropy (_fwd $ findTrie ntf trie))
396 , (nodeEntropy info_entropy (_bwd $ findTrie ntb trie))
397 ]
398 )
399 where
400 ntf = parseToken <$> ng
401 ntb = parseToken <$> reverse ng
402
403 ---------------------------------------------
404
405 testEleve :: e ~ Double => Bool -> Int -> [Text] -> [(Text, Int, e, e, e, e, e)] -> IO Bool
406 testEleve debug n output checks = do
407 let
408 {-
409 pss = [ (ps, findTrie ps fwd ^? _Just . node_entropy) -- . info_entropy)
410 | ps <- L.nub $ [ c
411 | m <- [1..n]
412 , cs <- chunkAlong m 1 <$> inp
413 , c <- cs
414 ]
415 ]
416 -}
417 --res = map (map printToken) . split identity fwd <$> inp
418 --res = map (map printToken) . split info_norm_entropy' nt' <$> inp
419 res = map (map printToken) . split info_autonomy nt <$> inp
420 when debug $ do
421 P.putStrLn (show input)
422 -- forM_ pss (P.putStrLn . show)
423 P.putStrLn ""
424 printTrie info_entropy nt
425 P.putStrLn ""
426 P.putStrLn "Entropy Var:"
427 printTrie identity t''
428 P.putStrLn ""
429 P.putStrLn "Splitting:"
430 P.putStrLn $ show res
431 forM_ checks checker
432 pure $ expected == res
433
434 where
435 out = T.words <$> output
436 expected = fmap (T.splitOn "-") <$> out
437 input = (T.splitOn "-" =<<) <$> out
438 inp = toToken <$> input
439
440 t :: Tries Token Double
441 t = buildTrie (toToken' n input)
442 & bwd . node_children . at (Terminal Start) . _Just . node_entropy .~ nan
443 -- TODO NP: this is a hack to set the bwd entropy of Start at NaN.
444
445 nt :: Tries Token (I Double)
446 nt = normalizeEntropy identity set_autonomy t
447
448 t'' :: Trie Token Double
449 t'' = set_entropy_vars info_autonomy (\e _i -> e) nt
450
451 -- nt = normalizeEntropy identity set_autonomy (fwd :: Trie Token Double)
452 -- nt = normalizeEntropy' info_entropy (\f -> info_norm_entropy' %~ f) nt
453
454 check f msg ref my =
455 if f ref my
456 then P.putStrLn $ " PASS " <> msg <> " " <> show ref
457 else P.putStrLn $ " FAIL " <> msg <> " ref=" <> show ref <> " my=" <> show my
458
459 checker (ngram, count, entropy, _ev, autonomy, bwd_entropy, fwd_entropy) = do
460 let ns = parseToken <$> T.words ngram
461 nsb = parseToken <$> (reverse $ T.words ngram)
462 t' = findTrie ns t
463 tvar = findTrie ns t''
464 nt' = findTrie ns nt
465
466 P.putStrLn $ " " <> T.unpack ngram <> ":"
467 check (==) "count" count (_node_count tvar)
468 check sim "entropy_var" entropy (nodeEntropy identity tvar)
469 --check sim ("entropy_varOK") entropy (entropy_var (T.words ngram) nt)
470 --check sim "entropy" entropy (entropy_var' nt (parseToken <$> T.words ngram))
471 {- ^ FIXME 2 fun above should have same results (error in reverseToken):
472 <start> New York:
473 PASS count 1
474 FAIL entropy ref=NaN my=0.0
475 -}
476
477 check sim "autonomy" autonomy (nodeEntropy info_autonomy nt')
478 check sim "fwd_entropy" fwd_entropy (nodeEntropy identity (_fwd t'))
479 check sim "bwd_entropy" bwd_entropy (nodeEntropy identity (_bwd t'))
480
481 -- | TODO real data is a list of tokenized sentences
482 example0, example1, example2, example3, example4, example5, example6 :: [Text]
483 example0 = ["New-York is New-York and New-York"]
484 example1 = ["to-be or not to-be"]
485 example2 = ["to-be-or not to-be-or NOT to-be and"]
486 example3 = example0 <> example0
487 -- > TEST: Should not have York New in the trie
488 example4 = ["a-b-c-d e a-b-c-d f"]
489 example5 = ["a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"]
490 example6 = ["le-petit chat"
491 ,"le-petit chien"
492 ,"le-petit rat"
493 ,"le gros rat"
494 ]
495
496 checks0, checks2 :: [(Text, Int, Double, Double, Double, Double, Double)]
497
498 checks0 =
499 [("<start>", 1, nan, nan, nan, nan, 0.0)
500 ,("New", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 1.584962500721156, 0.0)
501 ,("York", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 0.0, 1.584962500721156)
502 ,("is", 1, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, 0.0)
503 ,("and", 1, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, 0.0)
504 --,("<stop>", 0, nan, nan, nan, 0.0, nan) Since it is not in the trie it no,
505 -- need to count it.
506
507 --{-
508 ,("<start> New", 1, nan, nan, nan, nan, 0.0)
509 ,("New York", 3, 1.584962500721156, 1.584962500721156, 1.4142135623730951, nan, 1.584962500721156)
510 ,("York is", 1, 0.0, nan, nan, nan, 0.0)
511 ,("is New", 1, 0.0, nan, nan, nan, 0.0)
512 ,("York and", 1, 0.0, nan, nan, nan, 0.0)
513 ,("and New", 1, 0.0, nan, nan, nan, 0.0)
514 ,("York <stop>", 1, nan, nan, nan, nan, nan)
515
516 ,("<start> New York", 1, nan, nan, nan, nan, 0.0)
517 ,("New York is", 1, 0.0, nan, nan, nan, 0.0)
518 ,("York is New", 1, 0.0, nan, nan, nan, 0.0)
519 ,("is New York", 1, 0.0, nan, nan, nan, 0.0)
520 ,("New York and", 1, 0.0, nan, nan, nan, 0.0)
521 ,("York and New", 1, 0.0, nan, nan, nan, 0.0)
522 ,("and New York", 1, 0.0, nan, nan, nan, 0.0)
523 ,("New York <stop>", 1, nan, nan, nan, nan, nan)
524 --}
525 ]
526
527
528
529 checks2 =
530 [("to be", 3, 1.2516291673878228, 1.2516291673878228, 1.5535694744293167, nan, 0.9182958340544896)
531 ,("be or", 2, 0.5, nan, nan, nan, 1.0)
532 ,("or not", 1, 0.0, nan, nan, nan, 0.0)
533 ,("not to", 1, 0.0, nan, nan, nan, 0.0)
534 ,("or NOT", 1, 0.0, nan, nan, nan, 0.0)
535 ,("NOT to", 1, 0.0, nan, nan, nan, 0.0)
536 ,("be and", 1, 0.0, nan, nan, nan, 0.0)
537 ]
538
539
540 runTests :: IO ()
541 runTests =
542 forM_
543 [("example0", 2, example0, checks0)
544 ,("example1", 2, example1, [])
545 ,("example2", 3, example2, checks2)
546 ,("example3", 2, example3, [])
547 ,("example4", 4, example4, [])
548 ,("example5", 5, example5, [])
549 ]
550 (\(name, n, ex, checks) -> do
551 P.putStrLn $ name <> " " <> show n
552 b <- testEleve False n ex checks
553 P.putStrLn $ " splitting: " <> if b then "PASS" else "FAIL"
554 )