2 Module : Gargantext.Text.Eleve
3 Description : Unsupervized Word segmentation
4 Copyright : (c) CNRS, 2019-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 # Implementation of Unsupervized Word Segmentation
14 - Python implementation (Korantin August, Emmanuel Navarro):
15 [EleVe](https://github.com/kodexlab/eleve.git)
17 - Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
18 Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
19 the 50th Annual Meeting of the Association for Computational Linguistics
20 , pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
22 Notes for current implementation:
23 - TODO fix normalization
24 - TODO extract longer ngrams (see paper above, viterbi algo can be used)
25 - TODO AD TEST: prop (Node c _e f) = c == Map.size f
27 - AD: Real ngrams extraction test
28 from Gargantext.Text.Terms import extractTermsUnsupervised
29 docs <- runCmdRepl $ selectDocs 1004
30 extractTermsUnsupervised 3 $ DT.intercalate " "
32 $ Gargantext.map _hyperdataDocument_abstract docs
35 {-# LANGUAGE ConstraintKinds #-}
36 {-# LANGUAGE NoImplicitPrelude #-}
37 {-# LANGUAGE OverloadedStrings #-}
38 {-# LANGUAGE RankNTypes #-}
39 {-# LANGUAGE TemplateHaskell #-}
40 {-# LANGUAGE TypeFamilies #-}
42 module Gargantext.Text.Eleve where
44 import Debug.Trace (trace)
45 -- import Debug.SimpleReflect
47 import Data.Functor.Reverse
48 import Control.Lens (Lens', Getting, (^.), (^?), view, makeLenses, _Just, under, reversed, at, (.~), to, set)
49 import Control.Monad (forM_)
51 import qualified Data.List as L
53 import Data.Text (Text)
54 import qualified Data.Text as T
56 import Data.Maybe (fromMaybe)
57 import qualified Data.Map as Map
58 import Gargantext.Prelude hiding (cs)
59 import qualified Gargantext.Prelude as GP
60 import qualified Data.Tree as Tree
61 import Data.Tree (Tree)
62 import qualified Prelude as P (putStrLn, logBase, isNaN, RealFloat)
69 -- ^ TODO: only used for debugging
71 ------------------------------------------------------------------------
72 -- | Example and tests for development
75 , _info_entropy_var :: e
79 instance Show e => Show (I e) where
80 show (I e v n) = show (e, v, n)
84 type ModEntropy i o e = (e -> e) -> i -> o
86 set_autonomy :: ModEntropy e (I e) e
87 set_autonomy f e = I e e (f e)
89 set_entropy_var :: ModEntropy e (I e) e
90 set_entropy_var f e = I e (f e) e
93 data StartStop = Start | Stop
94 deriving (Ord, Eq, Show)
96 data Token = NonTerminal Text
98 deriving (Ord, Eq, Show)
100 isTerminal :: Token -> Bool
101 isTerminal (Terminal _) = True
102 isTerminal (NonTerminal _) = False
104 parseToken :: Text -> Token
105 parseToken "<start>" = Terminal Start
106 parseToken "<stop>" = Terminal Stop
107 parseToken t = NonTerminal t
110 -- >>> reverseTokens [Terminal Start, NonTerminal "new", NonTerminal "york", Terminal Stop]
111 -- [Terminal Start,NonTerminal "york",NonTerminal "new",Terminal Stop]
112 reverseTokens :: [Token] -> [Token]
113 reverseTokens xs = case lastMay xs of
115 Just (Terminal Stop) -> reverseTokens' xs <> [Terminal Stop]
116 _ -> reverseTokens' xs
118 reverseTokens' :: [Token] -> [Token]
119 reverseTokens' [] = []
120 reverseTokens' [Terminal Stop] = []
121 reverseTokens' [x] = [x]
122 reverseTokens' (x:xs) = case x of
123 Terminal Start -> [Terminal Start] <> reverseTokens' xs
124 _ -> reverseTokens' xs <> [x]
127 toToken :: [Text] -> [Token]
128 toToken xs = Terminal Start : (NonTerminal <$> xs) <> [Terminal Stop]
130 printToken :: Token -> Text
133 f (NonTerminal x) = x
134 f (Terminal Start) = "<start>"
135 f (Terminal Stop) = "<stop>"
136 ------------------------------------------------------------------------
139 = Node { _node_count :: Int
141 , _node_children :: Map k (Trie k e)
143 | Leaf { _node_count :: Int }
148 insertTries :: Ord k => [[k]] -> Trie k ()
149 insertTries = L.foldr insertTrie emptyTrie
151 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
152 insertTrie [] n = n { _node_count = _node_count n +1}
153 insertTrie (x:xs) (Leaf c) = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
154 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
156 f = Just . insertTrie xs . fromMaybe emptyTrie
158 -- emptyTrie :: (Ord k, Monoid e) => Trie k e
159 -- emptyTrie = Node 0 mempty mempty
160 emptyTrie :: Trie k e
163 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
165 | Map.null children = Leaf c
166 | otherwise = Node c mempty children
168 -----------------------------
169 -- | Trie to Tree since Tree as nice print function
170 toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
171 toTree k (Leaf c) = Tree.Node (k, c, Nothing) []
172 toTree k (Node c e cs) = Tree.Node (k, c, Just e) (map (uncurry toTree) $ Map.toList cs)
174 ------------------------------------------------------------------------
175 ------------------------------------------------------------------------
176 nan :: Floating e => e
179 noNaNs :: P.RealFloat e => [e] -> [e]
180 noNaNs = filter (not . P.isNaN)
182 updateIfDefined :: P.RealFloat e => e -> e -> e
183 updateIfDefined e0 e | P.isNaN e = e0
186 sim :: Entropy e => e -> e -> Bool
187 sim x y = x == y || (P.isNaN x && P.isNaN y)
189 subst :: Entropy e => (e, e) -> e -> e
190 subst (src, dst) x | sim src x = dst
193 entropyTrie :: Entropy e => (k -> Bool) -> Trie k () -> Trie k e
194 entropyTrie _ (Leaf c) = Leaf c
195 entropyTrie pred (Node c () children) = Node c e (map (entropyTrie pred) children)
197 e = sum $ map f $ Map.toList children
198 f (k, child) = if pred k then chc * P.logBase 2 (fromIntegral c)
199 else - chc * P.logBase 2 chc
201 chc = fromIntegral (_node_count child) / fromIntegral c
202 ------------------------------------------------------------------------
203 normalizeLevel :: Entropy e => [e] -> e -> e
204 normalizeLevel = checkDiff (go . noNaNs)
207 -- checkDiff f es e = let e' = f es e in if e == e' then e' else trace ("normalizeLevel: diff " <> show e <> " " <> show e') e'
209 -- go [] = panic "normalizeLevel: impossible"
210 -- trace "normalizeLevel"
212 go es = \e -> (e - m) / v
215 then trace ("normalizeLevel " <> show (e,m,v,es))
225 nodeChildren :: Trie k e -> Map k (Trie k e)
226 nodeChildren (Node _ _ cs) = cs
227 nodeChildren (Leaf _) = Map.empty
232 class IsTrie trie where
233 buildTrie :: Entropy e => (Int -> [[Text]] -> [[Token]]) -> Int -> [[Text]] -> trie Token e
234 nodeEntropy :: Entropy e => Getting e i e -> trie k i -> e
235 nodeChild :: Ord k => k -> trie k e -> trie k e
236 findTrie :: Ord k => [k] -> trie k e -> trie k e
237 normalizeEntropy :: Entropy e
238 => Getting e i e -> ModEntropy i o e
239 -> trie k i -> trie k o
242 --nodeAutonomy :: (Ord k, Entropy e) => Getting e i e -> trie k i -> [k] -> e
243 --nodeAutonomy inE t ks = nodeEntropy inE $ findTrie ks t
245 instance IsTrie Trie where
246 buildTrie to n ts = entropyTrie isTerminal $ insertTries $ to n ts
248 nodeEntropy inE (Node _ e _) = e ^. inE
249 nodeEntropy _ (Leaf _) = nan
251 nodeChild k (Node _ _ cs) = fromMaybe emptyTrie (Map.lookup k cs)
252 nodeChild _ (Leaf _) = emptyTrie
254 findTrie ks t = L.foldl (flip nodeChild) t ks
256 normalizeEntropy inE modE t = trace (show level) $ go (modE identity) level t
258 level = (entropyLevels inE t)
259 go _ [] _ = panic "normalizeEntropy' empty levels"
260 go _ _ (Leaf c) = Leaf c
261 -- go _ ([] : _) _ = panic "normalizeEntropy': empty level"
262 go f (es : ess) (Node c i children)
263 -- | any (sim (i ^. inE)) es
264 = Node c (f i) $ go (modE $ normalizeLevel es) ess <$> children
266 -- = panic "NOT an elem"
270 This is only normalizing a node with respect to its brothers (unlike all the
271 nodes of the same level).
273 normalizeEntropy inE modE = go $ modE identity
275 go _ (Leaf c) = Leaf c
276 go f (Node c i children)
277 | Map.null children =
278 panic "normalizeEntropy: impossible"
280 Node c (f i) $ go (modE $ normalizeLevel es) <$> children
282 es = [ i' ^. inE | Node _ i' _ <- Map.elems children ]
284 ------------------------------------------------------------------------
286 levels :: Trie k e -> [[Trie k e]]
287 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
289 subForest :: Trie k e -> [Trie k e]
290 subForest (Leaf _) = []
291 subForest (Node _ _ children) = Map.elems children
293 entropyLevels :: Entropy e => Getting e i e -> Trie k i -> [[e]]
294 entropyLevels inE = fmap (noNaNs . map (nodeEntropy inE)) . levels
296 ------------------------------------------------------------------------
298 data Tries k e = Tries
307 instance IsTrie Tries where
308 buildTrie to n tts = Tries { _fwd = buildTrie to n tts
309 , _bwd = buildTrie to n (map reverse $ tts)
312 nodeEntropy inE (Tries fwd bwd) =
313 -- VETODO reverse the query for bwd here
314 -- mean $ noNaNs [nodeEntropy inE fwd, nodeEntropy inE bwd . under reversed]
315 mean $ noNaNs [nodeEntropy inE fwd, nodeEntropy inE bwd]
317 findTrie ks (Tries fwd bwd) = Tries (findTrie ks fwd) (findTrie ks bwd)
319 -- TODO: here this is tempting to reverse but this is not always what we
320 -- want. See also nodeAutonomy.
321 -- AD: I also tried to reverse here and I confirm getting unexpected results (whereas VETODO FIX below is ok)
322 -- since recursivity of the function makes the reverse multiple times (I guess)
324 nodeChild k (Tries fwd bwd) = Tries (nodeChild k fwd) (nodeChild k bwd)
326 normalizeEntropy inE modE = onTries (normalizeEntropy inE modE)
328 onTries :: (Trie k i -> Trie k o) -> Tries k i -> Tries k o
329 onTries f (Tries fwd bwd) = Tries (f fwd) (f bwd)
331 ------------------------------------------------------------------------
332 split :: (IsTrie trie, Entropy e) => Lens' i e -> trie Token i -> [Token] -> [[Token]]
334 split inE t0 (Terminal Start:xs0) = split inE (nodeChild (Terminal Start) t0) xs0
335 split inE t0 (x0:xs0) = go (nodeChild x0 t0) [x0] xs0
338 consRev xs xss = reverse xs : xss
340 go _ pref [] = [reverse pref]
341 go _ pref (Terminal Stop:_) = [reverse pref]
342 go t pref (Terminal Start:xs) = go t pref xs
344 -- trace (show (if acc then "ACC" else "CUT", (reverse (x : pref), ext), if acc then ">" else "<=", ((reverse pref, et), "+", ([x], ext0)))) $
346 then go xt (x:pref) xs
347 else consRev pref $ go xt0 [x] xs
352 -- ^ entropy of the current prefix
356 -- ^ entropy of the current prefix plus x
357 acc = ext > et + ext0
358 -- aut(["in","this","paper"]) > aut(["in","this"]) + aut(["paper"])
360 ne d t = if P.isNaN e then d else e
361 where e = nodeEntropy inE t
364 split :: Entropy e => Lens' i e -> Tries Token i -> [Token] -> [[Token]]
366 maximumWith (sum . map $ nodeAutonomy inE t0) (all the splits of ts)
369 ------------------------------------------------------------------------
370 ------------------------------------------------------------------------
372 mainEleve :: Int -> [[Text]] -> [[[Text]]]
375 mainEleve n input = map (map printToken) . split identity (t :: Trie Token Double) <$> inp
377 inp = toToken <$> input
378 t = buildTrie $ L.concat $ chunkAlong n 1 <$> inp
381 chunkAlongEleve :: Int -> [a] -> [[a]]
382 chunkAlongEleve n xs = L.take n <$> L.tails xs
384 toToken' :: Int -> [[Text]] -> [[Token]]
385 toToken' n input = L.concat $ (filter (/= [Terminal Stop]) . chunkAlongEleve (n + 2)) <$> toToken <$> input
387 ---------------------------------------------
388 set_entropy_vars :: Entropy e => Getting e i e -> (e -> i -> o) -> Tries Token i -> Trie Token o
389 set_entropy_vars inE modE tries@(Tries fwd _bwd) =
390 mapTree (\k -> modE $ entropy_var'' inE tries k) [] fwd
392 mapTree :: ([Token] -> t -> e) -> [Token] -> Trie Token t -> Trie Token e
393 mapTree f k t = go f k t
395 go _ _ (Leaf c) = Leaf c
396 go f k (Node c i children) = Node c (f k i) (Map.mapWithKey (\k'-> go f (k <> [k'])) children)
398 entropy_var'' :: Entropy e => Getting e i e -> Tries Token i -> [Token] -> e
399 entropy_var'' inE tries ng = mean $ noNaNs [fwd, bwd]
401 fwd = (nodeEntropy inE (_fwd $ findTrie ng tries))
402 bwd = (nodeEntropy inE (_bwd $ findTrie (reverseTokens ng) tries))
404 ---------------------------------------------
405 -- | TODO remove function below after following bug fixed
406 -- | TODO entropy_var' /= entropy_var on "<start> token.."
407 entropy_var' :: Entropy e => Tries Token (I e) -> [Token] -> e
408 entropy_var' tries ng = (mean $ noNaNs [ (nodeEntropy info_entropy (_fwd $ findTrie ng tries))
409 , (nodeEntropy info_entropy (_bwd $ findTrie (reverseTokens ng) tries))
413 entropy_var :: Entropy e => [Text] -> Tries Token (I e) -> e
414 entropy_var ng trie = (mean [ (nodeEntropy info_entropy (_fwd $ findTrie ntf trie))
415 , (nodeEntropy info_entropy (_bwd $ findTrie ntb trie))
419 ntf = parseToken <$> ng
420 ntb = parseToken <$> reverse ng
422 ---------------------------------------------
424 testEleve :: e ~ Double => Bool -> Int -> [Text] -> [(Text, Int, e, e, e, e, e)] -> IO Bool
425 testEleve debug n output checks = do
428 pss = [ (ps, findTrie ps fwd ^? _Just . node_entropy) -- . info_entropy)
431 , cs <- chunkAlong m 1 <$> inp
436 --res = map (map printToken) . split identity fwd <$> inp
437 --res = map (map printToken) . split info_norm_entropy' nt' <$> inp
438 res = map (map printToken) . split info_autonomy nt <$> inp
440 P.putStrLn (show input)
441 -- forM_ pss (P.putStrLn . show)
444 forM_ (entropyLevels identity (_fwd t)) $ \level ->
445 P.putStrLn $ " " <> show level
447 P.putStrLn "Forward:"
450 P.putStrLn "Backward:"
453 P.putStrLn "Normalized:"
456 P.putStrLn "Splitting:"
457 P.putStrLn $ show res
459 pure $ expected == res
462 out = T.words <$> output
463 expected = fmap (T.splitOn "-") <$> out
464 input = (T.splitOn "-" =<<) <$> out
465 inp = toToken <$> input
467 t :: Tries Token Double
468 t = buildTrie toToken' n input
469 & bwd . node_children . at (Terminal Start) . _Just . node_entropy .~ nan
470 -- TODO NP: this is a hack to set the bwd entropy of Start at NaN.
472 t'' :: Trie Token Double
473 t'' = set_entropy_vars identity (\e _i -> e) t
475 nt :: Trie Token (I Double)
476 nt = normalizeEntropy identity set_autonomy t''
478 -- nt = normalizeEntropy identity set_autonomy (fwd :: Trie Token Double)
479 -- nt = normalizeEntropy' info_entropy (\f -> info_norm_entropy' %~ f) nt
483 then P.putStrLn $ " PASS " <> msg <> " " <> show ref
484 else P.putStrLn $ " FAIL " <> msg <> " ref=" <> show ref <> " my=" <> show my
486 checker (ngram, count, entropy, _ev, autonomy, bwd_entropy, fwd_entropy) = do
487 let ns = parseToken <$> T.words ngram
488 nsb = parseToken <$> (reverse $ T.words ngram)
490 tvar = findTrie ns t''
492 P.putStrLn $ " " <> T.unpack ngram <> ":"
493 check (==) "count" count (_node_count tvar)
494 check sim "entropy_var" entropy (nodeEntropy identity tvar)
495 --check sim ("entropy_varOK") entropy (entropy_var (T.words ngram) nt)
496 --check sim "entropy" entropy (entropy_var' nt (parseToken <$> T.words ngram))
497 {- ^ FIXME 2 fun above should have same results (error in reverseToken):
500 FAIL entropy ref=NaN my=0.0
503 check sim "autonomy" autonomy (nodeEntropy info_autonomy nt)
504 check sim "fwd_entropy" fwd_entropy (nodeEntropy identity (_fwd t'))
505 check sim "bwd_entropy" bwd_entropy (nodeEntropy identity (_bwd t'))
507 printTrie :: Show e => Trie Token e -> IO ()
509 P.putStrLn . Tree.drawTree
511 . toTree (NonTerminal "")
513 -- | TODO real data is a list of tokenized sentences
514 example0, example1, example2, example3, example4, example5, example6 :: [Text]
515 example0 = ["New-York is New-York and New-York"]
516 example1 = ["to-be or not to-be"]
517 example2 = ["to-be-or not to-be-or NOT to-be and"]
518 example3 = example0 <> example0
519 -- > TEST: Should not have York New in the trie
520 example4 = ["a-b-c-d e a-b-c-d f"]
521 example5 = ["a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"]
522 example6 = ["le-petit chat"
528 checks0, checks2 :: [(Text, Int, Double, Double, Double, Double, Double)]
531 [("<start>", 1, nan, nan, nan, nan, 0.0)
532 ,("New", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 1.584962500721156, 0.0)
533 ,("York", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 0.0, 1.584962500721156)
534 ,("is", 1, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, 0.0)
535 ,("and", 1, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, 0.0)
536 --,("<stop>", 0, nan, nan, nan, 0.0, nan) Since it is not in the trie it no,
540 ,("<start> New", 1, nan, nan, nan, nan, 0.0)
541 ,("New York", 3, 1.584962500721156, 1.584962500721156, 1.4142135623730951, nan, 1.584962500721156)
542 ,("York is", 1, 0.0, nan, nan, nan, 0.0)
543 ,("is New", 1, 0.0, nan, nan, nan, 0.0)
544 ,("York and", 1, 0.0, nan, nan, nan, 0.0)
545 ,("and New", 1, 0.0, nan, nan, nan, 0.0)
546 ,("York <stop>", 1, nan, nan, nan, nan, nan)
548 ,("<start> New York", 1, nan, nan, nan, nan, 0.0)
549 ,("New York is", 1, 0.0, nan, nan, nan, 0.0)
550 ,("York is New", 1, 0.0, nan, nan, nan, 0.0)
551 ,("is New York", 1, 0.0, nan, nan, nan, 0.0)
552 ,("New York and", 1, 0.0, nan, nan, nan, 0.0)
553 ,("York and New", 1, 0.0, nan, nan, nan, 0.0)
554 ,("and New York", 1, 0.0, nan, nan, nan, 0.0)
555 ,("New York <stop>", 1, nan, nan, nan, nan, nan)
562 [("to be", 3, 1.2516291673878228, 1.2516291673878228, 1.5535694744293167, nan, 0.9182958340544896)
563 ,("be or", 2, 0.5, nan, nan, nan, 1.0)
564 ,("or not", 1, 0.0, nan, nan, nan, 0.0)
565 ,("not to", 1, 0.0, nan, nan, nan, 0.0)
566 ,("or NOT", 1, 0.0, nan, nan, nan, 0.0)
567 ,("NOT to", 1, 0.0, nan, nan, nan, 0.0)
568 ,("be and", 1, 0.0, nan, nan, nan, 0.0)
575 [("example0", 2, example0, checks0)
576 ,("example1", 2, example1, [])
577 ,("example2", 3, example2, checks2)
578 ,("example3", 2, example3, [])
579 ,("example4", 4, example4, [])
580 ,("example5", 5, example5, [])
582 (\(name, n, ex, checks) -> do
583 P.putStrLn $ name <> " " <> show n
584 b <- testEleve False n ex checks
585 P.putStrLn $ " splitting: " <> if b then "PASS" else "FAIL"