2 Module : Gargantext.Text.Eleve
3 Description : Unsupervized Word segmentation
4 Copyright : (c) CNRS, 2019-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 # Implementation of Unsupervized Word Segmentation
14 - Python implementation (Korantin August, Emmanuel Navarro):
15 [EleVe](https://github.com/kodexlab/eleve.git)
17 - Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
18 Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
19 the 50th Annual Meeting of the Association for Computational Linguistics
20 , pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
22 Notes for current implementation:
23 - TODO fix normalization
24 - TODO extract longer ngrams (see paper above, viterbi algo can be used)
25 - TODO AD TEST: prop (Node c _e f) = c == Map.size f
27 - AD: Real ngrams extraction test
28 from Gargantext.Text.Terms import extractTermsUnsupervised
29 docs <- runCmdRepl $ selectDocs 1004
30 extractTermsUnsupervised 3 $ DT.intercalate " "
32 $ Gargantext.map _hyperdataDocument_abstract docs
35 {-# LANGUAGE ConstraintKinds #-}
36 {-# LANGUAGE NoImplicitPrelude #-}
37 {-# LANGUAGE OverloadedStrings #-}
38 {-# LANGUAGE RankNTypes #-}
39 {-# LANGUAGE TemplateHaskell #-}
40 {-# LANGUAGE TypeFamilies #-}
42 module Gargantext.Text.Eleve where
44 -- import Debug.Trace (trace)
45 -- import Debug.SimpleReflect
47 import Data.Functor.Reverse
48 import Control.Lens (Lens', Getting, (^.), (^?), view, makeLenses, _Just, under, reversed, at, (.~), to, set)
49 import Control.Monad (forM_)
51 import qualified Data.List as L
53 import Data.Text (Text)
54 import qualified Data.Text as T
56 import Data.Maybe (fromMaybe)
57 import qualified Data.Map as Map
58 import Gargantext.Prelude hiding (cs)
59 import qualified Gargantext.Prelude as GP
60 import qualified Data.Tree as Tree
61 import Data.Tree (Tree)
62 import qualified Prelude as P (putStrLn, logBase, isNaN, RealFloat)
69 -- ^ TODO: only used for debugging
71 ------------------------------------------------------------------------
72 -- | Example and tests for development
75 , _info_entropy_var :: e
79 instance Show e => Show (I e) where
80 show (I e v n) = show (e, v, n)
84 type ModEntropy i o e = (e -> e) -> i -> o
86 set_autonomy :: ModEntropy e (I e) e
87 set_autonomy f e = I e e (f e)
89 set_entropy_var :: ModEntropy e (I e) e
90 set_entropy_var f e = I e (f e) e
93 data StartStop = Start | Stop
94 deriving (Ord, Eq, Show)
96 data Token = NonTerminal Text
98 deriving (Ord, Eq, Show)
100 isTerminal :: Token -> Bool
101 isTerminal (Terminal _) = True
102 isTerminal (NonTerminal _) = False
104 parseToken :: Text -> Token
105 parseToken "<start>" = Terminal Start
106 parseToken "<stop>" = Terminal Stop
107 parseToken t = NonTerminal t
110 -- >>> reverseTokens [Terminal Start, NonTerminal "new", NonTerminal "york", Terminal Stop]
111 -- [Terminal Start,NonTerminal "york",NonTerminal "new",Terminal Stop]
112 reverseTokens :: [Token] -> [Token]
113 reverseTokens xs = case lastMay xs of
115 Just (Terminal Stop) -> reverseTokens' xs <> [Terminal Stop]
116 _ -> reverseTokens' xs
118 reverseTokens' :: [Token] -> [Token]
119 reverseTokens' [] = []
120 reverseTokens' [Terminal Stop] = []
121 reverseTokens' [x] = [x]
122 reverseTokens' (x:xs) = case x of
123 Terminal Start -> [Terminal Start] <> reverseTokens' xs
124 _ -> reverseTokens' xs <> [x]
127 toToken :: [Text] -> [Token]
128 toToken xs = Terminal Start : (NonTerminal <$> xs) <> [Terminal Stop]
130 printToken :: Token -> Text
133 f (NonTerminal x) = x
134 f (Terminal Start) = "<start>"
135 f (Terminal Stop) = "<stop>"
136 ------------------------------------------------------------------------
139 = Node { _node_count :: Int
141 , _node_children :: Map k (Trie k e)
143 | Leaf { _node_count :: Int }
148 insertTries :: Ord k => [[k]] -> Trie k ()
149 insertTries = L.foldr insertTrie emptyTrie
151 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
152 insertTrie [] n = n { _node_count = _node_count n +1}
153 insertTrie (x:xs) (Leaf c) = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
154 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
156 f = Just . insertTrie xs . fromMaybe emptyTrie
158 -- emptyTrie :: (Ord k, Monoid e) => Trie k e
159 -- emptyTrie = Node 0 mempty mempty
160 emptyTrie :: Trie k e
163 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
165 | Map.null children = Leaf c
166 | otherwise = Node c mempty children
168 -----------------------------
169 -- | Trie to Tree since Tree as nice print function
170 toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
171 toTree k (Leaf c) = Tree.Node (k, c, Nothing) []
172 toTree k (Node c e cs) = Tree.Node (k, c, Just e) (map (uncurry toTree) $ Map.toList cs)
174 ------------------------------------------------------------------------
175 ------------------------------------------------------------------------
176 nan :: Floating e => e
179 noNaNs :: P.RealFloat e => [e] -> [e]
180 noNaNs = filter (not . P.isNaN)
182 updateIfDefined :: P.RealFloat e => e -> e -> e
183 updateIfDefined e0 e | P.isNaN e = e0
186 sim :: Entropy e => e -> e -> Bool
187 sim x y = x == y || (P.isNaN x && P.isNaN y)
189 subst :: Entropy e => (e, e) -> e -> e
190 subst (src, dst) x | sim src x = dst
193 entropyTrie :: Entropy e => (k -> Bool) -> Trie k () -> Trie k e
194 entropyTrie _ (Leaf c) = Leaf c
195 entropyTrie pred (Node c () children) = Node c e (map (entropyTrie pred) children)
197 e = sum $ map f $ Map.toList children
198 f (k, child) = if pred k then chc * P.logBase 2 (fromIntegral c)
199 else - chc * P.logBase 2 chc
201 chc = fromIntegral (_node_count child) / fromIntegral c
202 ------------------------------------------------------------------------
203 normalizeLevel :: Entropy e => [e] -> e -> e
204 normalizeLevel = checkDiff (go . noNaNs)
207 -- checkDiff f es e = let e' = f es e in if e == e' then e' else trace ("normalizeLevel: diff " <> show e <> " " <> show e') e'
209 -- go [] = panic "normalizeLevel: impossible"
210 -- trace "normalizeLevel"
212 go es = \e -> (e - m) / v
215 then trace ("normalizeLevel " <> show (e,m,v,es))
225 nodeChildren :: Trie k e -> Map k (Trie k e)
226 nodeChildren (Node _ _ cs) = cs
227 nodeChildren (Leaf _) = Map.empty
232 class IsTrie trie where
233 buildTrie :: Entropy e => (Int -> [[Text]] -> [[Token]]) -> Int -> [[Text]] -> trie Token e
234 nodeEntropy :: Entropy e => Getting e i e -> trie k i -> e
235 nodeChild :: Ord k => k -> trie k e -> trie k e
236 findTrie :: Ord k => [k] -> trie k e -> trie k e
239 --nodeAutonomy :: (Ord k, Entropy e) => Getting e i e -> trie k i -> [k] -> e
240 --nodeAutonomy inE t ks = nodeEntropy inE $ findTrie ks t
242 instance IsTrie Trie where
243 buildTrie to n ts = entropyTrie isTerminal $ insertTries $ to n ts
245 nodeEntropy inE (Node _ e _) = e ^. inE
246 nodeEntropy _ (Leaf _) = nan
248 nodeChild k (Node _ _ cs) = fromMaybe emptyTrie (Map.lookup k cs)
249 nodeChild _ (Leaf _) = emptyTrie
251 findTrie ks t = L.foldl (flip nodeChild) t ks
253 normalizeEntropy :: Entropy e
254 => Getting e i e -> ModEntropy i o e
255 -> Trie k i -> Trie k o
256 normalizeEntropy inE modE t = go (modE identity) level t
258 level = (entropyLevels inE t)
259 go _ [] _ = panic "normalizeEntropy' empty levels"
260 go _ _ (Leaf c) = Leaf c
261 -- go _ ([] : _) _ = panic "normalizeEntropy': empty level"
262 go f (es : ess) (Node c i children)
263 -- | any (sim (i ^. inE)) es
264 = Node c (f i) $ go (modE $ normalizeLevel es) ess <$> children
266 -- = panic "NOT an elem"
270 This is only normalizing a node with respect to its brothers (unlike all the
271 nodes of the same level).
273 normalizeEntropy inE modE = go $ modE identity
275 go _ (Leaf c) = Leaf c
276 go f (Node c i children)
277 | Map.null children =
278 panic "normalizeEntropy: impossible"
280 Node c (f i) $ go (modE $ normalizeLevel es) <$> children
282 es = [ i' ^. inE | Node _ i' _ <- Map.elems children ]
284 ------------------------------------------------------------------------
286 levels :: Trie k e -> [[Trie k e]]
287 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
289 subForest :: Trie k e -> [Trie k e]
290 subForest (Leaf _) = []
291 subForest (Node _ _ children) = Map.elems children
293 entropyLevels :: Entropy e => Getting e i e -> Trie k i -> [[e]]
294 entropyLevels inE = fmap (noNaNs . map (nodeEntropy inE)) . levels
296 ------------------------------------------------------------------------
298 data Tries k e = Tries
307 instance IsTrie Tries where
308 buildTrie to n tts = Tries { _fwd = buildTrie to n tts
309 , _bwd = buildTrie to n (map reverse $ tts)
312 nodeEntropy inE (Tries fwd bwd) =
313 -- VETODO reverse the query for bwd here
314 -- mean $ noNaNs [nodeEntropy inE fwd, nodeEntropy inE bwd . under reversed]
315 mean $ noNaNs [nodeEntropy inE fwd, nodeEntropy inE bwd]
317 findTrie ks (Tries fwd bwd) = Tries (findTrie ks fwd) (findTrie ks bwd)
319 -- TODO: here this is tempting to reverse but this is not always what we
320 -- want. See also nodeAutonomy.
321 -- AD: I also tried to reverse here and I confirm getting unexpected results (whereas VETODO FIX below is ok)
322 -- since recursivity of the function makes the reverse multiple times (I guess)
324 nodeChild k (Tries fwd bwd) = Tries (nodeChild k fwd) (nodeChild k bwd)
326 ------------------------------------------------------------------------
327 split :: (IsTrie trie, Entropy e) => Lens' i e -> trie Token i -> [Token] -> [[Token]]
329 split inE t0 (Terminal Start:xs0) = split inE (nodeChild (Terminal Start) t0) xs0
330 split inE t0 (x0:xs0) = go (nodeChild x0 t0) [x0] xs0
333 consRev xs xss = reverse xs : xss
335 go _ pref [] = [reverse pref]
336 go _ pref (Terminal Stop:_) = [reverse pref]
337 go t pref (Terminal Start:xs) = go t pref xs
339 -- trace (show (if acc then "ACC" else "CUT", (reverse (x : pref), ext), if acc then ">" else "<=", ((reverse pref, et), "+", ([x], ext0)))) $
341 then go xt (x:pref) xs
342 else consRev pref $ go xt0 [x] xs
347 -- ^ entropy of the current prefix
351 -- ^ entropy of the current prefix plus x
352 acc = ext > et + ext0
353 -- aut(["in","this","paper"]) > aut(["in","this"]) + aut(["paper"])
355 ne d t = if P.isNaN e then d else e
356 where e = nodeEntropy inE t
359 split :: Entropy e => Lens' i e -> Tries Token i -> [Token] -> [[Token]]
361 maximumWith (sum . map $ nodeAutonomy inE t0) (all the splits of ts)
364 ------------------------------------------------------------------------
365 ------------------------------------------------------------------------
367 mainEleve :: Int -> [[Text]] -> [[[Text]]]
370 mainEleve n input = map (map printToken) . split identity (t :: Trie Token Double) <$> inp
372 inp = toToken <$> input
373 t = buildTrie $ L.concat $ chunkAlong n 1 <$> inp
376 chunkAlongEleve :: Int -> [a] -> [[a]]
377 chunkAlongEleve n xs = L.take n <$> L.tails xs
379 toToken' :: Int -> [[Text]] -> [[Token]]
380 toToken' n input = L.concat $ (filter (/= [Terminal Stop]) . chunkAlongEleve (n + 2)) <$> toToken <$> input
382 ---------------------------------------------
383 set_entropy_vars :: Entropy e => Getting e i e -> (e -> i -> o) -> Tries Token i -> Trie Token o
384 set_entropy_vars inE modE tries@(Tries fwd _bwd) =
385 mapTree (\k -> modE $ entropy_var'' inE tries k) [] fwd
387 mapTree :: ([Token] -> t -> e) -> [Token] -> Trie Token t -> Trie Token e
388 mapTree f k t = go f k t
390 go _ _ (Leaf c) = Leaf c
391 go f k (Node c i children) = Node c (f k i) (Map.mapWithKey (\k'-> go f (k <> [k'])) children)
393 entropy_var'' :: Entropy e => Getting e i e -> Tries Token i -> [Token] -> e
394 entropy_var'' inE tries ng = mean $ noNaNs [fwd, bwd]
396 fwd = (nodeEntropy inE (_fwd $ findTrie ng tries))
397 bwd = (nodeEntropy inE (_bwd $ findTrie (reverseTokens ng) tries))
399 ---------------------------------------------
400 -- | TODO remove function below after following bug fixed
401 -- | TODO entropy_var' /= entropy_var on "<start> token.."
402 entropy_var' :: Entropy e => Tries Token (I e) -> [Token] -> e
403 entropy_var' tries ng = (mean $ noNaNs [ (nodeEntropy info_entropy (_fwd $ findTrie ng tries))
404 , (nodeEntropy info_entropy (_bwd $ findTrie (reverseTokens ng) tries))
408 entropy_var :: Entropy e => [Text] -> Tries Token (I e) -> e
409 entropy_var ng trie = (mean [ (nodeEntropy info_entropy (_fwd $ findTrie ntf trie))
410 , (nodeEntropy info_entropy (_bwd $ findTrie ntb trie))
414 ntf = parseToken <$> ng
415 ntb = parseToken <$> reverse ng
417 ---------------------------------------------
419 testEleve :: e ~ Double => Bool -> Int -> [Text] -> [(Text, Int, e, e, e, e, e)] -> IO Bool
420 testEleve debug n output checks = do
423 pss = [ (ps, findTrie ps fwd ^? _Just . node_entropy) -- . info_entropy)
426 , cs <- chunkAlong m 1 <$> inp
431 --res = map (map printToken) . split identity fwd <$> inp
432 --res = map (map printToken) . split info_norm_entropy' nt' <$> inp
433 res = map (map printToken) . split info_autonomy nt <$> inp
435 P.putStrLn (show input)
436 -- forM_ pss (P.putStrLn . show)
438 P.putStrLn "Forward:"
441 P.putStrLn "Backward:"
445 forM_ (entropyLevels identity t'') $ \level ->
446 P.putStrLn $ " " <> show level
448 P.putStrLn "Normalized:"
451 P.putStrLn "Splitting:"
452 P.putStrLn $ show res
454 pure $ expected == res
457 out = T.words <$> output
458 expected = fmap (T.splitOn "-") <$> out
459 input = (T.splitOn "-" =<<) <$> out
460 inp = toToken <$> input
462 t :: Tries Token Double
463 t = buildTrie toToken' n input
464 & bwd . node_children . at (Terminal Start) . _Just . node_entropy .~ nan
465 -- TODO NP: this is a hack to set the bwd entropy of Start at NaN.
467 t'' :: Trie Token Double
468 t'' = set_entropy_vars identity (\e _i -> e) t
470 nt :: Trie Token (I Double)
471 nt = normalizeEntropy identity set_autonomy t''
473 -- nt = normalizeEntropy identity set_autonomy (fwd :: Trie Token Double)
474 -- nt = normalizeEntropy' info_entropy (\f -> info_norm_entropy' %~ f) nt
478 then P.putStrLn $ " PASS " <> msg <> " " <> show ref
479 else P.putStrLn $ " FAIL " <> msg <> " ref=" <> show ref <> " my=" <> show my
481 checker (ngram, count, entropy, _ev, autonomy, bwd_entropy, fwd_entropy) = do
482 let ns = parseToken <$> T.words ngram
483 nsb = parseToken <$> (reverse $ T.words ngram)
485 tvar = findTrie ns t''
488 P.putStrLn $ " " <> T.unpack ngram <> ":"
489 check (==) "count" count (_node_count tvar)
490 check sim "entropy_var" entropy (nodeEntropy identity tvar)
491 --check sim ("entropy_varOK") entropy (entropy_var (T.words ngram) nt)
492 --check sim "entropy" entropy (entropy_var' nt (parseToken <$> T.words ngram))
493 {- ^ FIXME 2 fun above should have same results (error in reverseToken):
496 FAIL entropy ref=NaN my=0.0
499 check sim "autonomy" autonomy (nodeEntropy info_autonomy nt')
500 check sim "fwd_entropy" fwd_entropy (nodeEntropy identity (_fwd t'))
501 check sim "bwd_entropy" bwd_entropy (nodeEntropy identity (_bwd t'))
503 printTrie :: Show e => Trie Token e -> IO ()
505 P.putStrLn . Tree.drawTree
507 . toTree (NonTerminal "")
509 -- | TODO real data is a list of tokenized sentences
510 example0, example1, example2, example3, example4, example5, example6 :: [Text]
511 example0 = ["New-York is New-York and New-York"]
512 example1 = ["to-be or not to-be"]
513 example2 = ["to-be-or not to-be-or NOT to-be and"]
514 example3 = example0 <> example0
515 -- > TEST: Should not have York New in the trie
516 example4 = ["a-b-c-d e a-b-c-d f"]
517 example5 = ["a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"]
518 example6 = ["le-petit chat"
524 checks0, checks2 :: [(Text, Int, Double, Double, Double, Double, Double)]
527 [("<start>", 1, nan, nan, nan, nan, 0.0)
528 ,("New", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 1.584962500721156, 0.0)
529 ,("York", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 0.0, 1.584962500721156)
530 ,("is", 1, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, 0.0)
531 ,("and", 1, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, 0.0)
532 --,("<stop>", 0, nan, nan, nan, 0.0, nan) Since it is not in the trie it no,
536 ,("<start> New", 1, nan, nan, nan, nan, 0.0)
537 ,("New York", 3, 1.584962500721156, 1.584962500721156, 1.4142135623730951, nan, 1.584962500721156)
538 ,("York is", 1, 0.0, nan, nan, nan, 0.0)
539 ,("is New", 1, 0.0, nan, nan, nan, 0.0)
540 ,("York and", 1, 0.0, nan, nan, nan, 0.0)
541 ,("and New", 1, 0.0, nan, nan, nan, 0.0)
542 ,("York <stop>", 1, nan, nan, nan, nan, nan)
544 ,("<start> New York", 1, nan, nan, nan, nan, 0.0)
545 ,("New York is", 1, 0.0, nan, nan, nan, 0.0)
546 ,("York is New", 1, 0.0, nan, nan, nan, 0.0)
547 ,("is New York", 1, 0.0, nan, nan, nan, 0.0)
548 ,("New York and", 1, 0.0, nan, nan, nan, 0.0)
549 ,("York and New", 1, 0.0, nan, nan, nan, 0.0)
550 ,("and New York", 1, 0.0, nan, nan, nan, 0.0)
551 ,("New York <stop>", 1, nan, nan, nan, nan, nan)
558 [("to be", 3, 1.2516291673878228, 1.2516291673878228, 1.5535694744293167, nan, 0.9182958340544896)
559 ,("be or", 2, 0.5, nan, nan, nan, 1.0)
560 ,("or not", 1, 0.0, nan, nan, nan, 0.0)
561 ,("not to", 1, 0.0, nan, nan, nan, 0.0)
562 ,("or NOT", 1, 0.0, nan, nan, nan, 0.0)
563 ,("NOT to", 1, 0.0, nan, nan, nan, 0.0)
564 ,("be and", 1, 0.0, nan, nan, nan, 0.0)
571 [("example0", 2, example0, checks0)
572 ,("example1", 2, example1, [])
573 ,("example2", 3, example2, checks2)
574 ,("example3", 2, example3, [])
575 ,("example4", 4, example4, [])
576 ,("example5", 5, example5, [])
578 (\(name, n, ex, checks) -> do
579 P.putStrLn $ name <> " " <> show n
580 b <- testEleve False n ex checks
581 P.putStrLn $ " splitting: " <> if b then "PASS" else "FAIL"