2 Module : Gargantext.Text.Eleve
3 Description : Unsupervized Word segmentation
4 Copyright : (c) CNRS, 2019-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 # Implementation of Unsupervized Word Segmentation
14 - Python implementation (Korantin August, Emmanuel Navarro):
15 [EleVe](https://github.com/kodexlab/eleve.git)
17 - Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
18 Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
19 the 50th Annual Meeting of the Association for Computational Linguistics
20 , pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
22 Notes for current implementation:
23 - TODO fix normalization
24 - TODO extract longer ngrams (see paper above, viterbi algo can be used)
25 - TODO AD TEST: prop (Node c _e f) = c == Map.size f
27 - AD: Real ngrams extraction test
28 from Gargantext.Text.Terms import extractTermsUnsupervised
29 docs <- runCmdRepl $ selectDocs 1004
30 extractTermsUnsupervised 3 $ DT.intercalate " "
32 $ Gargantext.map _hyperdataDocument_abstract docs
35 {-# LANGUAGE ConstraintKinds #-}
36 {-# LANGUAGE NoImplicitPrelude #-}
37 {-# LANGUAGE OverloadedStrings #-}
38 {-# LANGUAGE RankNTypes #-}
39 {-# LANGUAGE TemplateHaskell #-}
40 {-# LANGUAGE TypeFamilies #-}
42 module Gargantext.Text.Eleve where
44 -- import Debug.Trace (trace)
45 -- import Debug.SimpleReflect
47 import Data.Functor.Reverse
48 import Control.Lens (Lens', Getting, (^.), (^?), view, makeLenses, _Just, under, reversed, at, (.~), to, set)
49 import Control.Monad (forM_)
51 import qualified Data.List as L
53 import Data.Text (Text)
54 import qualified Data.Text as T
56 import Data.Maybe (fromMaybe)
57 import qualified Data.Map as Map
58 import Gargantext.Prelude hiding (cs)
59 import qualified Gargantext.Prelude as GP
60 import qualified Data.Tree as Tree
61 import Data.Tree (Tree)
62 import qualified Prelude as P (putStrLn, logBase, isNaN, RealFloat)
69 -- ^ TODO: only used for debugging
71 ------------------------------------------------------------------------
72 -- | Example and tests for development
75 , _info_entropy_var :: e
79 instance Show e => Show (I e) where
80 show (I e v n) = show (e, v, n)
84 type ModEntropy i o e = (e -> e) -> i -> o
86 set_autonomy :: ModEntropy e (I e) e
87 set_autonomy f e = I e e (f e)
89 set_entropy_var :: ModEntropy e (I e) e
90 set_entropy_var f e = I e (f e) e
93 data StartStop = Start | Stop
94 deriving (Ord, Eq, Show)
96 data Token = NonTerminal Text
98 deriving (Ord, Eq, Show)
100 isTerminal :: Token -> Bool
101 isTerminal (Terminal _) = True
102 isTerminal (NonTerminal _) = False
104 parseToken :: Text -> Token
105 parseToken "<start>" = Terminal Start
106 parseToken "<stop>" = Terminal Stop
107 parseToken t = NonTerminal t
110 -- >>> reverseTokens [Terminal Start, NonTerminal "new", NonTerminal "york", Terminal Stop]
111 -- [Terminal Start,NonTerminal "york",NonTerminal "new",Terminal Stop]
112 reverseTokens :: [Token] -> [Token]
113 reverseTokens xs = case lastMay xs of
115 Just (Terminal Stop) -> reverseTokens' xs <> [Terminal Stop]
116 _ -> reverseTokens' xs
118 reverseTokens' :: [Token] -> [Token]
119 reverseTokens' [] = []
120 reverseTokens' [Terminal Stop] = []
121 reverseTokens' [x] = [x]
122 reverseTokens' (x:xs) = case x of
123 Terminal Start -> [Terminal Start] <> reverseTokens' xs
124 _ -> reverseTokens' xs <> [x]
127 toToken :: [Text] -> [Token]
128 toToken xs = Terminal Start : (NonTerminal <$> xs) <> [Terminal Stop]
130 printToken :: Token -> Text
133 f (NonTerminal x) = x
134 f (Terminal Start) = "<start>"
135 f (Terminal Stop) = "<stop>"
136 ------------------------------------------------------------------------
139 = Node { _node_count :: Int
141 , _node_children :: Map k (Trie k e)
143 | Leaf { _node_count :: Int }
148 insertTries :: Ord k => [[k]] -> Trie k ()
149 insertTries = L.foldr insertTrie emptyTrie
151 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
152 insertTrie [] n = n { _node_count = _node_count n +1}
153 insertTrie (x:xs) (Leaf c) = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
154 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
156 f = Just . insertTrie xs . fromMaybe emptyTrie
158 -- emptyTrie :: (Ord k, Monoid e) => Trie k e
159 -- emptyTrie = Node 0 mempty mempty
160 emptyTrie :: Trie k e
163 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
165 | Map.null children = Leaf c
166 | otherwise = Node c mempty children
168 -----------------------------
169 -- | Trie to Tree since Tree as nice print function
170 toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
171 toTree k (Leaf c) = Tree.Node (k, c, Nothing) []
172 toTree k (Node c e cs) = Tree.Node (k, c, Just e) (map (uncurry toTree) $ Map.toList cs)
174 ------------------------------------------------------------------------
175 ------------------------------------------------------------------------
176 nan :: Floating e => e
179 noNaNs :: P.RealFloat e => [e] -> [e]
180 noNaNs = filter (not . P.isNaN)
182 updateIfDefined :: P.RealFloat e => e -> e -> e
183 updateIfDefined e0 e | P.isNaN e = e0
186 sim :: Entropy e => e -> e -> Bool
187 sim x y = x == y || (P.isNaN x && P.isNaN y)
189 subst :: Entropy e => (e, e) -> e -> e
190 subst (src, dst) x | sim src x = dst
193 entropyTrie :: Entropy e => (k -> Bool) -> Trie k () -> Trie k e
194 entropyTrie _ (Leaf c) = Leaf c
195 entropyTrie pred (Node c () children) = Node c e (map (entropyTrie pred) children)
197 e = sum $ map f $ Map.toList children
198 f (k, child) = if pred k then chc * P.logBase 2 (fromIntegral c)
199 else - chc * P.logBase 2 chc
201 chc = fromIntegral (_node_count child) / fromIntegral c
202 ------------------------------------------------------------------------
203 normalizeLevel :: Entropy e => e -> e -> e -> e -> e
204 normalizeLevel prev m v e = ((e - prev) - m) / v
208 nodeChildren :: Trie k e -> Map k (Trie k e)
209 nodeChildren (Node _ _ cs) = cs
210 nodeChildren (Leaf _) = Map.empty
215 class IsTrie trie where
216 buildTrie :: Entropy e => (Int -> [[Text]] -> [[Token]]) -> Int -> [[Text]] -> trie Token e
217 nodeEntropy :: Entropy e => Getting e i e -> trie k i -> e
218 nodeChild :: Ord k => k -> trie k e -> trie k e
219 findTrie :: Ord k => [k] -> trie k e -> trie k e
220 normalizeEntropy :: Entropy e
221 => Getting e i e -> ModEntropy i o e
222 -> trie k i -> trie k o
225 --nodeAutonomy :: (Ord k, Entropy e) => Getting e i e -> trie k i -> [k] -> e
226 --nodeAutonomy inE t ks = nodeEntropy inE $ findTrie ks t
228 instance IsTrie Trie where
229 buildTrie to n ts = entropyTrie isTerminal $ insertTries $ to n ts
231 nodeEntropy inE (Node _ e _) = e ^. inE
232 nodeEntropy _ (Leaf _) = nan
234 nodeChild k (Node _ _ cs) = fromMaybe emptyTrie (Map.lookup k cs)
235 nodeChild _ (Leaf _) = emptyTrie
237 findTrie ks t = L.foldl (flip nodeChild) t ks
239 normalizeEntropy inE modE t = go (modE identity) (normalizationLevels inE t) t
241 go _ [] _ = panic "normalizeEntropy' empty levels"
242 go _ _ (Leaf c) = Leaf c
243 go f ((m, v, _) : ess) (Node c i children)
244 = Node c (f i) $ go (modE $ normalizeLevel (i ^. inE) m v) ess <$> children
248 This is only normalizing a node with respect to its brothers (unlike all the
249 nodes of the same level).
251 normalizeEntropy inE modE = go $ modE identity
253 go _ (Leaf c) = Leaf c
254 go f (Node c i children)
255 | Map.null children =
256 panic "normalizeEntropy: impossible"
258 Node c (f i) $ go (modE $ normalizeLevel es) <$> children
260 es = [ i' ^. inE | Node _ i' _ <- Map.elems children ]
262 ------------------------------------------------------------------------
264 levels :: Trie k e -> [[Trie k e]]
265 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
267 subForest :: Trie k e -> [Trie k e]
268 subForest (Leaf _) = []
269 subForest (Node _ _ children) = Map.elems children
271 entropyLevels :: Entropy e => Getting e i e -> Trie k i -> [[e]]
272 entropyLevels inE = fmap (noNaNs . map (nodeEntropy inE)) . levels
274 normalizationLevels :: Entropy e => Getting e i e -> Trie k i -> [(e, e, Int)]
275 normalizationLevels inE = fmap f . entropyLevels inE
277 f es = (mean es, deviation es, length es)
279 ------------------------------------------------------------------------
281 data Tries k e = Tries
290 instance IsTrie Tries where
291 buildTrie to n tts = Tries { _fwd = buildTrie to n tts
292 , _bwd = buildTrie to n (map reverse $ tts)
295 nodeEntropy inE (Tries fwd bwd) =
296 -- VETODO reverse the query for bwd here
297 -- mean $ noNaNs [nodeEntropy inE fwd, nodeEntropy inE bwd . under reversed]
298 mean $ noNaNs [nodeEntropy inE fwd, nodeEntropy inE bwd]
300 findTrie ks (Tries fwd bwd) = Tries (findTrie ks fwd) (findTrie ks bwd)
302 -- TODO: here this is tempting to reverse but this is not always what we
303 -- want. See also nodeAutonomy.
304 -- AD: I also tried to reverse here and I confirm getting unexpected results (whereas VETODO FIX below is ok)
305 -- since recursivity of the function makes the reverse multiple times (I guess)
307 nodeChild k (Tries fwd bwd) = Tries (nodeChild k fwd) (nodeChild k bwd)
309 normalizeEntropy inE modE = onTries (normalizeEntropy inE modE)
311 onTries :: (Trie k i -> Trie k o) -> Tries k i -> Tries k o
312 onTries f (Tries fwd bwd) = Tries (f fwd) (f bwd)
314 ------------------------------------------------------------------------
315 split :: (IsTrie trie, Entropy e) => Lens' i e -> trie Token i -> [Token] -> [[Token]]
317 split inE t0 (Terminal Start:xs0) = split inE (nodeChild (Terminal Start) t0) xs0
318 split inE t0 (x0:xs0) = go (nodeChild x0 t0) [x0] xs0
321 consRev xs xss = reverse xs : xss
323 go _ pref [] = [reverse pref]
324 go _ pref (Terminal Stop:_) = [reverse pref]
325 go t pref (Terminal Start:xs) = go t pref xs
327 -- trace (show (if acc then "ACC" else "CUT", (reverse (x : pref), ext), if acc then ">" else "<=", ((reverse pref, et), "+", ([x], ext0)))) $
329 then go xt (x:pref) xs
330 else consRev pref $ go xt0 [x] xs
335 -- ^ entropy of the current prefix
339 -- ^ entropy of the current prefix plus x
340 acc = ext > et + ext0
341 -- aut(["in","this","paper"]) > aut(["in","this"]) + aut(["paper"])
343 ne d t = if P.isNaN e then d else e
344 where e = nodeEntropy inE t
347 split :: Entropy e => Lens' i e -> Tries Token i -> [Token] -> [[Token]]
349 maximumWith (sum . map $ nodeAutonomy inE t0) (all the splits of ts)
352 ------------------------------------------------------------------------
353 ------------------------------------------------------------------------
355 mainEleve :: Int -> [[Text]] -> [[[Text]]]
358 mainEleve n input = map (map printToken) . split identity (t :: Trie Token Double) <$> inp
360 inp = toToken <$> input
361 t = buildTrie $ L.concat $ chunkAlong n 1 <$> inp
364 chunkAlongEleve :: Int -> [a] -> [[a]]
365 chunkAlongEleve n xs = L.take n <$> L.tails xs
367 toToken' :: Int -> [[Text]] -> [[Token]]
368 toToken' n input = L.concat $ (filter (/= [Terminal Stop]) . chunkAlongEleve (n + 2)) <$> toToken <$> input
370 ---------------------------------------------
371 set_entropy_vars :: Entropy e => Getting e i e -> (e -> i -> o) -> Tries Token i -> Trie Token o
372 set_entropy_vars inE modE tries@(Tries fwd _bwd) =
373 mapTree (\k -> modE $ entropy_var'' inE tries k) [] fwd
375 mapTree :: ([Token] -> t -> e) -> [Token] -> Trie Token t -> Trie Token e
376 mapTree f k t = go f k t
378 go _ _ (Leaf c) = Leaf c
379 go f k (Node c i children) = Node c (f k i) (Map.mapWithKey (\k'-> go f (k <> [k'])) children)
381 entropy_var'' :: Entropy e => Getting e i e -> Tries Token i -> [Token] -> e
382 entropy_var'' inE tries ng = mean $ noNaNs [fwd, bwd]
384 fwd = (nodeEntropy inE (_fwd $ findTrie ng tries))
385 bwd = (nodeEntropy inE (_bwd $ findTrie (reverseTokens ng) tries))
387 ---------------------------------------------
388 -- | TODO remove function below after following bug fixed
389 -- | TODO entropy_var' /= entropy_var on "<start> token.."
390 entropy_var' :: Entropy e => Tries Token (I e) -> [Token] -> e
391 entropy_var' tries ng = (mean $ noNaNs [ (nodeEntropy info_entropy (_fwd $ findTrie ng tries))
392 , (nodeEntropy info_entropy (_bwd $ findTrie (reverseTokens ng) tries))
396 entropy_var :: Entropy e => [Text] -> Tries Token (I e) -> e
397 entropy_var ng trie = (mean [ (nodeEntropy info_entropy (_fwd $ findTrie ntf trie))
398 , (nodeEntropy info_entropy (_bwd $ findTrie ntb trie))
402 ntf = parseToken <$> ng
403 ntb = parseToken <$> reverse ng
405 ---------------------------------------------
407 testEleve :: e ~ Double => Bool -> Int -> [Text] -> [(Text, Int, e, e, e, e, e)] -> IO Bool
408 testEleve debug n output checks = do
411 pss = [ (ps, findTrie ps fwd ^? _Just . node_entropy) -- . info_entropy)
414 , cs <- chunkAlong m 1 <$> inp
419 --res = map (map printToken) . split identity fwd <$> inp
420 --res = map (map printToken) . split info_norm_entropy' nt' <$> inp
421 res = map (map printToken) . split info_autonomy nt <$> inp
423 P.putStrLn (show input)
424 -- forM_ pss (P.putStrLn . show)
426 P.putStrLn "Forward:"
429 P.putStrLn "Backward:"
433 forM_ (normalizationLevels identity t'') $ \level ->
434 P.putStrLn $ " " <> show level
436 P.putStrLn "Entropy Var:"
439 P.putStrLn "Splitting:"
440 P.putStrLn $ show res
442 pure $ expected == res
445 out = T.words <$> output
446 expected = fmap (T.splitOn "-") <$> out
447 input = (T.splitOn "-" =<<) <$> out
448 inp = toToken <$> input
450 t :: Tries Token Double
451 t = buildTrie toToken' n input
452 & bwd . node_children . at (Terminal Start) . _Just . node_entropy .~ nan
453 -- TODO NP: this is a hack to set the bwd entropy of Start at NaN.
455 nt :: Tries Token (I Double)
456 nt = normalizeEntropy identity set_autonomy t
458 t'' :: Trie Token Double
459 t'' = set_entropy_vars info_autonomy (\e _i -> e) nt
461 -- nt = normalizeEntropy identity set_autonomy (fwd :: Trie Token Double)
462 -- nt = normalizeEntropy' info_entropy (\f -> info_norm_entropy' %~ f) nt
466 then P.putStrLn $ " PASS " <> msg <> " " <> show ref
467 else P.putStrLn $ " FAIL " <> msg <> " ref=" <> show ref <> " my=" <> show my
469 checker (ngram, count, entropy, _ev, autonomy, bwd_entropy, fwd_entropy) = do
470 let ns = parseToken <$> T.words ngram
471 nsb = parseToken <$> (reverse $ T.words ngram)
473 tvar = findTrie ns t''
476 P.putStrLn $ " " <> T.unpack ngram <> ":"
477 check (==) "count" count (_node_count tvar)
478 check sim "entropy_var" entropy (nodeEntropy identity tvar)
479 --check sim ("entropy_varOK") entropy (entropy_var (T.words ngram) nt)
480 --check sim "entropy" entropy (entropy_var' nt (parseToken <$> T.words ngram))
481 {- ^ FIXME 2 fun above should have same results (error in reverseToken):
484 FAIL entropy ref=NaN my=0.0
487 check sim "autonomy" autonomy (nodeEntropy info_autonomy nt')
488 check sim "fwd_entropy" fwd_entropy (nodeEntropy identity (_fwd t'))
489 check sim "bwd_entropy" bwd_entropy (nodeEntropy identity (_bwd t'))
491 printTrie :: Show e => Trie Token e -> IO ()
493 P.putStrLn . Tree.drawTree
495 . toTree (NonTerminal "")
497 -- | TODO real data is a list of tokenized sentences
498 example0, example1, example2, example3, example4, example5, example6 :: [Text]
499 example0 = ["New-York is New-York and New-York"]
500 example1 = ["to-be or not to-be"]
501 example2 = ["to-be-or not to-be-or NOT to-be and"]
502 example3 = example0 <> example0
503 -- > TEST: Should not have York New in the trie
504 example4 = ["a-b-c-d e a-b-c-d f"]
505 example5 = ["a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"]
506 example6 = ["le-petit chat"
512 checks0, checks2 :: [(Text, Int, Double, Double, Double, Double, Double)]
515 [("<start>", 1, nan, nan, nan, nan, 0.0)
516 ,("New", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 1.584962500721156, 0.0)
517 ,("York", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 0.0, 1.584962500721156)
518 ,("is", 1, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, 0.0)
519 ,("and", 1, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, 0.0)
520 --,("<stop>", 0, nan, nan, nan, 0.0, nan) Since it is not in the trie it no,
524 ,("<start> New", 1, nan, nan, nan, nan, 0.0)
525 ,("New York", 3, 1.584962500721156, 1.584962500721156, 1.4142135623730951, nan, 1.584962500721156)
526 ,("York is", 1, 0.0, nan, nan, nan, 0.0)
527 ,("is New", 1, 0.0, nan, nan, nan, 0.0)
528 ,("York and", 1, 0.0, nan, nan, nan, 0.0)
529 ,("and New", 1, 0.0, nan, nan, nan, 0.0)
530 ,("York <stop>", 1, nan, nan, nan, nan, nan)
532 ,("<start> New York", 1, nan, nan, nan, nan, 0.0)
533 ,("New York is", 1, 0.0, nan, nan, nan, 0.0)
534 ,("York is New", 1, 0.0, nan, nan, nan, 0.0)
535 ,("is New York", 1, 0.0, nan, nan, nan, 0.0)
536 ,("New York and", 1, 0.0, nan, nan, nan, 0.0)
537 ,("York and New", 1, 0.0, nan, nan, nan, 0.0)
538 ,("and New York", 1, 0.0, nan, nan, nan, 0.0)
539 ,("New York <stop>", 1, nan, nan, nan, nan, nan)
546 [("to be", 3, 1.2516291673878228, 1.2516291673878228, 1.5535694744293167, nan, 0.9182958340544896)
547 ,("be or", 2, 0.5, nan, nan, nan, 1.0)
548 ,("or not", 1, 0.0, nan, nan, nan, 0.0)
549 ,("not to", 1, 0.0, nan, nan, nan, 0.0)
550 ,("or NOT", 1, 0.0, nan, nan, nan, 0.0)
551 ,("NOT to", 1, 0.0, nan, nan, nan, 0.0)
552 ,("be and", 1, 0.0, nan, nan, nan, 0.0)
559 [("example0", 2, example0, checks0)
560 ,("example1", 2, example1, [])
561 ,("example2", 3, example2, checks2)
562 ,("example3", 2, example3, [])
563 ,("example4", 4, example4, [])
564 ,("example5", 5, example5, [])
566 (\(name, n, ex, checks) -> do
567 P.putStrLn $ name <> " " <> show n
568 b <- testEleve False n ex checks
569 P.putStrLn $ " splitting: " <> if b then "PASS" else "FAIL"