2 Module : Gargantext.Text.Eleve
3 Description : Unsupervized Word segmentation
4 Copyright : (c) CNRS, 2019-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 # Implementation of Unsupervized Word Segmentation
14 - Python implementation (Korantin August, Emmanuel Navarro):
15 [EleVe](https://github.com/kodexlab/eleve.git)
17 - Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
18 Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
19 the 50th Annual Meeting of the Association for Computational Linguistics
20 , pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
22 Notes for current implementation:
23 - TODO fix normalization
24 - TODO extract longer ngrams (see paper above, viterbi algo can be used)
25 - TODO AD TEST: prop (Node c _e f) = c == Map.size f
27 - AD: Real ngrams extraction test
28 from Gargantext.Text.Terms import extractTermsUnsupervised
29 docs <- runCmdRepl $ selectDocs 1004
30 extractTermsUnsupervised 3 $ DT.intercalate " "
32 $ Gargantext.map _hyperdataDocument_abstract docs
35 {-# LANGUAGE ConstraintKinds #-}
36 {-# LANGUAGE NoImplicitPrelude #-}
37 {-# LANGUAGE OverloadedStrings #-}
38 {-# LANGUAGE RankNTypes #-}
39 {-# LANGUAGE TemplateHaskell #-}
40 {-# LANGUAGE TypeFamilies #-}
42 module Gargantext.Text.Eleve where
44 import Debug.Trace (trace)
45 -- import Debug.SimpleReflect
47 import Data.Functor.Reverse
48 import Control.Lens (Lens', Getting, (^.), (^?), view, makeLenses, _Just, under, reversed, at, (.~), to, set)
49 import Control.Monad (forM_)
51 import qualified Data.List as L
53 import Data.Text (Text)
54 import qualified Data.Text as T
56 import Data.Maybe (fromMaybe)
57 import qualified Data.Map as Map
58 import Gargantext.Prelude hiding (cs)
59 import qualified Gargantext.Prelude as GP
60 import qualified Data.Tree as Tree
61 import Data.Tree (Tree)
62 import qualified Prelude as P (putStrLn, logBase, isNaN, RealFloat)
69 -- ^ TODO: only used for debugging
71 ------------------------------------------------------------------------
72 -- | Example and tests for development
75 , _info_entropy_var :: e
79 instance Show e => Show (I e) where
80 show (I e v n) = show (e, v, n)
84 type ModEntropy i o e = (e -> e) -> i -> o
86 set_autonomy :: ModEntropy e (I e) e
87 set_autonomy f e = I e e (f e)
89 set_entropy_var :: ModEntropy e (I e) e
90 set_entropy_var f e = I e (f e) e
93 data StartStop = Start | Stop
94 deriving (Ord, Eq, Show)
96 data Token = NonTerminal Text
98 deriving (Ord, Eq, Show)
101 rootTrie = NonTerminal ""
103 isTerminal :: Token -> Bool
104 isTerminal (Terminal _) = True
105 isTerminal (NonTerminal _) = False
107 parseToken :: Text -> Token
108 parseToken "<start>" = Terminal Start
109 parseToken "<stop>" = Terminal Stop
110 parseToken t = NonTerminal t
113 -- >>> reverseTokens [Terminal Start, NonTerminal "new", NonTerminal "york", Terminal Stop]
114 -- [Terminal Start,NonTerminal "york",NonTerminal "new",Terminal Stop]
115 reverseTokens :: [Token] -> [Token]
116 reverseTokens xs = case lastMay xs of
118 Just (Terminal Stop) -> reverseTokens' xs <> [Terminal Stop]
119 _ -> reverseTokens' xs
121 reverseTokens' :: [Token] -> [Token]
122 reverseTokens' [] = []
123 reverseTokens' [Terminal Stop] = []
124 reverseTokens' [x] = [x]
125 reverseTokens' (x:xs) = case x of
126 Terminal Start -> [Terminal Start] <> reverseTokens' xs
127 _ -> reverseTokens' xs <> [x]
130 toToken :: [Text] -> [Token]
131 toToken xs = Terminal Start : (NonTerminal <$> xs) <> [Terminal Stop]
133 printToken :: Token -> Text
136 f (NonTerminal x) = x
137 f (Terminal Start) = "<start>"
138 f (Terminal Stop) = "<stop>"
139 ------------------------------------------------------------------------
142 = Node { _node_count :: Int
144 , _node_children :: Map k (Trie k e)
146 | Leaf { _node_count :: Int }
151 insertTries :: Ord k => [[k]] -> Trie k ()
152 insertTries = L.foldr insertTrie emptyTrie
154 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
155 insertTrie [] n = n { _node_count = _node_count n +1}
156 insertTrie (x:xs) (Leaf c) = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
157 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
159 f = Just . insertTrie xs . fromMaybe emptyTrie
161 -- emptyTrie :: (Ord k, Monoid e) => Trie k e
162 -- emptyTrie = Node 0 mempty mempty
163 emptyTrie :: Trie k e
166 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
168 | Map.null children = Leaf c
169 | otherwise = Node c mempty children
171 -----------------------------
172 -- | Trie to Tree since Tree as nice print function
173 toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
174 toTree k (Leaf c) = Tree.Node (k, c, Nothing) []
175 toTree k (Node c e cs) = Tree.Node (k, c, Just e) (map (uncurry toTree) $ Map.toList cs)
177 ------------------------------------------------------------------------
178 ------------------------------------------------------------------------
179 nan :: Floating e => e
182 noNaNs :: P.RealFloat e => [e] -> [e]
183 noNaNs = filter (not . P.isNaN)
185 updateIfDefined :: P.RealFloat e => e -> e -> e
186 updateIfDefined e0 e | P.isNaN e = e0
189 sim :: Entropy e => e -> e -> Bool
190 sim x y = x == y || (P.isNaN x && P.isNaN y)
192 subst :: Entropy e => (e, e) -> e -> e
193 subst (src, dst) x | sim src x = dst
196 entropyTrie :: Entropy e => (k -> Bool) -> Trie k () -> Trie k e
197 entropyTrie _ (Leaf c) = Leaf c
198 entropyTrie pred (Node c () children) = Node c e (map (entropyTrie pred) children)
200 e = sum $ map f $ Map.toList children
201 f (k, child) = if pred k then chc * P.logBase 2 (fromIntegral c)
202 else - chc * P.logBase 2 chc
204 chc = fromIntegral (_node_count child) / fromIntegral c
205 ------------------------------------------------------------------------
206 normalizeLevel :: Entropy e => [e] -> e -> e
207 normalizeLevel = checkDiff (go . noNaNs)
210 -- checkDiff f es e = let e' = f es e in if e == e' then e' else trace ("normalizeLevel: diff " <> show e <> " " <> show e') e'
212 -- go [] = panic "normalizeLevel: impossible"
213 -- trace "normalizeLevel"
215 go es = \e -> (e - m) / v
218 then trace ("normalizeLevel " <> show (e,m,v,es))
228 nodeChildren :: Trie k e -> Map k (Trie k e)
229 nodeChildren (Node _ _ cs) = cs
230 nodeChildren (Leaf _) = Map.empty
235 class IsTrie trie where
236 buildTrie :: Entropy e => (Int -> [[Text]] -> [[Token]]) -> Int -> [[Text]] -> trie Token e
237 nodeEntropy :: Entropy e => Getting e i e -> trie k i -> e
238 nodeChild :: Ord k => k -> trie k e -> trie k e
239 findTrie :: Ord k => [k] -> trie k e -> trie k e
240 normalizeEntropy :: Entropy e
241 => Getting e i e -> ModEntropy i o e
242 -> trie k i -> trie k o
245 --nodeAutonomy :: (Ord k, Entropy e) => Getting e i e -> trie k i -> [k] -> e
246 --nodeAutonomy inE t ks = nodeEntropy inE $ findTrie ks t
248 instance IsTrie Trie where
249 buildTrie to n ts = entropyTrie isTerminal $ insertTries $ to n ts
251 nodeEntropy inE (Node _ e _) = e ^. inE
252 nodeEntropy _ (Leaf _) = nan
254 nodeChild k (Node _ _ cs) = fromMaybe emptyTrie (Map.lookup k cs)
255 nodeChild _ (Leaf _) = emptyTrie
257 findTrie ks t = L.foldl (flip nodeChild) t ks
259 normalizeEntropy inE modE t = trace (show level) $ go (modE identity) level t
261 level = (entropyLevels inE t)
262 go _ [] _ = panic "normalizeEntropy' empty levels"
263 go _ _ (Leaf c) = Leaf c
264 -- go _ ([] : _) _ = panic "normalizeEntropy': empty level"
265 go f (es : ess) (Node c i children)
266 -- | any (sim (i ^. inE)) es
267 = Node c (f i) $ go (modE $ normalizeLevel es) ess <$> children
269 -- = panic "NOT an elem"
273 This is only normalizing a node with respect to its brothers (unlike all the
274 nodes of the same level).
276 normalizeEntropy inE modE = go $ modE identity
278 go _ (Leaf c) = Leaf c
279 go f (Node c i children)
280 | Map.null children =
281 panic "normalizeEntropy: impossible"
283 Node c (f i) $ go (modE $ normalizeLevel es) <$> children
285 es = [ i' ^. inE | Node _ i' _ <- Map.elems children ]
287 ------------------------------------------------------------------------
289 levels :: Trie k e -> [[Trie k e]]
290 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
292 subForest :: Trie k e -> [Trie k e]
293 subForest (Leaf _) = []
294 subForest (Node _ _ children) = Map.elems children
296 entropyLevels :: Entropy e => Getting e i e -> Trie k i -> [[e]]
297 entropyLevels inE = fmap (noNaNs . map (nodeEntropy inE)) . levels
299 ------------------------------------------------------------------------
301 data Tries k e = Tries
310 instance IsTrie Tries where
311 buildTrie to n tts = Tries { _fwd = buildTrie to n tts
312 , _bwd = buildTrie to n (map reverse $ tts)
315 nodeEntropy inE (Tries fwd bwd) =
316 -- VETODO reverse the query for bwd here
317 -- mean $ noNaNs [nodeEntropy inE fwd, nodeEntropy inE bwd . under reversed]
318 mean $ noNaNs [nodeEntropy inE fwd, nodeEntropy inE bwd]
320 findTrie ks (Tries fwd bwd) = Tries (findTrie ks fwd) (findTrie ks bwd)
322 -- TODO: here this is tempting to reverse but this is not always what we
323 -- want. See also nodeAutonomy.
324 -- AD: I also tried to reverse here and I confirm getting unexpected results (whereas VETODO FIX below is ok)
325 -- since recursivity of the function makes the reverse multiple times (I guess)
327 nodeChild k (Tries fwd bwd) = Tries (nodeChild k fwd) (nodeChild k bwd)
329 normalizeEntropy inE modE = onTries (normalizeEntropy inE modE)
331 onTries :: (Trie k i -> Trie k o) -> Tries k i -> Tries k o
332 onTries f (Tries fwd bwd) = Tries (f fwd) (f bwd)
334 ------------------------------------------------------------------------
335 split :: (IsTrie trie, Entropy e) => Lens' i e -> trie Token i -> [Token] -> [[Token]]
337 split inE t0 (Terminal Start:xs0) = split inE (nodeChild (Terminal Start) t0) xs0
338 split inE t0 (x0:xs0) = go (nodeChild x0 t0) [x0] xs0
341 consRev xs xss = reverse xs : xss
343 go _ pref [] = [reverse pref]
344 go _ pref (Terminal Stop:_) = [reverse pref]
345 go t pref (Terminal Start:xs) = go t pref xs
347 -- trace (show (if acc then "ACC" else "CUT", (reverse (x : pref), ext), if acc then ">" else "<=", ((reverse pref, et), "+", ([x], ext0)))) $
349 then go xt (x:pref) xs
350 else consRev pref $ go xt0 [x] xs
355 -- ^ entropy of the current prefix
359 -- ^ entropy of the current prefix plus x
360 acc = ext > et + ext0
361 -- aut(["in","this","paper"]) > aut(["in","this"]) + aut(["paper"])
363 ne d t = if P.isNaN e then d else e
364 where e = nodeEntropy inE t
367 split :: Entropy e => Lens' i e -> Tries Token i -> [Token] -> [[Token]]
369 maximumWith (sum . map $ nodeAutonomy inE t0) (all the splits of ts)
372 ------------------------------------------------------------------------
373 ------------------------------------------------------------------------
375 mainEleve :: Int -> [[Text]] -> [[[Text]]]
378 mainEleve n input = map (map printToken) . split identity (t :: Trie Token Double) <$> inp
380 inp = toToken <$> input
381 t = buildTrie $ L.concat $ chunkAlong n 1 <$> inp
384 chunkAlongEleve :: Int -> [a] -> [[a]]
385 chunkAlongEleve n xs = L.take n <$> L.tails xs
387 toToken' :: Int -> [[Text]] -> [[Token]]
388 toToken' n input = L.concat $ (filter (/= [Terminal Stop]) . chunkAlongEleve (n + 2)) <$> toToken <$> input
390 ---------------------------------------------
391 set_entropy_vars :: Entropy e => Getting e i e -> (e -> i -> o) -> Tries Token i -> Trie Token o
392 set_entropy_vars inE modE tries@(Tries fwd _bwd) =
393 mapTree (\k -> modE $ entropy_var'' inE tries k) [rootTrie] fwd
395 mapTree :: ([Token] -> t -> e) -> [Token] -> Trie Token t -> Trie Token e
396 mapTree f k t = go f k t
398 go _ _ (Leaf c) = Leaf c
399 go f k (Node c i children) = Node c (f k i) (Map.mapWithKey (\k'-> go f (filter (/= rootTrie) $ k <> [k'])) children)
401 entropy_var'' :: Entropy e => Getting e i e -> Tries Token i -> [Token] -> e
402 entropy_var'' inE tries ng = mean $ noNaNs [fwd, bwd]
404 fwd = (nodeEntropy inE (_fwd $ findTrie ng tries))
405 bwd = (nodeEntropy inE (_bwd $ findTrie (reverseTokens ng) tries))
407 ---------------------------------------------
408 -- | TODO remove function below after following bug fixed
409 -- | TODO entropy_var' /= entropy_var on "<start> token.."
410 entropy_var' :: Entropy e => Tries Token (I e) -> [Token] -> e
411 entropy_var' tries ng = (mean $ noNaNs [ (nodeEntropy info_entropy (_fwd $ findTrie ng tries))
412 , (nodeEntropy info_entropy (_bwd $ findTrie (reverseTokens ng) tries))
416 entropy_var :: Entropy e => [Text] -> Tries Token (I e) -> e
417 entropy_var ng trie = (mean [ (nodeEntropy info_entropy (_fwd $ findTrie ntf trie))
418 , (nodeEntropy info_entropy (_bwd $ findTrie ntb trie))
422 ntf = parseToken <$> ng
423 ntb = parseToken <$> reverse ng
425 ---------------------------------------------
427 testEleve :: e ~ Double => Bool -> Int -> [Text] -> [(Text, Int, e, e, e, e, e)] -> IO Bool
428 testEleve debug n output checks = do
431 pss = [ (ps, findTrie ps fwd ^? _Just . node_entropy) -- . info_entropy)
434 , cs <- chunkAlong m 1 <$> inp
439 --res = map (map printToken) . split identity fwd <$> inp
440 --res = map (map printToken) . split info_norm_entropy' nt' <$> inp
441 res = map (map printToken) . split info_autonomy nt <$> inp
443 P.putStrLn (show input)
444 -- forM_ pss (P.putStrLn . show)
447 forM_ (entropyLevels identity (_fwd t)) $ \level ->
448 P.putStrLn $ " " <> show level
450 P.putStrLn "Forward:"
453 P.putStrLn "Backward:"
456 P.putStrLn "Normalized:"
459 P.putStrLn "Splitting:"
460 P.putStrLn $ show res
462 pure $ expected == res
465 out = T.words <$> output
466 expected = fmap (T.splitOn "-") <$> out
467 input = (T.splitOn "-" =<<) <$> out
468 inp = toToken <$> input
470 t :: Tries Token Double
471 t = buildTrie toToken' n input
472 & bwd . node_children . at (Terminal Start) . _Just . node_entropy .~ nan
473 -- TODO NP: this is a hack to set the bwd entropy of Start at NaN.
475 t'' :: Trie Token Double
476 t'' = set_entropy_vars identity (\e _i -> e) t
478 nt :: Trie Token (I Double)
479 nt = normalizeEntropy identity set_autonomy t''
481 -- nt = normalizeEntropy identity set_autonomy (fwd :: Trie Token Double)
482 -- nt = normalizeEntropy' info_entropy (\f -> info_norm_entropy' %~ f) nt
486 then P.putStrLn $ " PASS " <> msg <> " " <> show ref
487 else P.putStrLn $ " FAIL " <> msg <> " ref=" <> show ref <> " my=" <> show my
489 checker (ngram, count, entropy, _ev, autonomy, bwd_entropy, fwd_entropy) = do
490 let ns = parseToken <$> T.words ngram
491 nsb = parseToken <$> (reverse $ T.words ngram)
493 tvar = findTrie ns t''
495 P.putStrLn $ " " <> T.unpack ngram <> ":"
496 check (==) "count" count (_node_count tvar)
497 check sim "entropy_var" entropy (nodeEntropy identity tvar)
498 --check sim ("entropy_varOK") entropy (entropy_var (T.words ngram) nt)
499 --check sim "entropy" entropy (entropy_var' nt (parseToken <$> T.words ngram))
500 {- ^ FIXME 2 fun above should have same results (error in reverseToken):
503 FAIL entropy ref=NaN my=0.0
506 check sim "autonomy" autonomy (nodeEntropy info_autonomy nt)
507 check sim "fwd_entropy" fwd_entropy (nodeEntropy identity (_fwd t'))
508 check sim "bwd_entropy" bwd_entropy (nodeEntropy identity (_bwd t'))
510 printTrie :: Show e => Trie Token e -> IO ()
512 P.putStrLn . Tree.drawTree
516 -- | TODO real data is a list of tokenized sentences
517 example0, example1, example2, example3, example4, example5, example6 :: [Text]
518 example0 = ["New-York is New-York and New-York"]
519 example1 = ["to-be or not to-be"]
520 example2 = ["to-be-or not to-be-or NOT to-be and"]
521 example3 = example0 <> example0
522 -- > TEST: Should not have York New in the trie
523 example4 = ["a-b-c-d e a-b-c-d f"]
524 example5 = ["a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"]
525 example6 = ["le-petit chat"
531 checks0, checks2 :: [(Text, Int, Double, Double, Double, Double, Double)]
534 [("<start>", 1, nan, nan, nan, nan, 0.0)
535 ,("New", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 1.584962500721156, 0.0)
536 ,("York", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 0.0, 1.584962500721156)
537 ,("is", 1, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, 0.0)
538 ,("and", 1, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, 0.0)
539 --,("<stop>", 0, nan, nan, nan, 0.0, nan) Since it is not in the trie it no,
543 ,("<start> New", 1, nan, nan, nan, nan, 0.0)
544 ,("New York", 3, 1.584962500721156, 1.584962500721156, 1.4142135623730951, nan, 1.584962500721156)
545 ,("York is", 1, 0.0, nan, nan, nan, 0.0)
546 ,("is New", 1, 0.0, nan, nan, nan, 0.0)
547 ,("York and", 1, 0.0, nan, nan, nan, 0.0)
548 ,("and New", 1, 0.0, nan, nan, nan, 0.0)
549 ,("York <stop>", 1, nan, nan, nan, nan, nan)
551 ,("<start> New York", 1, nan, nan, nan, nan, 0.0)
552 ,("New York is", 1, 0.0, nan, nan, nan, 0.0)
553 ,("York is New", 1, 0.0, nan, nan, nan, 0.0)
554 ,("is New York", 1, 0.0, nan, nan, nan, 0.0)
555 ,("New York and", 1, 0.0, nan, nan, nan, 0.0)
556 ,("York and New", 1, 0.0, nan, nan, nan, 0.0)
557 ,("and New York", 1, 0.0, nan, nan, nan, 0.0)
558 ,("New York <stop>", 1, nan, nan, nan, nan, nan)
565 [("to be", 3, 1.2516291673878228, 1.2516291673878228, 1.5535694744293167, nan, 0.9182958340544896)
566 ,("be or", 2, 0.5, nan, nan, nan, 1.0)
567 ,("or not", 1, 0.0, nan, nan, nan, 0.0)
568 ,("not to", 1, 0.0, nan, nan, nan, 0.0)
569 ,("or NOT", 1, 0.0, nan, nan, nan, 0.0)
570 ,("NOT to", 1, 0.0, nan, nan, nan, 0.0)
571 ,("be and", 1, 0.0, nan, nan, nan, 0.0)
578 [("example0", 2, example0, checks0)
579 ,("example1", 2, example1, [])
580 ,("example2", 3, example2, checks2)
581 ,("example3", 2, example3, [])
582 ,("example4", 4, example4, [])
583 ,("example5", 5, example5, [])
585 (\(name, n, ex, checks) -> do
586 P.putStrLn $ name <> " " <> show n
587 b <- testEleve False n ex checks
588 P.putStrLn $ " splitting: " <> if b then "PASS" else "FAIL"