2 Module : Gargantext.Text.Eleve
3 Description : Unsupervized Word segmentation
4 Copyright : (c) CNRS, 2019-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 # Implementation of Unsupervized Word Segmentation
14 - Python implementation (Korantin August, Emmanuel Navarro):
15 [EleVe](https://github.com/kodexlab/eleve.git)
17 - Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
18 Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
19 the 50th Annual Meeting of the Association for Computational Linguistics
20 , pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
22 Notes for current implementation:
23 - TODO fix normalization
24 - TODO extract longer ngrams (see paper above, viterbi algo can be used)
25 - TODO AD TEST: prop (Node c _e f) = c == Map.size f
27 - AD: Real ngrams extraction test
28 from Gargantext.Text.Terms import extractTermsUnsupervised
29 docs <- runCmdRepl $ selectDocs 1004
30 extractTermsUnsupervised 3 $ DT.intercalate " "
32 $ Gargantext.map _hyperdataDocument_abstract docs
35 {-# LANGUAGE ConstraintKinds #-}
36 {-# LANGUAGE NoImplicitPrelude #-}
37 {-# LANGUAGE OverloadedStrings #-}
38 {-# LANGUAGE RankNTypes #-}
39 {-# LANGUAGE TemplateHaskell #-}
40 {-# LANGUAGE TypeFamilies #-}
42 module Gargantext.Text.Eleve where
44 -- import Debug.Trace (trace)
45 -- import Debug.SimpleReflect
47 import Data.Functor.Reverse
48 import Control.Lens (Lens', Getting, (^.), (^?), view, makeLenses, _Just, under, reversed, at, (.~), to, set)
49 import Control.Monad (forM_)
51 import qualified Data.List as L
53 import Data.Text (Text)
54 import qualified Data.Text as T
56 import Data.Maybe (fromMaybe)
57 import qualified Data.Map as Map
58 import Gargantext.Prelude hiding (cs)
59 import qualified Gargantext.Prelude as GP
60 import qualified Data.Tree as Tree
61 import Data.Tree (Tree)
62 import qualified Prelude as P (putStrLn, logBase, isNaN, RealFloat)
69 -- ^ TODO: only used for debugging
71 ------------------------------------------------------------------------
72 -- | Example and tests for development
75 , _info_entropy_var :: e
79 instance Show e => Show (I e) where
80 show (I e v n) = show (e, v, n)
84 type ModEntropy i o e = (e -> e) -> i -> o
86 set_autonomy :: ModEntropy e (I e) e
87 set_autonomy f e = I e e (f e)
89 set_entropy_var :: ModEntropy e (I e) e
90 set_entropy_var f e = I e (f e) e
93 data StartStop = Start | Stop
94 deriving (Ord, Eq, Show)
96 data Token = NonTerminal Text
98 deriving (Ord, Eq, Show)
100 isTerminal :: Token -> Bool
101 isTerminal (Terminal _) = True
102 isTerminal (NonTerminal _) = False
104 parseToken :: Text -> Token
105 parseToken "<start>" = Terminal Start
106 parseToken "<stop>" = Terminal Stop
107 parseToken t = NonTerminal t
110 -- >>> reverseTokens [Terminal Start, NonTerminal "new", NonTerminal "york", Terminal Stop]
111 -- [Terminal Start,NonTerminal "york",NonTerminal "new",Terminal Stop]
112 reverseTokens :: [Token] -> [Token]
113 reverseTokens xs = case lastMay xs of
115 Just (Terminal Stop) -> reverseTokens' xs <> [Terminal Stop]
116 _ -> reverseTokens' xs
118 reverseTokens' :: [Token] -> [Token]
119 reverseTokens' [] = []
120 reverseTokens' [Terminal Stop] = []
121 reverseTokens' [x] = [x]
122 reverseTokens' (x:xs) = case x of
123 Terminal Start -> [Terminal Start] <> reverseTokens' xs
124 _ -> reverseTokens' xs <> [x]
127 toToken :: [Text] -> [Token]
128 toToken xs = Terminal Start : (NonTerminal <$> xs) <> [Terminal Stop]
130 printToken :: Token -> Text
133 f (NonTerminal x) = x
134 f (Terminal Start) = "<start>"
135 f (Terminal Stop) = "<stop>"
136 ------------------------------------------------------------------------
139 = Node { _node_count :: Int
141 , _node_children :: Map k (Trie k e)
143 | Leaf { _node_count :: Int }
148 insertTries :: Ord k => [[k]] -> Trie k ()
149 insertTries = L.foldr insertTrie emptyTrie
151 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
152 insertTrie [] n = n { _node_count = _node_count n +1}
153 insertTrie (x:xs) (Leaf c) = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
154 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
156 f = Just . insertTrie xs . fromMaybe emptyTrie
158 -- emptyTrie :: (Ord k, Monoid e) => Trie k e
159 -- emptyTrie = Node 0 mempty mempty
160 emptyTrie :: Trie k e
163 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
165 | Map.null children = Leaf c
166 | otherwise = Node c mempty children
168 -----------------------------
169 -- | Trie to Tree since Tree as nice print function
170 toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
171 toTree k (Leaf c) = Tree.Node (k, c, Nothing) []
172 toTree k (Node c e cs) = Tree.Node (k, c, Just e) (map (uncurry toTree) $ Map.toList cs)
174 ------------------------------------------------------------------------
175 ------------------------------------------------------------------------
176 nan :: Floating e => e
179 noNaNs :: P.RealFloat e => [e] -> [e]
180 noNaNs = filter (not . P.isNaN)
182 updateIfDefined :: P.RealFloat e => e -> e -> e
183 updateIfDefined e0 e | P.isNaN e = e0
186 sim :: Entropy e => e -> e -> Bool
187 sim x y = x == y || (P.isNaN x && P.isNaN y)
189 subst :: Entropy e => (e, e) -> e -> e
190 subst (src, dst) x | sim src x = dst
193 entropyTrie :: Entropy e => (k -> Bool) -> Trie k () -> Trie k e
194 entropyTrie _ (Leaf c) = Leaf c
195 entropyTrie pred (Node c () children) = Node c e (map (entropyTrie pred) children)
197 e = sum $ map f $ Map.toList children
198 f (k, child) = if pred k then chc * P.logBase 2 (fromIntegral c)
199 else - chc * P.logBase 2 chc
201 chc = fromIntegral (_node_count child) / fromIntegral c
202 ------------------------------------------------------------------------
203 normalizeLevel :: Entropy e => e -> [e] -> e -> e
204 normalizeLevel prev = go . noNaNs
207 go es = \e -> ((e - prev) - m) / v
214 nodeChildren :: Trie k e -> Map k (Trie k e)
215 nodeChildren (Node _ _ cs) = cs
216 nodeChildren (Leaf _) = Map.empty
221 class IsTrie trie where
222 buildTrie :: Entropy e => (Int -> [[Text]] -> [[Token]]) -> Int -> [[Text]] -> trie Token e
223 nodeEntropy :: Entropy e => Getting e i e -> trie k i -> e
224 nodeChild :: Ord k => k -> trie k e -> trie k e
225 findTrie :: Ord k => [k] -> trie k e -> trie k e
228 --nodeAutonomy :: (Ord k, Entropy e) => Getting e i e -> trie k i -> [k] -> e
229 --nodeAutonomy inE t ks = nodeEntropy inE $ findTrie ks t
231 instance IsTrie Trie where
232 buildTrie to n ts = entropyTrie isTerminal $ insertTries $ to n ts
234 nodeEntropy inE (Node _ e _) = e ^. inE
235 nodeEntropy _ (Leaf _) = nan
237 nodeChild k (Node _ _ cs) = fromMaybe emptyTrie (Map.lookup k cs)
238 nodeChild _ (Leaf _) = emptyTrie
240 findTrie ks t = L.foldl (flip nodeChild) t ks
242 normalizeEntropy :: Entropy e
243 => Getting e i e -> ModEntropy i o e
244 -> Trie k i -> Trie k o
245 normalizeEntropy inE modE t = go (modE identity) level t
247 level = (entropyLevels inE t)
248 go _ [] _ = panic "normalizeEntropy' empty levels"
249 go _ _ (Leaf c) = Leaf c
250 -- go _ ([] : _) _ = panic "normalizeEntropy': empty level"
251 go f (es : ess) (Node c i children)
252 -- | any (sim (i ^. inE)) es
253 = Node c (f i) $ go (modE $ normalizeLevel (i ^. inE) es) ess <$> children
255 -- = panic "NOT an elem"
259 This is only normalizing a node with respect to its brothers (unlike all the
260 nodes of the same level).
262 normalizeEntropy inE modE = go $ modE identity
264 go _ (Leaf c) = Leaf c
265 go f (Node c i children)
266 | Map.null children =
267 panic "normalizeEntropy: impossible"
269 Node c (f i) $ go (modE $ normalizeLevel es) <$> children
271 es = [ i' ^. inE | Node _ i' _ <- Map.elems children ]
273 ------------------------------------------------------------------------
275 levels :: Trie k e -> [[Trie k e]]
276 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
278 subForest :: Trie k e -> [Trie k e]
279 subForest (Leaf _) = []
280 subForest (Node _ _ children) = Map.elems children
282 entropyLevels :: Entropy e => Getting e i e -> Trie k i -> [[e]]
283 entropyLevels inE = fmap (noNaNs . map (nodeEntropy inE)) . levels
285 ------------------------------------------------------------------------
287 data Tries k e = Tries
296 instance IsTrie Tries where
297 buildTrie to n tts = Tries { _fwd = buildTrie to n tts
298 , _bwd = buildTrie to n (map reverse $ tts)
301 nodeEntropy inE (Tries fwd bwd) =
302 -- VETODO reverse the query for bwd here
303 -- mean $ noNaNs [nodeEntropy inE fwd, nodeEntropy inE bwd . under reversed]
304 mean $ noNaNs [nodeEntropy inE fwd, nodeEntropy inE bwd]
306 findTrie ks (Tries fwd bwd) = Tries (findTrie ks fwd) (findTrie ks bwd)
308 -- TODO: here this is tempting to reverse but this is not always what we
309 -- want. See also nodeAutonomy.
310 -- AD: I also tried to reverse here and I confirm getting unexpected results (whereas VETODO FIX below is ok)
311 -- since recursivity of the function makes the reverse multiple times (I guess)
313 nodeChild k (Tries fwd bwd) = Tries (nodeChild k fwd) (nodeChild k bwd)
315 ------------------------------------------------------------------------
316 split :: (IsTrie trie, Entropy e) => Lens' i e -> trie Token i -> [Token] -> [[Token]]
318 split inE t0 (Terminal Start:xs0) = split inE (nodeChild (Terminal Start) t0) xs0
319 split inE t0 (x0:xs0) = go (nodeChild x0 t0) [x0] xs0
322 consRev xs xss = reverse xs : xss
324 go _ pref [] = [reverse pref]
325 go _ pref (Terminal Stop:_) = [reverse pref]
326 go t pref (Terminal Start:xs) = go t pref xs
328 -- trace (show (if acc then "ACC" else "CUT", (reverse (x : pref), ext), if acc then ">" else "<=", ((reverse pref, et), "+", ([x], ext0)))) $
330 then go xt (x:pref) xs
331 else consRev pref $ go xt0 [x] xs
336 -- ^ entropy of the current prefix
340 -- ^ entropy of the current prefix plus x
341 acc = ext > et + ext0
342 -- aut(["in","this","paper"]) > aut(["in","this"]) + aut(["paper"])
344 ne d t = if P.isNaN e then d else e
345 where e = nodeEntropy inE t
348 split :: Entropy e => Lens' i e -> Tries Token i -> [Token] -> [[Token]]
350 maximumWith (sum . map $ nodeAutonomy inE t0) (all the splits of ts)
353 ------------------------------------------------------------------------
354 ------------------------------------------------------------------------
356 mainEleve :: Int -> [[Text]] -> [[[Text]]]
359 mainEleve n input = map (map printToken) . split identity (t :: Trie Token Double) <$> inp
361 inp = toToken <$> input
362 t = buildTrie $ L.concat $ chunkAlong n 1 <$> inp
365 chunkAlongEleve :: Int -> [a] -> [[a]]
366 chunkAlongEleve n xs = L.take n <$> L.tails xs
368 toToken' :: Int -> [[Text]] -> [[Token]]
369 toToken' n input = L.concat $ (filter (/= [Terminal Stop]) . chunkAlongEleve (n + 2)) <$> toToken <$> input
371 ---------------------------------------------
372 set_entropy_vars :: Entropy e => Getting e i e -> (e -> i -> o) -> Tries Token i -> Trie Token o
373 set_entropy_vars inE modE tries@(Tries fwd _bwd) =
374 mapTree (\k -> modE $ entropy_var'' inE tries k) [] fwd
376 mapTree :: ([Token] -> t -> e) -> [Token] -> Trie Token t -> Trie Token e
377 mapTree f k t = go f k t
379 go _ _ (Leaf c) = Leaf c
380 go f k (Node c i children) = Node c (f k i) (Map.mapWithKey (\k'-> go f (k <> [k'])) children)
382 entropy_var'' :: Entropy e => Getting e i e -> Tries Token i -> [Token] -> e
383 entropy_var'' inE tries ng = mean $ noNaNs [fwd, bwd]
385 fwd = (nodeEntropy inE (_fwd $ findTrie ng tries))
386 bwd = (nodeEntropy inE (_bwd $ findTrie (reverseTokens ng) tries))
388 ---------------------------------------------
389 -- | TODO remove function below after following bug fixed
390 -- | TODO entropy_var' /= entropy_var on "<start> token.."
391 entropy_var' :: Entropy e => Tries Token (I e) -> [Token] -> e
392 entropy_var' tries ng = (mean $ noNaNs [ (nodeEntropy info_entropy (_fwd $ findTrie ng tries))
393 , (nodeEntropy info_entropy (_bwd $ findTrie (reverseTokens ng) tries))
397 entropy_var :: Entropy e => [Text] -> Tries Token (I e) -> e
398 entropy_var ng trie = (mean [ (nodeEntropy info_entropy (_fwd $ findTrie ntf trie))
399 , (nodeEntropy info_entropy (_bwd $ findTrie ntb trie))
403 ntf = parseToken <$> ng
404 ntb = parseToken <$> reverse ng
406 ---------------------------------------------
408 testEleve :: e ~ Double => Bool -> Int -> [Text] -> [(Text, Int, e, e, e, e, e)] -> IO Bool
409 testEleve debug n output checks = do
412 pss = [ (ps, findTrie ps fwd ^? _Just . node_entropy) -- . info_entropy)
415 , cs <- chunkAlong m 1 <$> inp
420 --res = map (map printToken) . split identity fwd <$> inp
421 --res = map (map printToken) . split info_norm_entropy' nt' <$> inp
422 res = map (map printToken) . split info_autonomy nt <$> inp
424 P.putStrLn (show input)
425 -- forM_ pss (P.putStrLn . show)
427 P.putStrLn "Forward:"
430 P.putStrLn "Backward:"
434 forM_ (entropyLevels identity t'') $ \level ->
435 P.putStrLn $ " " <> show level
437 P.putStrLn "Normalized:"
440 P.putStrLn "Splitting:"
441 P.putStrLn $ show res
443 pure $ expected == res
446 out = T.words <$> output
447 expected = fmap (T.splitOn "-") <$> out
448 input = (T.splitOn "-" =<<) <$> out
449 inp = toToken <$> input
451 t :: Tries Token Double
452 t = buildTrie toToken' n input
453 & bwd . node_children . at (Terminal Start) . _Just . node_entropy .~ nan
454 -- TODO NP: this is a hack to set the bwd entropy of Start at NaN.
456 t'' :: Trie Token Double
457 t'' = set_entropy_vars identity (\e _i -> e) t
459 nt :: Trie Token (I Double)
460 nt = normalizeEntropy identity set_autonomy t''
462 -- nt = normalizeEntropy identity set_autonomy (fwd :: Trie Token Double)
463 -- nt = normalizeEntropy' info_entropy (\f -> info_norm_entropy' %~ f) nt
467 then P.putStrLn $ " PASS " <> msg <> " " <> show ref
468 else P.putStrLn $ " FAIL " <> msg <> " ref=" <> show ref <> " my=" <> show my
470 checker (ngram, count, entropy, _ev, autonomy, bwd_entropy, fwd_entropy) = do
471 let ns = parseToken <$> T.words ngram
472 nsb = parseToken <$> (reverse $ T.words ngram)
474 tvar = findTrie ns t''
477 P.putStrLn $ " " <> T.unpack ngram <> ":"
478 check (==) "count" count (_node_count tvar)
479 check sim "entropy_var" entropy (nodeEntropy identity tvar)
480 --check sim ("entropy_varOK") entropy (entropy_var (T.words ngram) nt)
481 --check sim "entropy" entropy (entropy_var' nt (parseToken <$> T.words ngram))
482 {- ^ FIXME 2 fun above should have same results (error in reverseToken):
485 FAIL entropy ref=NaN my=0.0
488 check sim "autonomy" autonomy (nodeEntropy info_autonomy nt')
489 check sim "fwd_entropy" fwd_entropy (nodeEntropy identity (_fwd t'))
490 check sim "bwd_entropy" bwd_entropy (nodeEntropy identity (_bwd t'))
492 printTrie :: Show e => Trie Token e -> IO ()
494 P.putStrLn . Tree.drawTree
496 . toTree (NonTerminal "")
498 -- | TODO real data is a list of tokenized sentences
499 example0, example1, example2, example3, example4, example5, example6 :: [Text]
500 example0 = ["New-York is New-York and New-York"]
501 example1 = ["to-be or not to-be"]
502 example2 = ["to-be-or not to-be-or NOT to-be and"]
503 example3 = example0 <> example0
504 -- > TEST: Should not have York New in the trie
505 example4 = ["a-b-c-d e a-b-c-d f"]
506 example5 = ["a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"]
507 example6 = ["le-petit chat"
513 checks0, checks2 :: [(Text, Int, Double, Double, Double, Double, Double)]
516 [("<start>", 1, nan, nan, nan, nan, 0.0)
517 ,("New", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 1.584962500721156, 0.0)
518 ,("York", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 0.0, 1.584962500721156)
519 ,("is", 1, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, 0.0)
520 ,("and", 1, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, 0.0)
521 --,("<stop>", 0, nan, nan, nan, 0.0, nan) Since it is not in the trie it no,
525 ,("<start> New", 1, nan, nan, nan, nan, 0.0)
526 ,("New York", 3, 1.584962500721156, 1.584962500721156, 1.4142135623730951, nan, 1.584962500721156)
527 ,("York is", 1, 0.0, nan, nan, nan, 0.0)
528 ,("is New", 1, 0.0, nan, nan, nan, 0.0)
529 ,("York and", 1, 0.0, nan, nan, nan, 0.0)
530 ,("and New", 1, 0.0, nan, nan, nan, 0.0)
531 ,("York <stop>", 1, nan, nan, nan, nan, nan)
533 ,("<start> New York", 1, nan, nan, nan, nan, 0.0)
534 ,("New York is", 1, 0.0, nan, nan, nan, 0.0)
535 ,("York is New", 1, 0.0, nan, nan, nan, 0.0)
536 ,("is New York", 1, 0.0, nan, nan, nan, 0.0)
537 ,("New York and", 1, 0.0, nan, nan, nan, 0.0)
538 ,("York and New", 1, 0.0, nan, nan, nan, 0.0)
539 ,("and New York", 1, 0.0, nan, nan, nan, 0.0)
540 ,("New York <stop>", 1, nan, nan, nan, nan, nan)
547 [("to be", 3, 1.2516291673878228, 1.2516291673878228, 1.5535694744293167, nan, 0.9182958340544896)
548 ,("be or", 2, 0.5, nan, nan, nan, 1.0)
549 ,("or not", 1, 0.0, nan, nan, nan, 0.0)
550 ,("not to", 1, 0.0, nan, nan, nan, 0.0)
551 ,("or NOT", 1, 0.0, nan, nan, nan, 0.0)
552 ,("NOT to", 1, 0.0, nan, nan, nan, 0.0)
553 ,("be and", 1, 0.0, nan, nan, nan, 0.0)
560 [("example0", 2, example0, checks0)
561 ,("example1", 2, example1, [])
562 ,("example2", 3, example2, checks2)
563 ,("example3", 2, example3, [])
564 ,("example4", 4, example4, [])
565 ,("example5", 5, example5, [])
567 (\(name, n, ex, checks) -> do
568 P.putStrLn $ name <> " " <> show n
569 b <- testEleve False n ex checks
570 P.putStrLn $ " splitting: " <> if b then "PASS" else "FAIL"