2 Module : Gargantext.Text.Eleve
3 Description : Unsupervized Word segmentation
4 Copyright : (c) CNRS, 2019-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 # Implementation of Unsupervized Word Segmentation
14 - Python implementation (Korantin August, Emmanuel Navarro):
15 [EleVe](https://github.com/kodexlab/eleve.git)
17 - Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
18 Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
19 the 50th Annual Meeting of the Association for Computational Linguistics
20 , pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
22 Notes for current implementation:
23 - TODO fix normalization
24 - TODO extract longer ngrams (see paper above, viterbi algo can be used)
25 - TODO AD TEST: prop (Node c _e f) = c == Map.size f
27 - AD: Real ngrams extraction test
28 from Gargantext.Text.Terms import extractTermsUnsupervised
29 docs <- runCmdRepl $ selectDocs 1004
30 extractTermsUnsupervised 3 $ DT.intercalate " "
32 $ Gargantext.map _hyperdataDocument_abstract docs
35 {-# LANGUAGE ConstraintKinds #-}
36 {-# LANGUAGE NoImplicitPrelude #-}
37 {-# LANGUAGE OverloadedStrings #-}
38 {-# LANGUAGE RankNTypes #-}
39 {-# LANGUAGE TemplateHaskell #-}
40 {-# LANGUAGE TypeFamilies #-}
42 module Gargantext.Text.Eleve where
44 -- import Debug.Trace (trace)
45 -- import Debug.SimpleReflect
47 import Data.Functor.Reverse
48 import Control.Lens (Lens', Getting, (^.), (^?), view, makeLenses, _Just, under, reversed, at, (.~), to, set)
49 import Control.Monad (forM_)
51 import qualified Data.List as L
53 import Data.Text (Text)
54 import qualified Data.Text as T
56 import Data.Maybe (fromMaybe)
57 import qualified Data.Map as Map
58 import Gargantext.Prelude hiding (cs)
59 import qualified Gargantext.Prelude as GP
60 import qualified Data.Tree as Tree
61 import Data.Tree (Tree)
62 import qualified Prelude as P (putStrLn, logBase, isNaN, RealFloat)
69 -- ^ TODO: only used for debugging
71 ------------------------------------------------------------------------
72 -- | Example and tests for development
75 , _info_entropy_var :: e
79 instance Show e => Show (I e) where
80 show (I e v n) = show (e, v, n)
84 type ModEntropy i o e = (e -> e) -> i -> o
86 set_autonomy :: ModEntropy e (I e) e
87 set_autonomy f e = I e e (f e)
89 set_entropy_var :: ModEntropy e (I e) e
90 set_entropy_var f e = I e (f e) e
93 data StartStop = Start | Stop
94 deriving (Ord, Eq, Show)
96 data Token = NonTerminal Text
98 deriving (Ord, Eq, Show)
100 isTerminal :: Token -> Bool
101 isTerminal (Terminal _) = True
102 isTerminal (NonTerminal _) = False
104 parseToken :: Text -> Token
105 parseToken "<start>" = Terminal Start
106 parseToken "<stop>" = Terminal Stop
107 parseToken t = NonTerminal t
109 toToken :: [Text] -> [Token]
110 toToken xs = Terminal Start : (NonTerminal <$> xs) <> [Terminal Stop]
112 printToken :: Token -> Text
115 f (NonTerminal x) = x
116 f (Terminal Start) = "<start>"
117 f (Terminal Stop) = "<stop>"
118 ------------------------------------------------------------------------
121 = Node { _node_count :: Int
123 , _node_children :: Map k (Trie k e)
125 | Leaf { _node_count :: Int }
130 insertTries :: Ord k => [[k]] -> Trie k ()
131 insertTries = L.foldr insertTrie emptyTrie
133 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
134 insertTrie [] n = n { _node_count = _node_count n +1}
135 insertTrie (x:xs) (Leaf c) = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
136 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
138 f = Just . insertTrie xs . fromMaybe emptyTrie
140 -- emptyTrie :: (Ord k, Monoid e) => Trie k e
141 -- emptyTrie = Node 0 mempty mempty
142 emptyTrie :: Trie k e
145 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
147 | Map.null children = Leaf c
148 | otherwise = Node c mempty children
150 -----------------------------
151 -- | Trie to Tree since Tree as nice print function
152 toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
153 toTree k (Leaf c) = Tree.Node (k, c, Nothing) []
154 toTree k (Node c e cs) = Tree.Node (k, c, Just e) (map (uncurry toTree) $ Map.toList cs)
156 ------------------------------------------------------------------------
157 ------------------------------------------------------------------------
158 nan :: Floating e => e
161 noNaNs :: P.RealFloat e => [e] -> [e]
162 noNaNs = filter (not . P.isNaN)
164 updateIfDefined :: P.RealFloat e => e -> e -> e
165 updateIfDefined e0 e | P.isNaN e = e0
168 sim :: Entropy e => e -> e -> Bool
169 sim x y = x == y || (P.isNaN x && P.isNaN y)
171 subst :: Entropy e => (e, e) -> e -> e
172 subst (src, dst) x | sim src x = dst
175 entropyTrie :: Entropy e => (k -> Bool) -> Trie k () -> Trie k e
176 entropyTrie _ (Leaf c) = Leaf c
177 entropyTrie pred (Node c () children) = Node c e (map (entropyTrie pred) children)
179 e = sum $ map f $ Map.toList children
180 f (k, child) = if pred k then chc * P.logBase 2 (fromIntegral c)
181 else - chc * P.logBase 2 chc
183 chc = fromIntegral (_node_count child) / fromIntegral c
184 ------------------------------------------------------------------------
185 normalizeLevel :: Entropy e => e -> e -> e -> e -> e
186 normalizeLevel prev m v e = ((e - prev) - m) / v
190 nodeChildren :: Trie k e -> Map k (Trie k e)
191 nodeChildren (Node _ _ cs) = cs
192 nodeChildren (Leaf _) = Map.empty
197 class IsTrie trie where
198 buildTrie :: Entropy e => [[Token]] -> trie Token e
199 nodeEntropy :: Entropy e => Getting e i e -> trie k i -> e
200 nodeChild :: Ord k => k -> trie k e -> trie k e
201 findTrie :: Ord k => [k] -> trie k e -> trie k e
202 printTrie :: (Show i, Entropy e) => Getting e i e -> trie Token i -> IO ()
203 normalizeEntropy :: Entropy e
204 => Getting e i e -> ModEntropy i o e
205 -> trie k i -> trie k o
208 --nodeAutonomy :: (Ord k, Entropy e) => Getting e i e -> trie k i -> [k] -> e
209 --nodeAutonomy inE t ks = nodeEntropy inE $ findTrie ks t
211 instance IsTrie Trie where
212 buildTrie ts = entropyTrie isTerminal $ insertTries ts
214 nodeEntropy inE (Node _ e _) = e ^. inE
215 nodeEntropy _ (Leaf _) = nan
217 nodeChild k (Node _ _ cs) = fromMaybe emptyTrie (Map.lookup k cs)
218 nodeChild _ (Leaf _) = emptyTrie
220 findTrie ks t = L.foldl (flip nodeChild) t ks
223 P.putStrLn . Tree.drawTree
225 $ toTree (NonTerminal "") t
226 P.putStrLn " Levels:"
227 forM_ (normalizationLevels inE t) $ \level ->
228 P.putStrLn $ " " <> show level
230 normalizeEntropy inE modE t = go (modE identity) (normalizationLevels inE t) t
232 go _ _ (Leaf c) = Leaf c
233 go _ [] _ = panic "normalizeEntropy' empty levels"
234 go f ((m, v, _) : ess) (Node c i children)
235 = Node c (f i) $ go (modE $ normalizeLevel (i ^. inE) m v) ess <$> children
239 This is only normalizing a node with respect to its brothers (unlike all the
240 nodes of the same level).
242 normalizeEntropy inE modE = go $ modE identity
244 go _ (Leaf c) = Leaf c
245 go f (Node c i children)
246 | Map.null children =
247 panic "normalizeEntropy: impossible"
249 Node c (f i) $ go (modE $ normalizeLevel es) <$> children
251 es = [ i' ^. inE | Node _ i' _ <- Map.elems children ]
253 ------------------------------------------------------------------------
255 levels :: Trie k e -> [[Trie k e]]
256 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
258 subForest :: Trie k e -> [Trie k e]
259 subForest (Leaf _) = []
260 subForest (Node _ _ children) = Map.elems children
262 entropyLevels :: Entropy e => Getting e i e -> Trie k i -> [[e]]
263 entropyLevels inE = fmap (noNaNs . map (nodeEntropy inE)) . L.tail . levels
265 normalizationLevels :: Entropy e => Getting e i e -> Trie k i -> [(e, e, Int)]
266 normalizationLevels inE = fmap f . entropyLevels inE
268 f es = (mean es, deviation es, length es)
270 ------------------------------------------------------------------------
272 data Tries k e = Tries
281 instance IsTrie Tries where
282 buildTrie tts = Tries { _fwd = buildTrie tts
283 , _bwd = buildTrie (reverse <$> tts)
286 nodeEntropy inE (Tries fwd bwd) =
287 -- VETODO reverse the query for bwd here
288 -- mean $ noNaNs [nodeEntropy inE fwd, nodeEntropy inE bwd . under reversed]
289 mean $ noNaNs [nodeEntropy inE fwd, nodeEntropy inE bwd]
291 findTrie ks (Tries fwd bwd) = Tries (findTrie ks fwd) (findTrie ks bwd)
293 -- TODO: here this is tempting to reverse but this is not always what we
294 -- want. See also nodeAutonomy.
295 -- AD: I also tried to reverse here and I confirm getting unexpected results (whereas VETODO FIX below is ok)
296 -- since recursivity of the function makes the reverse multiple times (I guess)
298 nodeChild k (Tries fwd bwd) = Tries (nodeChild k fwd) (nodeChild k bwd)
300 normalizeEntropy inE modE = onTries (normalizeEntropy inE modE)
302 printTrie inE (Tries fwd bwd) = do
303 P.putStrLn "Forward:"
306 P.putStrLn "Backward:"
309 onTries :: (Trie k i -> Trie k o) -> Tries k i -> Tries k o
310 onTries f (Tries fwd bwd) = Tries (f fwd) (f bwd)
312 ------------------------------------------------------------------------
313 split :: (IsTrie trie, Entropy e) => Lens' i e -> trie Token i -> [Token] -> [[Token]]
315 split inE t0 (Terminal Start:xs0) = split inE (nodeChild (Terminal Start) t0) xs0
316 split inE t0 (x0:xs0) = go (nodeChild x0 t0) [x0] xs0
319 consRev xs xss = reverse xs : xss
321 go _ pref [] = [reverse pref]
322 go _ pref (Terminal Stop:_) = [reverse pref]
323 go t pref (Terminal Start:xs) = go t pref xs
325 -- trace (show (if acc then "ACC" else "CUT", (reverse (x : pref), ext), if acc then ">" else "<=", ((reverse pref, et), "+", ([x], ext0)))) $
327 then go xt (x:pref) xs
328 else consRev pref $ go xt0 [x] xs
333 -- ^ entropy of the current prefix
337 -- ^ entropy of the current prefix plus x
338 acc = ext > et + ext0
339 -- aut(["in","this","paper"]) > aut(["in","this"]) + aut(["paper"])
341 ne d t = if P.isNaN e then d else e
342 where e = nodeEntropy inE t
345 split :: Entropy e => Lens' i e -> Tries Token i -> [Token] -> [[Token]]
347 maximumWith (sum . map $ nodeAutonomy inE t0) (all the splits of ts)
350 ------------------------------------------------------------------------
351 ------------------------------------------------------------------------
353 mainEleve :: Int -> [[Text]] -> [[[Text]]]
356 mainEleve n input = map (map printToken) . split identity (t :: Trie Token Double) <$> inp
358 inp = toToken <$> input
359 t = buildTrie $ L.concat $ chunkAlong n 1 <$> inp
362 chunkAlongEleve :: Int -> [a] -> [[a]]
363 chunkAlongEleve n xs = L.take n <$> L.tails xs
365 toToken' :: Int -> [[Text]] -> [[Token]]
366 toToken' n input = L.concat $ (filter (/= [Terminal Stop]) . chunkAlongEleve (n + 2)) <$> toToken <$> input
368 ---------------------------------------------
369 set_entropy_vars :: Entropy e => Getting e i e -> (e -> i -> o) -> Tries Token i -> Trie Token o
370 set_entropy_vars inE modE tries@(Tries fwd _bwd) =
371 mapTree (\k -> modE $ entropy_var'' inE tries k) [] fwd
373 mapTree :: ([Token] -> t -> e) -> [Token] -> Trie Token t -> Trie Token e
374 mapTree f k t = go f k t
376 go _ _ (Leaf c) = Leaf c
377 go f k (Node c i children) = Node c (f k i) (Map.mapWithKey (\k'-> go f (k <> [k'])) children)
379 entropy_var'' :: Entropy e => Getting e i e -> Tries Token i -> [Token] -> e
380 entropy_var'' inE tries ng = mean $ noNaNs [fwd, bwd]
382 fwd = (nodeEntropy inE (_fwd $ findTrie ng tries))
383 bwd = (nodeEntropy inE (_bwd $ findTrie (reverse ng) tries))
385 ---------------------------------------------
386 -- | TODO remove function below after following bug fixed
387 -- | TODO entropy_var' /= entropy_var on "<start> token.."
388 entropy_var' :: Entropy e => Tries Token (I e) -> [Token] -> e
389 entropy_var' tries ng = (mean $ noNaNs [ (nodeEntropy info_entropy (_fwd $ findTrie ng tries))
390 , (nodeEntropy info_entropy (_bwd $ findTrie (reverse ng) tries))
394 entropy_var :: Entropy e => [Text] -> Tries Token (I e) -> e
395 entropy_var ng trie = (mean [ (nodeEntropy info_entropy (_fwd $ findTrie ntf trie))
396 , (nodeEntropy info_entropy (_bwd $ findTrie ntb trie))
400 ntf = parseToken <$> ng
401 ntb = parseToken <$> reverse ng
403 ---------------------------------------------
405 testEleve :: e ~ Double => Bool -> Int -> [Text] -> [(Text, Int, e, e, e, e, e)] -> IO Bool
406 testEleve debug n output checks = do
409 pss = [ (ps, findTrie ps fwd ^? _Just . node_entropy) -- . info_entropy)
412 , cs <- chunkAlong m 1 <$> inp
417 --res = map (map printToken) . split identity fwd <$> inp
418 --res = map (map printToken) . split info_norm_entropy' nt' <$> inp
419 res = map (map printToken) . split info_autonomy nt <$> inp
421 P.putStrLn (show input)
422 -- forM_ pss (P.putStrLn . show)
424 printTrie info_entropy nt
426 P.putStrLn "Entropy Var:"
427 printTrie identity t''
429 P.putStrLn "Splitting:"
430 P.putStrLn $ show res
432 pure $ expected == res
435 out = T.words <$> output
436 expected = fmap (T.splitOn "-") <$> out
437 input = (T.splitOn "-" =<<) <$> out
438 inp = toToken <$> input
440 t :: Tries Token Double
441 t = buildTrie (toToken' n input)
442 & bwd . node_children . at (Terminal Start) . _Just . node_entropy .~ nan
443 -- TODO NP: this is a hack to set the bwd entropy of Start at NaN.
445 nt :: Tries Token (I Double)
446 nt = normalizeEntropy identity set_autonomy t
448 t'' :: Trie Token Double
449 t'' = set_entropy_vars info_autonomy (\e _i -> e) nt
451 -- nt = normalizeEntropy identity set_autonomy (fwd :: Trie Token Double)
452 -- nt = normalizeEntropy' info_entropy (\f -> info_norm_entropy' %~ f) nt
456 then P.putStrLn $ " PASS " <> msg <> " " <> show ref
457 else P.putStrLn $ " FAIL " <> msg <> " ref=" <> show ref <> " my=" <> show my
459 checker (ngram, count, entropy, _ev, autonomy, bwd_entropy, fwd_entropy) = do
460 let ns = parseToken <$> T.words ngram
461 nsb = parseToken <$> (reverse $ T.words ngram)
463 tvar = findTrie ns t''
466 P.putStrLn $ " " <> T.unpack ngram <> ":"
467 check (==) "count" count (_node_count tvar)
468 check sim "entropy_var" entropy (nodeEntropy identity tvar)
469 --check sim ("entropy_varOK") entropy (entropy_var (T.words ngram) nt)
470 --check sim "entropy" entropy (entropy_var' nt (parseToken <$> T.words ngram))
471 {- ^ FIXME 2 fun above should have same results (error in reverseToken):
474 FAIL entropy ref=NaN my=0.0
477 check sim "autonomy" autonomy (nodeEntropy info_autonomy nt')
478 check sim "fwd_entropy" fwd_entropy (nodeEntropy identity (_fwd t'))
479 check sim "bwd_entropy" bwd_entropy (nodeEntropy identity (_bwd t'))
481 -- | TODO real data is a list of tokenized sentences
482 example0, example1, example2, example3, example4, example5, example6 :: [Text]
483 example0 = ["New-York is New-York and New-York"]
484 example1 = ["to-be or not to-be"]
485 example2 = ["to-be-or not to-be-or NOT to-be and"]
486 example3 = example0 <> example0
487 -- > TEST: Should not have York New in the trie
488 example4 = ["a-b-c-d e a-b-c-d f"]
489 example5 = ["a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"]
490 example6 = ["le-petit chat"
496 checks0, checks2 :: [(Text, Int, Double, Double, Double, Double, Double)]
499 [("<start>", 1, nan, nan, nan, nan, 0.0)
500 ,("New", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 1.584962500721156, 0.0)
501 ,("York", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 0.0, 1.584962500721156)
502 ,("is", 1, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, 0.0)
503 ,("and", 1, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, 0.0)
504 --,("<stop>", 0, nan, nan, nan, 0.0, nan) Since it is not in the trie it no,
508 ,("<start> New", 1, nan, nan, nan, nan, 0.0)
509 ,("New York", 3, 1.584962500721156, 1.584962500721156, 1.4142135623730951, nan, 1.584962500721156)
510 ,("York is", 1, 0.0, nan, nan, nan, 0.0)
511 ,("is New", 1, 0.0, nan, nan, nan, 0.0)
512 ,("York and", 1, 0.0, nan, nan, nan, 0.0)
513 ,("and New", 1, 0.0, nan, nan, nan, 0.0)
514 ,("York <stop>", 1, nan, nan, nan, nan, nan)
516 ,("<start> New York", 1, nan, nan, nan, nan, 0.0)
517 ,("New York is", 1, 0.0, nan, nan, nan, 0.0)
518 ,("York is New", 1, 0.0, nan, nan, nan, 0.0)
519 ,("is New York", 1, 0.0, nan, nan, nan, 0.0)
520 ,("New York and", 1, 0.0, nan, nan, nan, 0.0)
521 ,("York and New", 1, 0.0, nan, nan, nan, 0.0)
522 ,("and New York", 1, 0.0, nan, nan, nan, 0.0)
523 ,("New York <stop>", 1, nan, nan, nan, nan, nan)
530 [("to be", 3, 1.2516291673878228, 1.2516291673878228, 1.5535694744293167, nan, 0.9182958340544896)
531 ,("be or", 2, 0.5, nan, nan, nan, 1.0)
532 ,("or not", 1, 0.0, nan, nan, nan, 0.0)
533 ,("not to", 1, 0.0, nan, nan, nan, 0.0)
534 ,("or NOT", 1, 0.0, nan, nan, nan, 0.0)
535 ,("NOT to", 1, 0.0, nan, nan, nan, 0.0)
536 ,("be and", 1, 0.0, nan, nan, nan, 0.0)
543 [("example0", 2, example0, checks0)
544 ,("example1", 2, example1, [])
545 ,("example2", 3, example2, checks2)
546 ,("example3", 2, example3, [])
547 ,("example4", 4, example4, [])
548 ,("example5", 5, example5, [])
550 (\(name, n, ex, checks) -> do
551 P.putStrLn $ name <> " " <> show n
552 b <- testEleve False n ex checks
553 P.putStrLn $ " splitting: " <> if b then "PASS" else "FAIL"