2 Module : Gargantext.Text.Eleve
3 Description : Unsupervized Word segmentation
4 Copyright : (c) CNRS, 2019-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 # Implementation of Unsupervized Word Segmentation
14 - Python implementation (Korantin August, Emmanuel Navarro):
15 [EleVe](https://github.com/kodexlab/eleve.git)
17 - Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
18 Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
19 the 50th Annual Meeting of the Association for Computational Linguistics
20 , pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
22 Notes for current implementation:
23 - TODO fix normalization
24 - TODO extract longer ngrams (see paper above, viterbi algo can be used)
25 - TODO AD TEST: prop (Node c _e f) = c == Map.size f
27 - AD: Real ngrams extraction test
28 from Gargantext.Text.Terms import extractTermsUnsupervised
29 docs <- runCmdRepl $ selectDocs 1004
30 extractTermsUnsupervised 3 $ DT.intercalate " "
32 $ Gargantext.map _hyperdataDocument_abstract docs
35 {-# LANGUAGE ConstraintKinds #-}
36 {-# LANGUAGE NoImplicitPrelude #-}
37 {-# LANGUAGE OverloadedStrings #-}
38 {-# LANGUAGE RankNTypes #-}
39 {-# LANGUAGE TemplateHaskell #-}
40 {-# LANGUAGE TypeFamilies #-}
42 module Gargantext.Text.Eleve where
44 -- import Debug.Trace (trace)
45 -- import Debug.SimpleReflect
47 import Control.Lens hiding (levels, children)
48 import Control.Monad (forM_)
50 import qualified Data.List as L
52 import Data.Text (Text)
53 import qualified Data.Text as T
55 import Data.Maybe (fromMaybe)
56 import qualified Data.Map as Map
57 import Gargantext.Prelude hiding (cs)
58 import qualified Data.Tree as Tree
59 import Data.Tree (Tree)
60 import qualified Prelude as P (putStrLn, logBase, isNaN, RealFloat)
62 nan :: Floating e => e
65 noNaNs :: P.RealFloat e => [e] -> [e]
66 noNaNs = filter (not . P.isNaN)
68 updateIfDefined :: P.RealFloat e => e -> e -> e
69 updateIfDefined e0 e | P.isNaN e = e0
72 sim :: Entropy e => e -> e -> Bool
73 sim x y = x == y || (P.isNaN x && P.isNaN y)
75 subst :: Entropy e => (e, e) -> e -> e
76 subst (src, dst) x | sim src x = dst
78 ------------------------------------------------------------------------
85 -- ^ TODO: only used for debugging
87 ------------------------------------------------------------------------
88 -- | Example and tests for development
91 , _info_entropy_var :: e
95 instance Show e => Show (I e) where
96 show (I e ev a) = show (e, ev, a)
100 type ModEntropy i o e = (e -> e) -> i -> o
102 set_autonomy :: Entropy e => ModEntropy e (I e) e
103 set_autonomy f e = I e nan (f e)
105 set_entropy_var :: Entropy e => Setter e (I e) e e
106 set_entropy_var f e = (\ev -> I e ev nan) <$> f e
108 data StartStop = Start | Stop
109 deriving (Ord, Eq, Show)
111 data Token = NonTerminal Text
113 deriving (Ord, Eq, Show)
115 isTerminal :: Token -> Bool
116 isTerminal (Terminal _) = True
117 isTerminal (NonTerminal _) = False
119 parseToken :: Text -> Token
120 parseToken "<start>" = Terminal Start
121 parseToken "<stop>" = Terminal Stop
122 parseToken t = NonTerminal t
124 toToken :: [Text] -> [Token]
125 toToken xs = Terminal Start : (NonTerminal <$> xs) <> [Terminal Stop]
127 printToken :: Token -> Text
130 f (NonTerminal x) = x
131 f (Terminal Start) = "<start>"
132 f (Terminal Stop) = "<stop>"
133 ------------------------------------------------------------------------
136 = Node { _node_count :: Int
138 , _node_children :: Map k (Trie k e)
140 | Leaf { _node_count :: Int }
145 insertTries :: Ord k => [[k]] -> Trie k ()
146 insertTries = L.foldr insertTrie emptyTrie
148 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
149 insertTrie [] n = n { _node_count = _node_count n +1}
150 insertTrie (x:xs) (Leaf c) = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
151 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
153 f = Just . insertTrie xs . fromMaybe emptyTrie
155 -- emptyTrie :: (Ord k, Monoid e) => Trie k e
156 -- emptyTrie = Node 0 mempty mempty
157 emptyTrie :: Trie k e
160 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
162 | Map.null children = Leaf c
163 | otherwise = Node c mempty children
165 -----------------------------
166 -- | Trie to Tree since Tree as nice print function
167 toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
168 toTree k (Leaf c) = Tree.Node (k, c, Nothing) []
169 toTree k (Node c e cs) = Tree.Node (k, c, Just e) (map (uncurry toTree) $ Map.toList cs)
171 ------------------------------------------------------------------------
172 ------------------------------------------------------------------------
173 entropyTrie :: Entropy e => (k -> Bool) -> Trie k () -> Trie k e
174 entropyTrie _ (Leaf c) = Leaf c
175 entropyTrie pred (Node c () children) = Node c e (map (entropyTrie pred) children)
177 children' = Map.toList children
178 sum_count = sum $ _node_count . snd <$> children'
179 e | sum_count == 0 = nan
180 | otherwise = sum $ f <$> children'
181 f (k, child) = if pred k then chc * P.logBase 2 (fromIntegral c)
182 else - chc * P.logBase 2 chc
184 chc = fromIntegral (_node_count child) / fromIntegral c
185 ------------------------------------------------------------------------
186 normalizeLevel :: Entropy e => e -> e -> e -> e
187 normalizeLevel m v e = (e - m) / v
191 nodeChildren :: Trie k e -> Map k (Trie k e)
192 nodeChildren (Node _ _ cs) = cs
193 nodeChildren (Leaf _) = Map.empty
198 class IsTrie trie where
199 buildTrie :: Entropy e => [[Token]] -> trie Token e
200 nodeEntropy :: Entropy e => Getting e i e -> trie k i -> e
201 nodeChild :: Ord k => k -> trie k e -> trie k e
202 findTrie :: Ord k => [k] -> trie k e -> trie k e
203 findTrieR :: Ord k => [k] -> trie k e -> trie k e
204 printTrie :: (Show i, Entropy e) => Getting e i e -> trie Token i -> IO ()
205 evTrie :: Entropy e => Getting e i e -> Setter i o e e -> trie k i -> trie k o
206 normalizeEntropy :: Entropy e
207 => Getting e i e -> ModEntropy i o e
208 -> trie k i -> trie k o
211 --nodeAutonomy :: (Ord k, Entropy e) => Getting e i e -> trie k i -> [k] -> e
212 --nodeAutonomy inE t ks = nodeEntropy inE $ findTrie ks t
214 instance IsTrie Trie where
215 buildTrie ts = entropyTrie isTerminal $ insertTries ts
217 nodeEntropy inE (Node _ e _) = e ^. inE
218 nodeEntropy _ (Leaf _) = nan
220 nodeChild k (Node _ _ cs) = fromMaybe emptyTrie (Map.lookup k cs)
221 nodeChild _ (Leaf _) = emptyTrie
223 findTrie ks t = L.foldl (flip nodeChild) t ks
227 P.putStrLn . Tree.drawTree
229 $ toTree (NonTerminal "") t
230 P.putStrLn " Levels:"
231 forM_ (normalizationLevels inE t) $ \level ->
232 P.putStrLn $ " " <> show level
234 evTrie inE setEV = go nan
236 go _ (Leaf c) = Leaf c
237 go e0 (Node c i children) = Node c (i & setEV .~ ev e0 e1) $ go e1 <$> children
243 normalizeEntropy inE modE t = go (modE identity) (normalizationLevels inE t) t
245 go _ _ (Leaf c) = Leaf c
246 go _ [] _ = panic "normalizeEntropy' empty levels"
247 go f ((m, v, _) : ess) (Node c i children)
248 = Node c (f i) $ go (modE $ normalizeLevel m v) ess <$> children
249 ------------------------------------------------------------------------
251 levels :: Trie k e -> [[Trie k e]]
252 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
254 subForest :: Trie k e -> [Trie k e]
255 subForest (Leaf _) = []
256 subForest (Node _ _ children) = Map.elems children
258 entropyLevels :: Entropy e => Getting e i e -> Trie k i -> [[e]]
259 entropyLevels inE = fmap (noNaNs . map (nodeEntropy inE)) . L.tail . levels
261 normalizationLevels :: Entropy e => Getting e i e -> Trie k i -> [(e, e, Int)]
262 normalizationLevels inE = fmap f . entropyLevels inE
264 f es = (mean es, deviation es, length es)
266 ------------------------------------------------------------------------
268 data Tries k e = Tries
275 nodeEntropySafe :: Entropy e => Getting e i e -> Tries k i -> e
276 nodeEntropySafe inE (Tries f b) =
277 mean $ noNaNs [nodeEntropy inE f, nodeEntropy inE b]
279 nodeEntropyBwdOpt :: Entropy e => Getting e i e -> Tries k i -> e
280 nodeEntropyBwdOpt inE (Tries f b) =
281 mean $ nodeEntropy inE f : noNaNs [nodeEntropy inE b]
283 instance IsTrie Tries where
284 buildTrie tts = Tries { _fwd = buildTrie tts
285 , _bwd = buildTrie (reverse <$> tts)
288 nodeEntropy inE (Tries f b) = mean [nodeEntropy inE f, nodeEntropy inE b]
290 findTrie ks = onTries (findTrie ks)
291 findTrieR ks (Tries f b) = Tries (findTrieR ks f) (findTrieR (reverse ks) b)
293 nodeChild = onTries . nodeChild
295 evTrie inE setEV = onTries $ evTrie inE setEV
297 normalizeEntropy inE = onTries . normalizeEntropy inE
299 printTrie inE (Tries f b) = do
300 P.putStrLn "Forward:"
303 P.putStrLn "Backward:"
306 onTries :: (Trie k i -> Trie k o) -> Tries k i -> Tries k o
307 onTries h (Tries f b) = Tries (h f) (h b)
309 ------------------------------------------------------------------------
310 split :: (IsTrie trie, Entropy e) => Lens' i e -> trie Token i -> [Token] -> [[Token]]
312 split inE t0 (Terminal Start:xs0) = split inE (nodeChild (Terminal Start) t0) xs0
313 split inE t0 (x0:xs0) = go (nodeChild x0 t0) [x0] xs0
316 consRev xs xss = reverse xs : xss
318 go _ pref [] = [reverse pref]
319 go _ pref (Terminal Stop:_) = [reverse pref]
320 go t pref (Terminal Start:xs) = go t pref xs
322 -- trace (show (if acc then "ACC" else "CUT", (reverse (x : pref), ext), if acc then ">" else "<=", ((reverse pref, et), "+", ([x], ext0)))) $
324 then go xt (x:pref) xs
325 else consRev pref $ go xt0 [x] xs
330 -- ^ entropy of the current prefix
334 -- ^ entropy of the current prefix plus x
335 acc = ext > et + ext0
336 -- aut(["in","this","paper"]) > aut(["in","this"]) + aut(["paper"])
338 ne d t = if P.isNaN e then d else e
339 where e = nodeEntropy inE t
342 split :: Entropy e => Lens' i e -> Tries Token i -> [Token] -> [[Token]]
344 maximumWith (sum . map $ nodeAutonomy inE t0) (all the splits of ts)
347 ------------------------------------------------------------------------
348 ------------------------------------------------------------------------
350 mainEleve :: Int -> [[Text]] -> [[[Text]]]
353 mainEleve n input = map (map printToken) . split identity (t :: Trie Token Double) <$> inp
355 inp = toToken <$> input
356 t = buildTrie $ L.concat $ chunkAlong n 1 <$> inp
359 chunkAlongEleve :: Int -> [a] -> [[a]]
360 chunkAlongEleve n xs = L.take n <$> L.tails xs
362 toToken' :: Int -> [[Text]] -> [[Token]]
363 toToken' n input = L.concat $ (filter (/= [Terminal Stop]) . chunkAlongEleve (n + 2)) <$> toToken <$> input
365 toTokenR' :: Int -> [[Text]] -> [[Token]]
366 toTokenR' n input = L.concat $ (filter (/= [Terminal Start]) . chunkAlongEleve (n + 2) . reverse) <$> toToken <$> input
368 ---------------------------------------------
370 set_entropy_vars :: Entropy e => Getting e i e -> (e -> i -> o) -> Tries Token i -> Trie Token o
371 set_entropy_vars inE modE tries@(Tries fwd _bwd) =
372 mapTree (\k -> modE $ nodeEntropy inE (findTrieR k tries)) [] fwd
374 mapTree :: ([Token] -> t -> e) -> [Token] -> Trie Token t -> Trie Token e
375 mapTree f k t = go f k t
377 go _ _ (Leaf c) = Leaf c
378 go f k (Node c i children) = Node c (f k i) (Map.mapWithKey (\k'-> go f (k <> [k'])) children)
381 ---------------------------------------------
383 type Checks e = [(Text, Int, e, e, e, e, e, e, e, e, e)]
385 testEleve :: e ~ Double => Bool -> Int -> [Text] -> Checks e -> IO Bool
386 testEleve debug n output checks = do
389 pss = [ (ps, findTrie ps fwd ^? _Just . node_entropy) -- . info_entropy)
392 , cs <- chunkAlong m 1 <$> inp
397 --res = map (map printToken) . split identity fwd <$> inp
398 --res = map (map printToken) . split info_norm_entropy' nt' <$> inp
399 res = map (map printToken) . split info_autonomy nt <$> inp
401 P.putStrLn (show input)
402 -- forM_ pss (P.putStrLn . show)
404 printTrie info_entropy nt
406 -- P.putStrLn "Entropy Var:"
407 -- printTrie identity t''
409 P.putStrLn "Splitting:"
410 P.putStrLn $ show res
412 pure $ expected == res
415 out = T.words <$> output
416 expected = fmap (T.splitOn "-") <$> out
417 input = (T.splitOn "-" =<<) <$> out
418 inp = toToken <$> input
420 t :: Tries Token Double
421 t = -- buildTrie (toToken' n input)
422 Tries { _fwd = buildTrie (toToken' n input)
423 , _bwd = buildTrie (toTokenR' n input)
426 evt :: Tries Token (I Double)
427 evt = evTrie identity set_entropy_var t
429 nt :: Tries Token (I Double)
430 nt = normalizeEntropy info_entropy_var (\fe i -> i & info_autonomy .~ fe (i ^. info_entropy_var)) evt
432 -- t'' :: Trie Token Double
433 -- t'' = set_entropy_vars info_autonomy (\e _i -> e) nt
435 -- nt = normalizeEntropy identity set_autonomy (fwd :: Trie Token Double)
436 -- nt = normalizeEntropy' info_entropy (\f -> info_norm_entropy' %~ f) nt
440 then P.putStrLn $ " \ESC[32mPASS\ESC[m " <> msg <> " " <> show ref
441 else P.putStrLn $ " \ESC[31mFAIL\ESC[m " <> msg <> " ref=" <> show ref <> " my=" <> show my
443 checker (ngram, count, entropy, ev, autonomy, fwd_entropy, fwd_ev, fwd_autonomy, bwd_entropy, bwd_ev, bwd_autonomy) = do
444 let ns = parseToken <$> T.words ngram
446 -- tvar = findTrie ns t''
447 -- my_entropy_var = nodeEntropy identity tvar
450 P.putStrLn $ " " <> T.unpack ngram <> ":"
451 check (==) "count" count (_node_count (_fwd t'))
452 check sim "entropy" entropy (nodeEntropyBwdOpt info_entropy nt')
453 check sim "ev" ev (nodeEntropy info_entropy_var nt')
454 check sim "autonomy" autonomy (nodeEntropy info_autonomy nt')
455 check sim "fwd_entropy" fwd_entropy (nodeEntropy info_entropy (_fwd nt'))
456 check sim "fwd_ev" fwd_ev (nodeEntropy info_entropy_var (_fwd nt'))
457 check sim "fwd_autonomy" fwd_autonomy (nodeEntropy info_autonomy (_fwd nt'))
458 check sim "bwd_entropy" bwd_entropy (nodeEntropy identity (_bwd t'))
459 check sim "bwd_ev" bwd_ev (nodeEntropy info_entropy_var (_bwd nt'))
460 check sim "bwd_autonomy" bwd_autonomy (nodeEntropy info_autonomy (_bwd nt'))
462 -- | TODO real data is a list of tokenized sentences
463 example0, example1, example2, example3, example4, example5, example6 :: [Text]
464 example0 = ["New-York is New-York and New-York"]
465 example1 = ["to-be or not to-be"]
466 example2 = ["to-be-or not to-be-or NOT to-be and"]
467 example3 = example0 <> example0
468 -- > TEST: Should not have York New in the trie
469 example4 = ["a-b-c-d e a-b-c-d f"]
470 example5 = ["a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"]
471 example6 = ["le-petit chat"
477 checks0, checks2 :: Checks Double
480 [ ("<start>", 1, nan, nan, nan, 0.0, -2.113283334294875, -0.5000000000000002, nan, nan, nan)
481 , ("New", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 0.0, -2.113283334294875, -0.5000000000000002, 1.584962500721156, -0.5283208335737188, 2.0)
482 , ("York", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 1.584962500721156, -0.5283208335737188, 2.0, 0.0, -2.113283334294875, -0.5000000000000002)
483 , ("is", 1, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002)
484 , ("and", 1, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002)
485 --, ("<stop>", 0.0, nan, nan, nan, nan, nan, nan, 0.0, -2.113283334294875, -0.5000000000000002)
486 -- Since it is not in the trie it no, need to count it.
487 , ("<start> New", 1, nan, nan, nan, 0.0, nan, nan, nan, nan, nan)
488 , ("New York", 3, 1.584962500721156, 1.584962500721156, 1.4142135623730951, 1.584962500721156, 1.584962500721156, 1.4142135623730951, nan, nan, nan)
489 , ("York is", 1, 0.0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865474, nan, nan, nan)
490 , ("is New", 1, 0.0, nan, nan, 0.0, nan, nan, nan, nan, nan)
491 , ("York and", 1, 0.0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865474, nan, nan, nan)
492 , ("and New", 1, 0.0, nan, nan, 0.0, nan, nan, nan, nan, nan)
493 , ("York <stop>", 1, nan, nan, nan, nan, nan, nan, nan, nan, nan)
494 , ("<start> New York", 1, nan, nan, nan, 0.0, nan, nan, nan, nan, nan)
495 , ("New York is", 1, 0.0, nan, nan, 0.0, -1.584962500721156, nan, nan, nan, nan)
496 , ("York is New", 1, 0.0, nan, nan, 0.0, nan, nan, nan, nan, nan)
497 , ("is New York", 1, 0.0, nan, nan, 0.0, nan, nan, nan, nan, nan)
498 , ("New York and", 1, 0.0, nan, nan, 0.0, -1.584962500721156, nan, nan, nan, nan)
499 , ("York and New", 1, 0.0, nan, nan, 0.0, nan, nan, nan, nan, nan)
500 , ("and New York", 1, 0.0, nan, nan, 0.0, nan, nan, nan, nan, nan)
501 , ("New York <stop>", 1, nan, nan, nan, nan, nan, nan, nan, nan, nan)
508 [("to be", 3, 1.2516291673878228, 1.2516291673878228, 1.5535694744293167, nan, 0.9182958340544896)
509 ,("be or", 2, 0.5, nan, nan, nan, 1.0)
510 ,("or not", 1, 0.0, nan, nan, nan, 0.0)
511 ,("not to", 1, 0.0, nan, nan, nan, 0.0)
512 ,("or NOT", 1, 0.0, nan, nan, nan, 0.0)
513 ,("NOT to", 1, 0.0, nan, nan, nan, 0.0)
514 ,("be and", 1, 0.0, nan, nan, nan, 0.0)
521 [("example0", 2, example0, checks0)
522 ,("example1", 2, example1, [])
523 ,("example2", 3, example2, checks2)
524 ,("example3", 2, example3, [])
525 ,("example4", 4, example4, [])
526 ,("example5", 5, example5, [])
528 (\(name, n, ex, checks) -> do
529 P.putStrLn $ name <> " " <> show n
530 b <- testEleve False n ex checks
531 P.putStrLn $ " splitting: " <> if b then "PASS" else "FAIL"