2 Module : Gargantext.Text.Eleve
3 Description : Unsupervized Word segmentation
4 Copyright : (c) CNRS, 2019-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 # Implementation of Unsupervized Word Segmentation
14 - Python implementation (Korantin August, Emmanuel Navarro):
15 [EleVe](https://github.com/kodexlab/eleve.git)
17 - Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
18 Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
19 the 50th Annual Meeting of the Association for Computational Linguistics
20 , pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
22 Notes for current implementation:
23 - TODO extract longer ngrams (see paper above, viterbi algo can be used)
24 - TODO AD TEST: prop (Node c _e f) = c == Map.size f
26 - AD: Real ngrams extraction test
27 from Gargantext.Text.Terms import extractTermsUnsupervised
28 docs <- runCmdRepl $ selectDocs 1004
29 extractTermsUnsupervised 3 $ DT.intercalate " "
31 $ Gargantext.map _hyperdataDocument_abstract docs
34 {-# LANGUAGE ConstraintKinds #-}
35 {-# LANGUAGE NoImplicitPrelude #-}
36 {-# LANGUAGE OverloadedStrings #-}
37 {-# LANGUAGE RankNTypes #-}
38 {-# LANGUAGE TemplateHaskell #-}
39 {-# LANGUAGE TypeFamilies #-}
41 module Gargantext.Text.Eleve where
43 -- import Debug.Trace (trace)
44 -- import Debug.SimpleReflect
46 import Control.Lens hiding (levels, children)
47 import Control.Monad (forM_)
49 import qualified Data.List as L
51 import Data.Text (Text)
52 import qualified Data.Text as T
54 import Data.Maybe (fromMaybe)
55 import qualified Data.Map as Map
56 import Gargantext.Prelude hiding (cs)
57 import qualified Data.Tree as Tree
58 import Data.Tree (Tree)
59 import qualified Prelude as P (putStrLn, logBase, isNaN, RealFloat)
61 nan :: Floating e => e
64 noNaNs :: P.RealFloat e => [e] -> [e]
65 noNaNs = filter (not . P.isNaN)
67 updateIfDefined :: P.RealFloat e => e -> e -> e
68 updateIfDefined e0 e | P.isNaN e = e0
71 sim :: Entropy e => e -> e -> Bool
72 sim x y = x == y || (P.isNaN x && P.isNaN y)
74 subst :: Entropy e => (e, e) -> e -> e
75 subst (src, dst) x | sim src x = dst
77 ------------------------------------------------------------------------
84 -- ^ TODO: only used for debugging
86 ------------------------------------------------------------------------
87 -- | Example and tests for development
90 , _info_entropy_var :: e
94 instance Show e => Show (I e) where
95 show (I e ev a) = show (e, ev, a)
99 type ModEntropy i o e = (e -> e) -> i -> o
101 set_autonomy :: Entropy e => ModEntropy e (I e) e
102 set_autonomy f e = I e nan (f e)
104 set_entropy_var :: Entropy e => Setter e (I e) e e
105 set_entropy_var f e = (\ev -> I e ev nan) <$> f e
107 data StartStop = Start | Stop
108 deriving (Ord, Eq, Show)
110 data Token = NonTerminal Text
112 deriving (Ord, Eq, Show)
114 isTerminal :: Token -> Bool
115 isTerminal (Terminal _) = True
116 isTerminal (NonTerminal _) = False
118 parseToken :: Text -> Token
119 parseToken "<start>" = Terminal Start
120 parseToken "<stop>" = Terminal Stop
121 parseToken t = NonTerminal t
123 toToken :: [Text] -> [Token]
124 toToken xs = Terminal Start : (NonTerminal <$> xs) <> [Terminal Stop]
126 printToken :: Token -> Text
129 f (NonTerminal x) = x
130 f (Terminal Start) = "<start>"
131 f (Terminal Stop) = "<stop>"
132 ------------------------------------------------------------------------
135 = Node { _node_count :: Int
137 , _node_children :: Map k (Trie k e)
139 | Leaf { _node_count :: Int }
144 insertTries :: Ord k => [[k]] -> Trie k ()
145 insertTries = L.foldr insertTrie emptyTrie
147 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
148 insertTrie [] n = n { _node_count = _node_count n +1}
149 insertTrie (x:xs) (Leaf c) = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
150 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
152 f = Just . insertTrie xs . fromMaybe emptyTrie
154 -- emptyTrie :: (Ord k, Monoid e) => Trie k e
155 -- emptyTrie = Node 0 mempty mempty
156 emptyTrie :: Trie k e
159 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
161 | Map.null children = Leaf c
162 | otherwise = Node c mempty children
164 -----------------------------
165 -- | Trie to Tree since Tree as nice print function
166 toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
167 toTree k (Leaf c) = Tree.Node (k, c, Nothing) []
168 toTree k (Node c e cs) = Tree.Node (k, c, Just e) (map (uncurry toTree) $ Map.toList cs)
170 ------------------------------------------------------------------------
171 ------------------------------------------------------------------------
172 entropyTrie :: Entropy e => (k -> Bool) -> Trie k () -> Trie k e
173 entropyTrie _ (Leaf c) = Leaf c
174 entropyTrie pred (Node c () children) = Node c e (map (entropyTrie pred) children)
176 children' = Map.toList children
177 sum_count = sum $ _node_count . snd <$> children'
178 e | sum_count == 0 = nan
179 | otherwise = sum $ f <$> children'
180 f (k, child) = if pred k then chc * P.logBase 2 (fromIntegral c)
181 else - chc * P.logBase 2 chc
183 chc = fromIntegral (_node_count child) / fromIntegral c
184 ------------------------------------------------------------------------
185 normalizeLevel :: Entropy e => e -> e -> e -> e
186 normalizeLevel m v e = (e - m) / v
190 nodeChildren :: Trie k e -> Map k (Trie k e)
191 nodeChildren (Node _ _ cs) = cs
192 nodeChildren (Leaf _) = Map.empty
197 class IsTrie trie where
198 buildTrie :: Entropy e => [[Token]] -> trie Token e
199 nodeEntropy :: Entropy e => Getting e i e -> trie k i -> e
200 nodeChild :: Ord k => k -> trie k e -> trie k e
201 findTrie :: Ord k => [k] -> trie k e -> trie k e
202 findTrieR :: Ord k => [k] -> trie k e -> trie k e
203 printTrie :: (Show i, Entropy e) => Getting e i e -> trie Token i -> IO ()
204 evTrie :: Entropy e => Getting e i e -> Setter i o e e -> trie k i -> trie k o
205 normalizeEntropy :: Entropy e
206 => Getting e i e -> ModEntropy i o e
207 -> trie k i -> trie k o
210 --nodeAutonomy :: (Ord k, Entropy e) => Getting e i e -> trie k i -> [k] -> e
211 --nodeAutonomy inE t ks = nodeEntropy inE $ findTrie ks t
213 instance IsTrie Trie where
214 buildTrie ts = entropyTrie isTerminal $ insertTries ts
216 nodeEntropy inE (Node _ e _) = e ^. inE
217 nodeEntropy _ (Leaf _) = nan
219 nodeChild k (Node _ _ cs) = fromMaybe emptyTrie (Map.lookup k cs)
220 nodeChild _ (Leaf _) = emptyTrie
222 findTrie ks t = L.foldl (flip nodeChild) t ks
226 P.putStrLn . Tree.drawTree
228 $ toTree (NonTerminal "") t
229 P.putStrLn " Levels:"
230 forM_ (normalizationLevels inE t) $ \level ->
231 P.putStrLn $ " " <> show level
233 evTrie inE setEV = go nan
235 go _ (Leaf c) = Leaf c
236 go e0 (Node c i children) = Node c (i & setEV .~ ev e0 e1) $ go e1 <$> children
242 normalizeEntropy inE modE t = go (modE identity) (normalizationLevels inE t) t
244 go _ _ (Leaf c) = Leaf c
245 go _ [] _ = panic "normalizeEntropy' empty levels"
246 go f ((m, v, _) : ess) (Node c i children)
247 = Node c (f i) $ go (modE $ normalizeLevel m v) ess <$> children
248 ------------------------------------------------------------------------
250 levels :: Trie k e -> [[Trie k e]]
251 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
253 subForest :: Trie k e -> [Trie k e]
254 subForest (Leaf _) = []
255 subForest (Node _ _ children) = Map.elems children
257 entropyLevels :: Entropy e => Getting e i e -> Trie k i -> [[e]]
258 entropyLevels inE = fmap (noNaNs . map (nodeEntropy inE)) . L.tail . levels
260 normalizationLevels :: Entropy e => Getting e i e -> Trie k i -> [(e, e, Int)]
261 normalizationLevels inE = fmap f . entropyLevels inE
263 f es = (mean es, deviation es, length es)
265 ------------------------------------------------------------------------
267 data Tries k e = Tries
274 nodeEntropySafe :: Entropy e => Getting e i e -> Tries k i -> e
275 nodeEntropySafe inE (Tries f b) =
276 mean $ noNaNs [nodeEntropy inE f, nodeEntropy inE b]
278 nodeEntropyBwdOpt :: Entropy e => Getting e i e -> Tries k i -> e
279 nodeEntropyBwdOpt inE (Tries f b) =
280 mean $ nodeEntropy inE f : noNaNs [nodeEntropy inE b]
282 instance IsTrie Tries where
283 buildTrie tts = Tries { _fwd = buildTrie tts
284 , _bwd = buildTrie (reverse <$> tts)
287 nodeEntropy inE (Tries f b) = mean [nodeEntropy inE f, nodeEntropy inE b]
289 findTrie ks = onTries (findTrie ks)
290 findTrieR ks (Tries f b) = Tries (findTrieR ks f) (findTrieR (reverse ks) b)
292 nodeChild = onTries . nodeChild
294 evTrie inE setEV = onTries $ evTrie inE setEV
296 normalizeEntropy inE = onTries . normalizeEntropy inE
298 printTrie inE (Tries f b) = do
299 P.putStrLn "Forward:"
302 P.putStrLn "Backward:"
305 onTries :: (Trie k i -> Trie k o) -> Tries k i -> Tries k o
306 onTries h (Tries f b) = Tries (h f) (h b)
308 ------------------------------------------------------------------------
309 split :: (IsTrie trie, Entropy e) => Lens' i e -> trie Token i -> [Token] -> [[Token]]
311 split inE t0 (Terminal Start:xs0) = split inE (nodeChild (Terminal Start) t0) xs0
312 split inE t0 (x0:xs0) = go (nodeChild x0 t0) [x0] xs0
315 consRev xs xss = reverse xs : xss
317 go _ pref [] = [reverse pref]
318 go _ pref (Terminal Stop:_) = [reverse pref]
319 go t pref (Terminal Start:xs) = go t pref xs
321 -- trace (show (if acc then "ACC" else "CUT", (reverse (x : pref), ext), if acc then ">" else "<=", ((reverse pref, et), "+", ([x], ext0)))) $
323 then go xt (x:pref) xs
324 else consRev pref $ go xt0 [x] xs
329 -- ^ entropy of the current prefix
333 -- ^ entropy of the current prefix plus x
334 acc = ext > et + ext0
335 -- aut(["in","this","paper"]) > aut(["in","this"]) + aut(["paper"])
337 ne d t = if P.isNaN e then d else e
338 where e = nodeEntropy inE t
341 split :: Entropy e => Lens' i e -> Tries Token i -> [Token] -> [[Token]]
343 maximumWith (sum . map $ nodeAutonomy inE t0) (all the splits of ts)
346 ------------------------------------------------------------------------
347 ------------------------------------------------------------------------
349 mainEleve :: Int -> [[Text]] -> [[[Text]]]
352 mainEleve n input = map (map printToken) . split identity (t :: Trie Token Double) <$> inp
354 inp = toToken <$> input
355 t = buildTrie $ L.concat $ chunkAlong n 1 <$> inp
358 chunkAlongEleve :: Int -> [a] -> [[a]]
359 chunkAlongEleve n xs = L.take n <$> L.tails xs
361 data Order = Backward | Forward
363 toToken' :: Order -> Int -> [[Text]] -> [[Token]]
364 toToken' o n input = L.concat $ (filter (/= [Terminal Stop]) . chunkAlongEleve (n + 2) . (order o) ) <$> toToken <$> input
366 order Forward = identity
367 order Backward = reverse
370 ---------------------------------------------
372 set_entropy_vars :: Entropy e => Getting e i e -> (e -> i -> o) -> Tries Token i -> Trie Token o
373 set_entropy_vars inE modE tries@(Tries fwd _bwd) =
374 mapTree (\k -> modE $ nodeEntropy inE (findTrieR k tries)) [] fwd
376 mapTree :: ([Token] -> t -> e) -> [Token] -> Trie Token t -> Trie Token e
377 mapTree f k t = go f k t
379 go _ _ (Leaf c) = Leaf c
380 go f k (Node c i children) = Node c (f k i) (Map.mapWithKey (\k'-> go f (k <> [k'])) children)
383 ---------------------------------------------
385 type Checks e = [(Text, Int, e, e, e, e, e, e, e, e, e)]
387 testEleve :: e ~ Double => Bool -> Int -> [Text] -> Checks e -> IO Bool
388 testEleve debug n output checks = do
391 pss = [ (ps, findTrie ps fwd ^? _Just . node_entropy) -- . info_entropy)
394 , cs <- chunkAlong m 1 <$> inp
399 --res = map (map printToken) . split identity fwd <$> inp
400 --res = map (map printToken) . split info_norm_entropy' nt' <$> inp
401 res = map (map printToken) . split info_autonomy nt <$> inp
403 P.putStrLn (show input)
404 -- forM_ pss (P.putStrLn . show)
406 printTrie info_entropy nt
408 -- P.putStrLn "Entropy Var:"
409 -- printTrie identity t''
411 P.putStrLn "Splitting:"
412 P.putStrLn $ show res
414 pure $ expected == res
417 out = T.words <$> output
418 expected = fmap (T.splitOn "-") <$> out
419 input = (T.splitOn "-" =<<) <$> out
420 inp = toToken <$> input
422 t :: Tries Token Double
423 t = -- buildTrie (toToken' n input)
424 Tries { _fwd = buildTrie (toToken' Forward n input)
425 , _bwd = buildTrie (toToken' Backward n input)
428 evt :: Tries Token (I Double)
429 evt = evTrie identity set_entropy_var t
431 nt :: Tries Token (I Double)
432 nt = normalizeEntropy info_entropy_var (\fe i -> i & info_autonomy .~ fe (i ^. info_entropy_var)) evt
434 -- t'' :: Trie Token Double
435 -- t'' = set_entropy_vars info_autonomy (\e _i -> e) nt
437 -- nt = normalizeEntropy identity set_autonomy (fwd :: Trie Token Double)
438 -- nt = normalizeEntropy' info_entropy (\f -> info_norm_entropy' %~ f) nt
442 then P.putStrLn $ " \ESC[32mPASS\ESC[m " <> msg <> " " <> show ref
443 else P.putStrLn $ " \ESC[31mFAIL\ESC[m " <> msg <> " ref=" <> show ref <> " my=" <> show my
445 checker (ngram, count, entropy, ev, autonomy, fwd_entropy, fwd_ev, fwd_autonomy, bwd_entropy, bwd_ev, bwd_autonomy) = do
446 let ns = parseToken <$> T.words ngram
448 -- tvar = findTrie ns t''
449 -- my_entropy_var = nodeEntropy identity tvar
452 P.putStrLn $ " " <> T.unpack ngram <> ":"
453 check (==) "count" count (_node_count (_fwd t'))
455 check sim "entropy" entropy (nodeEntropyBwdOpt info_entropy nt' )
456 check sim "ev" ev (nodeEntropy info_entropy_var nt' )
457 check sim "autonomy" autonomy (nodeEntropy info_autonomy nt' )
459 check sim "fwd_entropy" fwd_entropy (nodeEntropy info_entropy (_fwd nt'))
460 check sim "fwd_ev" fwd_ev (nodeEntropy info_entropy_var (_fwd nt'))
461 check sim "fwd_autonomy" fwd_autonomy (nodeEntropy info_autonomy (_fwd nt'))
463 check sim "bwd_entropy" bwd_entropy (nodeEntropy identity (_bwd t'))
464 check sim "bwd_ev" bwd_ev (nodeEntropy info_entropy_var (_bwd nt'))
465 check sim "bwd_autonomy" bwd_autonomy (nodeEntropy info_autonomy (_bwd nt'))
467 -- | TODO real data is a list of tokenized sentences
468 example0, example1, example2, example3, example4, example5, example6 :: [Text]
469 example0 = ["New-York is New-York and New-York"]
470 example1 = ["to-be or not to-be"]
471 example2 = ["to-be-or not to-be-or NOT to-be and"]
472 example3 = example0 <> example0
473 -- > TEST: Should not have York New in the trie
474 example4 = ["a-b-c-d e a-b-c-d f"]
475 example5 = ["a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"]
476 example6 = ["le-petit chat"
482 checks0, checks2 :: Checks Double
485 [ ("<start>", 1, nan, nan, nan, 0.0, -2.113283334294875, -0.5000000000000002, nan, nan, nan)
486 , ("New", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 0.0, -2.113283334294875, -0.5000000000000002, 1.584962500721156, -0.5283208335737188, 2.0)
487 , ("York", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 1.584962500721156, -0.5283208335737188, 2.0, 0.0, -2.113283334294875, -0.5000000000000002)
488 , ("is", 1, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002)
489 , ("and", 1, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002)
490 --, ("<stop>", 0.0, nan, nan, nan, nan, nan, nan, 0.0, -2.113283334294875, -0.5000000000000002)
491 -- Since it is not in the trie it no, need to count it.
492 , ("<start> New", 1, nan, nan, nan, 0.0, nan, nan, nan, nan, nan)
493 , ("New York", 3, 1.584962500721156, 1.584962500721156, 1.4142135623730951, 1.584962500721156, 1.584962500721156, 1.4142135623730951, nan, nan, nan)
494 , ("York is", 1, 0.0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865474, nan, nan, nan)
495 , ("is New", 1, 0.0, nan, nan, 0.0, nan, nan, nan, nan, nan)
496 , ("York and", 1, 0.0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865474, nan, nan, nan)
497 , ("and New", 1, 0.0, nan, nan, 0.0, nan, nan, nan, nan, nan)
498 , ("York <stop>", 1, nan, nan, nan, nan, nan, nan, nan, nan, nan)
499 , ("<start> New York", 1, nan, nan, nan, 0.0, nan, nan, nan, nan, nan)
500 , ("New York is", 1, 0.0, nan, nan, 0.0, -1.584962500721156, nan, nan, nan, nan)
501 , ("York is New", 1, 0.0, nan, nan, 0.0, nan, nan, nan, nan, nan)
502 , ("is New York", 1, 0.0, nan, nan, 0.0, nan, nan, nan, nan, nan)
503 , ("New York and", 1, 0.0, nan, nan, 0.0, -1.584962500721156, nan, nan, nan, nan)
504 , ("York and New", 1, 0.0, nan, nan, 0.0, nan, nan, nan, nan, nan)
505 , ("and New York", 1, 0.0, nan, nan, 0.0, nan, nan, nan, nan, nan)
506 , ("New York <stop>", 1, nan, nan, nan, nan, nan, nan, nan, nan, nan)
513 [("to be", 3, 1.2516291673878228, 1.2516291673878228, 1.5535694744293167, nan, 0.9182958340544896)
514 ,("be or", 2, 0.5, nan, nan, nan, 1.0)
515 ,("or not", 1, 0.0, nan, nan, nan, 0.0)
516 ,("not to", 1, 0.0, nan, nan, nan, 0.0)
517 ,("or NOT", 1, 0.0, nan, nan, nan, 0.0)
518 ,("NOT to", 1, 0.0, nan, nan, nan, 0.0)
519 ,("be and", 1, 0.0, nan, nan, nan, 0.0)
526 [("example0", 2, example0, checks0)
527 ,("example1", 2, example1, [])
528 ,("example2", 3, example2, checks2)
529 ,("example3", 2, example3, [])
530 ,("example4", 4, example4, [])
531 ,("example5", 5, example5, [])
533 (\(name, n, ex, checks) -> do
534 P.putStrLn $ name <> " " <> show n
535 b <- testEleve False n ex checks
536 P.putStrLn $ " splitting: " <> if b then "PASS" else "FAIL"