2 Module : Gargantext.Core.Text.Terms.Eleve
3 Description : Unsupervized Word segmentation
4 Copyright : (c) CNRS, 2019-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 # Implementation of Unsupervized Word Segmentation
14 - Python implementation (Korantin August, Emmanuel Navarro):
15 [EleVe](https://github.com/kodexlab/eleve.git)
17 - Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
18 Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
19 the 50th Annual Meeting of the Association for Computational Linguistics
20 , pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
22 Notes for current implementation:
23 - TODO extract longer ngrams (see paper above, viterbi algo can be used)
24 - TODO AD TEST: prop (Node c _e f) = c == Map.size f
26 - AD: Real ngrams extraction test
27 from Gargantext.Core.Text.Terms import extractTermsUnsupervised
28 docs <- runCmdRepl $ selectDocs 1004
29 extractTermsUnsupervised 3 $ DT.intercalate " "
31 $ Gargantext.map _hyperdataDocument_abstract docs
34 {-# LANGUAGE ConstraintKinds #-}
35 {-# LANGUAGE TemplateHaskell #-}
36 {-# LANGUAGE TypeFamilies #-}
38 module Gargantext.Core.Text.Terms.Eleve where
40 -- import Debug.Trace (trace)
41 -- import Debug.SimpleReflect
43 import Control.Lens hiding (levels, children)
44 import Control.Monad (forM_)
45 import qualified Data.List as L
47 import Data.Text (Text)
48 import qualified Data.Text as T
50 import Data.Maybe (fromMaybe)
51 import qualified Data.Map as Map
52 import Gargantext.Prelude hiding (cs)
53 import qualified Data.Tree as Tree
54 import Data.Tree (Tree)
55 import qualified Prelude as P (putStrLn, logBase, isNaN, RealFloat)
57 nan :: Floating e => e
60 noNaNs :: P.RealFloat e => [e] -> [e]
61 noNaNs = filter (not . P.isNaN)
63 updateIfDefined :: P.RealFloat e => e -> e -> e
64 updateIfDefined e0 e | P.isNaN e = e0
67 sim :: Entropy e => e -> e -> Bool
68 sim x y = x == y || (P.isNaN x && P.isNaN y)
70 subst :: Entropy e => (e, e) -> e -> e
71 subst (src, dst) x | sim src x = dst
73 ------------------------------------------------------------------------
75 -- | TODO: Show Instance only used for debugging
82 ------------------------------------------------------------------------
83 -- | Example and tests for development
86 , _info_entropy_var :: e
90 instance Show e => Show (I e) where
91 show (I e ev a) = show (e, ev, a)
95 type ModEntropy i o e = (e -> e) -> i -> o
97 set_autonomy :: Entropy e => ModEntropy (I e) (I e) e
98 set_autonomy fe i = i & info_autonomy .~ fe (i ^. info_entropy_var)
100 set_entropy_var :: Entropy e => Setter e (I e) e e
101 set_entropy_var f e = (\ev -> I e ev nan) <$> f e
103 data StartStop = Start | Stop
104 deriving (Ord, Eq, Show)
106 data Token = NonTerminal Text
108 deriving (Ord, Eq, Show)
110 isTerminal :: Token -> Bool
111 isTerminal (Terminal _) = True
112 isTerminal (NonTerminal _) = False
114 nonTerminals :: [Token] -> [Text]
115 nonTerminals ts = [nt | NonTerminal nt <- ts]
117 parseToken :: Text -> Token
118 parseToken "<start>" = Terminal Start
119 parseToken "<stop>" = Terminal Stop
120 parseToken t = NonTerminal t
122 toToken :: [Text] -> [Token]
123 toToken xs = Terminal Start : (NonTerminal <$> xs) <> [Terminal Stop]
125 printToken :: Token -> Text
128 f (NonTerminal x) = x
129 f (Terminal Start) = "<start>"
130 f (Terminal Stop) = "<stop>"
131 ------------------------------------------------------------------------
134 = Node { _node_count :: Int
136 , _node_children :: Map k (Trie k e)
138 | Leaf { _node_count :: Int }
143 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
144 insertTrie [] n = n { _node_count = _node_count n +1}
145 insertTrie (x:xs) (Leaf c) = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
146 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
148 f = Just . insertTrie xs . fromMaybe emptyTrie
150 -- emptyTrie :: (Ord k, Monoid e) => Trie k e
151 -- emptyTrie = Node 0 mempty mempty
152 emptyTrie :: Trie k e
155 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
157 | Map.null children = Leaf c
158 | otherwise = Node c mempty children
160 -----------------------------
161 -- | Trie to Tree since Tree as nice print function
162 toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
163 toTree k (Leaf c) = Tree.Node (k, c, Nothing) []
164 toTree k (Node c e cs) = Tree.Node (k, c, Just e) (map (uncurry toTree) $ Map.toList cs)
166 ------------------------------------------------------------------------
167 ------------------------------------------------------------------------
168 normalizeLevel :: Entropy e => e -> e -> e -> e
169 normalizeLevel m v e = (e - m) / v
173 nodeChildren :: Trie k e -> Map k (Trie k e)
174 nodeChildren (Node _ _ cs) = cs
175 nodeChildren (Leaf _) = Map.empty
179 chunkAlongEleve :: Int -> [a] -> [[a]]
180 chunkAlongEleve n xs = L.take n <$> L.tails xs
182 data Direction = Backward | Forward
184 buildTrie :: Direction -> Int -> [[Token]] -> Trie Token ()
185 buildTrie d n sentences
186 = L.foldr insertTrie emptyTrie
188 $ ( filter (/= [Terminal (term d)])
189 . chunkAlongEleve (n + 1)
194 order Forward = identity
195 order Backward = reverse
197 term Backward = Start
199 class IsTrie trie where
200 entropyTrie :: Entropy e => (k -> Bool) -> trie k () -> trie k e
201 nodeEntropy :: Entropy e => Getting e i e -> trie k i -> e
202 nodeChild :: Ord k => k -> trie k e -> trie k e
203 findTrie :: Ord k => [k] -> trie k e -> trie k e
204 printTrie :: (Show i, Entropy e) => Getting e i e -> trie Token i -> IO ()
205 evTrie :: Entropy e => Getting e i e -> Setter i o e e -> trie k i -> trie k o
206 normalizeEntropy :: Entropy e
207 => Getting e i e -> ModEntropy i o e
208 -> trie k i -> trie k o
210 instance IsTrie Trie where
212 entropyTrie _ (Leaf c) = Leaf c
213 entropyTrie pred (Node c () children) = Node c e (map (entropyTrie pred) children)
215 children' = Map.toList children
216 sum_count = sum $ _node_count . snd <$> children'
217 e | sum_count == 0 = nan
218 | otherwise = sum $ f <$> children'
219 f (k, child) = if pred k then chc * P.logBase 2 (fromIntegral c)
220 else - chc * P.logBase 2 chc
222 chc = fromIntegral (_node_count child) / fromIntegral c
224 nodeEntropy inE (Node _ e _) = e ^. inE
225 nodeEntropy _ (Leaf _) = nan
227 nodeChild k (Node _ _ cs) = fromMaybe emptyTrie (Map.lookup k cs)
228 nodeChild _ (Leaf _) = emptyTrie
230 findTrie ks t = L.foldl (flip nodeChild) t ks
233 P.putStrLn . Tree.drawTree
235 $ toTree (NonTerminal "") t
236 P.putStrLn " Levels:"
237 forM_ (normalizationLevels inE t) $ \level ->
238 P.putStrLn $ " " <> show level
240 evTrie inE setEV = go nan
242 go _ (Leaf c) = Leaf c
243 go e0 (Node c i children) = Node c (i & setEV .~ ev e0 e1) $ go e1 <$> children
249 normalizeEntropy inE modE t = go (modE identity) (normalizationLevels inE t) t
251 go _ _ (Leaf c) = Leaf c
252 go _ [] _ = panic "normalizeEntropy' empty levels"
253 go f ((m, v, _) : ess) (Node c i children)
254 = Node c (f i) $ go (modE $ normalizeLevel m v) ess <$> children
255 ------------------------------------------------------------------------
257 levels :: Trie k e -> [[Trie k e]]
258 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
260 subForest :: Trie k e -> [Trie k e]
261 subForest (Leaf _) = []
262 subForest (Node _ _ children) = Map.elems children
264 entropyLevels :: Entropy e => Getting e i e -> Trie k i -> [[e]]
265 entropyLevels inE = fmap (noNaNs . map (nodeEntropy inE)) . L.tail . levels
267 normalizationLevels :: Entropy e => Getting e i e -> Trie k i -> [(e, e, Int)]
268 normalizationLevels inE = fmap f . entropyLevels inE
270 f es = (mean es, deviation es, length es)
272 ------------------------------------------------------------------------
274 data Tries k e = Tries
281 buildTries :: Int -> [[Token]] -> Tries Token ()
282 buildTries n sentences = Tries
283 { _fwd = buildTrie Forward n sentences
284 , _bwd = buildTrie Backward n sentences
287 instance IsTrie Tries where
289 nodeEntropy inE (Tries f b) = mean [nodeEntropy inE f, nodeEntropy inE b]
291 findTrie ks (Tries f b) = Tries (findTrie ks f) (findTrie (reverse ks) b)
293 nodeChild = onTries . nodeChild
295 entropyTrie = onTries . entropyTrie
297 evTrie inE setEV = onTries $ evTrie inE setEV
299 normalizeEntropy inE = onTries . normalizeEntropy inE
301 printTrie inE (Tries f b) = do
302 P.putStrLn "Forward:"
305 P.putStrLn "Backward:"
308 onTries :: (Trie k i -> Trie k o) -> Tries k i -> Tries k o
309 onTries h (Tries f b) = Tries (h f) (h b)
311 ------------------------------------------------------------------------
312 mayCons :: [a] -> [[a]] -> [[a]]
314 mayCons xs xss = xs : xss
317 split :: (IsTrie trie, Entropy e) => Lens' i e -> trie Token i -> [Token] -> [[Token]]
319 split inE t (Terminal Start:xs) = split inE t xs
320 split inE t (x0:xs0) = go [x0] xs0
323 go pref (Terminal Stop:_) = [pref]
324 go _ (Terminal Start:_) = panic "split impossible"
326 -- trace (show (if acc then "ACC" else "CUT", (prefx, epxt), if acc then ">" else "<=", ((pref, ept), "+", ([x], ext)))) $
329 else mayCons pref $ go [x] xs
333 pxt = findTrie prefx t
336 -- ^ entropy of the current prefix
340 -- ^ entropy of the current prefix plus x
341 acc = P.isNaN ept || P.isNaN ext || not (P.isNaN epxt) -- && (epxt > mean [ept, ext])
343 -- aut(["in","this","paper"]) > aut(["in","this"]) + aut(["paper"])
348 split :: Entropy e => Int -> Lens' i e -> Tries Token i -> [Token] -> [[Text]]
350 split _ _ _ [t] = pure <$> nonTerminals [t]
351 split n inE t ts = nonTerminals pref `mayCons` split n inE t (drop (length pref) ts)
353 pref = maximumWith (\ks -> nodeEntropy inE $ findTrie ks t)
354 (L.tail . L.inits . take n $ ts)
358 split :: Entropy e => Lens' i e -> Tries Token i -> [Token] -> [[Token]]
360 maximumWith (sum . map $ nodeAutonomy inE t0) (all the splits of ts)
363 ------------------------------------------------------------------------
365 mainEleve :: Int -> [[Text]] -> [[[Text]]]
366 mainEleve n x = mainEleve' n x x
368 mainEleve' :: Int -> [[Text]] -> [[Text]] -> [[[Text]]]
369 mainEleve' n x y = mainEleveWith x' n y
371 x' = buildTries n (fmap toToken x)
372 -- (fmap toToken i) is computed twice, since mainEleveWith is computing it too
374 -- | This function should take the longest possible chain of:
375 -- mainEleve'' n x y = maxChainSizeOf [ mainEleve' n x y
376 -- , mainEleve' n x x
377 -- , mainEleve' n y y
379 mainEleve'' :: Int -> [[Text]] -> [[Text]] -> [[[Text]]]
380 mainEleve'' = undefined
382 mainEleveWith :: Tries Token () -> Int -> [[Text]] -> [[[Text]]]
383 mainEleveWith m n i = fmap (split n info_autonomy t) (fmap toToken i)
385 t :: Tries Token (I Double)
386 t = normalizeEntropy info_entropy_var set_autonomy
387 $ evTrie identity set_entropy_var
388 $ entropyTrie isTerminal m
390 ------------------------------------------------------------------------
392 type Checks e = [(Text, Int, e, e, e, e, e, e, e, e, e)]
394 testEleve :: e ~ Double => Bool -> Int -> [Text] -> Checks e -> IO Bool
395 testEleve debug n output checks = do
397 res = split (1 + n) info_autonomy nt <$> input
399 P.putStrLn . show $ (printToken <$>) <$> input
401 printTrie info_entropy nt
403 P.putStrLn "Splitting:"
404 P.putStrLn $ show res
406 pure $ expected == res
409 out = T.words <$> output
410 expected = fmap (T.splitOn "-") <$> out
411 input = toToken . (T.splitOn "-" =<<) <$> out
413 nt :: Tries Token (I Double)
414 nt = normalizeEntropy info_entropy_var set_autonomy
415 . evTrie identity set_entropy_var
416 . entropyTrie isTerminal
421 then P.putStrLn $ " \ESC[32mPASS\ESC[m " <> msg <> " " <> show ref
422 else P.putStrLn $ " \ESC[31mFAIL\ESC[m " <> msg <> " ref=" <> show ref <> " my=" <> show my
424 checker (ngram, count, entropy, ev, autonomy, fwd_entropy, fwd_ev, fwd_autonomy, bwd_entropy, bwd_ev, bwd_autonomy) = do
425 let ns = parseToken <$> T.words ngram
428 P.putStrLn $ " " <> T.unpack ngram <> ":"
429 check (==) "count" count (_node_count (_fwd nt'))
431 check sim "entropy" entropy (nodeEntropy info_entropy nt' )
432 check sim "ev" ev (nodeEntropy info_entropy_var nt' )
433 check sim "autonomy" autonomy (nodeEntropy info_autonomy nt' )
435 check sim "fwd_entropy" fwd_entropy (nodeEntropy info_entropy (_fwd nt'))
436 check sim "fwd_ev" fwd_ev (nodeEntropy info_entropy_var (_fwd nt'))
437 check sim "fwd_autonomy" fwd_autonomy (nodeEntropy info_autonomy (_fwd nt'))
439 check sim "bwd_entropy" bwd_entropy (nodeEntropy info_entropy (_bwd nt'))
440 check sim "bwd_ev" bwd_ev (nodeEntropy info_entropy_var (_bwd nt'))
441 check sim "bwd_autonomy" bwd_autonomy (nodeEntropy info_autonomy (_bwd nt'))
443 -- | TODO real data is a list of tokenized sentences
444 example0, example1, example2, example3, example4, example5, example6, example7, example8, example9 :: [Text]
445 example0 = ["New-York is New-York and New-York"]
446 example1 = ["to-be or not to-be"]
447 example2 = ["to-be-or not to-be-or NOT to-be and"]
448 example3 = example0 <> example0
449 -- > TEST: Should not have York New in the trie
450 example4 = ["a-b-c-d e a-b-c-d f"]
451 example5 = ["a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"]
452 example6 = ["le-petit chat"
457 example7 = ["a-b d", "a-c e", "a-c", "a-b", "a-b", "a-c", "a-c", "a-b"]
458 -- example8 = ["z f", "z", "z", "z"] <> example7
459 example8 = ["z", "z", "z", "z"] <> example7 <> example7 <> example7
460 example9 = (T.replace "z" "a") <$> example8
461 --example8 = ["a-b d", "a-c e", "a f", "a-c g", "a-b h", "a i", "a j", "a-b k", "a-c l", "a-c m", "a n", "a-b o"]
463 checks0, checks2, checks7, checks8, checks9 :: Checks Double
466 -- [(token, count, entropy, ev, autonomy, fwd_entropy, fwd_ev, fwd_autonomy, bwd_entropy, bwd_ev, bwd_autonomy)]
467 [ ("<start>", 1, nan, nan, nan, 0.0, -2.113283334294875, -0.5000000000000002, nan, nan, nan)
468 , ("New", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 0.0, -2.113283334294875, -0.5000000000000002, 1.584962500721156, -0.5283208335737188, 2.0)
469 , ("York", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 1.584962500721156, -0.5283208335737188, 2.0, 0.0, -2.113283334294875, -0.5000000000000002)
470 , ("is", 1, 0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002)
471 , ("and", 1, 0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002)
472 , ("<stop>", 0, nan, nan, nan, nan, nan, nan, 0.0, -2.113283334294875, -0.5000000000000002)
473 , ("<start> New", 1, nan, nan, nan, 0.0, nan, nan, nan, nan, nan)
474 , ("New York", 3, 1.584962500721156, 1.584962500721156, 1.414213562373095, 1.584962500721156, 1.584962500721156, 1.4142135623730947, 1.584962500721156, 1.584962500721156, 1.4142135623730951)
475 , ("York is", 1, 0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865476, 0.0, nan, nan)
476 , ("is New", 1, 0, nan, nan, 0.0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865474)
477 , ("York and", 1, 0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865476, 0.0, nan, nan)
478 , ("and New", 1, 0, nan, nan, 0.0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865474)
479 , ("York <stop>", 1, nan, nan, nan, nan, nan, nan, 0.0, nan, nan)
480 , ("<start> New York", 1, nan, nan, nan, 0.0, nan, nan, nan, nan, nan)
481 , ("New York is", 1, 0, nan, nan, 0.0, -1.584962500721156, nan, 0.0, nan, nan)
482 , ("York is New", 1, 0, nan, nan, 0.0, nan, nan, 0.0, nan, nan)
483 , ("is New York", 1, 0, nan, nan, 0.0, nan, nan, 0.0, -1.584962500721156, nan)
484 , ("New York and", 1, 0, nan, nan, 0.0, -1.584962500721156, nan, 0.0, nan, nan)
485 , ("York and New", 1, 0, nan, nan, 0.0, nan, nan, 0.0, nan, nan)
486 , ("and New York", 1, 0, nan, nan, 0.0, nan, nan, 0.0, -1.584962500721156, nan)
487 , ("New York <stop>", 1, nan, nan, nan, nan, nan, nan, 0.0, nan, nan)
492 [("to be", 3, 1.2516291673878228, 1.2516291673878228, 1.5535694744293167, nan, 0.9182958340544896)
493 ,("be or", 2, 0.5, nan, nan, nan, 1.0)
494 ,("or not", 1, 0.0, nan, nan, nan, 0.0)
495 ,("not to", 1, 0.0, nan, nan, nan, 0.0)
496 ,("or NOT", 1, 0.0, nan, nan, nan, 0.0)
497 ,("NOT to", 1, 0.0, nan, nan, nan, 0.0)
498 ,("be and", 1, 0.0, nan, nan, nan, 0.0)
503 [ ("a b", 4, 2, 1.5, 1.0106455960380136, 2, 1, 0.7302967433402215, 2, 2, 1.2909944487358056)
504 , ("a c", 4, 2, 1.5, 1.0106455960380136, 2, 1, 0.7302967433402215, 2, 2, 1.2909944487358056)
505 , ("a", 8, 2, -0.7139421727208477, 0.9315597394596105, 1, -1.7139421727208477, 0.1695158759052029, 3, 0.2860578272791523, 1.693603603014018)
509 [ ("a b", 4, 2, 1.5, 1.2384061243840367, 2, 1, 0.9190418024406298, 2, 2, 1.5577704463274435)
510 , ("a c", 4, 2, 1.5, 1.2384061243840367, 2, 1, 0.9190418024406298, 2, 2, 1.5577704463274435)
511 , ("a", 8, 2, -1.1151193576322829, 0.8012882295122719, 1, -2.115119357632283, 1.1025957503820932e-2, 3, -0.11511935763228287, 1.5915505015207227)
512 , ("z", 4, 2, -1.1151193576322829, 0.9576679529201777, 2, -1.1151193576322829, 1.0906240295212841, 2, -1.1151193576322829, 0.8247118763190712)
516 [ ("a b", 4, 2, 0.8741854163060885, 0.9234576822288185, 2, -0.25162916738782304, 0.2891449181301934, 2, 2, 1.5577704463274435)
517 , ("a c", 4, 2, 0.8741854163060885, 0.9234576822288185, 2, -0.25162916738782304, 0.2891449181301934, 2, 2, 1.5577704463274435)
518 , ("a", 12, 2.91829583405449, 3.763498724462999e-2, 1.518835832034022, 2.251629167387823, -0.6290316794220367, 1.2162041043595873, 3.5849625007211565, 0.7043016539112967, 1.8214675597084569)
521 runTestsEleve :: Bool -> IO ()
522 runTestsEleve doChecks =
524 [("example0", 3, example0, checks0)
525 ,("example0", 2, example0, [])
526 ,("example1", 2, example1, [])
527 ,("example2", 3, example2, checks2)
528 ,("example3", 2, example3, [])
529 ,("example4", 4, example4, [])
530 ,("example5", 5, example5, [])
531 ,("example6", 2, example6, [])
532 ,("example7", 2, example7, checks7)
533 ,("example8", 2, example8, checks8)
534 ,("example9", 2, example9, checks9)
536 (\(name, n, ex, checks) -> do
537 P.putStrLn $ name <> " " <> show n
538 b <- testEleve False n ex (if doChecks then checks else [])
539 P.putStrLn $ " splitting: " <> if b then "PASS" else "FAIL"