2 Module : Gargantext.Core.Text.Terms.Eleve
3 Description : Unsupervized Word segmentation
4 Copyright : (c) CNRS, 2019-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 # Implementation of Unsupervized Word Segmentation
14 - Python implementation (Korantin August, Emmanuel Navarro):
15 [EleVe](https://github.com/kodexlab/eleve.git)
17 - Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
18 Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
19 the 50th Annual Meeting of the Association for Computational Linguistics
20 , pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
22 Notes for current implementation:
23 - TODO extract longer ngrams (see paper above, viterbi algo can be used)
24 - TODO AD TEST: prop (Node c _e f) = c == Map.size f
26 - AD: Real ngrams extraction test
27 from Gargantext.Core.Text.Terms import extractTermsUnsupervised
28 docs <- runCmdRepl $ selectDocs 1004
29 extractTermsUnsupervised 3 $ DT.intercalate " "
31 $ Gargantext.map _hyperdataDocument_abstract docs
34 {-# LANGUAGE ConstraintKinds #-}
35 {-# LANGUAGE StandaloneDeriving #-}
36 {-# LANGUAGE TemplateHaskell #-}
37 {-# LANGUAGE TypeFamilies #-}
39 module Gargantext.Core.Text.Terms.Eleve where
41 -- import Debug.Trace (trace)
42 -- import Debug.SimpleReflect
44 import Control.Lens hiding (levels, children)
45 import Control.Monad (forM_)
46 import qualified Data.List as L
48 import Data.Text (Text)
49 import qualified Data.Text as T
50 import Data.Map.Strict (Map)
51 import Data.Maybe (fromMaybe)
52 import qualified Data.Map.Strict as Map
53 import Gargantext.Prelude hiding (cs)
54 import qualified Data.Tree as Tree
55 import Data.Tree (Tree)
56 import qualified Prelude as P (putStrLn, logBase, isNaN, RealFloat)
58 nan :: Floating e => e
61 noNaNs :: P.RealFloat e => [e] -> [e]
62 noNaNs = filter (not . P.isNaN)
64 updateIfDefined :: P.RealFloat e => e -> e -> e
65 updateIfDefined e0 e | P.isNaN e = e0
68 sim :: Entropy e => e -> e -> Bool
69 sim x y = x == y || (P.isNaN x && P.isNaN y)
71 subst :: Entropy e => (e, e) -> e -> e
72 subst (src, dst) x | sim src x = dst
74 ------------------------------------------------------------------------
76 -- | TODO: Show Instance only used for debugging
83 ------------------------------------------------------------------------
84 -- | Example and tests for development
87 , _info_entropy_var :: e
91 instance Show e => Show (I e) where
92 show (I e ev a) = show (e, ev, a)
96 type ModEntropy i o e = (e -> e) -> i -> o
98 set_autonomy :: Entropy e => ModEntropy (I e) (I e) e
99 set_autonomy fe i = i & info_autonomy .~ fe (i ^. info_entropy_var)
101 set_entropy_var :: Entropy e => Setter e (I e) e e
102 set_entropy_var f e = (\ev -> I e ev nan) <$> f e
104 data StartStop = Start | Stop
105 deriving (Ord, Eq, Show)
107 data Token = NonTerminal Text
109 deriving (Ord, Eq, Show)
111 isTerminal :: Token -> Bool
112 isTerminal (Terminal _) = True
113 isTerminal (NonTerminal _) = False
115 nonTerminals :: [Token] -> [Text]
116 nonTerminals ts = [nt | NonTerminal nt <- ts]
118 parseToken :: Text -> Token
119 parseToken "<start>" = Terminal Start
120 parseToken "<stop>" = Terminal Stop
121 parseToken t = NonTerminal t
123 toToken :: [Text] -> [Token]
124 toToken xs = Terminal Start : (NonTerminal <$> xs) <> [Terminal Stop]
126 printToken :: Token -> Text
129 f (NonTerminal x) = x
130 f (Terminal Start) = "<start>"
131 f (Terminal Stop) = "<stop>"
132 ------------------------------------------------------------------------
135 = Node { _node_count :: Int
137 , _node_children :: Map k (Trie k e)
139 | Leaf { _node_count :: Int }
144 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
145 insertTrie [] n = n { _node_count = _node_count n +1}
146 insertTrie (x:xs) (Leaf c) = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
147 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
149 f = Just . insertTrie xs . fromMaybe emptyTrie
151 -- emptyTrie :: (Ord k, Monoid e) => Trie k e
152 -- emptyTrie = Node 0 mempty mempty
153 emptyTrie :: Trie k e
156 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
158 | Map.null children = Leaf c
159 | otherwise = Node c mempty children
161 -----------------------------
162 -- | Trie to Tree since Tree as nice print function
163 toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
164 toTree k (Leaf c) = Tree.Node (k, c, Nothing) []
165 toTree k (Node c e cs) = Tree.Node (k, c, Just e) (map (uncurry toTree) $ Map.toList cs)
167 ------------------------------------------------------------------------
168 ------------------------------------------------------------------------
169 normalizeLevel :: Entropy e => e -> e -> e -> e
170 normalizeLevel m v e = (e - m) / v
174 nodeChildren :: Trie k e -> Map k (Trie k e)
175 nodeChildren (Node _ _ cs) = cs
176 nodeChildren (Leaf _) = Map.empty
180 chunkAlongEleve :: Int -> [a] -> [[a]]
181 chunkAlongEleve n xs = L.take n <$> L.tails xs
183 data Direction = Backward | Forward
185 buildTrie :: Direction -> Int -> [[Token]] -> Trie Token ()
186 buildTrie d n sentences
187 = L.foldr insertTrie emptyTrie
189 $ ( filter (/= [Terminal (term d)])
190 . chunkAlongEleve (n + 1)
195 order Forward = identity
196 order Backward = reverse
198 term Backward = Start
200 class IsTrie trie where
201 entropyTrie :: Entropy e => (k -> Bool) -> trie k () -> trie k e
202 nodeEntropy :: Entropy e => Getting e i e -> trie k i -> e
203 nodeChild :: Ord k => k -> trie k e -> trie k e
204 findTrie :: Ord k => [k] -> trie k e -> trie k e
205 printTrie :: (Show i, Entropy e) => Getting e i e -> trie Token i -> IO ()
206 evTrie :: Entropy e => Getting e i e -> Setter i o e e -> trie k i -> trie k o
207 normalizeEntropy :: Entropy e
208 => Getting e i e -> ModEntropy i o e
209 -> trie k i -> trie k o
211 instance IsTrie Trie where
213 entropyTrie _ (Leaf c) = Leaf c
214 entropyTrie pred (Node c () children) = Node c e (map (entropyTrie pred) children)
216 children' = Map.toList children
217 sum_count = sum $ _node_count . snd <$> children'
218 e | sum_count == 0 = nan
219 | otherwise = sum $ f <$> children'
220 f (k, child) = if pred k then chc * P.logBase 2 (fromIntegral c)
221 else - chc * P.logBase 2 chc
223 chc = fromIntegral (_node_count child) / fromIntegral c
225 nodeEntropy inE (Node _ e _) = e ^. inE
226 nodeEntropy _ (Leaf _) = nan
228 nodeChild k (Node _ _ cs) = fromMaybe emptyTrie (Map.lookup k cs)
229 nodeChild _ (Leaf _) = emptyTrie
231 findTrie ks t = L.foldl (flip nodeChild) t ks
234 P.putStrLn . Tree.drawTree
236 $ toTree (NonTerminal "") t
237 P.putStrLn " Levels:"
238 forM_ (normalizationLevels inE t) $ \level ->
239 P.putStrLn $ " " <> show level
241 evTrie inE setEV = go nan
243 go _ (Leaf c) = Leaf c
244 go e0 (Node c i children) = Node c (i & setEV .~ ev e0 e1) $ go e1 <$> children
250 normalizeEntropy inE modE t = go (modE identity) (normalizationLevels inE t) t
252 go _ _ (Leaf c) = Leaf c
253 go _ [] _ = panic "normalizeEntropy' empty levels"
254 go f ((m, v, _) : ess) (Node c i children)
255 = Node c (f i) $ go (modE $ normalizeLevel m v) ess <$> children
256 ------------------------------------------------------------------------
258 levels :: Trie k e -> [[Trie k e]]
259 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
261 subForest :: Trie k e -> [Trie k e]
262 subForest (Leaf _) = []
263 subForest (Node _ _ children) = Map.elems children
265 entropyLevels :: Entropy e => Getting e i e -> Trie k i -> [[e]]
266 entropyLevels inE = fmap (noNaNs . map (nodeEntropy inE)) . L.tail . levels
268 normalizationLevels :: Entropy e => Getting e i e -> Trie k i -> [(e, e, Int)]
269 normalizationLevels inE = fmap f . entropyLevels inE
271 f es = (mean es, deviation es, length es)
273 ------------------------------------------------------------------------
275 data Tries k e = Tries
282 deriving instance (Show k, Show e) => Show (Tries k e)
284 buildTries :: Int -> [[Token]] -> Tries Token ()
285 buildTries n sentences = Tries
286 { _fwd = buildTrie Forward n sentences
287 , _bwd = buildTrie Backward n sentences
290 instance IsTrie Tries where
292 nodeEntropy inE (Tries f b) = mean [nodeEntropy inE f, nodeEntropy inE b]
294 findTrie ks (Tries f b) = Tries (findTrie ks f) (findTrie (reverse ks) b)
296 nodeChild = onTries . nodeChild
298 entropyTrie = onTries . entropyTrie
300 evTrie inE setEV = onTries $ evTrie inE setEV
302 normalizeEntropy inE = onTries . normalizeEntropy inE
304 printTrie inE (Tries f b) = do
305 P.putStrLn "Forward:"
308 P.putStrLn "Backward:"
311 onTries :: (Trie k i -> Trie k o) -> Tries k i -> Tries k o
312 onTries h (Tries f b) = Tries (h f) (h b)
314 ------------------------------------------------------------------------
315 mayCons :: [a] -> [[a]] -> [[a]]
317 mayCons xs xss = xs : xss
320 split :: (IsTrie trie, Entropy e) => Lens' i e -> trie Token i -> [Token] -> [[Token]]
322 split inE t (Terminal Start:xs) = split inE t xs
323 split inE t (x0:xs0) = go [x0] xs0
326 go pref (Terminal Stop:_) = [pref]
327 go _ (Terminal Start:_) = panic "split impossible"
329 -- trace (show (if acc then "ACC" else "CUT", (prefx, epxt), if acc then ">" else "<=", ((pref, ept), "+", ([x], ext)))) $
332 else mayCons pref $ go [x] xs
336 pxt = findTrie prefx t
339 -- ^ entropy of the current prefix
343 -- ^ entropy of the current prefix plus x
344 acc = P.isNaN ept || P.isNaN ext || not (P.isNaN epxt) -- && (epxt > mean [ept, ext])
346 -- aut(["in","this","paper"]) > aut(["in","this"]) + aut(["paper"])
351 split :: Entropy e => Int -> Lens' i e -> Tries Token i -> [Token] -> [[Text]]
353 split _ _ _ [t] = pure <$> nonTerminals [t]
354 split n inE t ts = nonTerminals pref `mayCons` split n inE t (drop (length pref) ts)
356 pref = maximumWith (\ks -> nodeEntropy inE $ findTrie ks t)
357 (L.tail . L.inits . take n $ ts)
361 split :: Entropy e => Lens' i e -> Tries Token i -> [Token] -> [[Token]]
363 maximumWith (sum . map $ nodeAutonomy inE t0) (all the splits of ts)
366 ------------------------------------------------------------------------
368 mainEleve :: Int -> [[Text]] -> [[[Text]]]
369 mainEleve n x = mainEleve' n x x
371 mainEleve' :: Int -> [[Text]] -> [[Text]] -> [[[Text]]]
372 mainEleve' n x y = mainEleveWith x' n y
374 x' = buildTries n (fmap toToken x)
375 -- (fmap toToken i) is computed twice, since mainEleveWith is computing it too
377 -- | This function should take the longest possible chain of:
378 -- mainEleve'' n x y = maxChainSizeOf [ mainEleve' n x y
379 -- , mainEleve' n x x
380 -- , mainEleve' n y y
382 mainEleve'' :: Int -> [[Text]] -> [[Text]] -> [[[Text]]]
383 mainEleve'' = undefined
385 mainEleveWith :: Tries Token () -> Int -> [[Text]] -> [[[Text]]]
386 mainEleveWith m n i = fmap (split n info_autonomy t) (fmap toToken i)
388 t :: Tries Token (I Double)
389 t = normalizeEntropy info_entropy_var set_autonomy
390 $ evTrie identity set_entropy_var
391 $ entropyTrie isTerminal m
393 ------------------------------------------------------------------------
395 type Checks e = [(Text, Int, e, e, e, e, e, e, e, e, e)]
397 testEleve :: e ~ Double => Bool -> Int -> [Text] -> Checks e -> IO Bool
398 testEleve debug n output checks = do
400 res = split (1 + n) info_autonomy nt <$> input
402 P.putStrLn . show $ (printToken <$>) <$> input
404 printTrie info_entropy nt
406 P.putStrLn "Splitting:"
407 P.putStrLn $ show res
409 pure $ expected == res
412 out = T.words <$> output
413 expected = fmap (T.splitOn "-") <$> out
414 input = toToken . (T.splitOn "-" =<<) <$> out
416 nt :: Tries Token (I Double)
417 nt = normalizeEntropy info_entropy_var set_autonomy
418 . evTrie identity set_entropy_var
419 . entropyTrie isTerminal
424 then P.putStrLn $ " \ESC[32mPASS\ESC[m " <> msg <> " " <> show ref
425 else P.putStrLn $ " \ESC[31mFAIL\ESC[m " <> msg <> " ref=" <> show ref <> " my=" <> show my
427 checker (ngram, count, entropy, ev, autonomy, fwd_entropy, fwd_ev, fwd_autonomy, bwd_entropy, bwd_ev, bwd_autonomy) = do
428 let ns = parseToken <$> T.words ngram
431 P.putStrLn $ " " <> T.unpack ngram <> ":"
432 check (==) "count" count (_node_count (_fwd nt'))
434 check sim "entropy" entropy (nodeEntropy info_entropy nt' )
435 check sim "ev" ev (nodeEntropy info_entropy_var nt' )
436 check sim "autonomy" autonomy (nodeEntropy info_autonomy nt' )
438 check sim "fwd_entropy" fwd_entropy (nodeEntropy info_entropy (_fwd nt'))
439 check sim "fwd_ev" fwd_ev (nodeEntropy info_entropy_var (_fwd nt'))
440 check sim "fwd_autonomy" fwd_autonomy (nodeEntropy info_autonomy (_fwd nt'))
442 check sim "bwd_entropy" bwd_entropy (nodeEntropy info_entropy (_bwd nt'))
443 check sim "bwd_ev" bwd_ev (nodeEntropy info_entropy_var (_bwd nt'))
444 check sim "bwd_autonomy" bwd_autonomy (nodeEntropy info_autonomy (_bwd nt'))
446 -- | TODO real data is a list of tokenized sentences
447 example0, example1, example2, example3, example4, example5, example6, example7, example8, example9 :: [Text]
448 example0 = ["New-York is New-York and New-York"]
449 example1 = ["to-be or not to-be"]
450 example2 = ["to-be-or not to-be-or NOT to-be and"]
451 example3 = example0 <> example0
452 -- > TEST: Should not have York New in the trie
453 example4 = ["a-b-c-d e a-b-c-d f"]
454 example5 = ["a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"]
455 example6 = ["le-petit chat"
460 example7 = ["a-b d", "a-c e", "a-c", "a-b", "a-b", "a-c", "a-c", "a-b"]
461 -- example8 = ["z f", "z", "z", "z"] <> example7
462 example8 = ["z", "z", "z", "z"] <> example7 <> example7 <> example7
463 example9 = (T.replace "z" "a") <$> example8
464 --example8 = ["a-b d", "a-c e", "a f", "a-c g", "a-b h", "a i", "a j", "a-b k", "a-c l", "a-c m", "a n", "a-b o"]
466 checks0, checks2, checks7, checks8, checks9 :: Checks Double
469 -- [(token, count, entropy, ev, autonomy, fwd_entropy, fwd_ev, fwd_autonomy, bwd_entropy, bwd_ev, bwd_autonomy)]
470 [ ("<start>", 1, nan, nan, nan, 0.0, -2.113283334294875, -0.5000000000000002, nan, nan, nan)
471 , ("New", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 0.0, -2.113283334294875, -0.5000000000000002, 1.584962500721156, -0.5283208335737188, 2.0)
472 , ("York", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 1.584962500721156, -0.5283208335737188, 2.0, 0.0, -2.113283334294875, -0.5000000000000002)
473 , ("is", 1, 0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002)
474 , ("and", 1, 0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002)
475 , ("<stop>", 0, nan, nan, nan, nan, nan, nan, 0.0, -2.113283334294875, -0.5000000000000002)
476 , ("<start> New", 1, nan, nan, nan, 0.0, nan, nan, nan, nan, nan)
477 , ("New York", 3, 1.584962500721156, 1.584962500721156, 1.414213562373095, 1.584962500721156, 1.584962500721156, 1.4142135623730947, 1.584962500721156, 1.584962500721156, 1.4142135623730951)
478 , ("York is", 1, 0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865476, 0.0, nan, nan)
479 , ("is New", 1, 0, nan, nan, 0.0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865474)
480 , ("York and", 1, 0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865476, 0.0, nan, nan)
481 , ("and New", 1, 0, nan, nan, 0.0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865474)
482 , ("York <stop>", 1, nan, nan, nan, nan, nan, nan, 0.0, nan, nan)
483 , ("<start> New York", 1, nan, nan, nan, 0.0, nan, nan, nan, nan, nan)
484 , ("New York is", 1, 0, nan, nan, 0.0, -1.584962500721156, nan, 0.0, nan, nan)
485 , ("York is New", 1, 0, nan, nan, 0.0, nan, nan, 0.0, nan, nan)
486 , ("is New York", 1, 0, nan, nan, 0.0, nan, nan, 0.0, -1.584962500721156, nan)
487 , ("New York and", 1, 0, nan, nan, 0.0, -1.584962500721156, nan, 0.0, nan, nan)
488 , ("York and New", 1, 0, nan, nan, 0.0, nan, nan, 0.0, nan, nan)
489 , ("and New York", 1, 0, nan, nan, 0.0, nan, nan, 0.0, -1.584962500721156, nan)
490 , ("New York <stop>", 1, nan, nan, nan, nan, nan, nan, 0.0, nan, nan)
495 [("to be", 3, 1.2516291673878228, 1.2516291673878228, 1.5535694744293167, nan, 0.9182958340544896)
496 ,("be or", 2, 0.5, nan, nan, nan, 1.0)
497 ,("or not", 1, 0.0, nan, nan, nan, 0.0)
498 ,("not to", 1, 0.0, nan, nan, nan, 0.0)
499 ,("or NOT", 1, 0.0, nan, nan, nan, 0.0)
500 ,("NOT to", 1, 0.0, nan, nan, nan, 0.0)
501 ,("be and", 1, 0.0, nan, nan, nan, 0.0)
506 [ ("a b", 4, 2, 1.5, 1.0106455960380136, 2, 1, 0.7302967433402215, 2, 2, 1.2909944487358056)
507 , ("a c", 4, 2, 1.5, 1.0106455960380136, 2, 1, 0.7302967433402215, 2, 2, 1.2909944487358056)
508 , ("a", 8, 2, -0.7139421727208477, 0.9315597394596105, 1, -1.7139421727208477, 0.1695158759052029, 3, 0.2860578272791523, 1.693603603014018)
512 [ ("a b", 4, 2, 1.5, 1.2384061243840367, 2, 1, 0.9190418024406298, 2, 2, 1.5577704463274435)
513 , ("a c", 4, 2, 1.5, 1.2384061243840367, 2, 1, 0.9190418024406298, 2, 2, 1.5577704463274435)
514 , ("a", 8, 2, -1.1151193576322829, 0.8012882295122719, 1, -2.115119357632283, 1.1025957503820932e-2, 3, -0.11511935763228287, 1.5915505015207227)
515 , ("z", 4, 2, -1.1151193576322829, 0.9576679529201777, 2, -1.1151193576322829, 1.0906240295212841, 2, -1.1151193576322829, 0.8247118763190712)
519 [ ("a b", 4, 2, 0.8741854163060885, 0.9234576822288185, 2, -0.25162916738782304, 0.2891449181301934, 2, 2, 1.5577704463274435)
520 , ("a c", 4, 2, 0.8741854163060885, 0.9234576822288185, 2, -0.25162916738782304, 0.2891449181301934, 2, 2, 1.5577704463274435)
521 , ("a", 12, 2.91829583405449, 3.763498724462999e-2, 1.518835832034022, 2.251629167387823, -0.6290316794220367, 1.2162041043595873, 3.5849625007211565, 0.7043016539112967, 1.8214675597084569)
524 runTestsEleve :: Bool -> IO ()
525 runTestsEleve doChecks =
527 [("example0", 3, example0, checks0)
528 ,("example0", 2, example0, [])
529 ,("example1", 2, example1, [])
530 ,("example2", 3, example2, checks2)
531 ,("example3", 2, example3, [])
532 ,("example4", 4, example4, [])
533 ,("example5", 5, example5, [])
534 ,("example6", 2, example6, [])
535 ,("example7", 2, example7, checks7)
536 ,("example8", 2, example8, checks8)
537 ,("example9", 2, example9, checks9)
539 (\(name, n, ex, checks) -> do
540 P.putStrLn $ name <> " " <> show n
541 b <- testEleve False n ex (if doChecks then checks else [])
542 P.putStrLn $ " splitting: " <> if b then "PASS" else "FAIL"