2 Module : Gargantext.Text.Terms.Eleve
3 Description : Unsupervized Word segmentation
4 Copyright : (c) CNRS, 2019-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 # Implementation of Unsupervized Word Segmentation
14 - Python implementation (Korantin August, Emmanuel Navarro):
15 [EleVe](https://github.com/kodexlab/eleve.git)
17 - Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
18 Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
19 the 50th Annual Meeting of the Association for Computational Linguistics
20 , pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
22 Notes for current implementation:
23 - TODO extract longer ngrams (see paper above, viterbi algo can be used)
24 - TODO AD TEST: prop (Node c _e f) = c == Map.size f
26 - AD: Real ngrams extraction test
27 from Gargantext.Text.Terms import extractTermsUnsupervised
28 docs <- runCmdRepl $ selectDocs 1004
29 extractTermsUnsupervised 3 $ DT.intercalate " "
31 $ Gargantext.map _hyperdataDocument_abstract docs
34 {-# LANGUAGE ConstraintKinds #-}
35 {-# LANGUAGE NoImplicitPrelude #-}
36 {-# LANGUAGE OverloadedStrings #-}
37 {-# LANGUAGE RankNTypes #-}
38 {-# LANGUAGE TemplateHaskell #-}
39 {-# LANGUAGE TypeFamilies #-}
41 module Gargantext.Text.Terms.Eleve where
43 -- import Debug.Trace (trace)
44 -- import Debug.SimpleReflect
46 import Control.Lens hiding (levels, children)
47 import Control.Monad (forM_)
49 import qualified Data.List as L
51 import Data.Text (Text)
52 import qualified Data.Text as T
54 import Data.Maybe (fromMaybe)
55 import qualified Data.Map as Map
56 import Gargantext.Prelude hiding (cs)
57 import qualified Data.Tree as Tree
58 import Data.Tree (Tree)
59 import qualified Prelude as P (putStrLn, logBase, isNaN, RealFloat)
61 nan :: Floating e => e
64 noNaNs :: P.RealFloat e => [e] -> [e]
65 noNaNs = filter (not . P.isNaN)
67 updateIfDefined :: P.RealFloat e => e -> e -> e
68 updateIfDefined e0 e | P.isNaN e = e0
71 sim :: Entropy e => e -> e -> Bool
72 sim x y = x == y || (P.isNaN x && P.isNaN y)
74 subst :: Entropy e => (e, e) -> e -> e
75 subst (src, dst) x | sim src x = dst
77 ------------------------------------------------------------------------
84 -- ^ TODO: only used for debugging
86 ------------------------------------------------------------------------
87 -- | Example and tests for development
90 , _info_entropy_var :: e
94 instance Show e => Show (I e) where
95 show (I e ev a) = show (e, ev, a)
99 type ModEntropy i o e = (e -> e) -> i -> o
101 set_autonomy :: Entropy e => ModEntropy (I e) (I e) e
102 set_autonomy fe i = i & info_autonomy .~ fe (i ^. info_entropy_var)
104 set_entropy_var :: Entropy e => Setter e (I e) e e
105 set_entropy_var f e = (\ev -> I e ev nan) <$> f e
107 data StartStop = Start | Stop
108 deriving (Ord, Eq, Show)
110 data Token = NonTerminal Text
112 deriving (Ord, Eq, Show)
114 isTerminal :: Token -> Bool
115 isTerminal (Terminal _) = True
116 isTerminal (NonTerminal _) = False
118 nonTerminals :: [Token] -> [Text]
119 nonTerminals ts = [nt | NonTerminal nt <- ts]
121 parseToken :: Text -> Token
122 parseToken "<start>" = Terminal Start
123 parseToken "<stop>" = Terminal Stop
124 parseToken t = NonTerminal t
126 toToken :: [Text] -> [Token]
127 toToken xs = Terminal Start : (NonTerminal <$> xs) <> [Terminal Stop]
129 printToken :: Token -> Text
132 f (NonTerminal x) = x
133 f (Terminal Start) = "<start>"
134 f (Terminal Stop) = "<stop>"
135 ------------------------------------------------------------------------
138 = Node { _node_count :: Int
140 , _node_children :: Map k (Trie k e)
142 | Leaf { _node_count :: Int }
147 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
148 insertTrie [] n = n { _node_count = _node_count n +1}
149 insertTrie (x:xs) (Leaf c) = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
150 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
152 f = Just . insertTrie xs . fromMaybe emptyTrie
154 -- emptyTrie :: (Ord k, Monoid e) => Trie k e
155 -- emptyTrie = Node 0 mempty mempty
156 emptyTrie :: Trie k e
159 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
161 | Map.null children = Leaf c
162 | otherwise = Node c mempty children
164 -----------------------------
165 -- | Trie to Tree since Tree as nice print function
166 toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
167 toTree k (Leaf c) = Tree.Node (k, c, Nothing) []
168 toTree k (Node c e cs) = Tree.Node (k, c, Just e) (map (uncurry toTree) $ Map.toList cs)
170 ------------------------------------------------------------------------
171 ------------------------------------------------------------------------
172 normalizeLevel :: Entropy e => e -> e -> e -> e
173 normalizeLevel m v e = (e - m) / v
177 nodeChildren :: Trie k e -> Map k (Trie k e)
178 nodeChildren (Node _ _ cs) = cs
179 nodeChildren (Leaf _) = Map.empty
183 chunkAlongEleve :: Int -> [a] -> [[a]]
184 chunkAlongEleve n xs = L.take n <$> L.tails xs
186 data Direction = Backward | Forward
188 buildTrie :: Direction -> Int -> [[Token]] -> Trie Token ()
189 buildTrie d n sentences
190 = L.foldr insertTrie emptyTrie
192 $ ( filter (/= [Terminal (term d)])
193 . chunkAlongEleve (n + 1)
198 order Forward = identity
199 order Backward = reverse
201 term Backward = Start
203 class IsTrie trie where
204 entropyTrie :: Entropy e => (k -> Bool) -> trie k () -> trie k e
205 nodeEntropy :: Entropy e => Getting e i e -> trie k i -> e
206 nodeChild :: Ord k => k -> trie k e -> trie k e
207 findTrie :: Ord k => [k] -> trie k e -> trie k e
208 printTrie :: (Show i, Entropy e) => Getting e i e -> trie Token i -> IO ()
209 evTrie :: Entropy e => Getting e i e -> Setter i o e e -> trie k i -> trie k o
210 normalizeEntropy :: Entropy e
211 => Getting e i e -> ModEntropy i o e
212 -> trie k i -> trie k o
214 instance IsTrie Trie where
216 entropyTrie _ (Leaf c) = Leaf c
217 entropyTrie pred (Node c () children) = Node c e (map (entropyTrie pred) children)
219 children' = Map.toList children
220 sum_count = sum $ _node_count . snd <$> children'
221 e | sum_count == 0 = nan
222 | otherwise = sum $ f <$> children'
223 f (k, child) = if pred k then chc * P.logBase 2 (fromIntegral c)
224 else - chc * P.logBase 2 chc
226 chc = fromIntegral (_node_count child) / fromIntegral c
228 nodeEntropy inE (Node _ e _) = e ^. inE
229 nodeEntropy _ (Leaf _) = nan
231 nodeChild k (Node _ _ cs) = fromMaybe emptyTrie (Map.lookup k cs)
232 nodeChild _ (Leaf _) = emptyTrie
234 findTrie ks t = L.foldl (flip nodeChild) t ks
237 P.putStrLn . Tree.drawTree
239 $ toTree (NonTerminal "") t
240 P.putStrLn " Levels:"
241 forM_ (normalizationLevels inE t) $ \level ->
242 P.putStrLn $ " " <> show level
244 evTrie inE setEV = go nan
246 go _ (Leaf c) = Leaf c
247 go e0 (Node c i children) = Node c (i & setEV .~ ev e0 e1) $ go e1 <$> children
253 normalizeEntropy inE modE t = go (modE identity) (normalizationLevels inE t) t
255 go _ _ (Leaf c) = Leaf c
256 go _ [] _ = panic "normalizeEntropy' empty levels"
257 go f ((m, v, _) : ess) (Node c i children)
258 = Node c (f i) $ go (modE $ normalizeLevel m v) ess <$> children
259 ------------------------------------------------------------------------
261 levels :: Trie k e -> [[Trie k e]]
262 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
264 subForest :: Trie k e -> [Trie k e]
265 subForest (Leaf _) = []
266 subForest (Node _ _ children) = Map.elems children
268 entropyLevels :: Entropy e => Getting e i e -> Trie k i -> [[e]]
269 entropyLevels inE = fmap (noNaNs . map (nodeEntropy inE)) . L.tail . levels
271 normalizationLevels :: Entropy e => Getting e i e -> Trie k i -> [(e, e, Int)]
272 normalizationLevels inE = fmap f . entropyLevels inE
274 f es = (mean es, deviation es, length es)
276 ------------------------------------------------------------------------
278 data Tries k e = Tries
285 buildTries :: Int -> [[Token]] -> Tries Token ()
286 buildTries n sentences = Tries
287 { _fwd = buildTrie Forward n sentences
288 , _bwd = buildTrie Backward n sentences
291 instance IsTrie Tries where
293 nodeEntropy inE (Tries f b) = mean [nodeEntropy inE f, nodeEntropy inE b]
295 findTrie ks (Tries f b) = Tries (findTrie ks f) (findTrie (reverse ks) b)
297 nodeChild = onTries . nodeChild
299 entropyTrie = onTries . entropyTrie
301 evTrie inE setEV = onTries $ evTrie inE setEV
303 normalizeEntropy inE = onTries . normalizeEntropy inE
305 printTrie inE (Tries f b) = do
306 P.putStrLn "Forward:"
309 P.putStrLn "Backward:"
312 onTries :: (Trie k i -> Trie k o) -> Tries k i -> Tries k o
313 onTries h (Tries f b) = Tries (h f) (h b)
315 ------------------------------------------------------------------------
316 mayCons :: [a] -> [[a]] -> [[a]]
318 mayCons xs xss = xs : xss
321 split :: (IsTrie trie, Entropy e) => Lens' i e -> trie Token i -> [Token] -> [[Token]]
323 split inE t (Terminal Start:xs) = split inE t xs
324 split inE t (x0:xs0) = go [x0] xs0
327 go pref (Terminal Stop:_) = [pref]
328 go _ (Terminal Start:_) = panic "split impossible"
330 -- trace (show (if acc then "ACC" else "CUT", (prefx, epxt), if acc then ">" else "<=", ((pref, ept), "+", ([x], ext)))) $
333 else mayCons pref $ go [x] xs
337 pxt = findTrie prefx t
340 -- ^ entropy of the current prefix
344 -- ^ entropy of the current prefix plus x
345 acc = P.isNaN ept || P.isNaN ext || not (P.isNaN epxt) -- && (epxt > mean [ept, ext])
347 -- aut(["in","this","paper"]) > aut(["in","this"]) + aut(["paper"])
352 split :: Entropy e => Int -> Lens' i e -> Tries Token i -> [Token] -> [[Text]]
354 split _ _ _ [t] = pure <$> nonTerminals [t]
355 split n inE t ts = nonTerminals pref `mayCons` split n inE t (drop (length pref) ts)
357 pref = maximumWith (\ks -> nodeEntropy inE $ findTrie ks t)
358 (L.tail . L.inits . take n $ ts)
362 split :: Entropy e => Lens' i e -> Tries Token i -> [Token] -> [[Token]]
364 maximumWith (sum . map $ nodeAutonomy inE t0) (all the splits of ts)
367 ------------------------------------------------------------------------
369 mainEleve :: Int -> [[Text]] -> [[[Text]]]
370 mainEleve n x = mainEleve' n x x
372 mainEleve' :: Int -> [[Text]] -> [[Text]] -> [[[Text]]]
373 mainEleve' n x y = mainEleveWith x' n y
375 x' = buildTries n (fmap toToken x)
376 -- (fmap toToken i) is computed twice, since mainEleveWith is computing it too
378 -- | This function should take the longest possible chain of:
379 -- mainEleve'' n x y = maxChainSizeOf [ mainEleve' n x y
380 -- , mainEleve' n x x
381 -- , mainEleve' n y y
383 mainEleve'' :: Int -> [[Text]] -> [[Text]] -> [[[Text]]]
384 mainEleve'' = undefined
386 mainEleveWith :: Tries Token () -> Int -> [[Text]] -> [[[Text]]]
387 mainEleveWith m n i = fmap (split n info_autonomy t) (fmap toToken i)
389 t :: Tries Token (I Double)
390 t = normalizeEntropy info_entropy_var set_autonomy
391 $ evTrie identity set_entropy_var
392 $ entropyTrie isTerminal m
394 ------------------------------------------------------------------------
396 type Checks e = [(Text, Int, e, e, e, e, e, e, e, e, e)]
398 testEleve :: e ~ Double => Bool -> Int -> [Text] -> Checks e -> IO Bool
399 testEleve debug n output checks = do
401 res = split (1 + n) info_autonomy nt <$> input
403 P.putStrLn . show $ (printToken <$>) <$> input
405 printTrie info_entropy nt
407 P.putStrLn "Splitting:"
408 P.putStrLn $ show res
410 pure $ expected == res
413 out = T.words <$> output
414 expected = fmap (T.splitOn "-") <$> out
415 input = toToken . (T.splitOn "-" =<<) <$> out
417 nt :: Tries Token (I Double)
418 nt = normalizeEntropy info_entropy_var set_autonomy
419 . evTrie identity set_entropy_var
420 . entropyTrie isTerminal
425 then P.putStrLn $ " \ESC[32mPASS\ESC[m " <> msg <> " " <> show ref
426 else P.putStrLn $ " \ESC[31mFAIL\ESC[m " <> msg <> " ref=" <> show ref <> " my=" <> show my
428 checker (ngram, count, entropy, ev, autonomy, fwd_entropy, fwd_ev, fwd_autonomy, bwd_entropy, bwd_ev, bwd_autonomy) = do
429 let ns = parseToken <$> T.words ngram
432 P.putStrLn $ " " <> T.unpack ngram <> ":"
433 check (==) "count" count (_node_count (_fwd nt'))
435 check sim "entropy" entropy (nodeEntropy info_entropy nt' )
436 check sim "ev" ev (nodeEntropy info_entropy_var nt' )
437 check sim "autonomy" autonomy (nodeEntropy info_autonomy nt' )
439 check sim "fwd_entropy" fwd_entropy (nodeEntropy info_entropy (_fwd nt'))
440 check sim "fwd_ev" fwd_ev (nodeEntropy info_entropy_var (_fwd nt'))
441 check sim "fwd_autonomy" fwd_autonomy (nodeEntropy info_autonomy (_fwd nt'))
443 check sim "bwd_entropy" bwd_entropy (nodeEntropy info_entropy (_bwd nt'))
444 check sim "bwd_ev" bwd_ev (nodeEntropy info_entropy_var (_bwd nt'))
445 check sim "bwd_autonomy" bwd_autonomy (nodeEntropy info_autonomy (_bwd nt'))
447 -- | TODO real data is a list of tokenized sentences
448 example0, example1, example2, example3, example4, example5, example6, example7, example8, example9 :: [Text]
449 example0 = ["New-York is New-York and New-York"]
450 example1 = ["to-be or not to-be"]
451 example2 = ["to-be-or not to-be-or NOT to-be and"]
452 example3 = example0 <> example0
453 -- > TEST: Should not have York New in the trie
454 example4 = ["a-b-c-d e a-b-c-d f"]
455 example5 = ["a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"]
456 example6 = ["le-petit chat"
461 example7 = ["a-b d", "a-c e", "a-c", "a-b", "a-b", "a-c", "a-c", "a-b"]
462 -- example8 = ["z f", "z", "z", "z"] <> example7
463 example8 = ["z", "z", "z", "z"] <> example7 <> example7 <> example7
464 example9 = (T.replace "z" "a") <$> example8
465 --example8 = ["a-b d", "a-c e", "a f", "a-c g", "a-b h", "a i", "a j", "a-b k", "a-c l", "a-c m", "a n", "a-b o"]
467 checks0, checks2, checks7, checks8, checks9 :: Checks Double
470 -- [(token, count, entropy, ev, autonomy, fwd_entropy, fwd_ev, fwd_autonomy, bwd_entropy, bwd_ev, bwd_autonomy)]
471 [ ("<start>", 1, nan, nan, nan, 0.0, -2.113283334294875, -0.5000000000000002, nan, nan, nan)
472 , ("New", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 0.0, -2.113283334294875, -0.5000000000000002, 1.584962500721156, -0.5283208335737188, 2.0)
473 , ("York", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 1.584962500721156, -0.5283208335737188, 2.0, 0.0, -2.113283334294875, -0.5000000000000002)
474 , ("is", 1, 0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002)
475 , ("and", 1, 0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002)
476 , ("<stop>", 0, nan, nan, nan, nan, nan, nan, 0.0, -2.113283334294875, -0.5000000000000002)
477 , ("<start> New", 1, nan, nan, nan, 0.0, nan, nan, nan, nan, nan)
478 , ("New York", 3, 1.584962500721156, 1.584962500721156, 1.414213562373095, 1.584962500721156, 1.584962500721156, 1.4142135623730947, 1.584962500721156, 1.584962500721156, 1.4142135623730951)
479 , ("York is", 1, 0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865476, 0.0, nan, nan)
480 , ("is New", 1, 0, nan, nan, 0.0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865474)
481 , ("York and", 1, 0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865476, 0.0, nan, nan)
482 , ("and New", 1, 0, nan, nan, 0.0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865474)
483 , ("York <stop>", 1, nan, nan, nan, nan, nan, nan, 0.0, nan, nan)
484 , ("<start> New York", 1, nan, nan, nan, 0.0, nan, nan, nan, nan, nan)
485 , ("New York is", 1, 0, nan, nan, 0.0, -1.584962500721156, nan, 0.0, nan, nan)
486 , ("York is New", 1, 0, nan, nan, 0.0, nan, nan, 0.0, nan, nan)
487 , ("is New York", 1, 0, nan, nan, 0.0, nan, nan, 0.0, -1.584962500721156, nan)
488 , ("New York and", 1, 0, nan, nan, 0.0, -1.584962500721156, nan, 0.0, nan, nan)
489 , ("York and New", 1, 0, nan, nan, 0.0, nan, nan, 0.0, nan, nan)
490 , ("and New York", 1, 0, nan, nan, 0.0, nan, nan, 0.0, -1.584962500721156, nan)
491 , ("New York <stop>", 1, nan, nan, nan, nan, nan, nan, 0.0, nan, nan)
496 [("to be", 3, 1.2516291673878228, 1.2516291673878228, 1.5535694744293167, nan, 0.9182958340544896)
497 ,("be or", 2, 0.5, nan, nan, nan, 1.0)
498 ,("or not", 1, 0.0, nan, nan, nan, 0.0)
499 ,("not to", 1, 0.0, nan, nan, nan, 0.0)
500 ,("or NOT", 1, 0.0, nan, nan, nan, 0.0)
501 ,("NOT to", 1, 0.0, nan, nan, nan, 0.0)
502 ,("be and", 1, 0.0, nan, nan, nan, 0.0)
507 [ ("a b", 4, 2, 1.5, 1.0106455960380136, 2, 1, 0.7302967433402215, 2, 2, 1.2909944487358056)
508 , ("a c", 4, 2, 1.5, 1.0106455960380136, 2, 1, 0.7302967433402215, 2, 2, 1.2909944487358056)
509 , ("a", 8, 2, -0.7139421727208477, 0.9315597394596105, 1, -1.7139421727208477, 0.1695158759052029, 3, 0.2860578272791523, 1.693603603014018)
513 [ ("a b", 4, 2, 1.5, 1.2384061243840367, 2, 1, 0.9190418024406298, 2, 2, 1.5577704463274435)
514 , ("a c", 4, 2, 1.5, 1.2384061243840367, 2, 1, 0.9190418024406298, 2, 2, 1.5577704463274435)
515 , ("a", 8, 2, -1.1151193576322829, 0.8012882295122719, 1, -2.115119357632283, 1.1025957503820932e-2, 3, -0.11511935763228287, 1.5915505015207227)
516 , ("z", 4, 2, -1.1151193576322829, 0.9576679529201777, 2, -1.1151193576322829, 1.0906240295212841, 2, -1.1151193576322829, 0.8247118763190712)
520 [ ("a b", 4, 2, 0.8741854163060885, 0.9234576822288185, 2, -0.25162916738782304, 0.2891449181301934, 2, 2, 1.5577704463274435)
521 , ("a c", 4, 2, 0.8741854163060885, 0.9234576822288185, 2, -0.25162916738782304, 0.2891449181301934, 2, 2, 1.5577704463274435)
522 , ("a", 12, 2.91829583405449, 3.763498724462999e-2, 1.518835832034022, 2.251629167387823, -0.6290316794220367, 1.2162041043595873, 3.5849625007211565, 0.7043016539112967, 1.8214675597084569)
525 runTestsEleve :: Bool -> IO ()
526 runTestsEleve doChecks =
528 [("example0", 3, example0, checks0)
529 ,("example0", 2, example0, [])
530 ,("example1", 2, example1, [])
531 ,("example2", 3, example2, checks2)
532 ,("example3", 2, example3, [])
533 ,("example4", 4, example4, [])
534 ,("example5", 5, example5, [])
535 ,("example6", 2, example6, [])
536 ,("example7", 2, example7, checks7)
537 ,("example8", 2, example8, checks8)
538 ,("example9", 2, example9, checks9)
540 (\(name, n, ex, checks) -> do
541 P.putStrLn $ name <> " " <> show n
542 b <- testEleve False n ex (if doChecks then checks else [])
543 P.putStrLn $ " splitting: " <> if b then "PASS" else "FAIL"