2 Module : Gargantext.Text.Eleve
3 Description : Unsupervized Word segmentation
4 Copyright : (c) CNRS, 2019-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 # Implementation of Unsupervized Word Segmentation
14 - Python implementation (Korantin August, Emmanuel Navarro):
15 [EleVe](https://github.com/kodexlab/eleve.git)
17 - Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
18 Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
19 the 50th Annual Meeting of the Association for Computational Linguistics
20 , pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
22 Notes for current implementation:
23 - TODO extract longer ngrams (see paper above, viterbi algo can be used)
24 - TODO AD TEST: prop (Node c _e f) = c == Map.size f
26 - AD: Real ngrams extraction test
27 from Gargantext.Text.Terms import extractTermsUnsupervised
28 docs <- runCmdRepl $ selectDocs 1004
29 extractTermsUnsupervised 3 $ DT.intercalate " "
31 $ Gargantext.map _hyperdataDocument_abstract docs
34 {-# LANGUAGE ConstraintKinds #-}
35 {-# LANGUAGE NoImplicitPrelude #-}
36 {-# LANGUAGE OverloadedStrings #-}
37 {-# LANGUAGE RankNTypes #-}
38 {-# LANGUAGE TemplateHaskell #-}
39 {-# LANGUAGE TypeFamilies #-}
41 module Gargantext.Text.Eleve where
43 -- import Debug.Trace (trace)
44 -- import Debug.SimpleReflect
46 import Control.Lens hiding (levels, children)
47 import Control.Monad (forM_)
49 import qualified Data.List as L
51 import Data.Text (Text)
52 import qualified Data.Text as T
54 import Data.Maybe (fromMaybe)
55 import qualified Data.Map as Map
56 import Gargantext.Prelude hiding (cs)
57 import qualified Data.Tree as Tree
58 import Data.Tree (Tree)
59 import qualified Prelude as P (putStrLn, logBase, isNaN, RealFloat)
61 nan :: Floating e => e
64 noNaNs :: P.RealFloat e => [e] -> [e]
65 noNaNs = filter (not . P.isNaN)
67 updateIfDefined :: P.RealFloat e => e -> e -> e
68 updateIfDefined e0 e | P.isNaN e = e0
71 sim :: Entropy e => e -> e -> Bool
72 sim x y = x == y || (P.isNaN x && P.isNaN y)
74 subst :: Entropy e => (e, e) -> e -> e
75 subst (src, dst) x | sim src x = dst
77 ------------------------------------------------------------------------
84 -- ^ TODO: only used for debugging
86 ------------------------------------------------------------------------
87 -- | Example and tests for development
90 , _info_entropy_var :: e
94 instance Show e => Show (I e) where
95 show (I e ev a) = show (e, ev, a)
99 type ModEntropy i o e = (e -> e) -> i -> o
101 set_autonomy :: Entropy e => ModEntropy e (I e) e
102 set_autonomy f e = I e nan (f e)
104 set_entropy_var :: Entropy e => Setter e (I e) e e
105 set_entropy_var f e = (\ev -> I e ev nan) <$> f e
107 data StartStop = Start | Stop
108 deriving (Ord, Eq, Show)
110 data Token = NonTerminal Text
112 deriving (Ord, Eq, Show)
114 isTerminal :: Token -> Bool
115 isTerminal (Terminal _) = True
116 isTerminal (NonTerminal _) = False
118 parseToken :: Text -> Token
119 parseToken "<start>" = Terminal Start
120 parseToken "<stop>" = Terminal Stop
121 parseToken t = NonTerminal t
123 toToken :: [Text] -> [Token]
124 toToken xs = Terminal Start : (NonTerminal <$> xs) <> [Terminal Stop]
126 printToken :: Token -> Text
129 f (NonTerminal x) = x
130 f (Terminal Start) = "<start>"
131 f (Terminal Stop) = "<stop>"
132 ------------------------------------------------------------------------
135 = Node { _node_count :: Int
137 , _node_children :: Map k (Trie k e)
139 | Leaf { _node_count :: Int }
144 insertTries :: Ord k => [[k]] -> Trie k ()
145 insertTries = L.foldr insertTrie emptyTrie
147 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
148 insertTrie [] n = n { _node_count = _node_count n +1}
149 insertTrie (x:xs) (Leaf c) = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
150 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
152 f = Just . insertTrie xs . fromMaybe emptyTrie
154 -- emptyTrie :: (Ord k, Monoid e) => Trie k e
155 -- emptyTrie = Node 0 mempty mempty
156 emptyTrie :: Trie k e
159 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
161 | Map.null children = Leaf c
162 | otherwise = Node c mempty children
164 -----------------------------
165 -- | Trie to Tree since Tree as nice print function
166 toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
167 toTree k (Leaf c) = Tree.Node (k, c, Nothing) []
168 toTree k (Node c e cs) = Tree.Node (k, c, Just e) (map (uncurry toTree) $ Map.toList cs)
170 ------------------------------------------------------------------------
171 ------------------------------------------------------------------------
172 entropyTrie :: Entropy e => (k -> Bool) -> Trie k () -> Trie k e
173 entropyTrie _ (Leaf c) = Leaf c
174 entropyTrie pred (Node c () children) = Node c e (map (entropyTrie pred) children)
176 children' = Map.toList children
177 sum_count = sum $ _node_count . snd <$> children'
178 e | sum_count == 0 = nan
179 | otherwise = sum $ f <$> children'
180 f (k, child) = if pred k then chc * P.logBase 2 (fromIntegral c)
181 else - chc * P.logBase 2 chc
183 chc = fromIntegral (_node_count child) / fromIntegral c
184 ------------------------------------------------------------------------
185 normalizeLevel :: Entropy e => e -> e -> e -> e
186 normalizeLevel m v e = (e - m) / v
190 nodeChildren :: Trie k e -> Map k (Trie k e)
191 nodeChildren (Node _ _ cs) = cs
192 nodeChildren (Leaf _) = Map.empty
197 class IsTrie trie where
198 buildTrie :: Entropy e => [[Token]] -> trie Token e
199 nodeEntropy :: Entropy e => Getting e i e -> trie k i -> e
200 nodeChild :: Ord k => k -> trie k e -> trie k e
201 findTrie :: Ord k => [k] -> trie k e -> trie k e
202 findTrieR :: Ord k => [k] -> trie k e -> trie k e
203 printTrie :: (Show i, Entropy e) => Getting e i e -> trie Token i -> IO ()
204 evTrie :: Entropy e => Getting e i e -> Setter i o e e -> trie k i -> trie k o
205 normalizeEntropy :: Entropy e
206 => Getting e i e -> ModEntropy i o e
207 -> trie k i -> trie k o
210 --nodeAutonomy :: (Ord k, Entropy e) => Getting e i e -> trie k i -> [k] -> e
211 --nodeAutonomy inE t ks = nodeEntropy inE $ findTrie ks t
213 instance IsTrie Trie where
214 buildTrie ts = entropyTrie isTerminal $ insertTries ts
216 nodeEntropy inE (Node _ e _) = e ^. inE
217 nodeEntropy _ (Leaf _) = nan
219 nodeChild k (Node _ _ cs) = fromMaybe emptyTrie (Map.lookup k cs)
220 nodeChild _ (Leaf _) = emptyTrie
222 findTrie ks t = L.foldl (flip nodeChild) t ks
226 P.putStrLn . Tree.drawTree
228 $ toTree (NonTerminal "") t
229 P.putStrLn " Levels:"
230 forM_ (normalizationLevels inE t) $ \level ->
231 P.putStrLn $ " " <> show level
233 evTrie inE setEV = go nan
235 go _ (Leaf c) = Leaf c
236 go e0 (Node c i children) = Node c (i & setEV .~ ev e0 e1) $ go e1 <$> children
242 normalizeEntropy inE modE t = go (modE identity) (normalizationLevels inE t) t
244 go _ _ (Leaf c) = Leaf c
245 go _ [] _ = panic "normalizeEntropy' empty levels"
246 go f ((m, v, _) : ess) (Node c i children)
247 = Node c (f i) $ go (modE $ normalizeLevel m v) ess <$> children
248 ------------------------------------------------------------------------
250 levels :: Trie k e -> [[Trie k e]]
251 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
253 subForest :: Trie k e -> [Trie k e]
254 subForest (Leaf _) = []
255 subForest (Node _ _ children) = Map.elems children
257 entropyLevels :: Entropy e => Getting e i e -> Trie k i -> [[e]]
258 entropyLevels inE = fmap (noNaNs . map (nodeEntropy inE)) . L.tail . levels
260 normalizationLevels :: Entropy e => Getting e i e -> Trie k i -> [(e, e, Int)]
261 normalizationLevels inE = fmap f . entropyLevels inE
263 f es = (mean es, deviation es, length es)
265 ------------------------------------------------------------------------
267 data Tries k e = Tries
274 nodeEntropySafe :: Entropy e => Getting e i e -> Tries k i -> e
275 nodeEntropySafe inE (Tries f b) =
276 mean $ noNaNs [nodeEntropy inE f, nodeEntropy inE b]
278 nodeEntropyBwdOpt :: Entropy e => Getting e i e -> Tries k i -> e
279 nodeEntropyBwdOpt inE (Tries f b) =
280 mean $ nodeEntropy inE f : noNaNs [nodeEntropy inE b]
282 instance IsTrie Tries where
283 buildTrie tts = Tries { _fwd = buildTrie tts
284 , _bwd = buildTrie (reverse <$> tts)
287 nodeEntropy inE (Tries f b) = mean [nodeEntropy inE f, nodeEntropy inE b]
289 findTrie ks = onTries (findTrie ks)
290 findTrieR ks (Tries f b) = Tries (findTrieR ks f) (findTrieR (reverse ks) b)
292 nodeChild = onTries . nodeChild
294 evTrie inE setEV = onTries $ evTrie inE setEV
296 normalizeEntropy inE = onTries . normalizeEntropy inE
298 printTrie inE (Tries f b) = do
299 P.putStrLn "Forward:"
302 P.putStrLn "Backward:"
305 onTries :: (Trie k i -> Trie k o) -> Tries k i -> Tries k o
306 onTries h (Tries f b) = Tries (h f) (h b)
308 ------------------------------------------------------------------------
309 split :: (IsTrie trie, Entropy e) => Lens' i e -> trie Token i -> [Token] -> [[Token]]
311 split inE t (Terminal Start:xs) = split inE t xs
312 split inE t (x0:xs0) = go [x0] xs0
315 mayCons xs xss = xs : xss
318 go pref (Terminal Stop:_) = [pref]
319 go _ (Terminal Start:_) = panic "split impossible"
321 -- trace (show (if acc then "ACC" else "CUT", (prefx, epxt), if acc then ">" else "<=", ((pref, ept), "+", ([x], ext)))) $
324 else mayCons pref $ go [x] xs
327 pt = findTrieR pref t
328 pxt = findTrieR prefx t
331 -- ^ entropy of the current prefix
335 -- ^ entropy of the current prefix plus x
336 acc = P.isNaN ept || P.isNaN ext || not (P.isNaN epxt) -- && (epxt > ept + ext)
338 -- aut(["in","this","paper"]) > aut(["in","this"]) + aut(["paper"])
343 split :: Entropy e => Lens' i e -> Tries Token i -> [Token] -> [[Token]]
345 maximumWith (sum . map $ nodeAutonomy inE t0) (all the splits of ts)
348 ------------------------------------------------------------------------
349 ------------------------------------------------------------------------
351 mainEleve :: Int -> [[Text]] -> [[[Text]]]
354 mainEleve n input = map (map printToken) . split identity (t :: Trie Token Double) <$> inp
356 inp = toToken <$> input
357 t = buildTrie $ L.concat $ chunkAlong n 1 <$> inp
360 chunkAlongEleve :: Int -> [a] -> [[a]]
361 chunkAlongEleve n xs = L.take n <$> L.tails xs
363 data Order = Backward | Forward
365 toToken' :: Order -> Int -> [[Text]] -> [[Token]]
366 toToken' o n input = L.concat
367 $ ( filter (/= [Terminal (term o)])
368 . chunkAlongEleve (n + 1)
374 order Forward = identity
375 order Backward = reverse
377 term Backward = Start
380 ---------------------------------------------
382 type Checks e = [(Text, Int, e, e, e, e, e, e, e, e, e)]
384 testEleve :: e ~ Double => Bool -> Int -> [Text] -> Checks e -> IO Bool
385 testEleve debug n output checks = do
388 pss = [ (ps, findTrie ps fwd ^? _Just . node_entropy) -- . info_entropy)
391 , cs <- chunkAlong m 1 <$> inp
396 res = map (map printToken) . split info_autonomy nt <$> inp
398 P.putStrLn (show input)
399 -- forM_ pss (P.putStrLn . show)
401 printTrie info_entropy nt
403 -- P.putStrLn "Entropy Var:"
404 -- printTrie identity t''
406 P.putStrLn "Splitting:"
407 P.putStrLn $ show res
409 pure $ expected == res
412 out = T.words <$> output
413 expected = fmap (T.splitOn "-") <$> out
414 input = (T.splitOn "-" =<<) <$> out
415 inp = toToken <$> input
417 t :: Tries Token Double
418 t = -- buildTrie (toToken' n input)
419 Tries { _fwd = buildTrie (toToken' Forward n input)
420 , _bwd = buildTrie (toToken' Backward n input)
423 evt :: Tries Token (I Double)
424 evt = evTrie identity set_entropy_var t
426 nt :: Tries Token (I Double)
427 nt = normalizeEntropy info_entropy_var (\fe i -> i & info_autonomy .~ fe (i ^. info_entropy_var)) evt
429 -- t'' :: Trie Token Double
430 -- t'' = set_entropy_vars info_autonomy (\e _i -> e) nt
432 -- nt = normalizeEntropy identity set_autonomy (fwd :: Trie Token Double)
433 -- nt = normalizeEntropy' info_entropy (\f -> info_norm_entropy' %~ f) nt
437 then P.putStrLn $ " \ESC[32mPASS\ESC[m " <> msg <> " " <> show ref
438 else P.putStrLn $ " \ESC[31mFAIL\ESC[m " <> msg <> " ref=" <> show ref <> " my=" <> show my
440 checker (ngram, count, entropy, ev, autonomy, fwd_entropy, fwd_ev, fwd_autonomy, bwd_entropy, bwd_ev, bwd_autonomy) = do
441 let ns = parseToken <$> T.words ngram
442 nt' = findTrieR ns nt
444 P.putStrLn $ " " <> T.unpack ngram <> ":"
445 check (==) "count" count (_node_count (_fwd nt'))
447 check sim "entropy" entropy (nodeEntropy info_entropy nt' )
448 check sim "ev" ev (nodeEntropy info_entropy_var nt' )
449 check sim "autonomy" autonomy (nodeEntropy info_autonomy nt' )
451 check sim "fwd_entropy" fwd_entropy (nodeEntropy info_entropy (_fwd nt'))
452 check sim "fwd_ev" fwd_ev (nodeEntropy info_entropy_var (_fwd nt'))
453 check sim "fwd_autonomy" fwd_autonomy (nodeEntropy info_autonomy (_fwd nt'))
455 check sim "bwd_entropy" bwd_entropy (nodeEntropy info_entropy (_bwd nt'))
456 check sim "bwd_ev" bwd_ev (nodeEntropy info_entropy_var (_bwd nt'))
457 check sim "bwd_autonomy" bwd_autonomy (nodeEntropy info_autonomy (_bwd nt'))
459 -- | TODO real data is a list of tokenized sentences
460 example0, example1, example2, example3, example4, example5, example6 :: [Text]
461 example0 = ["New-York is New-York and New-York"]
462 example1 = ["to-be or not to-be"]
463 example2 = ["to-be-or not to-be-or NOT to-be and"]
464 example3 = example0 <> example0
465 -- > TEST: Should not have York New in the trie
466 example4 = ["a-b-c-d e a-b-c-d f"]
467 example5 = ["a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"]
468 example6 = ["le-petit chat"
474 checks0, checks2 :: Checks Double
477 -- [(token, count, entropy, ev, autonomy, fwd_entropy, fwd_ev, fwd_autonomy, bwd_entropy, bwd_ev, bwd_autonomy)]
478 [ ("<start>", 1, nan, nan, nan, 0.0, -2.113283334294875, -0.5000000000000002, nan, nan, nan)
479 , ("New", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 0.0, -2.113283334294875, -0.5000000000000002, 1.584962500721156, -0.5283208335737188, 2.0)
480 , ("York", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 1.584962500721156, -0.5283208335737188, 2.0, 0.0, -2.113283334294875, -0.5000000000000002)
481 , ("is", 1, 0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002)
482 , ("and", 1, 0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002)
483 , ("<stop>", 0, nan, nan, nan, nan, nan, nan, 0.0, -2.113283334294875, -0.5000000000000002)
484 , ("<start> New", 1, nan, nan, nan, 0.0, nan, nan, nan, nan, nan)
485 , ("New York", 3, 1.584962500721156, 1.584962500721156, 1.414213562373095, 1.584962500721156, 1.584962500721156, 1.4142135623730947, 1.584962500721156, 1.584962500721156, 1.4142135623730951)
486 , ("York is", 1, 0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865476, 0.0, nan, nan)
487 , ("is New", 1, 0, nan, nan, 0.0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865474)
488 , ("York and", 1, 0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865476, 0.0, nan, nan)
489 , ("and New", 1, 0, nan, nan, 0.0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865474)
490 , ("York <stop>", 1, nan, nan, nan, nan, nan, nan, 0.0, nan, nan)
491 , ("<start> New York", 1, nan, nan, nan, 0.0, nan, nan, nan, nan, nan)
492 , ("New York is", 1, 0, nan, nan, 0.0, -1.584962500721156, nan, 0.0, nan, nan)
493 , ("York is New", 1, 0, nan, nan, 0.0, nan, nan, 0.0, nan, nan)
494 , ("is New York", 1, 0, nan, nan, 0.0, nan, nan, 0.0, -1.584962500721156, nan)
495 , ("New York and", 1, 0, nan, nan, 0.0, -1.584962500721156, nan, 0.0, nan, nan)
496 , ("York and New", 1, 0, nan, nan, 0.0, nan, nan, 0.0, nan, nan)
497 , ("and New York", 1, 0, nan, nan, 0.0, nan, nan, 0.0, -1.584962500721156, nan)
498 , ("New York <stop>", 1, nan, nan, nan, nan, nan, nan, 0.0, nan, nan)
503 [("to be", 3, 1.2516291673878228, 1.2516291673878228, 1.5535694744293167, nan, 0.9182958340544896)
504 ,("be or", 2, 0.5, nan, nan, nan, 1.0)
505 ,("or not", 1, 0.0, nan, nan, nan, 0.0)
506 ,("not to", 1, 0.0, nan, nan, nan, 0.0)
507 ,("or NOT", 1, 0.0, nan, nan, nan, 0.0)
508 ,("NOT to", 1, 0.0, nan, nan, nan, 0.0)
509 ,("be and", 1, 0.0, nan, nan, nan, 0.0)
516 [("example0", 2, example0, checks0)
517 ,("example1", 2, example1, [])
518 ,("example2", 3, example2, checks2)
519 ,("example3", 2, example3, [])
520 ,("example4", 4, example4, [])
521 ,("example5", 5, example5, [])
522 ,("example6", 2, example6, [])
524 (\(name, n, ex, checks) -> do
525 P.putStrLn $ name <> " " <> show n
526 b <- testEleve False n ex checks
527 P.putStrLn $ " splitting: " <> if b then "PASS" else "FAIL"