2 Module : Gargantext.Text.Eleve
3 Description : Unsupervized Word segmentation
4 Copyright : (c) CNRS, 2019-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 # Implementation of Unsupervized Word Segmentation
14 - Python implementation (Korantin August, Emmanuel Navarro):
15 [EleVe](https://github.com/kodexlab/eleve.git)
17 - Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
18 Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
19 the 50th Annual Meeting of the Association for Computational Linguistics
20 , pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
22 Notes for current implementation:
23 - TODO fix normalization
24 - TODO extract longer ngrams (see paper above, viterbi algo can be used)
25 - TODO AD TEST: prop (Node c _e f) = c == Map.size f
27 - AD: Real ngrams extraction test
28 from Gargantext.Text.Terms import extractTermsUnsupervised
29 docs <- runCmdRepl $ selectDocs 1004
30 extractTermsUnsupervised 3 $ DT.intercalate " "
32 $ Gargantext.map _hyperdataDocument_abstract docs
35 {-# LANGUAGE ConstraintKinds #-}
36 {-# LANGUAGE NoImplicitPrelude #-}
37 {-# LANGUAGE OverloadedStrings #-}
38 {-# LANGUAGE RankNTypes #-}
39 {-# LANGUAGE TemplateHaskell #-}
40 {-# LANGUAGE TypeFamilies #-}
42 module Gargantext.Text.Eleve where
44 import Debug.Trace (trace)
45 -- import Debug.SimpleReflect
47 import Control.Lens (Lens', Getting, (^.), (^?), view, makeLenses, _Just, under, reversed)
48 import Control.Monad (forM_)
50 import qualified Data.List as L
52 import Data.Text (Text)
53 import qualified Data.Text as T
55 import Data.Maybe (fromMaybe)
56 import qualified Data.Map as Map
57 import Gargantext.Prelude hiding (cs)
58 import qualified Data.Tree as Tree
59 import Data.Tree (Tree)
60 import qualified Prelude as P (putStrLn, logBase, isNaN, RealFloat)
67 -- ^ TODO: only used for debugging
69 ------------------------------------------------------------------------
70 -- | Example and tests for development
76 instance Show e => Show (I e) where
77 show (I e n) = show (e, n)
81 type ModEntropy i o e = (e -> e) -> i -> o
83 set_autonomy :: ModEntropy e (I e) e
84 set_autonomy f e = I e (f e)
86 data StartStop = Start | Stop
87 deriving (Ord, Eq, Show)
89 data Token = NonTerminal Text
91 deriving (Ord, Eq, Show)
93 isTerminal :: Token -> Bool
94 isTerminal (Terminal _) = True
95 isTerminal (NonTerminal _) = False
97 parseToken :: Text -> Token
98 parseToken "<start>" = Terminal Start
99 parseToken "<stop>" = Terminal Stop
100 parseToken t = NonTerminal t
103 toToken :: [Text] -> [Token]
104 toToken xs = Terminal Start : (NonTerminal <$> xs) <> [Terminal Stop]
106 printToken :: Token -> Text
109 f (NonTerminal x) = x
110 f (Terminal Start) = "<start>"
111 f (Terminal Stop) = "<stop>"
113 ------------------------------------------------------------------------
116 = Node { _node_count :: Int
118 , _node_children :: Map k (Trie k e)
120 | Leaf { _node_count :: Int }
125 insertTries :: Ord k => [[k]] -> Trie k ()
126 insertTries = L.foldr insertTrie emptyTrie
128 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
129 insertTrie [] n = n { _node_count = _node_count n +1}
130 insertTrie (x:xs) (Leaf c) = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
131 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
133 f = Just . insertTrie xs . fromMaybe emptyTrie
135 -- emptyTrie :: (Ord k, Monoid e) => Trie k e
136 -- emptyTrie = Node 0 mempty mempty
137 emptyTrie :: Trie k e
140 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
142 | Map.null children = Leaf c
143 | otherwise = Node c mempty children
145 -----------------------------
147 -- | Trie to Tree since Tree as nice print function
148 toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
149 toTree k (Leaf c) = Tree.Node (k, c, Nothing) []
150 toTree k (Node c e cs) = Tree.Node (k, c, Just e) (map (uncurry toTree) $ Map.toList cs)
152 ------------------------------------------------------------------------
153 ------------------------------------------------------------------------
155 nan :: Floating e => e
158 noNaNs :: P.RealFloat e => [e] -> [e]
159 noNaNs = filter (not . P.isNaN)
161 updateIfDefined :: P.RealFloat e => e -> e -> e
162 updateIfDefined e0 e | P.isNaN e = e0
165 subst :: Entropy e => (e, e) -> e -> e
166 subst (src, dst) x | sim src x = dst
169 entropyTrie :: Entropy e => (k -> Bool) -> Trie k () -> Trie k e
170 entropyTrie _ (Leaf c) = Leaf c
171 entropyTrie pred (Node c () children) = Node c e (map (entropyTrie pred) children)
173 e = sum $ map f $ Map.toList children
174 f (k, child) = if pred k then chc * P.logBase 2 (fromIntegral c)
175 else - chc * P.logBase 2 chc
177 chc = fromIntegral (_node_count child) / fromIntegral c
178 ------------------------------------------------------------------------
180 normalizeLevel :: Entropy e => [e] -> e -> e
181 normalizeLevel = checkDiff (go . noNaNs)
184 -- checkDiff f es e = let e' = f es e in if e == e' then e' else trace ("normalizeLevel: diff " <> show e <> " " <> show e') e'
186 -- go [] = panic "normalizeLevel: impossible"
187 -- trace "normalizeLevel"
189 go es = \e -> (e - m) / v
192 then trace ("normalizeLevel " <> show (e,m,v,es))
202 nodeChildren :: Trie k e -> Map k (Trie k e)
203 nodeChildren (Node _ _ cs) = cs
204 nodeChildren (Leaf _) = Map.empty
209 class IsTrie trie where
210 buildTrie :: Entropy e => (Int -> [[Text]] -> [[Token]]) -> Int -> [[Text]] -> trie Token e
211 nodeEntropy :: Entropy e => Getting e i e -> trie k i -> e
212 nodeChild :: Ord k => k -> trie k e -> trie k e
213 findTrie :: Ord k => [k] -> trie k e -> trie k e
214 normalizeEntropy :: Entropy e
215 => Getting e i e -> ModEntropy i o e
216 -> trie k i -> trie k o
219 --nodeAutonomy :: (Ord k, Entropy e) => Getting e i e -> trie k i -> [k] -> e
220 --nodeAutonomy inE t ks = nodeEntropy inE $ findTrie ks t
222 instance IsTrie Trie where
223 buildTrie to n ts = entropyTrie isTerminal $ insertTries $ to n ts
225 nodeEntropy inE (Node _ e _) = e ^. inE
226 nodeEntropy _ (Leaf _) = -- trace "nodeEntropy of Leaf" $
229 nodeChild k (Node _ _ cs) = fromMaybe emptyTrie (Map.lookup k cs)
230 nodeChild _ (Leaf _) = emptyTrie
232 findTrie ks t = L.foldl (flip nodeChild) t ks
234 normalizeEntropy inE modE t = go (modE identity) (entropyLevels inE t) t
236 go _ [] _ = panic "normalizeEntropy' empty levels"
237 go _ _ (Leaf c) = Leaf c
238 -- go _ ([] : _) _ = panic "normalizeEntropy': empty level"
239 go f (es : ess) (Node c i children)
240 -- | any (sim (i ^. inE)) es
241 = Node c (f i) $ go (modE $ normalizeLevel es) ess <$> children
243 -- = panic "NOT an elem"
247 This is only normalizing a node with respect to its brothers (unlike all the
248 nodes of the same level).
250 normalizeEntropy inE modE = go $ modE identity
252 go _ (Leaf c) = Leaf c
253 go f (Node c i children)
254 | Map.null children =
255 panic "normalizeEntropy: impossible"
257 Node c (f i) $ go (modE $ normalizeLevel es) <$> children
259 es = [ i' ^. inE | Node _ i' _ <- Map.elems children ]
261 ------------------------------------------------------------------------
263 levels :: Trie k e -> [[Trie k e]]
264 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
266 subForest :: Trie k e -> [Trie k e]
267 subForest (Leaf _) = []
268 subForest (Node _ _ children) = Map.elems children
270 entropyLevels :: Entropy e => Getting e i e -> Trie k i -> [[e]]
271 entropyLevels inE = fmap (noNaNs . map (nodeEntropy inE)) . levels
273 ------------------------------------------------------------------------
275 data Tries k e = Tries
280 instance IsTrie Tries where
281 buildTrie to n tts = Tries { _fwd = buildTrie to n tts
282 , _bwd = buildTrie to n (map reverse $ tts)
285 nodeEntropy inE (Tries fwd bwd) =
286 -- VETODO reverse the query for bwd here
287 -- mean $ noNaNs [nodeEntropy inE fwd, nodeEntropy inE bwd . under reversed]
288 mean $ noNaNs [nodeEntropy inE fwd, nodeEntropy inE bwd]
290 findTrie ks (Tries fwd bwd) = Tries (findTrie ks fwd) (findTrie ks bwd)
292 -- TODO: here this is tempting to reverse but this is not always what we
293 -- want. See also nodeAutonomy.
295 nodeChild k (Tries fwd bwd) = Tries (nodeChild k fwd) (nodeChild k bwd)
297 normalizeEntropy inE modE = onTries (normalizeEntropy inE modE)
299 onTries :: (Trie k i -> Trie k o) -> Tries k i -> Tries k o
300 onTries f (Tries fwd bwd) = Tries (f fwd) (f bwd)
302 ------------------------------------------------------------------------
303 split :: (IsTrie trie, Entropy e) => Lens' i e -> trie Token i -> [Token] -> [[Token]]
305 split inE t0 (Terminal Start:xs0) = split inE (nodeChild (Terminal Start) t0) xs0
306 split inE t0 (x0:xs0) = go (nodeChild x0 t0) [x0] xs0
309 consRev xs xss = reverse xs : xss
311 go _ pref [] = [reverse pref]
312 go _ pref (Terminal Stop:_) = [reverse pref]
313 go t pref (Terminal Start:xs) = go t pref xs
315 -- trace (show (if acc then "ACC" else "CUT", (reverse (x : pref), ext), if acc then ">" else "<=", ((reverse pref, et), "+", ([x], ext0)))) $
317 then go xt (x:pref) xs
318 else consRev pref $ go xt0 [x] xs
323 -- ^ entropy of the current prefix
327 -- ^ entropy of the current prefix plus x
328 acc = ext > et + ext0
329 -- aut(["in","this","paper"]) > aut(["in","this"]) + aut(["paper"])
331 ne d t = if P.isNaN e then d else e
332 where e = nodeEntropy inE t
335 split :: Entropy e => Lens' i e -> Tries Token i -> [Token] -> [[Token]]
337 maximumWith (sum . map $ nodeAutonomy inE t0) (all the splits of ts)
340 ------------------------------------------------------------------------
341 ------------------------------------------------------------------------
343 mainEleve :: Int -> [[Text]] -> [[[Text]]]
346 mainEleve n input = map (map printToken) . split identity (t :: Trie Token Double) <$> inp
348 inp = toToken <$> input
349 t = buildTrie $ L.concat $ chunkAlong n 1 <$> inp
352 sim :: Entropy e => e -> e -> Bool
353 sim x y = x == y || (P.isNaN x && P.isNaN y)
355 chunkAlongEleve :: Int -> [a] -> [[a]]
356 chunkAlongEleve n xs = L.take n <$> L.tails xs
358 toToken' :: Int -> [[Text]] -> [[Token]]
359 toToken' n input = L.concat $ (filter (/= [Terminal Stop]) . chunkAlongEleve (n + 2)) <$> toToken <$> input
361 testEleve :: e ~ Double => Bool -> Int -> [Text] -> [(Text, Int, e, e, e, e, e)] -> IO Bool
362 testEleve debug n output checks = do
365 pss = [ (ps, findTrie ps fwd ^? _Just . node_entropy) -- . info_entropy)
368 , cs <- chunkAlong m 1 <$> inp
373 --res = map (map printToken) . split identity fwd <$> inp
374 --res = map (map printToken) . split info_norm_entropy' nt' <$> inp
375 res = map (map printToken) . split info_autonomy nt <$> inp
377 P.putStrLn (show input)
378 -- forM_ pss (P.putStrLn . show)
381 forM_ (entropyLevels identity (_fwd t)) $ \level ->
382 P.putStrLn $ " " <> show level
384 P.putStrLn "Forward:"
387 P.putStrLn "Backward:"
390 P.putStrLn "Splitting:"
391 P.putStrLn $ show res
393 pure $ expected == res
396 out = T.words <$> output
397 expected = fmap (T.splitOn "-") <$> out
398 input = (T.splitOn "-" =<<) <$> out
399 inp = toToken <$> input
400 t = buildTrie toToken' n input
401 -- nt = normalizeEntropy identity set_autonomy (fwd :: Trie Token Double)
402 -- nt = normalizeEntropy' info_entropy (\f -> info_norm_entropy' %~ f) nt
403 nt = normalizeEntropy identity set_autonomy t
407 then P.putStrLn $ " PASS " <> msg <> " " <> show ref
408 else P.putStrLn $ " FAIL " <> msg <> " ref=" <> show ref <> " my=" <> show my
410 checker (ngram, count, entropy, _ev, autonomy, bwd_entropy, fwd_entropy) = do
411 let ns = parseToken <$> T.words ngram
413 nsb = parseToken <$> (reverse $ T.words ngram)
414 tb' = findTrie nsb nt
415 -- TODO put this Variation Entropy at VETODO mark above
416 ev = (mean [(nodeEntropy info_entropy (_fwd t')), (nodeEntropy info_entropy (_bwd tb'))])
418 P.putStrLn $ " " <> T.unpack ngram <> ":"
419 check (==) "count" count (_node_count (_fwd t'))
420 check sim "entropy" entropy ev
421 check sim "autonomy" autonomy (nodeEntropy info_autonomy t')
422 check sim "fwd_entropy" fwd_entropy (nodeEntropy info_entropy (_fwd t'))
423 check sim "bwd_entropy" bwd_entropy (nodeEntropy info_entropy (_bwd t'))
426 P.putStrLn . Tree.drawTree
428 . toTree (NonTerminal "")
430 -- | TODO real data is a list of tokenized sentences
431 example0, example1, example2, example3, example4, example5, example6 :: [Text]
432 example0 = ["New-York is New-York and New-York"]
433 example1 = ["to-be or not to-be"]
434 example2 = ["to-be-or not to-be-or NOT to-be and"]
435 example3 = example0 <> example0
436 -- > TEST: Should not have York New in the trie
437 example4 = ["a-b-c-d e a-b-c-d f"]
438 example5 = ["a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"]
439 example6 = ["le-petit chat"
445 checks0, checks2 :: [(Text, Int, Double, Double, Double, Double, Double)]
448 [("<start>", 1, nan, nan, nan, nan, 0.0)
449 ,("New", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 1.584962500721156, 0.0)
450 ,("York", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 0.0, 1.584962500721156)
451 ,("is", 1, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, 0.0)
452 ,("and", 1, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, 0.0)
453 ,("<stop>", 0, nan, nan, nan, 0.0, nan)
456 ,("<start> New", 1, nan, nan, nan, nan, 0.0)
457 ,("New York", 3, 1.584962500721156, 1.584962500721156, 1.4142135623730951, nan, 1.584962500721156)
458 ,("York is", 1, 0.0, nan, nan, nan, 0.0)
459 ,("is New", 1, 0.0, nan, nan, nan, 0.0)
460 ,("York and", 1, 0.0, nan, nan, nan, 0.0)
461 ,("and New", 1, 0.0, nan, nan, nan, 0.0)
462 ,("York <stop>", 1, nan, nan, nan, nan, nan)
464 ,("<start> New York", 1, nan, nan, nan, nan, 0.0)
465 ,("New York is", 1, 0.0, nan, nan, nan, 0.0)
466 ,("York is New", 1, 0.0, nan, nan, nan, 0.0)
467 ,("is New York", 1, 0.0, nan, nan, nan, 0.0)
468 ,("New York and", 1, 0.0, nan, nan, nan, 0.0)
469 ,("York and New", 1, 0.0, nan, nan, nan, 0.0)
470 ,("and New York", 1, 0.0, nan, nan, nan, 0.0)
471 ,("New York <stop>", 1, nan, nan, nan, nan, nan)
478 [("to be", 3, 1.2516291673878228, 1.2516291673878228, 1.5535694744293167, nan, 0.9182958340544896)
479 ,("be or", 2, 0.5, nan, nan, nan, 1.0)
480 ,("or not", 1, 0.0, nan, nan, nan, 0.0)
481 ,("not to", 1, 0.0, nan, nan, nan, 0.0)
482 ,("or NOT", 1, 0.0, nan, nan, nan, 0.0)
483 ,("NOT to", 1, 0.0, nan, nan, nan, 0.0)
484 ,("be and", 1, 0.0, nan, nan, nan, 0.0)
491 [("example0", 2, example0, checks0)
492 ,("example1", 2, example1, [])
493 ,("example2", 3, example2, checks2)
494 ,("example3", 2, example3, [])
495 ,("example4", 4, example4, [])
496 ,("example5", 5, example5, [])
498 (\(name, n, ex, checks) -> do
499 P.putStrLn $ name <> " " <> show n
500 b <- testEleve False n ex checks
501 P.putStrLn $ " splitting: " <> if b then "PASS" else "FAIL"