2 Module : Gargantext.Text.Eleve
3 Description : Unsupervized Word segmentation
4 Copyright : (c) CNRS, 2019-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 # Implementation of Unsupervized Word Segmentation
14 - Python implementation (Korantin August, Emmanuel Navarro):
15 [EleVe](https://github.com/kodexlab/eleve.git)
17 - Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
18 Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
19 the 50th Annual Meeting of the Association for Computational Linguistics
20 , pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
22 Notes for current implementation:
23 - TODO fix normalization
24 - TODO extract longer ngrams (see paper above, viterbi algo can be used)
25 - TODO AD TEST: prop (Node c _e f) = c == Map.size f
27 - AD: Real ngrams extraction test
28 from Gargantext.Text.Terms import extractTermsUnsupervised
29 docs <- runCmdRepl $ selectDocs 1004
30 extractTermsUnsupervised 3 $ DT.intercalate " "
32 $ Gargantext.map _hyperdataDocument_abstract docs
35 {-# LANGUAGE ConstraintKinds #-}
36 {-# LANGUAGE NoImplicitPrelude #-}
37 {-# LANGUAGE OverloadedStrings #-}
38 {-# LANGUAGE RankNTypes #-}
39 {-# LANGUAGE TemplateHaskell #-}
40 {-# LANGUAGE TypeFamilies #-}
42 module Gargantext.Text.Eleve where
44 import Debug.Trace (trace)
45 -- import Debug.SimpleReflect
47 import Data.Functor.Reverse
48 import Control.Lens (Lens', Getting, (^.), (^?), view, makeLenses, _Just, under, reversed)
49 import Control.Monad (forM_)
51 import qualified Data.List as L
53 import Data.Text (Text)
54 import qualified Data.Text as T
56 import Data.Maybe (fromMaybe)
57 import qualified Data.Map as Map
58 import Gargantext.Prelude hiding (cs)
59 import qualified Data.Tree as Tree
60 import Data.Tree (Tree)
61 import qualified Prelude as P (putStrLn, logBase, isNaN, RealFloat)
68 -- ^ TODO: only used for debugging
70 ------------------------------------------------------------------------
71 -- | Example and tests for development
77 instance Show e => Show (I e) where
78 show (I e n) = show (e, n)
82 type ModEntropy i o e = (e -> e) -> i -> o
84 set_autonomy :: ModEntropy e (I e) e
85 set_autonomy f e = I e (f e)
87 data StartStop = Start | Stop
88 deriving (Ord, Eq, Show)
90 data Token = NonTerminal Text
92 deriving (Ord, Eq, Show)
94 isTerminal :: Token -> Bool
95 isTerminal (Terminal _) = True
96 isTerminal (NonTerminal _) = False
98 parseToken :: Text -> Token
99 parseToken "<start>" = Terminal Start
100 parseToken "<stop>" = Terminal Stop
101 parseToken t = NonTerminal t
104 toToken :: [Text] -> [Token]
105 toToken xs = Terminal Start : (NonTerminal <$> xs) <> [Terminal Stop]
107 printToken :: Token -> Text
110 f (NonTerminal x) = x
111 f (Terminal Start) = "<start>"
112 f (Terminal Stop) = "<stop>"
114 ------------------------------------------------------------------------
117 = Node { _node_count :: Int
119 , _node_children :: Map k (Trie k e)
121 | Leaf { _node_count :: Int }
126 insertTries :: Ord k => [[k]] -> Trie k ()
127 insertTries = L.foldr insertTrie emptyTrie
129 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
130 insertTrie [] n = n { _node_count = _node_count n +1}
131 insertTrie (x:xs) (Leaf c) = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
132 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
134 f = Just . insertTrie xs . fromMaybe emptyTrie
136 -- emptyTrie :: (Ord k, Monoid e) => Trie k e
137 -- emptyTrie = Node 0 mempty mempty
138 emptyTrie :: Trie k e
141 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
143 | Map.null children = Leaf c
144 | otherwise = Node c mempty children
146 -----------------------------
148 -- | Trie to Tree since Tree as nice print function
149 toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
150 toTree k (Leaf c) = Tree.Node (k, c, Nothing) []
151 toTree k (Node c e cs) = Tree.Node (k, c, Just e) (map (uncurry toTree) $ Map.toList cs)
153 ------------------------------------------------------------------------
154 ------------------------------------------------------------------------
156 nan :: Floating e => e
159 noNaNs :: P.RealFloat e => [e] -> [e]
160 noNaNs = filter (not . P.isNaN)
162 updateIfDefined :: P.RealFloat e => e -> e -> e
163 updateIfDefined e0 e | P.isNaN e = e0
166 subst :: Entropy e => (e, e) -> e -> e
167 subst (src, dst) x | sim src x = dst
170 entropyTrie :: Entropy e => (k -> Bool) -> Trie k () -> Trie k e
171 entropyTrie _ (Leaf c) = Leaf c
172 entropyTrie pred (Node c () children) = Node c e (map (entropyTrie pred) children)
174 e = sum $ map f $ Map.toList children
175 f (k, child) = if pred k then chc * P.logBase 2 (fromIntegral c)
176 else - chc * P.logBase 2 chc
178 chc = fromIntegral (_node_count child) / fromIntegral c
179 ------------------------------------------------------------------------
181 normalizeLevel :: Entropy e => [e] -> e -> e
182 normalizeLevel = checkDiff (go . noNaNs)
185 -- checkDiff f es e = let e' = f es e in if e == e' then e' else trace ("normalizeLevel: diff " <> show e <> " " <> show e') e'
187 -- go [] = panic "normalizeLevel: impossible"
188 -- trace "normalizeLevel"
190 go es = \e -> (e - m) / v
193 then trace ("normalizeLevel " <> show (e,m,v,es))
203 nodeChildren :: Trie k e -> Map k (Trie k e)
204 nodeChildren (Node _ _ cs) = cs
205 nodeChildren (Leaf _) = Map.empty
210 class IsTrie trie where
211 buildTrie :: Entropy e => (Int -> [[Text]] -> [[Token]]) -> Int -> [[Text]] -> trie Token e
212 nodeEntropy :: Entropy e => Getting e i e -> trie k i -> e
213 nodeChild :: Ord k => k -> trie k e -> trie k e
214 findTrie :: Ord k => [k] -> trie k e -> trie k e
215 normalizeEntropy :: Entropy e
216 => Getting e i e -> ModEntropy i o e
217 -> trie k i -> trie k o
220 --nodeAutonomy :: (Ord k, Entropy e) => Getting e i e -> trie k i -> [k] -> e
221 --nodeAutonomy inE t ks = nodeEntropy inE $ findTrie ks t
223 instance IsTrie Trie where
224 buildTrie to n ts = entropyTrie isTerminal $ insertTries $ to n ts
226 nodeEntropy inE (Node _ e _) = e ^. inE
227 nodeEntropy _ (Leaf _) = nan
229 nodeChild k (Node _ _ cs) = fromMaybe emptyTrie (Map.lookup k cs)
230 nodeChild _ (Leaf _) = emptyTrie
232 findTrie ks t = L.foldl (flip nodeChild) t ks
234 normalizeEntropy inE modE t = go (modE identity) (entropyLevels inE t) t
236 go _ [] _ = panic "normalizeEntropy' empty levels"
237 go _ _ (Leaf c) = Leaf c
238 -- go _ ([] : _) _ = panic "normalizeEntropy': empty level"
239 go f (es : ess) (Node c i children)
240 -- | any (sim (i ^. inE)) es
241 = Node c (f i) $ go (modE $ normalizeLevel es) ess <$> children
243 -- = panic "NOT an elem"
247 This is only normalizing a node with respect to its brothers (unlike all the
248 nodes of the same level).
250 normalizeEntropy inE modE = go $ modE identity
252 go _ (Leaf c) = Leaf c
253 go f (Node c i children)
254 | Map.null children =
255 panic "normalizeEntropy: impossible"
257 Node c (f i) $ go (modE $ normalizeLevel es) <$> children
259 es = [ i' ^. inE | Node _ i' _ <- Map.elems children ]
261 ------------------------------------------------------------------------
263 levels :: Trie k e -> [[Trie k e]]
264 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
266 subForest :: Trie k e -> [Trie k e]
267 subForest (Leaf _) = []
268 subForest (Node _ _ children) = Map.elems children
270 entropyLevels :: Entropy e => Getting e i e -> Trie k i -> [[e]]
271 entropyLevels inE = fmap (noNaNs . map (nodeEntropy inE)) . levels
273 ------------------------------------------------------------------------
275 data Tries k e = Tries
280 instance IsTrie Tries where
281 buildTrie to n tts = Tries { _fwd = buildTrie to n tts
282 , _bwd = buildTrie to n (map reverse $ tts)
285 nodeEntropy inE (Tries fwd bwd) =
286 -- VETODO reverse the query for bwd here
287 -- mean $ noNaNs [nodeEntropy inE fwd, nodeEntropy inE bwd . under reversed]
288 mean $ noNaNs [nodeEntropy inE fwd, nodeEntropy inE bwd]
290 findTrie ks (Tries fwd bwd) = Tries (findTrie ks fwd) (findTrie ks bwd)
292 -- TODO: here this is tempting to reverse but this is not always what we
293 -- want. See also nodeAutonomy.
294 -- AD: I also tried to reverse here and I confirm getting unexpected results (whereas VETODO FIX below is ok)
295 -- since recursivity of the function makes the reverse multiple times (I guess)
297 nodeChild k (Tries fwd bwd) = Tries (nodeChild k fwd) (nodeChild k bwd)
299 normalizeEntropy inE modE = onTries (normalizeEntropy inE modE)
301 onTries :: (Trie k i -> Trie k o) -> Tries k i -> Tries k o
302 onTries f (Tries fwd bwd) = Tries (f fwd) (f bwd)
304 ------------------------------------------------------------------------
305 split :: (IsTrie trie, Entropy e) => Lens' i e -> trie Token i -> [Token] -> [[Token]]
307 split inE t0 (Terminal Start:xs0) = split inE (nodeChild (Terminal Start) t0) xs0
308 split inE t0 (x0:xs0) = go (nodeChild x0 t0) [x0] xs0
311 consRev xs xss = reverse xs : xss
313 go _ pref [] = [reverse pref]
314 go _ pref (Terminal Stop:_) = [reverse pref]
315 go t pref (Terminal Start:xs) = go t pref xs
317 -- trace (show (if acc then "ACC" else "CUT", (reverse (x : pref), ext), if acc then ">" else "<=", ((reverse pref, et), "+", ([x], ext0)))) $
319 then go xt (x:pref) xs
320 else consRev pref $ go xt0 [x] xs
325 -- ^ entropy of the current prefix
329 -- ^ entropy of the current prefix plus x
330 acc = ext > et + ext0
331 -- aut(["in","this","paper"]) > aut(["in","this"]) + aut(["paper"])
333 ne d t = if P.isNaN e then d else e
334 where e = nodeEntropy inE t
337 split :: Entropy e => Lens' i e -> Tries Token i -> [Token] -> [[Token]]
339 maximumWith (sum . map $ nodeAutonomy inE t0) (all the splits of ts)
342 ------------------------------------------------------------------------
343 ------------------------------------------------------------------------
345 mainEleve :: Int -> [[Text]] -> [[[Text]]]
348 mainEleve n input = map (map printToken) . split identity (t :: Trie Token Double) <$> inp
350 inp = toToken <$> input
351 t = buildTrie $ L.concat $ chunkAlong n 1 <$> inp
354 sim :: Entropy e => e -> e -> Bool
355 sim x y = x == y || (P.isNaN x && P.isNaN y)
357 chunkAlongEleve :: Int -> [a] -> [[a]]
358 chunkAlongEleve n xs = L.take n <$> L.tails xs
360 toToken' :: Int -> [[Text]] -> [[Token]]
361 toToken' n input = L.concat $ (filter (/= [Terminal Stop]) . chunkAlongEleve (n + 2)) <$> toToken <$> input
363 testEleve :: e ~ Double => Bool -> Int -> [Text] -> [(Text, Int, e, e, e, e, e)] -> IO Bool
364 testEleve debug n output checks = do
367 pss = [ (ps, findTrie ps fwd ^? _Just . node_entropy) -- . info_entropy)
370 , cs <- chunkAlong m 1 <$> inp
375 --res = map (map printToken) . split identity fwd <$> inp
376 --res = map (map printToken) . split info_norm_entropy' nt' <$> inp
377 res = map (map printToken) . split info_autonomy nt <$> inp
379 P.putStrLn (show input)
380 -- forM_ pss (P.putStrLn . show)
383 forM_ (entropyLevels identity (_fwd t)) $ \level ->
384 P.putStrLn $ " " <> show level
386 P.putStrLn "Forward:"
389 P.putStrLn "Backward:"
392 P.putStrLn "Splitting:"
393 P.putStrLn $ show res
395 pure $ expected == res
398 out = T.words <$> output
399 expected = fmap (T.splitOn "-") <$> out
400 input = (T.splitOn "-" =<<) <$> out
401 inp = toToken <$> input
402 t = buildTrie toToken' n input
403 -- nt = normalizeEntropy identity set_autonomy (fwd :: Trie Token Double)
404 -- nt = normalizeEntropy' info_entropy (\f -> info_norm_entropy' %~ f) nt
405 nt = normalizeEntropy identity set_autonomy t
409 then P.putStrLn $ " PASS " <> msg <> " " <> show ref
410 else P.putStrLn $ " FAIL " <> msg <> " ref=" <> show ref <> " my=" <> show my
412 checker (ngram, count, entropy, _ev, autonomy, bwd_entropy, fwd_entropy) = do
413 let ns = parseToken <$> T.words ngram
414 nsb = parseToken <$> (reverse $ T.words ngram)
417 tb' = findTrie nsb nt
418 -- TODO put this Variation Entropy at VETODO mark above maybe in nodeEntropy ?
419 ev = (mean [(nodeEntropy info_entropy (_fwd t')), (nodeEntropy info_entropy (_bwd tb'))])
421 P.putStrLn $ " " <> T.unpack ngram <> ":"
422 check (==) "count" count (_node_count (_fwd t'))
423 check sim "entropy" entropy ev
424 check sim "autonomy" autonomy (nodeEntropy info_autonomy t')
425 check sim "fwd_entropy" fwd_entropy (nodeEntropy info_entropy (_fwd t'))
426 check sim "bwd_entropy" bwd_entropy (nodeEntropy info_entropy (_bwd t'))
429 P.putStrLn . Tree.drawTree
431 . toTree (NonTerminal "")
433 -- | TODO real data is a list of tokenized sentences
434 example0, example1, example2, example3, example4, example5, example6 :: [Text]
435 example0 = ["New-York is New-York and New-York"]
436 example1 = ["to-be or not to-be"]
437 example2 = ["to-be-or not to-be-or NOT to-be and"]
438 example3 = example0 <> example0
439 -- > TEST: Should not have York New in the trie
440 example4 = ["a-b-c-d e a-b-c-d f"]
441 example5 = ["a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"]
442 example6 = ["le-petit chat"
448 checks0, checks2 :: [(Text, Int, Double, Double, Double, Double, Double)]
451 [("<start>", 1, nan, nan, nan, nan, 0.0)
452 ,("New", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 1.584962500721156, 0.0)
453 ,("York", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 0.0, 1.584962500721156)
454 ,("is", 1, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, 0.0)
455 ,("and", 1, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, 0.0)
456 ,("<stop>", 0, nan, nan, nan, 0.0, nan)
459 ,("<start> New", 1, nan, nan, nan, nan, 0.0)
460 ,("New York", 3, 1.584962500721156, 1.584962500721156, 1.4142135623730951, nan, 1.584962500721156)
461 ,("York is", 1, 0.0, nan, nan, nan, 0.0)
462 ,("is New", 1, 0.0, nan, nan, nan, 0.0)
463 ,("York and", 1, 0.0, nan, nan, nan, 0.0)
464 ,("and New", 1, 0.0, nan, nan, nan, 0.0)
465 ,("York <stop>", 1, nan, nan, nan, nan, nan)
467 ,("<start> New York", 1, nan, nan, nan, nan, 0.0)
468 ,("New York is", 1, 0.0, nan, nan, nan, 0.0)
469 ,("York is New", 1, 0.0, nan, nan, nan, 0.0)
470 ,("is New York", 1, 0.0, nan, nan, nan, 0.0)
471 ,("New York and", 1, 0.0, nan, nan, nan, 0.0)
472 ,("York and New", 1, 0.0, nan, nan, nan, 0.0)
473 ,("and New York", 1, 0.0, nan, nan, nan, 0.0)
474 ,("New York <stop>", 1, nan, nan, nan, nan, nan)
481 [("to be", 3, 1.2516291673878228, 1.2516291673878228, 1.5535694744293167, nan, 0.9182958340544896)
482 ,("be or", 2, 0.5, nan, nan, nan, 1.0)
483 ,("or not", 1, 0.0, nan, nan, nan, 0.0)
484 ,("not to", 1, 0.0, nan, nan, nan, 0.0)
485 ,("or NOT", 1, 0.0, nan, nan, nan, 0.0)
486 ,("NOT to", 1, 0.0, nan, nan, nan, 0.0)
487 ,("be and", 1, 0.0, nan, nan, nan, 0.0)
494 [("example0", 2, example0, checks0)
495 ,("example1", 2, example1, [])
496 ,("example2", 3, example2, checks2)
497 ,("example3", 2, example3, [])
498 ,("example4", 4, example4, [])
499 ,("example5", 5, example5, [])
501 (\(name, n, ex, checks) -> do
502 P.putStrLn $ name <> " " <> show n
503 b <- testEleve False n ex checks
504 P.putStrLn $ " splitting: " <> if b then "PASS" else "FAIL"