2 Module : Gargantext.Text.Eleve
3 Description : Unsupervized Word segmentation
4 Copyright : (c) CNRS, 2019-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 # Implementation of Unsupervized Word Segmentation
14 - Python implementation (Korantin August, Emmanuel Navarro):
15 [EleVe](https://github.com/kodexlab/eleve.git)
17 - Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
18 Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
19 the 50th Annual Meeting of the Association for Computational Linguistics
20 , pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
22 Notes for current implementation:
23 - TODO fix normalization
24 - TODO extract longer ngrams (see paper above, viterbi algo can be used)
25 - TODO AD TEST: prop (Node c _e f) = c == Map.size f
27 - AD: Real ngrams extraction test
28 from Gargantext.Text.Terms import extractTermsUnsupervised
29 docs <- runCmdRepl $ selectDocs 1004
30 extractTermsUnsupervised 3 $ DT.intercalate " "
32 $ Gargantext.map _hyperdataDocument_abstract docs
35 {-# LANGUAGE ConstraintKinds #-}
36 {-# LANGUAGE NoImplicitPrelude #-}
37 {-# LANGUAGE OverloadedStrings #-}
38 {-# LANGUAGE RankNTypes #-}
39 {-# LANGUAGE TemplateHaskell #-}
40 {-# LANGUAGE TypeFamilies #-}
42 module Gargantext.Text.Eleve where
44 import Debug.Trace (trace)
45 -- import Debug.SimpleReflect
47 import Control.Lens (Lens', Getting, (^.), (^?), view, makeLenses, _Just)
48 import Control.Monad (forM_)
50 import qualified Data.List as L
52 import Data.Text (Text)
53 import qualified Data.Text as T
55 import Data.Maybe (fromMaybe)
56 import qualified Data.Map as Map
57 import Gargantext.Prelude hiding (cs)
58 import qualified Data.Tree as Tree
59 import Data.Tree (Tree)
60 import qualified Prelude as P (putStrLn, logBase, isNaN, RealFloat)
67 -- ^ TODO: only used for debugging
69 ------------------------------------------------------------------------
70 -- | Example and tests for development
76 instance Show e => Show (I e) where
77 show (I e n) = show (e, n)
81 type ModEntropy i o e = (e -> e) -> i -> o
83 set_autonomy :: ModEntropy e (I e) e
84 set_autonomy f e = I e (f e)
86 data StartStop = Start | Stop
87 deriving (Ord, Eq, Show)
89 data Token = NonTerminal Text
91 deriving (Ord, Eq, Show)
93 isTerminal :: Token -> Bool
94 isTerminal (Terminal _) = True
95 isTerminal (NonTerminal _) = False
97 parseToken :: Text -> Token
98 parseToken "<start>" = Terminal Start
99 parseToken "<stop>" = Terminal Stop
100 parseToken t = NonTerminal t
103 toToken :: [Text] -> [Token]
104 toToken xs = Terminal Start : (NonTerminal <$> xs) <> [Terminal Stop]
106 printToken :: Token -> Text
109 f (NonTerminal x) = x
110 f (Terminal Start) = "<start>"
111 f (Terminal Stop) = "<stop>"
113 ------------------------------------------------------------------------
116 = Node { _node_count :: Int
118 , _node_children :: Map k (Trie k e)
120 | Leaf { _node_count :: Int }
125 insertTries :: Ord k => [[k]] -> Trie k ()
126 insertTries = L.foldr insertTrie emptyTrie
128 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
129 insertTrie [] n = n { _node_count = _node_count n +1}
130 insertTrie (x:xs) (Leaf c) = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
131 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
133 f = Just . insertTrie xs . fromMaybe emptyTrie
135 -- emptyTrie :: (Ord k, Monoid e) => Trie k e
136 -- emptyTrie = Node 0 mempty mempty
137 emptyTrie :: Trie k e
140 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
142 | Map.null children = Leaf c
143 | otherwise = Node c mempty children
145 -----------------------------
147 -- | Trie to Tree since Tree as nice print function
148 toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
149 toTree k (Leaf c) = Tree.Node (k, c, Nothing) []
150 toTree k (Node c e cs) = Tree.Node (k, c, Just e) (map (uncurry toTree) $ Map.toList cs)
152 ------------------------------------------------------------------------
153 ------------------------------------------------------------------------
155 nan :: Floating e => e
158 noNaNs :: P.RealFloat e => [e] -> [e]
159 noNaNs = filter (not . P.isNaN)
161 updateIfDefined :: P.RealFloat e => e -> e -> e
162 updateIfDefined e0 e | P.isNaN e = e0
165 subst :: Entropy e => (e, e) -> e -> e
166 subst (src, dst) x | sim src x = dst
169 entropyTrie :: Entropy e => (k -> Bool) -> Trie k () -> Trie k e
170 entropyTrie _ (Leaf c) = Leaf c
171 entropyTrie pred (Node c () children) = Node c e (map (entropyTrie pred) children)
173 e = sum $ map f $ Map.toList children
174 f (k, child) = if pred k then chc * P.logBase 2 (fromIntegral c)
175 else - chc * P.logBase 2 chc
177 chc = fromIntegral (_node_count child) / fromIntegral c
178 ------------------------------------------------------------------------
180 normalizeLevel :: Entropy e => [e] -> e -> e
181 normalizeLevel = checkDiff (go . noNaNs)
184 -- checkDiff f es e = let e' = f es e in if e == e' then e' else trace ("normalizeLevel: diff " <> show e <> " " <> show e') e'
186 -- go [] = panic "normalizeLevel: impossible"
187 -- trace "normalizeLevel"
189 go es = \e -> (e - m) / v
192 then trace ("normalizeLevel " <> show (e,m,v,es))
202 nodeChildren :: Trie k e -> Map k (Trie k e)
203 nodeChildren (Node _ _ cs) = cs
204 nodeChildren (Leaf _) = Map.empty
209 class IsTrie trie where
210 buildTrie :: Entropy e => (Int -> [[Text]] -> [[Token]]) -> Int -> [[Text]] -> trie Token e
211 nodeEntropy :: Entropy e => Getting e i e -> trie k i -> e
212 nodeChild :: Ord k => k -> trie k e -> trie k e
213 findTrie :: Ord k => [k] -> trie k e -> trie k e
214 normalizeEntropy :: Entropy e
215 => Getting e i e -> ModEntropy i o e
216 -> trie k i -> trie k o
219 --nodeAutonomy :: (Ord k, Entropy e) => Getting e i e -> trie k i -> [k] -> e
220 --nodeAutonomy inE t ks = nodeEntropy inE $ findTrie ks t
222 instance IsTrie Trie where
223 buildTrie to n ts = entropyTrie isTerminal $ insertTries $ to n ts
225 nodeEntropy inE (Node _ e _) = e ^. inE
226 nodeEntropy _ (Leaf _) = -- trace "nodeEntropy of Leaf" $
229 nodeChild k (Node _ _ cs) = fromMaybe emptyTrie (Map.lookup k cs)
230 nodeChild _ (Leaf _) = emptyTrie
232 findTrie ks t = L.foldl (flip nodeChild) t ks
234 normalizeEntropy inE modE t = go (modE identity) (entropyLevels inE t) t
236 go _ [] _ = panic "normalizeEntropy' empty levels"
237 go _ _ (Leaf c) = Leaf c
238 -- go _ ([] : _) _ = panic "normalizeEntropy': empty level"
239 go f (es : ess) (Node c i children)
240 -- | any (sim (i ^. inE)) es
241 = Node c (f i) $ go (modE $ normalizeLevel es) ess <$> children
243 -- = panic "NOT an elem"
247 This is only normalizing a node with respect to its brothers (unlike all the
248 nodes of the same level).
250 normalizeEntropy inE modE = go $ modE identity
252 go _ (Leaf c) = Leaf c
253 go f (Node c i children)
254 | Map.null children =
255 panic "normalizeEntropy: impossible"
257 Node c (f i) $ go (modE $ normalizeLevel es) <$> children
259 es = [ i' ^. inE | Node _ i' _ <- Map.elems children ]
261 ------------------------------------------------------------------------
263 levels :: Trie k e -> [[Trie k e]]
264 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
266 subForest :: Trie k e -> [Trie k e]
267 subForest (Leaf _) = []
268 subForest (Node _ _ children) = Map.elems children
270 entropyLevels :: Entropy e => Getting e i e -> Trie k i -> [[e]]
271 entropyLevels inE = fmap (noNaNs . map (nodeEntropy inE)) . levels
273 ------------------------------------------------------------------------
275 data Tries k e = Tries
280 instance IsTrie Tries where
281 buildTrie to n tts = Tries { _fwd = buildTrie to n tts
282 , _bwd = buildTrie to n (map reverse $ tts)
285 nodeEntropy inE (Tries fwd bwd) =
286 mean $ noNaNs [nodeEntropy inE fwd, nodeEntropy inE bwd]
288 findTrie ks (Tries fwd bwd) = Tries (findTrie ks fwd) (findTrie ks bwd)
290 -- TODO: here this is tempting to reverse but this is not always what we
291 -- want. See also nodeAutonomy.
293 nodeChild k (Tries fwd bwd) = Tries (nodeChild k fwd) (nodeChild k bwd)
295 normalizeEntropy inE modE = onTries (normalizeEntropy inE modE)
297 onTries :: (Trie k i -> Trie k o) -> Tries k i -> Tries k o
298 onTries f (Tries fwd bwd) = Tries (f fwd) (f bwd)
300 ------------------------------------------------------------------------
301 split :: (IsTrie trie, Entropy e) => Lens' i e -> trie Token i -> [Token] -> [[Token]]
303 split inE t0 (Terminal Start:xs0) = split inE (nodeChild (Terminal Start) t0) xs0
304 split inE t0 (x0:xs0) = go (nodeChild x0 t0) [x0] xs0
307 consRev xs xss = reverse xs : xss
309 go _ pref [] = [reverse pref]
310 go _ pref (Terminal Stop:_) = [reverse pref]
311 go t pref (Terminal Start:xs) = go t pref xs
313 -- trace (show (if acc then "ACC" else "CUT", (reverse (x : pref), ext), if acc then ">" else "<=", ((reverse pref, et), "+", ([x], ext0)))) $
315 then go xt (x:pref) xs
316 else consRev pref $ go xt0 [x] xs
321 -- ^ entropy of the current prefix
325 -- ^ entropy of the current prefix plus x
326 acc = ext > et + ext0
327 -- aut(["in","this","paper"]) > aut(["in","this"]) + aut(["paper"])
329 ne d t = if P.isNaN e then d else e
330 where e = nodeEntropy inE t
333 split :: Entropy e => Lens' i e -> Tries Token i -> [Token] -> [[Token]]
335 maximumWith (sum . map $ nodeAutonomy inE t0) (all the splits of ts)
338 ------------------------------------------------------------------------
339 ------------------------------------------------------------------------
341 mainEleve :: Int -> [[Text]] -> [[[Text]]]
344 mainEleve n input = map (map printToken) . split identity (t :: Trie Token Double) <$> inp
346 inp = toToken <$> input
347 t = buildTrie $ L.concat $ chunkAlong n 1 <$> inp
350 sim :: Entropy e => e -> e -> Bool
351 sim x y = x == y || (P.isNaN x && P.isNaN y)
353 chunkAlongEleve :: Int -> [a] -> [[a]]
354 chunkAlongEleve n xs = L.take n <$> L.tails xs
356 toToken' :: Int -> [[Text]] -> [[Token]]
357 toToken' n input = L.concat $ (filter (/= [Terminal Stop]) . chunkAlongEleve (n + 2)) <$> toToken <$> input
359 testEleve :: e ~ Double => Bool -> Int -> [Text] -> [(Text, Int, e, e, e, e, e)] -> IO Bool
360 testEleve debug n output checks = do
363 pss = [ (ps, findTrie ps fwd ^? _Just . node_entropy) -- . info_entropy)
366 , cs <- chunkAlong m 1 <$> inp
371 --res = map (map printToken) . split identity fwd <$> inp
372 --res = map (map printToken) . split info_norm_entropy' nt' <$> inp
373 res = map (map printToken) . split info_autonomy nt <$> inp
375 P.putStrLn (show input)
376 -- forM_ pss (P.putStrLn . show)
379 forM_ (entropyLevels identity (_fwd t)) $ \level ->
380 P.putStrLn $ " " <> show level
382 P.putStrLn "Forward:"
385 P.putStrLn "Backward:"
388 P.putStrLn "Splitting:"
389 P.putStrLn $ show res
391 pure $ expected == res
394 out = T.words <$> output
395 expected = fmap (T.splitOn "-") <$> out
396 input = (T.splitOn "-" =<<) <$> out
397 inp = toToken <$> input
398 t = buildTrie toToken' n input
399 -- nt = normalizeEntropy identity set_autonomy (fwd :: Trie Token Double)
400 -- nt = normalizeEntropy' info_entropy (\f -> info_norm_entropy' %~ f) nt
401 nt = normalizeEntropy identity set_autonomy t
405 then P.putStrLn $ " PASS " <> msg <> " " <> show ref
406 else P.putStrLn $ " FAIL " <> msg <> " ref=" <> show ref <> " my=" <> show my
408 checker (ngram, count, entropy, _ev, autonomy, bwd_entropy, fwd_entropy) = do
409 let ns = parseToken <$> T.words ngram
411 nsb = parseToken <$> (reverse $ T.words ngram)
412 tb' = findTrie nsb nt
414 P.putStrLn $ " " <> T.unpack ngram <> ":"
415 check (==) "count" count (_node_count (_fwd t'))
416 check sim "entropy" entropy (mean [(nodeEntropy info_entropy (_fwd t')), (nodeEntropy info_entropy (_bwd tb'))])
418 -- (nodeEntropy info_entropy t')
419 check sim "autonomy" autonomy (nodeEntropy info_autonomy t')
420 check sim "fwd_entropy" fwd_entropy (nodeEntropy info_entropy (_fwd t'))
421 check sim "bwd_entropy" bwd_entropy (nodeEntropy info_entropy (_bwd t'))
424 P.putStrLn . Tree.drawTree
426 . toTree (NonTerminal "")
428 -- | TODO real data is a list of tokenized sentences
429 example0, example1, example2, example3, example4, example5, example6 :: [Text]
430 example0 = ["New-York is New-York and New-York"]
431 example1 = ["to-be or not to-be"]
432 example2 = ["to-be-or not to-be-or NOT to-be and"]
433 example3 = example0 <> example0
434 -- > TEST: Should not have York New in the trie
435 example4 = ["a-b-c-d e a-b-c-d f"]
436 example5 = ["a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"]
437 example6 = ["le-petit chat"
443 checks0, checks2 :: [(Text, Int, Double, Double, Double, Double, Double)]
446 [("<start>", 1, nan, nan, nan, nan, 0.0)
447 ,("New", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 1.584962500721156, 0.0)
448 ,("York", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 0.0, 1.584962500721156)
449 ,("is", 1, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, 0.0)
450 ,("and", 1, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, 0.0)
451 ,("<stop>", 0, nan, nan, nan, 0.0, nan)
454 ,("<start> New", 1, nan, nan, nan, nan, 0.0)
455 ,("New York", 3, 1.584962500721156, 1.584962500721156, 1.4142135623730951, nan, 1.584962500721156)
456 ,("York is", 1, 0.0, nan, nan, nan, 0.0)
457 ,("is New", 1, 0.0, nan, nan, nan, 0.0)
458 ,("York and", 1, 0.0, nan, nan, nan, 0.0)
459 ,("and New", 1, 0.0, nan, nan, nan, 0.0)
460 ,("York <stop>", 1, nan, nan, nan, nan, nan)
462 ,("<start> New York", 1, nan, nan, nan, nan, 0.0)
463 ,("New York is", 1, 0.0, nan, nan, nan, 0.0)
464 ,("York is New", 1, 0.0, nan, nan, nan, 0.0)
465 ,("is New York", 1, 0.0, nan, nan, nan, 0.0)
466 ,("New York and", 1, 0.0, nan, nan, nan, 0.0)
467 ,("York and New", 1, 0.0, nan, nan, nan, 0.0)
468 ,("and New York", 1, 0.0, nan, nan, nan, 0.0)
469 ,("New York <stop>", 1, nan, nan, nan, nan, nan)
476 [("to be", 3, 1.2516291673878228, 1.2516291673878228, 1.5535694744293167, nan, 0.9182958340544896)
477 ,("be or", 2, 0.5, nan, nan, nan, 1.0)
478 ,("or not", 1, 0.0, nan, nan, nan, 0.0)
479 ,("not to", 1, 0.0, nan, nan, nan, 0.0)
480 ,("or NOT", 1, 0.0, nan, nan, nan, 0.0)
481 ,("NOT to", 1, 0.0, nan, nan, nan, 0.0)
482 ,("be and", 1, 0.0, nan, nan, nan, 0.0)
489 [("example0", 2, example0, checks0)
490 ,("example1", 2, example1, [])
491 ,("example2", 3, example2, checks2)
492 ,("example3", 2, example3, [])
493 ,("example4", 4, example4, [])
494 ,("example5", 5, example5, [])
496 (\(name, n, ex, checks) -> do
497 P.putStrLn $ name <> " " <> show n
498 b <- testEleve False n ex checks
499 P.putStrLn $ " splitting: " <> if b then "PASS" else "FAIL"