2 Module : Gargantext.Text.Eleve
3 Description : Unsupervized Word segmentation
4 Copyright : (c) CNRS, 2019-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 # Implementation of Unsupervized Word Segmentation
14 - Python implementation (Korantin August, Emmanuel Navarro):
15 [EleVe](https://github.com/kodexlab/eleve.git)
17 - Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
18 Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
19 the 50th Annual Meeting of the Association for Computational Linguistics
20 , pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
22 Notes for current implementation:
23 - TODO fix normalization
24 - TODO extract longer ngrams (see paper above, viterbi algo can be used)
25 - TODO AD TEST: prop (Node c _e f) = c == Map.size f
27 - AD: Real ngrams extraction test
28 from Gargantext.Text.Terms import extractTermsUnsupervised
29 docs <- runCmdRepl $ selectDocs 1004
30 extractTermsUnsupervised 3 $ DT.intercalate " "
32 $ Gargantext.map _hyperdataDocument_abstract docs
35 {-# LANGUAGE ConstraintKinds #-}
36 {-# LANGUAGE NoImplicitPrelude #-}
37 {-# LANGUAGE OverloadedStrings #-}
38 {-# LANGUAGE RankNTypes #-}
39 {-# LANGUAGE TemplateHaskell #-}
40 {-# LANGUAGE TypeFamilies #-}
42 module Gargantext.Text.Eleve where
44 import Debug.Trace (trace)
45 -- import Debug.SimpleReflect
47 import Control.Lens (Lens', Getting, (^.), (^?), view, makeLenses, _Just)
48 import Control.Monad (forM_)
50 import qualified Data.List as L
52 import Data.Text (Text)
53 import qualified Data.Text as T
55 import Data.Maybe (fromMaybe)
56 import qualified Data.Map as Map
57 import Gargantext.Prelude hiding (cs)
58 import qualified Data.Tree as Tree
59 import Data.Tree (Tree)
60 import qualified Prelude as P (putStrLn, logBase, isNaN, RealFloat)
67 -- ^ TODO: only used for debugging
69 ------------------------------------------------------------------------
70 -- | Example and tests for development
76 instance Show e => Show (I e) where
77 show (I e n) = show (e, n)
81 type ModEntropy i o e = (e -> e) -> i -> o
83 set_autonomy :: ModEntropy e (I e) e
84 set_autonomy f e = I e (f e)
86 data StartStop = Start | Stop
87 deriving (Ord, Eq, Show)
89 data Token = NonTerminal Text
91 deriving (Ord, Eq, Show)
93 isTerminal :: Token -> Bool
94 isTerminal (Terminal _) = True
95 isTerminal (NonTerminal _) = False
97 parseToken :: Text -> Token
98 parseToken "<start>" = Terminal Start
99 parseToken "<stop>" = Terminal Stop
100 parseToken t = NonTerminal t
102 toToken :: [Text] -> [Token]
103 toToken xs = Terminal Start : (NonTerminal <$> xs) <> [Terminal Stop]
105 printToken :: Token -> Text
108 f (NonTerminal x) = x
109 f (Terminal Start) = "<start>"
110 f (Terminal Stop) = "<stop>"
112 ------------------------------------------------------------------------
115 = Node { _node_count :: Int
117 , _node_children :: Map k (Trie k e)
119 | Leaf { _node_count :: Int }
124 insertTries :: Ord k => [[k]] -> Trie k ()
125 insertTries = L.foldr insertTrie emptyTrie
127 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
128 insertTrie [] n = n { _node_count = _node_count n +1}
129 insertTrie (x:xs) (Leaf c) = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
130 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
132 f = Just . insertTrie xs . fromMaybe emptyTrie
134 -- emptyTrie :: (Ord k, Monoid e) => Trie k e
135 -- emptyTrie = Node 0 mempty mempty
136 emptyTrie :: Trie k e
139 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
141 | Map.null children = Leaf c
142 | otherwise = Node c mempty children
144 -----------------------------
146 -- | Trie to Tree since Tree as nice print function
147 toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
148 toTree k (Leaf c) = Tree.Node (k, c, Nothing) []
149 toTree k (Node c e cs) = Tree.Node (k, c, Just e) (map (uncurry toTree) $ Map.toList cs)
151 ------------------------------------------------------------------------
152 ------------------------------------------------------------------------
154 nan :: Floating e => e
157 noNaNs :: P.RealFloat e => [e] -> [e]
158 noNaNs = filter (not . P.isNaN)
160 updateIfDefined :: P.RealFloat e => e -> e -> e
161 updateIfDefined e0 e | P.isNaN e = e0
164 subst :: Entropy e => (e, e) -> e -> e
165 subst (src, dst) x | sim src x = dst
168 entropyTrie :: Entropy e => (k -> Bool) -> Trie k () -> Trie k e
169 entropyTrie _ (Leaf c) = Leaf c
170 entropyTrie pred (Node c () children) = Node c e (map (entropyTrie pred) children)
172 e = sum $ map f $ Map.toList children
173 f (k, child) = if pred k then chc * P.logBase 2 (fromIntegral c)
174 else - chc * P.logBase 2 chc
176 chc = fromIntegral (_node_count child) / fromIntegral c
177 ------------------------------------------------------------------------
179 normalizeLevel :: Entropy e => [e] -> e -> e
180 normalizeLevel = checkDiff (go . noNaNs)
183 -- checkDiff f es e = let e' = f es e in if e == e' then e' else trace ("normalizeLevel: diff " <> show e <> " " <> show e') e'
185 -- go [] = panic "normalizeLevel: impossible"
186 -- trace "normalizeLevel"
188 go es = \e -> (e - m) / v
191 then trace ("normalizeLevel " <> show (e,m,v,es))
201 nodeChildren :: Trie k e -> Map k (Trie k e)
202 nodeChildren (Node _ _ cs) = cs
203 nodeChildren (Leaf _) = Map.empty
207 class IsTrie trie where
208 buildTrie :: Entropy e => [[Token]] -> trie Token e
209 nodeEntropy :: Entropy e => Getting e i e -> trie k i -> e
210 nodeChild :: Ord k => k -> trie k e -> trie k e
211 findTrie :: Ord k => [k] -> trie k e -> trie k e
212 normalizeEntropy :: Entropy e
213 => Getting e i e -> ModEntropy i o e
214 -> trie k i -> trie k o
217 --nodeAutonomy :: (Ord k, Entropy e) => Getting e i e -> trie k i -> [k] -> e
218 --nodeAutonomy inE t ks = nodeEntropy inE $ findTrie ks t
220 instance IsTrie Trie where
221 buildTrie = entropyTrie isTerminal . insertTries
223 nodeEntropy inE (Node _ e _) = e ^. inE
224 nodeEntropy _ (Leaf _) = -- trace "nodeEntropy of Leaf" $
227 nodeChild k (Node _ _ cs) = fromMaybe emptyTrie (Map.lookup k cs)
228 nodeChild _ (Leaf _) = emptyTrie
230 findTrie ks t = L.foldl (flip nodeChild) t ks
232 normalizeEntropy inE modE t = go (modE identity) (entropyLevels inE t) t
234 go _ [] _ = panic "normalizeEntropy' empty levels"
235 go _ _ (Leaf c) = Leaf c
236 -- go _ ([] : _) _ = panic "normalizeEntropy': empty level"
237 go f (es : ess) (Node c i children)
238 -- | any (sim (i ^. inE)) es
239 = Node c (f i) $ go (modE $ normalizeLevel es) ess <$> children
241 -- = panic "NOT an elem"
245 This is only normalizing a node with respect to its brothers (unlike all the
246 nodes of the same level).
248 normalizeEntropy inE modE = go $ modE identity
250 go _ (Leaf c) = Leaf c
251 go f (Node c i children)
252 | Map.null children =
253 panic "normalizeEntropy: impossible"
255 Node c (f i) $ go (modE $ normalizeLevel es) <$> children
257 es = [ i' ^. inE | Node _ i' _ <- Map.elems children ]
259 ------------------------------------------------------------------------
261 levels :: Trie k e -> [[Trie k e]]
262 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
264 subForest :: Trie k e -> [Trie k e]
265 subForest (Leaf _) = []
266 subForest (Node _ _ children) = Map.elems children
268 entropyLevels :: Entropy e => Getting e i e -> Trie k i -> [[e]]
269 entropyLevels inE = fmap (noNaNs . map (nodeEntropy inE)) . levels
271 ------------------------------------------------------------------------
273 data Tries k e = Tries
278 instance IsTrie Tries where
279 buildTrie tts = Tries { _fwd = buildTrie tts
280 , _bwd = buildTrie (reverse <$> tts)
283 nodeEntropy inE (Tries fwd bwd) =
284 mean $ noNaNs [nodeEntropy inE fwd, nodeEntropy inE bwd]
286 findTrie ks (Tries fwd bwd) = Tries (findTrie ks fwd) (findTrie ks bwd)
288 -- TODO: here this is tempting to reverse but this is not always what we
289 -- want. See also nodeAutonomy.
291 nodeChild k (Tries fwd bwd) = Tries (nodeChild k fwd) (nodeChild k bwd)
293 normalizeEntropy inE modE = onTries (normalizeEntropy inE modE)
295 onTries :: (Trie k i -> Trie k o) -> Tries k i -> Tries k o
296 onTries f (Tries fwd bwd) = Tries (f fwd) (f bwd)
298 ------------------------------------------------------------------------
299 split :: (IsTrie trie, Entropy e) => Lens' i e -> trie Token i -> [Token] -> [[Token]]
301 split inE t0 (Terminal Start:xs0) = split inE (nodeChild (Terminal Start) t0) xs0
302 split inE t0 (x0:xs0) = go (nodeChild x0 t0) [x0] xs0
305 consRev xs xss = reverse xs : xss
307 go _ pref [] = [reverse pref]
308 go _ pref (Terminal Stop:_) = [reverse pref]
309 go t pref (Terminal Start:xs) = go t pref xs
311 -- trace (show (if acc then "ACC" else "CUT", (reverse (x : pref), ext), if acc then ">" else "<=", ((reverse pref, et), "+", ([x], ext0)))) $
313 then go xt (x:pref) xs
314 else consRev pref $ go xt0 [x] xs
319 -- ^ entropy of the current prefix
323 -- ^ entropy of the current prefix plus x
324 acc = ext > et + ext0
325 -- aut(["in","this","paper"]) > aut(["in","this"]) + aut(["paper"])
327 ne d t = if P.isNaN e then d else e
328 where e = nodeEntropy inE t
331 split :: Entropy e => Lens' i e -> Tries Token i -> [Token] -> [[Token]]
333 maximumWith (sum . map $ nodeAutonomy inE t0) (all the splits of ts)
336 ------------------------------------------------------------------------
337 ------------------------------------------------------------------------
339 mainEleve :: Int -> [[Text]] -> [[[Text]]]
342 mainEleve n input = map (map printToken) . split identity (t :: Trie Token Double) <$> inp
344 inp = toToken <$> input
345 t = buildTrie $ L.concat $ chunkAlong n 1 <$> inp
348 sim :: Entropy e => e -> e -> Bool
349 sim x y = x == y || (P.isNaN x && P.isNaN y)
351 chunkAlongEleve :: Int -> [a] -> [[a]]
352 chunkAlongEleve n xs = L.take n <$> L.tails xs
354 testEleve :: e ~ Double => Bool -> Int -> [Text] -> [(Text, Int, e, e, e, e, e)] -> IO Bool
355 testEleve debug n output checks = do
358 pss = [ (ps, findTrie ps fwd ^? _Just . node_entropy) -- . info_entropy)
361 , cs <- chunkAlong m 1 <$> inp
366 --res = map (map printToken) . split identity fwd <$> inp
367 --res = map (map printToken) . split info_norm_entropy' nt' <$> inp
368 res = map (map printToken) . split info_autonomy nt <$> inp
370 P.putStrLn (show input)
371 -- forM_ pss (P.putStrLn . show)
374 forM_ (entropyLevels identity (_fwd t)) $ \level ->
375 P.putStrLn $ " " <> show level
377 P.putStrLn "Forward:"
380 P.putStrLn "Backward:"
383 P.putStrLn "Splitting:"
384 P.putStrLn $ show res
386 pure $ expected == res
389 out = T.words <$> output
390 expected = fmap (T.splitOn "-") <$> out
391 input = (T.splitOn "-" =<<) <$> out
392 inp = toToken <$> input
393 t = buildTrie $ L.concat $ (filter (/= [Terminal Stop]) . chunkAlongEleve (n + 2)) <$> inp
394 -- nt = normalizeEntropy identity set_autonomy (fwd :: Trie Token Double)
395 -- nt = normalizeEntropy' info_entropy (\f -> info_norm_entropy' %~ f) nt
396 nt = normalizeEntropy identity set_autonomy t
400 then P.putStrLn $ " PASS " <> msg <> " " <> show ref
401 else P.putStrLn $ " FAIL " <> msg <> " ref=" <> show ref <> " my=" <> show my
403 checker (ngram, count, entropy, _ev, autonomy, bwd_entropy, fwd_entropy) = do
404 let ns = parseToken <$> T.words ngram
406 P.putStrLn $ " " <> T.unpack ngram <> ":"
407 check (==) "count" count (_node_count (_fwd t'))
408 check sim "entropy" entropy (nodeEntropy info_entropy t')
409 check sim "autonomy" autonomy (nodeEntropy info_autonomy t')
410 check sim "fwd_entropy" fwd_entropy (nodeEntropy info_entropy (_fwd t'))
411 check sim "bwd_entropy" bwd_entropy (nodeEntropy info_entropy (_bwd t'))
414 P.putStrLn . Tree.drawTree
416 . toTree (NonTerminal "")
418 -- | TODO real data is a list of tokenized sentences
419 example0, example1, example2, example3, example4, example5, example6 :: [Text]
420 example0 = ["New-York is New-York and New-York"]
421 example1 = ["to-be or not to-be"]
422 example2 = ["to-be-or not to-be-or NOT to-be and"]
423 example3 = example0 <> example0
424 -- > TEST: Should not have York New in the trie
425 example4 = ["a-b-c-d e a-b-c-d f"]
426 example5 = ["a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"]
427 example6 = ["le-petit chat"
433 checks0, checks2 :: [(Text, Int, Double, Double, Double, Double, Double)]
436 [("<start>", 1, nan, nan, nan, nan, 0.0)
437 ,("New", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 1.584962500721156, 0.0)
438 ,("York", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 0.0, 1.584962500721156)
439 ,("is", 1, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, 0.0)
440 ,("and", 1, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, 0.0)
441 ,("<stop>", 0, nan, nan, nan, 0.0, nan)
443 ,("<start> New", 1, nan, nan, nan, nan, 0.0)
444 ,("New York", 3, 1.584962500721156, 1.584962500721156, 1.4142135623730951, nan, 1.584962500721156)
445 ,("York is", 1, 0.0, nan, nan, nan, 0.0)
446 ,("is New", 1, 0.0, nan, nan, nan, 0.0)
447 ,("York and", 1, 0.0, nan, nan, nan, 0.0)
448 ,("and New", 1, 0.0, nan, nan, nan, 0.0)
449 ,("York <stop>", 1, nan, nan, nan, nan, nan)
451 ,("<start> New York", 1, nan, nan, nan, nan, 0.0)
452 ,("New York is", 1, 0.0, nan, nan, nan, 0.0)
453 ,("York is New", 1, 0.0, nan, nan, nan, 0.0)
454 ,("is New York", 1, 0.0, nan, nan, nan, 0.0)
455 ,("New York and", 1, 0.0, nan, nan, nan, 0.0)
456 ,("York and New", 1, 0.0, nan, nan, nan, 0.0)
457 ,("and New York", 1, 0.0, nan, nan, nan, 0.0)
458 ,("New York <stop>", 1, nan, nan, nan, nan, nan)
464 [("to be", 3, 1.2516291673878228, 1.2516291673878228, 1.5535694744293167, nan, 0.9182958340544896)
465 ,("be or", 2, 0.5, nan, nan, nan, 1.0)
466 ,("or not", 1, 0.0, nan, nan, nan, 0.0)
467 ,("not to", 1, 0.0, nan, nan, nan, 0.0)
468 ,("or NOT", 1, 0.0, nan, nan, nan, 0.0)
469 ,("NOT to", 1, 0.0, nan, nan, nan, 0.0)
470 ,("be and", 1, 0.0, nan, nan, nan, 0.0)
477 [("example0", 2, example0, checks0)
478 ,("example1", 2, example1, [])
479 ,("example2", 3, example2, checks2)
480 ,("example3", 2, example3, [])
481 ,("example4", 4, example4, [])
482 ,("example5", 5, example5, [])
484 (\(name, n, ex, checks) -> do
485 P.putStrLn $ name <> " " <> show n
486 b <- testEleve False n ex checks
487 P.putStrLn $ " splitting: " <> if b then "PASS" else "FAIL"