2 Module : Gargantext.Text.Terms.Eleve
3 Description : Unsupervized Word segmentation
4 Copyright : (c) CNRS, 2019-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 # Implementation of Unsupervized Word Segmentation
14 - Python implementation (Korantin August, Emmanuel Navarro):
15 [EleVe](https://github.com/kodexlab/eleve.git)
17 - Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
18 Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
19 the 50th Annual Meeting of the Association for Computational Linguistics
20 , pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
22 Notes for current implementation:
23 - TODO extract longer ngrams (see paper above, viterbi algo can be used)
24 - TODO AD TEST: prop (Node c _e f) = c == Map.size f
26 - AD: Real ngrams extraction test
27 from Gargantext.Text.Terms import extractTermsUnsupervised
28 docs <- runCmdRepl $ selectDocs 1004
29 extractTermsUnsupervised 3 $ DT.intercalate " "
31 $ Gargantext.map _hyperdataDocument_abstract docs
34 {-# LANGUAGE ConstraintKinds #-}
35 {-# LANGUAGE TemplateHaskell #-}
36 {-# LANGUAGE TypeFamilies #-}
38 module Gargantext.Text.Terms.Eleve where
40 -- import Debug.Trace (trace)
41 -- import Debug.SimpleReflect
43 import Control.Lens hiding (levels, children)
44 import Control.Monad (forM_)
46 import qualified Data.List as L
48 import Data.Text (Text)
49 import qualified Data.Text as T
51 import Data.Maybe (fromMaybe)
52 import qualified Data.Map as Map
53 import Gargantext.Prelude hiding (cs)
54 import qualified Data.Tree as Tree
55 import Data.Tree (Tree)
56 import qualified Prelude as P (putStrLn, logBase, isNaN, RealFloat)
58 nan :: Floating e => e
61 noNaNs :: P.RealFloat e => [e] -> [e]
62 noNaNs = filter (not . P.isNaN)
64 updateIfDefined :: P.RealFloat e => e -> e -> e
65 updateIfDefined e0 e | P.isNaN e = e0
68 sim :: Entropy e => e -> e -> Bool
69 sim x y = x == y || (P.isNaN x && P.isNaN y)
71 subst :: Entropy e => (e, e) -> e -> e
72 subst (src, dst) x | sim src x = dst
74 ------------------------------------------------------------------------
76 -- | TODO: Show Instance only used for debugging
83 ------------------------------------------------------------------------
84 -- | Example and tests for development
87 , _info_entropy_var :: e
91 instance Show e => Show (I e) where
92 show (I e ev a) = show (e, ev, a)
96 type ModEntropy i o e = (e -> e) -> i -> o
98 set_autonomy :: Entropy e => ModEntropy (I e) (I e) e
99 set_autonomy fe i = i & info_autonomy .~ fe (i ^. info_entropy_var)
101 set_entropy_var :: Entropy e => Setter e (I e) e e
102 set_entropy_var f e = (\ev -> I e ev nan) <$> f e
104 data StartStop = Start | Stop
105 deriving (Ord, Eq, Show)
107 data Token = NonTerminal Text
109 deriving (Ord, Eq, Show)
111 isTerminal :: Token -> Bool
112 isTerminal (Terminal _) = True
113 isTerminal (NonTerminal _) = False
115 nonTerminals :: [Token] -> [Text]
116 nonTerminals ts = [nt | NonTerminal nt <- ts]
118 parseToken :: Text -> Token
119 parseToken "<start>" = Terminal Start
120 parseToken "<stop>" = Terminal Stop
121 parseToken t = NonTerminal t
123 toToken :: [Text] -> [Token]
124 toToken xs = Terminal Start : (NonTerminal <$> xs) <> [Terminal Stop]
126 printToken :: Token -> Text
129 f (NonTerminal x) = x
130 f (Terminal Start) = "<start>"
131 f (Terminal Stop) = "<stop>"
132 ------------------------------------------------------------------------
135 = Node { _node_count :: Int
137 , _node_children :: Map k (Trie k e)
139 | Leaf { _node_count :: Int }
144 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
145 insertTrie [] n = n { _node_count = _node_count n +1}
146 insertTrie (x:xs) (Leaf c) = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
147 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
149 f = Just . insertTrie xs . fromMaybe emptyTrie
151 -- emptyTrie :: (Ord k, Monoid e) => Trie k e
152 -- emptyTrie = Node 0 mempty mempty
153 emptyTrie :: Trie k e
156 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
158 | Map.null children = Leaf c
159 | otherwise = Node c mempty children
161 -----------------------------
162 -- | Trie to Tree since Tree as nice print function
163 toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
164 toTree k (Leaf c) = Tree.Node (k, c, Nothing) []
165 toTree k (Node c e cs) = Tree.Node (k, c, Just e) (map (uncurry toTree) $ Map.toList cs)
167 ------------------------------------------------------------------------
168 ------------------------------------------------------------------------
169 normalizeLevel :: Entropy e => e -> e -> e -> e
170 normalizeLevel m v e = (e - m) / v
174 nodeChildren :: Trie k e -> Map k (Trie k e)
175 nodeChildren (Node _ _ cs) = cs
176 nodeChildren (Leaf _) = Map.empty
180 chunkAlongEleve :: Int -> [a] -> [[a]]
181 chunkAlongEleve n xs = L.take n <$> L.tails xs
183 data Direction = Backward | Forward
185 buildTrie :: Direction -> Int -> [[Token]] -> Trie Token ()
186 buildTrie d n sentences
187 = L.foldr insertTrie emptyTrie
189 $ ( filter (/= [Terminal (term d)])
190 . chunkAlongEleve (n + 1)
195 order Forward = identity
196 order Backward = reverse
198 term Backward = Start
200 class IsTrie trie where
201 entropyTrie :: Entropy e => (k -> Bool) -> trie k () -> trie k e
202 nodeEntropy :: Entropy e => Getting e i e -> trie k i -> e
203 nodeChild :: Ord k => k -> trie k e -> trie k e
204 findTrie :: Ord k => [k] -> trie k e -> trie k e
205 printTrie :: (Show i, Entropy e) => Getting e i e -> trie Token i -> IO ()
206 evTrie :: Entropy e => Getting e i e -> Setter i o e e -> trie k i -> trie k o
207 normalizeEntropy :: Entropy e
208 => Getting e i e -> ModEntropy i o e
209 -> trie k i -> trie k o
211 instance IsTrie Trie where
213 entropyTrie _ (Leaf c) = Leaf c
214 entropyTrie pred (Node c () children) = Node c e (map (entropyTrie pred) children)
216 children' = Map.toList children
217 sum_count = sum $ _node_count . snd <$> children'
218 e | sum_count == 0 = nan
219 | otherwise = sum $ f <$> children'
220 f (k, child) = if pred k then chc * P.logBase 2 (fromIntegral c)
221 else - chc * P.logBase 2 chc
223 chc = fromIntegral (_node_count child) / fromIntegral c
225 nodeEntropy inE (Node _ e _) = e ^. inE
226 nodeEntropy _ (Leaf _) = nan
228 nodeChild k (Node _ _ cs) = fromMaybe emptyTrie (Map.lookup k cs)
229 nodeChild _ (Leaf _) = emptyTrie
231 findTrie ks t = L.foldl (flip nodeChild) t ks
234 P.putStrLn . Tree.drawTree
236 $ toTree (NonTerminal "") t
237 P.putStrLn " Levels:"
238 forM_ (normalizationLevels inE t) $ \level ->
239 P.putStrLn $ " " <> show level
241 evTrie inE setEV = go nan
243 go _ (Leaf c) = Leaf c
244 go e0 (Node c i children) = Node c (i & setEV .~ ev e0 e1) $ go e1 <$> children
250 normalizeEntropy inE modE t = go (modE identity) (normalizationLevels inE t) t
252 go _ _ (Leaf c) = Leaf c
253 go _ [] _ = panic "normalizeEntropy' empty levels"
254 go f ((m, v, _) : ess) (Node c i children)
255 = Node c (f i) $ go (modE $ normalizeLevel m v) ess <$> children
256 ------------------------------------------------------------------------
258 levels :: Trie k e -> [[Trie k e]]
259 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
261 subForest :: Trie k e -> [Trie k e]
262 subForest (Leaf _) = []
263 subForest (Node _ _ children) = Map.elems children
265 entropyLevels :: Entropy e => Getting e i e -> Trie k i -> [[e]]
266 entropyLevels inE = fmap (noNaNs . map (nodeEntropy inE)) . L.tail . levels
268 normalizationLevels :: Entropy e => Getting e i e -> Trie k i -> [(e, e, Int)]
269 normalizationLevels inE = fmap f . entropyLevels inE
271 f es = (mean es, deviation es, length es)
273 ------------------------------------------------------------------------
275 data Tries k e = Tries
282 buildTries :: Int -> [[Token]] -> Tries Token ()
283 buildTries n sentences = Tries
284 { _fwd = buildTrie Forward n sentences
285 , _bwd = buildTrie Backward n sentences
288 instance IsTrie Tries where
290 nodeEntropy inE (Tries f b) = mean [nodeEntropy inE f, nodeEntropy inE b]
292 findTrie ks (Tries f b) = Tries (findTrie ks f) (findTrie (reverse ks) b)
294 nodeChild = onTries . nodeChild
296 entropyTrie = onTries . entropyTrie
298 evTrie inE setEV = onTries $ evTrie inE setEV
300 normalizeEntropy inE = onTries . normalizeEntropy inE
302 printTrie inE (Tries f b) = do
303 P.putStrLn "Forward:"
306 P.putStrLn "Backward:"
309 onTries :: (Trie k i -> Trie k o) -> Tries k i -> Tries k o
310 onTries h (Tries f b) = Tries (h f) (h b)
312 ------------------------------------------------------------------------
313 mayCons :: [a] -> [[a]] -> [[a]]
315 mayCons xs xss = xs : xss
318 split :: (IsTrie trie, Entropy e) => Lens' i e -> trie Token i -> [Token] -> [[Token]]
320 split inE t (Terminal Start:xs) = split inE t xs
321 split inE t (x0:xs0) = go [x0] xs0
324 go pref (Terminal Stop:_) = [pref]
325 go _ (Terminal Start:_) = panic "split impossible"
327 -- trace (show (if acc then "ACC" else "CUT", (prefx, epxt), if acc then ">" else "<=", ((pref, ept), "+", ([x], ext)))) $
330 else mayCons pref $ go [x] xs
334 pxt = findTrie prefx t
337 -- ^ entropy of the current prefix
341 -- ^ entropy of the current prefix plus x
342 acc = P.isNaN ept || P.isNaN ext || not (P.isNaN epxt) -- && (epxt > mean [ept, ext])
344 -- aut(["in","this","paper"]) > aut(["in","this"]) + aut(["paper"])
349 split :: Entropy e => Int -> Lens' i e -> Tries Token i -> [Token] -> [[Text]]
351 split _ _ _ [t] = pure <$> nonTerminals [t]
352 split n inE t ts = nonTerminals pref `mayCons` split n inE t (drop (length pref) ts)
354 pref = maximumWith (\ks -> nodeEntropy inE $ findTrie ks t)
355 (L.tail . L.inits . take n $ ts)
359 split :: Entropy e => Lens' i e -> Tries Token i -> [Token] -> [[Token]]
361 maximumWith (sum . map $ nodeAutonomy inE t0) (all the splits of ts)
364 ------------------------------------------------------------------------
366 mainEleve :: Int -> [[Text]] -> [[[Text]]]
367 mainEleve n x = mainEleve' n x x
369 mainEleve' :: Int -> [[Text]] -> [[Text]] -> [[[Text]]]
370 mainEleve' n x y = mainEleveWith x' n y
372 x' = buildTries n (fmap toToken x)
373 -- (fmap toToken i) is computed twice, since mainEleveWith is computing it too
375 -- | This function should take the longest possible chain of:
376 -- mainEleve'' n x y = maxChainSizeOf [ mainEleve' n x y
377 -- , mainEleve' n x x
378 -- , mainEleve' n y y
380 mainEleve'' :: Int -> [[Text]] -> [[Text]] -> [[[Text]]]
381 mainEleve'' = undefined
383 mainEleveWith :: Tries Token () -> Int -> [[Text]] -> [[[Text]]]
384 mainEleveWith m n i = fmap (split n info_autonomy t) (fmap toToken i)
386 t :: Tries Token (I Double)
387 t = normalizeEntropy info_entropy_var set_autonomy
388 $ evTrie identity set_entropy_var
389 $ entropyTrie isTerminal m
391 ------------------------------------------------------------------------
393 type Checks e = [(Text, Int, e, e, e, e, e, e, e, e, e)]
395 testEleve :: e ~ Double => Bool -> Int -> [Text] -> Checks e -> IO Bool
396 testEleve debug n output checks = do
398 res = split (1 + n) info_autonomy nt <$> input
400 P.putStrLn . show $ (printToken <$>) <$> input
402 printTrie info_entropy nt
404 P.putStrLn "Splitting:"
405 P.putStrLn $ show res
407 pure $ expected == res
410 out = T.words <$> output
411 expected = fmap (T.splitOn "-") <$> out
412 input = toToken . (T.splitOn "-" =<<) <$> out
414 nt :: Tries Token (I Double)
415 nt = normalizeEntropy info_entropy_var set_autonomy
416 . evTrie identity set_entropy_var
417 . entropyTrie isTerminal
422 then P.putStrLn $ " \ESC[32mPASS\ESC[m " <> msg <> " " <> show ref
423 else P.putStrLn $ " \ESC[31mFAIL\ESC[m " <> msg <> " ref=" <> show ref <> " my=" <> show my
425 checker (ngram, count, entropy, ev, autonomy, fwd_entropy, fwd_ev, fwd_autonomy, bwd_entropy, bwd_ev, bwd_autonomy) = do
426 let ns = parseToken <$> T.words ngram
429 P.putStrLn $ " " <> T.unpack ngram <> ":"
430 check (==) "count" count (_node_count (_fwd nt'))
432 check sim "entropy" entropy (nodeEntropy info_entropy nt' )
433 check sim "ev" ev (nodeEntropy info_entropy_var nt' )
434 check sim "autonomy" autonomy (nodeEntropy info_autonomy nt' )
436 check sim "fwd_entropy" fwd_entropy (nodeEntropy info_entropy (_fwd nt'))
437 check sim "fwd_ev" fwd_ev (nodeEntropy info_entropy_var (_fwd nt'))
438 check sim "fwd_autonomy" fwd_autonomy (nodeEntropy info_autonomy (_fwd nt'))
440 check sim "bwd_entropy" bwd_entropy (nodeEntropy info_entropy (_bwd nt'))
441 check sim "bwd_ev" bwd_ev (nodeEntropy info_entropy_var (_bwd nt'))
442 check sim "bwd_autonomy" bwd_autonomy (nodeEntropy info_autonomy (_bwd nt'))
444 -- | TODO real data is a list of tokenized sentences
445 example0, example1, example2, example3, example4, example5, example6, example7, example8, example9 :: [Text]
446 example0 = ["New-York is New-York and New-York"]
447 example1 = ["to-be or not to-be"]
448 example2 = ["to-be-or not to-be-or NOT to-be and"]
449 example3 = example0 <> example0
450 -- > TEST: Should not have York New in the trie
451 example4 = ["a-b-c-d e a-b-c-d f"]
452 example5 = ["a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"]
453 example6 = ["le-petit chat"
458 example7 = ["a-b d", "a-c e", "a-c", "a-b", "a-b", "a-c", "a-c", "a-b"]
459 -- example8 = ["z f", "z", "z", "z"] <> example7
460 example8 = ["z", "z", "z", "z"] <> example7 <> example7 <> example7
461 example9 = (T.replace "z" "a") <$> example8
462 --example8 = ["a-b d", "a-c e", "a f", "a-c g", "a-b h", "a i", "a j", "a-b k", "a-c l", "a-c m", "a n", "a-b o"]
464 checks0, checks2, checks7, checks8, checks9 :: Checks Double
467 -- [(token, count, entropy, ev, autonomy, fwd_entropy, fwd_ev, fwd_autonomy, bwd_entropy, bwd_ev, bwd_autonomy)]
468 [ ("<start>", 1, nan, nan, nan, 0.0, -2.113283334294875, -0.5000000000000002, nan, nan, nan)
469 , ("New", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 0.0, -2.113283334294875, -0.5000000000000002, 1.584962500721156, -0.5283208335737188, 2.0)
470 , ("York", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 1.584962500721156, -0.5283208335737188, 2.0, 0.0, -2.113283334294875, -0.5000000000000002)
471 , ("is", 1, 0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002)
472 , ("and", 1, 0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002)
473 , ("<stop>", 0, nan, nan, nan, nan, nan, nan, 0.0, -2.113283334294875, -0.5000000000000002)
474 , ("<start> New", 1, nan, nan, nan, 0.0, nan, nan, nan, nan, nan)
475 , ("New York", 3, 1.584962500721156, 1.584962500721156, 1.414213562373095, 1.584962500721156, 1.584962500721156, 1.4142135623730947, 1.584962500721156, 1.584962500721156, 1.4142135623730951)
476 , ("York is", 1, 0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865476, 0.0, nan, nan)
477 , ("is New", 1, 0, nan, nan, 0.0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865474)
478 , ("York and", 1, 0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865476, 0.0, nan, nan)
479 , ("and New", 1, 0, nan, nan, 0.0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865474)
480 , ("York <stop>", 1, nan, nan, nan, nan, nan, nan, 0.0, nan, nan)
481 , ("<start> New York", 1, nan, nan, nan, 0.0, nan, nan, nan, nan, nan)
482 , ("New York is", 1, 0, nan, nan, 0.0, -1.584962500721156, nan, 0.0, nan, nan)
483 , ("York is New", 1, 0, nan, nan, 0.0, nan, nan, 0.0, nan, nan)
484 , ("is New York", 1, 0, nan, nan, 0.0, nan, nan, 0.0, -1.584962500721156, nan)
485 , ("New York and", 1, 0, nan, nan, 0.0, -1.584962500721156, nan, 0.0, nan, nan)
486 , ("York and New", 1, 0, nan, nan, 0.0, nan, nan, 0.0, nan, nan)
487 , ("and New York", 1, 0, nan, nan, 0.0, nan, nan, 0.0, -1.584962500721156, nan)
488 , ("New York <stop>", 1, nan, nan, nan, nan, nan, nan, 0.0, nan, nan)
493 [("to be", 3, 1.2516291673878228, 1.2516291673878228, 1.5535694744293167, nan, 0.9182958340544896)
494 ,("be or", 2, 0.5, nan, nan, nan, 1.0)
495 ,("or not", 1, 0.0, nan, nan, nan, 0.0)
496 ,("not to", 1, 0.0, nan, nan, nan, 0.0)
497 ,("or NOT", 1, 0.0, nan, nan, nan, 0.0)
498 ,("NOT to", 1, 0.0, nan, nan, nan, 0.0)
499 ,("be and", 1, 0.0, nan, nan, nan, 0.0)
504 [ ("a b", 4, 2, 1.5, 1.0106455960380136, 2, 1, 0.7302967433402215, 2, 2, 1.2909944487358056)
505 , ("a c", 4, 2, 1.5, 1.0106455960380136, 2, 1, 0.7302967433402215, 2, 2, 1.2909944487358056)
506 , ("a", 8, 2, -0.7139421727208477, 0.9315597394596105, 1, -1.7139421727208477, 0.1695158759052029, 3, 0.2860578272791523, 1.693603603014018)
510 [ ("a b", 4, 2, 1.5, 1.2384061243840367, 2, 1, 0.9190418024406298, 2, 2, 1.5577704463274435)
511 , ("a c", 4, 2, 1.5, 1.2384061243840367, 2, 1, 0.9190418024406298, 2, 2, 1.5577704463274435)
512 , ("a", 8, 2, -1.1151193576322829, 0.8012882295122719, 1, -2.115119357632283, 1.1025957503820932e-2, 3, -0.11511935763228287, 1.5915505015207227)
513 , ("z", 4, 2, -1.1151193576322829, 0.9576679529201777, 2, -1.1151193576322829, 1.0906240295212841, 2, -1.1151193576322829, 0.8247118763190712)
517 [ ("a b", 4, 2, 0.8741854163060885, 0.9234576822288185, 2, -0.25162916738782304, 0.2891449181301934, 2, 2, 1.5577704463274435)
518 , ("a c", 4, 2, 0.8741854163060885, 0.9234576822288185, 2, -0.25162916738782304, 0.2891449181301934, 2, 2, 1.5577704463274435)
519 , ("a", 12, 2.91829583405449, 3.763498724462999e-2, 1.518835832034022, 2.251629167387823, -0.6290316794220367, 1.2162041043595873, 3.5849625007211565, 0.7043016539112967, 1.8214675597084569)
522 runTestsEleve :: Bool -> IO ()
523 runTestsEleve doChecks =
525 [("example0", 3, example0, checks0)
526 ,("example0", 2, example0, [])
527 ,("example1", 2, example1, [])
528 ,("example2", 3, example2, checks2)
529 ,("example3", 2, example3, [])
530 ,("example4", 4, example4, [])
531 ,("example5", 5, example5, [])
532 ,("example6", 2, example6, [])
533 ,("example7", 2, example7, checks7)
534 ,("example8", 2, example8, checks8)
535 ,("example9", 2, example9, checks9)
537 (\(name, n, ex, checks) -> do
538 P.putStrLn $ name <> " " <> show n
539 b <- testEleve False n ex (if doChecks then checks else [])
540 P.putStrLn $ " splitting: " <> if b then "PASS" else "FAIL"