2 Module : Gargantext.Text.Terms.Eleve
3 Description : Unsupervized Word segmentation
4 Copyright : (c) CNRS, 2019-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 # Implementation of Unsupervized Word Segmentation
14 - Python implementation (Korantin August, Emmanuel Navarro):
15 [EleVe](https://github.com/kodexlab/eleve.git)
17 - Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
18 Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
19 the 50th Annual Meeting of the Association for Computational Linguistics
20 , pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
22 Notes for current implementation:
23 - TODO extract longer ngrams (see paper above, viterbi algo can be used)
24 - TODO AD TEST: prop (Node c _e f) = c == Map.size f
26 - AD: Real ngrams extraction test
27 from Gargantext.Text.Terms import extractTermsUnsupervised
28 docs <- runCmdRepl $ selectDocs 1004
29 extractTermsUnsupervised 3 $ DT.intercalate " "
31 $ Gargantext.map _hyperdataDocument_abstract docs
34 {-# LANGUAGE ConstraintKinds #-}
35 {-# LANGUAGE NoImplicitPrelude #-}
36 {-# LANGUAGE OverloadedStrings #-}
37 {-# LANGUAGE RankNTypes #-}
38 {-# LANGUAGE TemplateHaskell #-}
39 {-# LANGUAGE TypeFamilies #-}
41 module Gargantext.Text.Terms.Eleve where
43 -- import Debug.Trace (trace)
44 -- import Debug.SimpleReflect
46 import Control.Lens hiding (levels, children)
47 import Control.Monad (forM_)
49 import qualified Data.List as L
51 import Data.Text (Text)
52 import qualified Data.Text as T
54 import Data.Maybe (fromMaybe)
55 import qualified Data.Map as Map
56 import Gargantext.Prelude hiding (cs)
57 import qualified Data.Tree as Tree
58 import Data.Tree (Tree)
59 import qualified Prelude as P (putStrLn, logBase, isNaN, RealFloat)
61 nan :: Floating e => e
64 noNaNs :: P.RealFloat e => [e] -> [e]
65 noNaNs = filter (not . P.isNaN)
67 updateIfDefined :: P.RealFloat e => e -> e -> e
68 updateIfDefined e0 e | P.isNaN e = e0
71 sim :: Entropy e => e -> e -> Bool
72 sim x y = x == y || (P.isNaN x && P.isNaN y)
74 subst :: Entropy e => (e, e) -> e -> e
75 subst (src, dst) x | sim src x = dst
77 ------------------------------------------------------------------------
84 -- ^ TODO: only used for debugging
86 ------------------------------------------------------------------------
87 -- | Example and tests for development
90 , _info_entropy_var :: e
94 instance Show e => Show (I e) where
95 show (I e ev a) = show (e, ev, a)
99 type ModEntropy i o e = (e -> e) -> i -> o
101 set_autonomy :: Entropy e => ModEntropy (I e) (I e) e
102 set_autonomy fe i = i & info_autonomy .~ fe (i ^. info_entropy_var)
104 set_entropy_var :: Entropy e => Setter e (I e) e e
105 set_entropy_var f e = (\ev -> I e ev nan) <$> f e
107 data StartStop = Start | Stop
108 deriving (Ord, Eq, Show)
110 data Token = NonTerminal Text
112 deriving (Ord, Eq, Show)
114 isTerminal :: Token -> Bool
115 isTerminal (Terminal _) = True
116 isTerminal (NonTerminal _) = False
118 nonTerminals :: [Token] -> [Text]
119 nonTerminals ts = [nt | NonTerminal nt <- ts]
121 parseToken :: Text -> Token
122 parseToken "<start>" = Terminal Start
123 parseToken "<stop>" = Terminal Stop
124 parseToken t = NonTerminal t
126 toToken :: [Text] -> [Token]
127 toToken xs = Terminal Start : (NonTerminal <$> xs) <> [Terminal Stop]
129 printToken :: Token -> Text
132 f (NonTerminal x) = x
133 f (Terminal Start) = "<start>"
134 f (Terminal Stop) = "<stop>"
135 ------------------------------------------------------------------------
138 = Node { _node_count :: Int
140 , _node_children :: Map k (Trie k e)
142 | Leaf { _node_count :: Int }
147 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
148 insertTrie [] n = n { _node_count = _node_count n +1}
149 insertTrie (x:xs) (Leaf c) = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
150 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
152 f = Just . insertTrie xs . fromMaybe emptyTrie
154 -- emptyTrie :: (Ord k, Monoid e) => Trie k e
155 -- emptyTrie = Node 0 mempty mempty
156 emptyTrie :: Trie k e
159 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
161 | Map.null children = Leaf c
162 | otherwise = Node c mempty children
164 -----------------------------
165 -- | Trie to Tree since Tree as nice print function
166 toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
167 toTree k (Leaf c) = Tree.Node (k, c, Nothing) []
168 toTree k (Node c e cs) = Tree.Node (k, c, Just e) (map (uncurry toTree) $ Map.toList cs)
170 ------------------------------------------------------------------------
171 ------------------------------------------------------------------------
172 normalizeLevel :: Entropy e => e -> e -> e -> e
173 normalizeLevel m v e = (e - m) / v
177 nodeChildren :: Trie k e -> Map k (Trie k e)
178 nodeChildren (Node _ _ cs) = cs
179 nodeChildren (Leaf _) = Map.empty
183 chunkAlongEleve :: Int -> [a] -> [[a]]
184 chunkAlongEleve n xs = L.take n <$> L.tails xs
186 data Direction = Backward | Forward
188 buildTrie :: Direction -> Int -> [[Token]] -> Trie Token ()
189 buildTrie d n sentences
190 = L.foldr insertTrie emptyTrie
192 $ ( filter (/= [Terminal (term d)])
193 . chunkAlongEleve (n + 1)
198 order Forward = identity
199 order Backward = reverse
201 term Backward = Start
203 class IsTrie trie where
204 entropyTrie :: Entropy e => (k -> Bool) -> trie k () -> trie k e
205 nodeEntropy :: Entropy e => Getting e i e -> trie k i -> e
206 nodeChild :: Ord k => k -> trie k e -> trie k e
207 findTrie :: Ord k => [k] -> trie k e -> trie k e
208 printTrie :: (Show i, Entropy e) => Getting e i e -> trie Token i -> IO ()
209 evTrie :: Entropy e => Getting e i e -> Setter i o e e -> trie k i -> trie k o
210 normalizeEntropy :: Entropy e
211 => Getting e i e -> ModEntropy i o e
212 -> trie k i -> trie k o
214 instance IsTrie Trie where
216 entropyTrie _ (Leaf c) = Leaf c
217 entropyTrie pred (Node c () children) = Node c e (map (entropyTrie pred) children)
219 children' = Map.toList children
220 sum_count = sum $ _node_count . snd <$> children'
221 e | sum_count == 0 = nan
222 | otherwise = sum $ f <$> children'
223 f (k, child) = if pred k then chc * P.logBase 2 (fromIntegral c)
224 else - chc * P.logBase 2 chc
226 chc = fromIntegral (_node_count child) / fromIntegral c
228 nodeEntropy inE (Node _ e _) = e ^. inE
229 nodeEntropy _ (Leaf _) = nan
231 nodeChild k (Node _ _ cs) = fromMaybe emptyTrie (Map.lookup k cs)
232 nodeChild _ (Leaf _) = emptyTrie
234 findTrie ks t = L.foldl (flip nodeChild) t ks
237 P.putStrLn . Tree.drawTree
239 $ toTree (NonTerminal "") t
240 P.putStrLn " Levels:"
241 forM_ (normalizationLevels inE t) $ \level ->
242 P.putStrLn $ " " <> show level
244 evTrie inE setEV = go nan
246 go _ (Leaf c) = Leaf c
247 go e0 (Node c i children) = Node c (i & setEV .~ ev e0 e1) $ go e1 <$> children
253 normalizeEntropy inE modE t = go (modE identity) (normalizationLevels inE t) t
255 go _ _ (Leaf c) = Leaf c
256 go _ [] _ = panic "normalizeEntropy' empty levels"
257 go f ((m, v, _) : ess) (Node c i children)
258 = Node c (f i) $ go (modE $ normalizeLevel m v) ess <$> children
259 ------------------------------------------------------------------------
261 levels :: Trie k e -> [[Trie k e]]
262 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
264 subForest :: Trie k e -> [Trie k e]
265 subForest (Leaf _) = []
266 subForest (Node _ _ children) = Map.elems children
268 entropyLevels :: Entropy e => Getting e i e -> Trie k i -> [[e]]
269 entropyLevels inE = fmap (noNaNs . map (nodeEntropy inE)) . L.tail . levels
271 normalizationLevels :: Entropy e => Getting e i e -> Trie k i -> [(e, e, Int)]
272 normalizationLevels inE = fmap f . entropyLevels inE
274 f es = (mean es, deviation es, length es)
276 ------------------------------------------------------------------------
278 data Tries k e = Tries
285 buildTries :: Int -> [[Token]] -> Tries Token ()
286 buildTries n sentences = Tries
287 { _fwd = buildTrie Forward n sentences
288 , _bwd = buildTrie Backward n sentences
291 instance IsTrie Tries where
293 nodeEntropy inE (Tries f b) = mean [nodeEntropy inE f, nodeEntropy inE b]
295 findTrie ks (Tries f b) = Tries (findTrie ks f) (findTrie (reverse ks) b)
297 nodeChild = onTries . nodeChild
299 entropyTrie = onTries . entropyTrie
301 evTrie inE setEV = onTries $ evTrie inE setEV
303 normalizeEntropy inE = onTries . normalizeEntropy inE
305 printTrie inE (Tries f b) = do
306 P.putStrLn "Forward:"
309 P.putStrLn "Backward:"
312 onTries :: (Trie k i -> Trie k o) -> Tries k i -> Tries k o
313 onTries h (Tries f b) = Tries (h f) (h b)
315 ------------------------------------------------------------------------
316 mayCons :: [a] -> [[a]] -> [[a]]
318 mayCons xs xss = xs : xss
321 split :: (IsTrie trie, Entropy e) => Lens' i e -> trie Token i -> [Token] -> [[Token]]
323 split inE t (Terminal Start:xs) = split inE t xs
324 split inE t (x0:xs0) = go [x0] xs0
327 go pref (Terminal Stop:_) = [pref]
328 go _ (Terminal Start:_) = panic "split impossible"
330 -- trace (show (if acc then "ACC" else "CUT", (prefx, epxt), if acc then ">" else "<=", ((pref, ept), "+", ([x], ext)))) $
333 else mayCons pref $ go [x] xs
337 pxt = findTrie prefx t
340 -- ^ entropy of the current prefix
344 -- ^ entropy of the current prefix plus x
345 acc = P.isNaN ept || P.isNaN ext || not (P.isNaN epxt) -- && (epxt > mean [ept, ext])
347 -- aut(["in","this","paper"]) > aut(["in","this"]) + aut(["paper"])
352 split :: Entropy e => Int -> Lens' i e -> Tries Token i -> [Token] -> [[Text]]
354 split _ _ _ [t] = pure <$> nonTerminals [t]
355 split n inE t ts = nonTerminals pref `mayCons` split n inE t (drop (length pref) ts)
357 pref = maximumWith (\ks -> nodeEntropy inE $ findTrie ks t)
358 (L.tail . L.inits . take n $ ts)
362 split :: Entropy e => Lens' i e -> Tries Token i -> [Token] -> [[Token]]
364 maximumWith (sum . map $ nodeAutonomy inE t0) (all the splits of ts)
367 ------------------------------------------------------------------------
369 mainEleve :: Int -> [[Text]] -> [[[Text]]]
370 mainEleve n x = mainEleve' n x x
372 mainEleve' :: Int -> [[Text]] -> [[Text]] -> [[[Text]]]
373 mainEleve' n x y = mainEleveWith x' n y
375 x' = buildTries n (fmap toToken x)
377 -- | This function should take the longest possible chain of:
378 -- mainEleve'' n x y = maxChainWith [mainEleve' n x y, mainEleve' n x x]
379 mainEleve'' :: Int -> [[Text]] -> [[Text]] -> [[[Text]]]
380 mainEleve'' = undefined
382 mainEleveWith :: Tries Token () -> Int -> [[Text]] -> [[[Text]]]
383 mainEleveWith m n i = fmap (split n info_autonomy t) (fmap toToken i)
385 t :: Tries Token (I Double)
386 t = normalizeEntropy info_entropy_var set_autonomy
387 $ evTrie identity set_entropy_var
388 $ entropyTrie isTerminal m
390 ------------------------------------------------------------------------
392 type Checks e = [(Text, Int, e, e, e, e, e, e, e, e, e)]
394 testEleve :: e ~ Double => Bool -> Int -> [Text] -> Checks e -> IO Bool
395 testEleve debug n output checks = do
397 res = split n info_autonomy nt <$> inp
399 P.putStrLn $ show input
401 printTrie info_entropy nt
403 P.putStrLn "Splitting:"
404 P.putStrLn $ show res
406 pure $ expected == res
409 out = T.words <$> output
410 expected = fmap (T.splitOn "-") <$> out
411 input = (T.splitOn "-" =<<) <$> out
412 inp = toToken <$> input
414 nt :: Tries Token (I Double)
415 nt = normalizeEntropy info_entropy_var set_autonomy
416 . evTrie identity set_entropy_var
417 . entropyTrie isTerminal
422 then P.putStrLn $ " \ESC[32mPASS\ESC[m " <> msg <> " " <> show ref
423 else P.putStrLn $ " \ESC[31mFAIL\ESC[m " <> msg <> " ref=" <> show ref <> " my=" <> show my
425 checker (ngram, count, entropy, ev, autonomy, fwd_entropy, fwd_ev, fwd_autonomy, bwd_entropy, bwd_ev, bwd_autonomy) = do
426 let ns = parseToken <$> T.words ngram
429 P.putStrLn $ " " <> T.unpack ngram <> ":"
430 check (==) "count" count (_node_count (_fwd nt'))
432 check sim "entropy" entropy (nodeEntropy info_entropy nt' )
433 check sim "ev" ev (nodeEntropy info_entropy_var nt' )
434 check sim "autonomy" autonomy (nodeEntropy info_autonomy nt' )
436 check sim "fwd_entropy" fwd_entropy (nodeEntropy info_entropy (_fwd nt'))
437 check sim "fwd_ev" fwd_ev (nodeEntropy info_entropy_var (_fwd nt'))
438 check sim "fwd_autonomy" fwd_autonomy (nodeEntropy info_autonomy (_fwd nt'))
440 check sim "bwd_entropy" bwd_entropy (nodeEntropy info_entropy (_bwd nt'))
441 check sim "bwd_ev" bwd_ev (nodeEntropy info_entropy_var (_bwd nt'))
442 check sim "bwd_autonomy" bwd_autonomy (nodeEntropy info_autonomy (_bwd nt'))
444 -- | TODO real data is a list of tokenized sentences
445 example0, example1, example2, example3, example4, example5, example6 :: [Text]
446 example0 = ["New-York is New-York and New-York"]
447 example1 = ["to-be or not to-be"]
448 example2 = ["to-be-or not to-be-or NOT to-be and"]
449 example3 = example0 <> example0
450 -- > TEST: Should not have York New in the trie
451 example4 = ["a-b-c-d e a-b-c-d f"]
452 example5 = ["a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"]
453 example6 = ["le-petit chat"
459 checks0, checks2 :: Checks Double
462 -- [(token, count, entropy, ev, autonomy, fwd_entropy, fwd_ev, fwd_autonomy, bwd_entropy, bwd_ev, bwd_autonomy)]
463 [ ("<start>", 1, nan, nan, nan, 0.0, -2.113283334294875, -0.5000000000000002, nan, nan, nan)
464 , ("New", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 0.0, -2.113283334294875, -0.5000000000000002, 1.584962500721156, -0.5283208335737188, 2.0)
465 , ("York", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 1.584962500721156, -0.5283208335737188, 2.0, 0.0, -2.113283334294875, -0.5000000000000002)
466 , ("is", 1, 0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002)
467 , ("and", 1, 0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002)
468 , ("<stop>", 0, nan, nan, nan, nan, nan, nan, 0.0, -2.113283334294875, -0.5000000000000002)
469 , ("<start> New", 1, nan, nan, nan, 0.0, nan, nan, nan, nan, nan)
470 , ("New York", 3, 1.584962500721156, 1.584962500721156, 1.414213562373095, 1.584962500721156, 1.584962500721156, 1.4142135623730947, 1.584962500721156, 1.584962500721156, 1.4142135623730951)
471 , ("York is", 1, 0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865476, 0.0, nan, nan)
472 , ("is New", 1, 0, nan, nan, 0.0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865474)
473 , ("York and", 1, 0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865476, 0.0, nan, nan)
474 , ("and New", 1, 0, nan, nan, 0.0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865474)
475 , ("York <stop>", 1, nan, nan, nan, nan, nan, nan, 0.0, nan, nan)
476 , ("<start> New York", 1, nan, nan, nan, 0.0, nan, nan, nan, nan, nan)
477 , ("New York is", 1, 0, nan, nan, 0.0, -1.584962500721156, nan, 0.0, nan, nan)
478 , ("York is New", 1, 0, nan, nan, 0.0, nan, nan, 0.0, nan, nan)
479 , ("is New York", 1, 0, nan, nan, 0.0, nan, nan, 0.0, -1.584962500721156, nan)
480 , ("New York and", 1, 0, nan, nan, 0.0, -1.584962500721156, nan, 0.0, nan, nan)
481 , ("York and New", 1, 0, nan, nan, 0.0, nan, nan, 0.0, nan, nan)
482 , ("and New York", 1, 0, nan, nan, 0.0, nan, nan, 0.0, -1.584962500721156, nan)
483 , ("New York <stop>", 1, nan, nan, nan, nan, nan, nan, 0.0, nan, nan)
488 [("to be", 3, 1.2516291673878228, 1.2516291673878228, 1.5535694744293167, nan, 0.9182958340544896)
489 ,("be or", 2, 0.5, nan, nan, nan, 1.0)
490 ,("or not", 1, 0.0, nan, nan, nan, 0.0)
491 ,("not to", 1, 0.0, nan, nan, nan, 0.0)
492 ,("or NOT", 1, 0.0, nan, nan, nan, 0.0)
493 ,("NOT to", 1, 0.0, nan, nan, nan, 0.0)
494 ,("be and", 1, 0.0, nan, nan, nan, 0.0)
498 runTestsEleve :: IO ()
501 [("example0", 3, example0, checks0)
502 ,("example0", 2, example0, [])
503 ,("example1", 2, example1, [])
504 ,("example2", 3, example2, checks2)
505 ,("example3", 2, example3, [])
506 ,("example4", 4, example4, [])
507 ,("example5", 5, example5, [])
508 ,("example6", 2, example6, [])
510 (\(name, n, ex, checks) -> do
511 P.putStrLn $ name <> " " <> show n
512 b <- testEleve False n ex checks
513 P.putStrLn $ " splitting: " <> if b then "PASS" else "FAIL"