2 Module : Gargantext.Text.Eleve
3 Description : Unsupervized Word segmentation
4 Copyright : (c) CNRS, 2019-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 # Implementation of Unsupervized Word Segmentation
14 - Python implementation (Korantin August, Emmanuel Navarro):
15 [EleVe](https://github.com/kodexlab/eleve.git)
17 - Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
18 Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
19 the 50th Annual Meeting of the Association for Computational Linguistics
20 , pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
22 Notes for current implementation:
23 - TODO extract longer ngrams (see paper above, viterbi algo can be used)
24 - TODO AD TEST: prop (Node c _e f) = c == Map.size f
26 - AD: Real ngrams extraction test
27 from Gargantext.Text.Terms import extractTermsUnsupervised
28 docs <- runCmdRepl $ selectDocs 1004
29 extractTermsUnsupervised 3 $ DT.intercalate " "
31 $ Gargantext.map _hyperdataDocument_abstract docs
34 {-# LANGUAGE ConstraintKinds #-}
35 {-# LANGUAGE NoImplicitPrelude #-}
36 {-# LANGUAGE OverloadedStrings #-}
37 {-# LANGUAGE RankNTypes #-}
38 {-# LANGUAGE TemplateHaskell #-}
39 {-# LANGUAGE TypeFamilies #-}
41 module Gargantext.Text.Eleve where
43 -- import Debug.Trace (trace)
44 -- import Debug.SimpleReflect
46 import Control.Lens hiding (levels, children)
47 import Control.Monad (forM_)
49 import qualified Data.List as L
51 import Data.Text (Text)
52 import qualified Data.Text as T
54 import Data.Maybe (fromMaybe)
55 import qualified Data.Map as Map
56 import Gargantext.Prelude hiding (cs)
57 import qualified Data.Tree as Tree
58 import Data.Tree (Tree)
59 import qualified Prelude as P (putStrLn, logBase, isNaN, RealFloat)
61 nan :: Floating e => e
64 noNaNs :: P.RealFloat e => [e] -> [e]
65 noNaNs = filter (not . P.isNaN)
67 updateIfDefined :: P.RealFloat e => e -> e -> e
68 updateIfDefined e0 e | P.isNaN e = e0
71 sim :: Entropy e => e -> e -> Bool
72 sim x y = x == y || (P.isNaN x && P.isNaN y)
74 subst :: Entropy e => (e, e) -> e -> e
75 subst (src, dst) x | sim src x = dst
77 ------------------------------------------------------------------------
84 -- ^ TODO: only used for debugging
86 ------------------------------------------------------------------------
87 -- | Example and tests for development
90 , _info_entropy_var :: e
94 instance Show e => Show (I e) where
95 show (I e ev a) = show (e, ev, a)
99 type ModEntropy i o e = (e -> e) -> i -> o
101 set_autonomy :: Entropy e => ModEntropy (I e) (I e) e
102 set_autonomy fe i = i & info_autonomy .~ fe (i ^. info_entropy_var)
104 set_entropy_var :: Entropy e => Setter e (I e) e e
105 set_entropy_var f e = (\ev -> I e ev nan) <$> f e
107 data StartStop = Start | Stop
108 deriving (Ord, Eq, Show)
110 data Token = NonTerminal Text
112 deriving (Ord, Eq, Show)
114 isTerminal :: Token -> Bool
115 isTerminal (Terminal _) = True
116 isTerminal (NonTerminal _) = False
118 nonTerminals :: [Token] -> [Text]
119 nonTerminals ts = [nt | NonTerminal nt <- ts]
121 parseToken :: Text -> Token
122 parseToken "<start>" = Terminal Start
123 parseToken "<stop>" = Terminal Stop
124 parseToken t = NonTerminal t
126 toToken :: [Text] -> [Token]
127 toToken xs = Terminal Start : (NonTerminal <$> xs) <> [Terminal Stop]
129 printToken :: Token -> Text
132 f (NonTerminal x) = x
133 f (Terminal Start) = "<start>"
134 f (Terminal Stop) = "<stop>"
135 ------------------------------------------------------------------------
138 = Node { _node_count :: Int
140 , _node_children :: Map k (Trie k e)
142 | Leaf { _node_count :: Int }
147 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
148 insertTrie [] n = n { _node_count = _node_count n +1}
149 insertTrie (x:xs) (Leaf c) = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
150 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
152 f = Just . insertTrie xs . fromMaybe emptyTrie
154 -- emptyTrie :: (Ord k, Monoid e) => Trie k e
155 -- emptyTrie = Node 0 mempty mempty
156 emptyTrie :: Trie k e
159 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
161 | Map.null children = Leaf c
162 | otherwise = Node c mempty children
164 -----------------------------
165 -- | Trie to Tree since Tree as nice print function
166 toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
167 toTree k (Leaf c) = Tree.Node (k, c, Nothing) []
168 toTree k (Node c e cs) = Tree.Node (k, c, Just e) (map (uncurry toTree) $ Map.toList cs)
170 ------------------------------------------------------------------------
171 ------------------------------------------------------------------------
172 normalizeLevel :: Entropy e => e -> e -> e -> e
173 normalizeLevel m v e = (e - m) / v
177 nodeChildren :: Trie k e -> Map k (Trie k e)
178 nodeChildren (Node _ _ cs) = cs
179 nodeChildren (Leaf _) = Map.empty
183 chunkAlongEleve :: Int -> [a] -> [[a]]
184 chunkAlongEleve n xs = L.take n <$> L.tails xs
186 data Direction = Backward | Forward
188 buildTrie :: Direction -> Int -> [[Token]] -> Trie Token ()
189 buildTrie d n sentences
190 = L.foldr insertTrie emptyTrie
192 $ ( filter (/= [Terminal (term d)])
193 . chunkAlongEleve (n + 1)
198 order Forward = identity
199 order Backward = reverse
201 term Backward = Start
203 class IsTrie trie where
204 entropyTrie :: Entropy e => (k -> Bool) -> trie k () -> trie k e
205 nodeEntropy :: Entropy e => Getting e i e -> trie k i -> e
206 nodeChild :: Ord k => k -> trie k e -> trie k e
207 findTrie :: Ord k => [k] -> trie k e -> trie k e
208 printTrie :: (Show i, Entropy e) => Getting e i e -> trie Token i -> IO ()
209 evTrie :: Entropy e => Getting e i e -> Setter i o e e -> trie k i -> trie k o
210 normalizeEntropy :: Entropy e
211 => Getting e i e -> ModEntropy i o e
212 -> trie k i -> trie k o
214 instance IsTrie Trie where
216 entropyTrie _ (Leaf c) = Leaf c
217 entropyTrie pred (Node c () children) = Node c e (map (entropyTrie pred) children)
219 children' = Map.toList children
220 sum_count = sum $ _node_count . snd <$> children'
221 e | sum_count == 0 = nan
222 | otherwise = sum $ f <$> children'
223 f (k, child) = if pred k then chc * P.logBase 2 (fromIntegral c)
224 else - chc * P.logBase 2 chc
226 chc = fromIntegral (_node_count child) / fromIntegral c
228 nodeEntropy inE (Node _ e _) = e ^. inE
229 nodeEntropy _ (Leaf _) = nan
231 nodeChild k (Node _ _ cs) = fromMaybe emptyTrie (Map.lookup k cs)
232 nodeChild _ (Leaf _) = emptyTrie
234 findTrie ks t = L.foldl (flip nodeChild) t ks
237 P.putStrLn . Tree.drawTree
239 $ toTree (NonTerminal "") t
240 P.putStrLn " Levels:"
241 forM_ (normalizationLevels inE t) $ \level ->
242 P.putStrLn $ " " <> show level
244 evTrie inE setEV = go nan
246 go _ (Leaf c) = Leaf c
247 go e0 (Node c i children) = Node c (i & setEV .~ ev e0 e1) $ go e1 <$> children
253 normalizeEntropy inE modE t = go (modE identity) (normalizationLevels inE t) t
255 go _ _ (Leaf c) = Leaf c
256 go _ [] _ = panic "normalizeEntropy' empty levels"
257 go f ((m, v, _) : ess) (Node c i children)
258 = Node c (f i) $ go (modE $ normalizeLevel m v) ess <$> children
259 ------------------------------------------------------------------------
261 levels :: Trie k e -> [[Trie k e]]
262 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
264 subForest :: Trie k e -> [Trie k e]
265 subForest (Leaf _) = []
266 subForest (Node _ _ children) = Map.elems children
268 entropyLevels :: Entropy e => Getting e i e -> Trie k i -> [[e]]
269 entropyLevels inE = fmap (noNaNs . map (nodeEntropy inE)) . L.tail . levels
271 normalizationLevels :: Entropy e => Getting e i e -> Trie k i -> [(e, e, Int)]
272 normalizationLevels inE = fmap f . entropyLevels inE
274 f es = (mean es, deviation es, length es)
276 ------------------------------------------------------------------------
278 data Tries k e = Tries
285 buildTries :: Int -> [[Token]] -> Tries Token ()
286 buildTries n sentences = Tries
287 { _fwd = buildTrie Forward n sentences
288 , _bwd = buildTrie Backward n sentences
291 instance IsTrie Tries where
293 nodeEntropy inE (Tries f b) = mean [nodeEntropy inE f, nodeEntropy inE b]
295 findTrie ks (Tries f b) = Tries (findTrie ks f) (findTrie (reverse ks) b)
297 nodeChild = onTries . nodeChild
299 entropyTrie = onTries . entropyTrie
301 evTrie inE setEV = onTries $ evTrie inE setEV
303 normalizeEntropy inE = onTries . normalizeEntropy inE
305 printTrie inE (Tries f b) = do
306 P.putStrLn "Forward:"
309 P.putStrLn "Backward:"
312 onTries :: (Trie k i -> Trie k o) -> Tries k i -> Tries k o
313 onTries h (Tries f b) = Tries (h f) (h b)
315 ------------------------------------------------------------------------
316 mayCons :: [a] -> [[a]] -> [[a]]
318 mayCons xs xss = xs : xss
321 split :: (IsTrie trie, Entropy e) => Lens' i e -> trie Token i -> [Token] -> [[Token]]
323 split inE t (Terminal Start:xs) = split inE t xs
324 split inE t (x0:xs0) = go [x0] xs0
327 go pref (Terminal Stop:_) = [pref]
328 go _ (Terminal Start:_) = panic "split impossible"
330 -- trace (show (if acc then "ACC" else "CUT", (prefx, epxt), if acc then ">" else "<=", ((pref, ept), "+", ([x], ext)))) $
333 else mayCons pref $ go [x] xs
337 pxt = findTrie prefx t
340 -- ^ entropy of the current prefix
344 -- ^ entropy of the current prefix plus x
345 acc = P.isNaN ept || P.isNaN ext || not (P.isNaN epxt) -- && (epxt > mean [ept, ext])
347 -- aut(["in","this","paper"]) > aut(["in","this"]) + aut(["paper"])
352 split :: Entropy e => Int -> Lens' i e -> Tries Token i -> [Token] -> [[Text]]
354 split _ _ _ [t] = pure <$> nonTerminals [t]
355 split n inE t ts = nonTerminals pref `mayCons` split n inE t (drop (length pref) ts)
357 pref = maximumWith (\ks -> nodeEntropy inE $ findTrie ks t)
358 (L.tail . L.inits . take n $ ts)
362 split :: Entropy e => Lens' i e -> Tries Token i -> [Token] -> [[Token]]
364 maximumWith (sum . map $ nodeAutonomy inE t0) (all the splits of ts)
367 ------------------------------------------------------------------------
369 mainEleve :: Int -> [[Text]] -> [[[Text]]]
370 mainEleve n input = split n info_autonomy (t :: Tries Token (I Double)) <$> inp
372 inp = toToken <$> input
373 t = normalizeEntropy info_entropy_var set_autonomy
374 . evTrie identity set_entropy_var
375 . entropyTrie isTerminal
379 ---------------------------------------------
381 type Checks e = [(Text, Int, e, e, e, e, e, e, e, e, e)]
383 testEleve :: e ~ Double => Bool -> Int -> [Text] -> Checks e -> IO Bool
384 testEleve debug n output checks = do
386 res = split n info_autonomy nt <$> inp
388 P.putStrLn $ show input
390 printTrie info_entropy nt
392 P.putStrLn "Splitting:"
393 P.putStrLn $ show res
395 pure $ expected == res
398 out = T.words <$> output
399 expected = fmap (T.splitOn "-") <$> out
400 input = (T.splitOn "-" =<<) <$> out
401 inp = toToken <$> input
403 nt :: Tries Token (I Double)
404 nt = normalizeEntropy info_entropy_var set_autonomy
405 . evTrie identity set_entropy_var
406 . entropyTrie isTerminal
411 then P.putStrLn $ " \ESC[32mPASS\ESC[m " <> msg <> " " <> show ref
412 else P.putStrLn $ " \ESC[31mFAIL\ESC[m " <> msg <> " ref=" <> show ref <> " my=" <> show my
414 checker (ngram, count, entropy, ev, autonomy, fwd_entropy, fwd_ev, fwd_autonomy, bwd_entropy, bwd_ev, bwd_autonomy) = do
415 let ns = parseToken <$> T.words ngram
418 P.putStrLn $ " " <> T.unpack ngram <> ":"
419 check (==) "count" count (_node_count (_fwd nt'))
421 check sim "entropy" entropy (nodeEntropy info_entropy nt' )
422 check sim "ev" ev (nodeEntropy info_entropy_var nt' )
423 check sim "autonomy" autonomy (nodeEntropy info_autonomy nt' )
425 check sim "fwd_entropy" fwd_entropy (nodeEntropy info_entropy (_fwd nt'))
426 check sim "fwd_ev" fwd_ev (nodeEntropy info_entropy_var (_fwd nt'))
427 check sim "fwd_autonomy" fwd_autonomy (nodeEntropy info_autonomy (_fwd nt'))
429 check sim "bwd_entropy" bwd_entropy (nodeEntropy info_entropy (_bwd nt'))
430 check sim "bwd_ev" bwd_ev (nodeEntropy info_entropy_var (_bwd nt'))
431 check sim "bwd_autonomy" bwd_autonomy (nodeEntropy info_autonomy (_bwd nt'))
433 -- | TODO real data is a list of tokenized sentences
434 example0, example1, example2, example3, example4, example5, example6 :: [Text]
435 example0 = ["New-York is New-York and New-York"]
436 example1 = ["to-be or not to-be"]
437 example2 = ["to-be-or not to-be-or NOT to-be and"]
438 example3 = example0 <> example0
439 -- > TEST: Should not have York New in the trie
440 example4 = ["a-b-c-d e a-b-c-d f"]
441 example5 = ["a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"]
442 example6 = ["le-petit chat"
448 checks0, checks2 :: Checks Double
451 -- [(token, count, entropy, ev, autonomy, fwd_entropy, fwd_ev, fwd_autonomy, bwd_entropy, bwd_ev, bwd_autonomy)]
452 [ ("<start>", 1, nan, nan, nan, 0.0, -2.113283334294875, -0.5000000000000002, nan, nan, nan)
453 , ("New", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 0.0, -2.113283334294875, -0.5000000000000002, 1.584962500721156, -0.5283208335737188, 2.0)
454 , ("York", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 1.584962500721156, -0.5283208335737188, 2.0, 0.0, -2.113283334294875, -0.5000000000000002)
455 , ("is", 1, 0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002)
456 , ("and", 1, 0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002)
457 , ("<stop>", 0, nan, nan, nan, nan, nan, nan, 0.0, -2.113283334294875, -0.5000000000000002)
458 , ("<start> New", 1, nan, nan, nan, 0.0, nan, nan, nan, nan, nan)
459 , ("New York", 3, 1.584962500721156, 1.584962500721156, 1.414213562373095, 1.584962500721156, 1.584962500721156, 1.4142135623730947, 1.584962500721156, 1.584962500721156, 1.4142135623730951)
460 , ("York is", 1, 0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865476, 0.0, nan, nan)
461 , ("is New", 1, 0, nan, nan, 0.0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865474)
462 , ("York and", 1, 0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865476, 0.0, nan, nan)
463 , ("and New", 1, 0, nan, nan, 0.0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865474)
464 , ("York <stop>", 1, nan, nan, nan, nan, nan, nan, 0.0, nan, nan)
465 , ("<start> New York", 1, nan, nan, nan, 0.0, nan, nan, nan, nan, nan)
466 , ("New York is", 1, 0, nan, nan, 0.0, -1.584962500721156, nan, 0.0, nan, nan)
467 , ("York is New", 1, 0, nan, nan, 0.0, nan, nan, 0.0, nan, nan)
468 , ("is New York", 1, 0, nan, nan, 0.0, nan, nan, 0.0, -1.584962500721156, nan)
469 , ("New York and", 1, 0, nan, nan, 0.0, -1.584962500721156, nan, 0.0, nan, nan)
470 , ("York and New", 1, 0, nan, nan, 0.0, nan, nan, 0.0, nan, nan)
471 , ("and New York", 1, 0, nan, nan, 0.0, nan, nan, 0.0, -1.584962500721156, nan)
472 , ("New York <stop>", 1, nan, nan, nan, nan, nan, nan, 0.0, nan, nan)
477 [("to be", 3, 1.2516291673878228, 1.2516291673878228, 1.5535694744293167, nan, 0.9182958340544896)
478 ,("be or", 2, 0.5, nan, nan, nan, 1.0)
479 ,("or not", 1, 0.0, nan, nan, nan, 0.0)
480 ,("not to", 1, 0.0, nan, nan, nan, 0.0)
481 ,("or NOT", 1, 0.0, nan, nan, nan, 0.0)
482 ,("NOT to", 1, 0.0, nan, nan, nan, 0.0)
483 ,("be and", 1, 0.0, nan, nan, nan, 0.0)
490 [("example0", 3, example0, checks0)
491 ,("example0", 2, example0, [])
492 ,("example1", 2, example1, [])
493 ,("example2", 3, example2, checks2)
494 ,("example3", 2, example3, [])
495 ,("example4", 4, example4, [])
496 ,("example5", 5, example5, [])
497 ,("example6", 2, example6, [])
499 (\(name, n, ex, checks) -> do
500 P.putStrLn $ name <> " " <> show n
501 b <- testEleve False n ex checks
502 P.putStrLn $ " splitting: " <> if b then "PASS" else "FAIL"