2 Module : Gargantext.Text.Eleve
3 Description : Unsupervized Word segmentation
4 Copyright : (c) CNRS, 2019-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 # Implementation of Unsupervized Word Segmentation
14 - Python implementation (Korantin August, Emmanuel Navarro):
15 [EleVe](https://github.com/kodexlab/eleve.git)
17 - Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
18 Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
19 the 50th Annual Meeting of the Association for Computational Linguistics
20 , pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
22 Notes for current implementation:
23 - TODO fix normalization
24 - TODO extract longer ngrams (see paper above, viterbi algo can be used)
25 - TODO AD TEST: prop (Node c _e f) = c == Map.size f
27 - AD: Real ngrams extraction test
28 from Gargantext.Text.Terms import extractTermsUnsupervised
29 docs <- runCmdRepl $ selectDocs 1004
30 extractTermsUnsupervised 3 $ DT.intercalate " "
32 $ Gargantext.map _hyperdataDocument_abstract docs
35 {-# LANGUAGE ConstraintKinds #-}
36 {-# LANGUAGE NoImplicitPrelude #-}
37 {-# LANGUAGE OverloadedStrings #-}
38 {-# LANGUAGE RankNTypes #-}
39 {-# LANGUAGE TemplateHaskell #-}
40 {-# LANGUAGE TypeFamilies #-}
42 module Gargantext.Text.Eleve where
44 import Debug.Trace (trace)
45 -- import Debug.SimpleReflect
47 import Control.Lens (Lens', Getting, (^.), (^?), view, makeLenses, _Just)
48 import Control.Monad (foldM, mapM_, forM_)
50 import qualified Data.List as L
52 import Data.Text (Text)
53 import qualified Data.Text as T
55 import Data.Maybe (fromMaybe, catMaybes)
56 import qualified Data.Map as Map
57 import Gargantext.Prelude hiding (cs)
58 import qualified Data.Tree as Tree
59 import Data.Tree (Tree)
60 import qualified Prelude as P (putStrLn, logBase, isNaN, RealFloat)
67 -- ^ TODO: only used for debugging
69 ------------------------------------------------------------------------
70 -- | Example and tests for development
76 instance Show e => Show (I e) where
77 show (I e n) = show (e, n)
81 type ModEntropy i o e = (e -> e) -> i -> o
83 setNormEntropy :: ModEntropy e (I e) e
84 setNormEntropy f e = I e (f e)
86 data StartStop = Start | Stop
87 deriving (Ord, Eq, Show)
89 data Token = NonTerminal Text
91 deriving (Ord, Eq, Show)
93 isTerminal :: Token -> Bool
94 isTerminal (Terminal _) = True
95 isTerminal (NonTerminal _) = False
97 toToken :: Int -> [Text] -> [Token]
98 toToken n xs = Terminal Start : (NonTerminal <$> xs) <> [Terminal Stop]
100 unToken :: [Token] -> [Text]
103 f (NonTerminal x) = x
106 ------------------------------------------------------------------------
109 = Node { _node_count :: Int
111 , _node_children :: Map k (Trie k e)
113 | Leaf { _node_count :: Int }
118 insertTries :: Ord k => [[k]] -> Trie k ()
119 insertTries = L.foldr insertTrie emptyTrie
121 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
122 insertTrie [] n = n { _node_count = _node_count n +1}
123 insertTrie (x:xs) (Leaf c) = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
124 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
126 f = Just . insertTrie xs . fromMaybe emptyTrie
128 -- emptyTrie :: (Ord k, Monoid e) => Trie k e
129 -- emptyTrie = Node 0 mempty mempty
130 emptyTrie :: Trie k e
133 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
135 | Map.null children = Leaf c
136 | otherwise = Node c mempty children
138 -----------------------------
140 -- | Trie to Tree since Tree as nice print function
141 toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
142 toTree k (Leaf c) = Tree.Node (k, c, Nothing) []
143 toTree k (Node c e cs) = Tree.Node (k, c, Just e) (map (uncurry toTree) $ Map.toList cs)
145 ------------------------------------------------------------------------
146 ------------------------------------------------------------------------
148 nan :: Floating e => e
151 updateIfDefined :: P.RealFloat e => e -> e -> e
152 updateIfDefined e0 e | P.isNaN e = e0
155 entropyTrie :: Floating e => (k -> Bool) -> Trie k () -> Trie k e
156 entropyTrie _ (Leaf c) = Leaf c
157 entropyTrie pred (Node c () children) = Node c e (map (entropyTrie pred) children)
159 e = sum $ map f $ Map.toList children
160 f (k, child) = if pred k then chc * P.logBase 2 (fromIntegral c)
161 else - chc * P.logBase 2 chc
163 chc = fromIntegral (_node_count child) / fromIntegral c
164 ------------------------------------------------------------------------
166 normalizeLevel :: Entropy e => [e] -> e -> e
167 normalizeLevel = checkDiff (go . filter (not . P.isNaN))
170 -- checkDiff f es e = let e' = f es e in if e == e' then e' else trace ("normalizeLevel: diff " <> show e <> " " <> show e') e'
172 go [] = panic "normalizeLevel: impossible"
173 -- trace "normalizeLevel"
175 go es = \e -> (e - m) / v
178 then trace ("normalizeLevel " <> show (e,m,v,es))
188 nodeChildren :: Trie k e -> Map k (Trie k e)
189 nodeChildren (Node _ _ cs) = cs
190 nodeChildren (Leaf _) = Map.empty
194 class IsTrie trie where
195 buildTrie :: Floating e => [[Token]] -> trie Token e
196 nodeEntropy :: Floating e => Getting e i e -> trie k i -> e
197 nodeChild :: Ord k => k -> trie k e -> trie k e
198 findTrie :: Ord k => [k] -> trie k e -> trie k e
199 normalizeEntropy :: Entropy e
200 => Getting e i e -> ModEntropy i o e
201 -> trie k i -> trie k o
203 nodeAutonomy :: (Ord k, Entropy e) => Getting e i e -> trie k i -> [k] -> e
204 nodeAutonomy inE t ks = nodeEntropy inE $ findTrie ks t
206 instance IsTrie Trie where
207 buildTrie = entropyTrie isTerminal . insertTries
209 nodeEntropy inE (Node _ e _) = e ^. inE
210 nodeEntropy _ (Leaf _) = -- trace "nodeEntropy of Leaf" $
213 nodeChild k (Node _ _ cs) = fromMaybe emptyTrie (Map.lookup k cs)
214 nodeChild _ (Leaf _) = emptyTrie
216 findTrie ks t = L.foldl (flip nodeChild) t ks
218 normalizeEntropy inE modE t = go (modE identity) (entropyLevels inE t) t
220 go _ [] _ = panic "normalizeEntropy' empty levels"
221 go _ _ (Leaf c) = Leaf c
222 go _ ([] : _) _ = panic "normalizeEntropy': empty level"
223 go f (es : ess) (Node c i children) =
224 Node c (f i) $ go (modE $ normalizeLevel es) ess <$> children
228 This is only normalizing a node with respect to its brothers (unlike all the
229 nodes of the same level).
231 normalizeEntropy inE modE = go $ modE identity
233 go _ (Leaf c) = Leaf c
234 go f (Node c i children)
235 | Map.null children =
236 panic "normalizeEntropy: impossible"
238 Node c (f i) $ go (modE $ normalizeLevel es) <$> children
240 es = [ i' ^. inE | Node _ i' _ <- Map.elems children ]
242 ------------------------------------------------------------------------
244 levels :: Trie k e -> [[Trie k e]]
245 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
247 subForest :: Trie k e -> [Trie k e]
248 subForest (Leaf _) = []
249 subForest (Node _ _ children) = Map.elems children
251 entropyLevels :: Entropy e => Getting e i e -> Trie k i -> [[e]]
252 entropyLevels inE = fmap (filter (not . P.isNaN) . map (nodeEntropy inE)) . levels
254 ------------------------------------------------------------------------
256 data Tries k e = Tries
261 instance IsTrie Tries where
262 buildTrie tts = Tries { _fwd = buildTrie tts
263 , _bwd = buildTrie (reverse <$> tts)
266 nodeEntropy inE (Tries fwd bwd) = mean [nodeEntropy inE fwd, nodeEntropy inE bwd]
268 findTrie ks (Tries fwd bwd) = Tries (findTrie ks fwd) (findTrie (reverse ks) bwd)
270 nodeChild k (Tries fwd bwd) = Tries (nodeChild k fwd) (nodeChild k bwd)
272 normalizeEntropy inE modE = onTries (normalizeEntropy inE modE)
274 onTries :: (Trie k i -> Trie k o) -> Tries k i -> Tries k o
275 onTries f (Tries fwd bwd) = Tries (f fwd) (f bwd)
277 ------------------------------------------------------------------------
278 split :: (IsTrie trie, Entropy e) => Lens' i e -> trie Token i -> [Token] -> [[Token]]
280 split inE t0 (Terminal Start:xs0) = split inE (nodeChild (Terminal Start) t0) xs0
281 split inE t0 (x0:xs0) = go (nodeChild x0 t0) [x0] xs0
284 consRev xs xss = reverse xs : xss
286 go _ pref [] = [reverse pref]
287 go _ pref (Terminal Stop:_) = [reverse pref]
288 go t pref (Terminal Start:xs) = go t pref xs
290 -- trace (show (if acc then "ACC" else "CUT", (reverse (x : pref), ext), if acc then ">" else "<=", ((reverse pref, et), "+", ([x], ext0)))) $
292 then go xt (x:pref) xs
293 else consRev pref $ go xt0 [x] xs
298 -- ^ entropy of the current prefix
302 -- ^ entropy of the current prefix plus x
303 acc = ext > et + ext0
304 -- aut(["in","this","paper"]) > aut(["in","this"]) + aut(["paper"])
306 ne d t = if P.isNaN e then d else e
307 where e = nodeEntropy inE t
310 split :: Entropy e => Lens' i e -> Tries Token i -> [Token] -> [[Token]]
312 maximumWith (sum . map $ nodeAutonomy inE t0) (all the splits of ts)
315 ------------------------------------------------------------------------
316 ------------------------------------------------------------------------
318 mainEleve :: Int -> [[Text]] -> [[[Text]]]
321 mainEleve n input = map unToken . split identity (t :: Trie Token Double) <$> inp
323 inp = toToken (n - 1) <$> input
324 t = buildTrie $ L.concat $ chunkAlong n 1 <$> inp
327 sim :: Entropy e => e -> e -> Bool
328 sim x y = x == y || (P.isNaN x && P.isNaN y)
330 testEleve :: e ~ Double => Bool -> Int -> [Text] -> [(Text, Int, e, e, e, e, e)] -> IO Bool
331 testEleve debug n output checks = do
334 pss = [ (ps, findTrie ps fwd ^? _Just . node_entropy) -- . info_entropy)
337 , cs <- chunkAlong m 1 <$> inp
342 --res = map unToken . split identity fwd <$> inp
343 --res = map unToken . split info_norm_entropy' nt' <$> inp
344 res = map unToken . split info_autonomy nt <$> inp
346 P.putStrLn (show input)
347 -- mapM_ (P.putStrLn . show) pss
352 P.putStrLn $ show res
354 pure $ expected == res
357 out = T.words <$> output
358 expected = fmap (T.splitOn "-") <$> out
359 input = (T.splitOn "-" =<<) <$> out
360 inp = toToken (n - 1) <$> input
361 t = buildTrie $ L.concat $ chunkAlong (n + 1) 1 <$> inp
362 -- nt = normalizeEntropy identity setNormEntropy (fwd :: Trie Token Double)
363 -- nt = normalizeEntropy' info_entropy (\f -> info_norm_entropy' %~ f) nt
364 nt = normalizeEntropy identity setNormEntropy t
368 then P.putStrLn $ " PASS " <> msg <> " " <> show x <> " ~= " <> show y
369 else P.putStrLn $ " FAIL " <> msg <> " " <> show x <> " /= " <> show y
371 checker (ngram, count, entropy, _ev, autonomy, bwd_entropy, fwd_entropy) = do
372 let ns = NonTerminal <$> T.words ngram
374 P.putStrLn $ " " <> T.unpack ngram <> ":"
375 check (==) "count" count (_node_count (_fwd t'))
376 check sim "entropy" entropy (nodeEntropy info_entropy t')
377 check sim "autonomy" autonomy (nodeEntropy info_autonomy t')
378 check sim "fwd_entropy" fwd_entropy (nodeEntropy info_entropy (_fwd t'))
379 check sim "bwd_entropy" bwd_entropy (nodeEntropy info_entropy (findTrie ns (_bwd nt)))
382 P.putStrLn . Tree.drawTree
384 . toTree (NonTerminal "")
386 -- | TODO real data is a list of tokenized sentences
387 example0, example1, example2, example3, example4, example5, example6 :: [Text]
388 example0 = ["New-York is New-York and New-York"]
389 example1 = ["to-be or not to-be"]
390 example2 = ["to-be-or not to-be-or NOT to-be and"]
391 example3 = example0 <> example0
392 -- > TEST: Should not have York New in the trie
393 example4 = ["a-b-c-d e a-b-c-d f"]
394 example5 = ["a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"]
395 example6 = ["le-petit chat"
401 checks0, checks2 :: [(Text, Int, Double, Double, Double, Double, Double)]
403 checks0 = [("\ue02b New", 1, nan, nan, nan, nan, 0.0), ("New York", 3, 1.584962500721156, 1.584962500721156, 1.414213562373095, nan, 1.584962500721156), ("York is", 1, 0.0, nan, nan, nan, 0.0), ("is New", 1, 0.0, nan, nan, nan, 0.0), ("New York", 3, 1.584962500721156, 1.584962500721156, 1.414213562373095, nan, 1.584962500721156), ("York and", 1, 0.0, nan, nan, nan, 0.0), ("and New", 1, 0.0, nan, nan, nan, 0.0), ("New York", 3, 1.584962500721156, 1.584962500721156, 1.414213562373095, nan, 1.584962500721156), ("York \ue02d", 1, nan, nan, nan, nan, nan)]
406 [("to be", 3, 1.2516291673878228, 1.2516291673878228, 1.5535694744293167, nan, 0.9182958340544896)
407 ,("be or", 2, 0.5, nan, nan, nan, 1.0)
408 ,("or not", 1, 0.0, nan, nan, nan, 0.0)
409 ,("not to", 1, 0.0, nan, nan, nan, 0.0)
410 ,("to be", 3, 1.2516291673878228, 1.2516291673878228, 1.5535694744293167, nan, 0.9182958340544896)
411 ,("be or", 2, 0.5, nan, nan, nan, 1.0)
412 ,("or NOT", 1, 0.0, nan, nan, nan, 0.0)
413 ,("NOT to", 1, 0.0, nan, nan, nan, 0.0)
414 ,("to be", 3, 1.2516291673878228, 1.2516291673878228, 1.5535694744293167, nan, 0.9182958340544896)
415 ,("be and", 1, 0.0, nan, nan, nan, 0.0)
422 [("example0", 2, example0, checks0)
423 ,("example1", 2, example1, [])
424 ,("example2", 3, example2, checks2)
425 ,("example3", 2, example3, [])
426 ,("example4", 4, example4, [])
427 ,("example5", 5, example5, [])
429 (\(name, n, ex, checks) -> do
430 P.putStrLn $ name <> " " <> show n
431 b <- testEleve False n ex checks
432 P.putStrLn $ " splitting: " <> if b then "PASS" else "FAIL"