2 Module : Gargantext.Text.Eleve
3 Description : Unsupervized Word segmentation
4 Copyright : (c) CNRS, 2019-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 # Implementation of Unsupervized Word Segmentation
14 - Python implementation (Korantin August, Emmanuel Navarro):
15 [EleVe](https://github.com/kodexlab/eleve.git)
17 - Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
18 Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
19 the 50th Annual Meeting of the Association for Computational Linguistics
20 , pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
22 Notes for current implementation:
23 - TODO fix normalization
24 - TODO extract longer ngrams (see paper above, viterbi algo can be used)
25 - TODO AD TEST: prop (Node c _e f) = c == Map.size f
27 - AD: Real ngrams extraction test
28 from Gargantext.Text.Terms import extractTermsUnsupervised
29 docs <- runCmdRepl $ selectDocs 1004
30 extractTermsUnsupervised 3 $ DT.intercalate " "
32 $ Gargantext.map _hyperdataDocument_abstract docs
35 {-# LANGUAGE ConstraintKinds #-}
36 {-# LANGUAGE NoImplicitPrelude #-}
37 {-# LANGUAGE OverloadedStrings #-}
38 {-# LANGUAGE RankNTypes #-}
39 {-# LANGUAGE TemplateHaskell #-}
40 {-# LANGUAGE TypeFamilies #-}
42 module Gargantext.Text.Eleve where
44 import Debug.Trace (trace)
45 -- import Debug.SimpleReflect
47 import Control.Lens (Lens', Getting, (^.), (^?), view, makeLenses, _Just)
48 import Control.Monad (foldM, mapM_, forM_)
50 import qualified Data.List as L
52 import Data.Text (Text)
53 import qualified Data.Text as T
55 import Data.Maybe (fromMaybe, catMaybes)
56 import qualified Data.Map as Map
57 import Gargantext.Prelude hiding (cs)
58 import qualified Data.Tree as Tree
59 import Data.Tree (Tree)
60 import qualified Prelude as P (putStrLn, logBase, isNaN, RealFloat)
67 -- ^ TODO: only used for debugging
69 ------------------------------------------------------------------------
70 -- | Example and tests for development
76 instance Show e => Show (I e) where
77 show (I e n) = show (e, n)
81 type ModEntropy i o e = (e -> e) -> i -> o
83 set_autonomy :: ModEntropy e (I e) e
84 set_autonomy f e = I e (f e)
86 data StartStop = Start | Stop
87 deriving (Ord, Eq, Show)
89 data Token = NonTerminal Text
91 deriving (Ord, Eq, Show)
93 isTerminal :: Token -> Bool
94 isTerminal (Terminal _) = True
95 isTerminal (NonTerminal _) = False
97 parseToken :: Text -> Token
98 parseToken "<start>" = Terminal Start
99 parseToken "<stop>" = Terminal Stop
100 parseToken t = NonTerminal t
102 toToken :: [Text] -> [Token]
103 toToken xs = Terminal Start : (NonTerminal <$> xs) <> [Terminal Stop]
105 printToken :: Token -> Text
108 f (NonTerminal x) = x
109 f (Terminal Start) = "<start>"
110 f (Terminal Stop) = "<stop>"
112 ------------------------------------------------------------------------
115 = Node { _node_count :: Int
117 , _node_children :: Map k (Trie k e)
119 | Leaf { _node_count :: Int }
124 insertTries :: Ord k => [[k]] -> Trie k ()
125 insertTries = L.foldr insertTrie emptyTrie
127 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
128 insertTrie [] n = n { _node_count = _node_count n +1}
129 insertTrie (x:xs) (Leaf c) = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
130 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
132 f = Just . insertTrie xs . fromMaybe emptyTrie
134 -- emptyTrie :: (Ord k, Monoid e) => Trie k e
135 -- emptyTrie = Node 0 mempty mempty
136 emptyTrie :: Trie k e
139 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
141 | Map.null children = Leaf c
142 | otherwise = Node c mempty children
144 -----------------------------
146 -- | Trie to Tree since Tree as nice print function
147 toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
148 toTree k (Leaf c) = Tree.Node (k, c, Nothing) []
149 toTree k (Node c e cs) = Tree.Node (k, c, Just e) (map (uncurry toTree) $ Map.toList cs)
151 ------------------------------------------------------------------------
152 ------------------------------------------------------------------------
154 nan :: Floating e => e
157 updateIfDefined :: P.RealFloat e => e -> e -> e
158 updateIfDefined e0 e | P.isNaN e = e0
161 entropyTrie :: Floating e => (k -> Bool) -> Trie k () -> Trie k e
162 entropyTrie _ (Leaf c) = Leaf c
163 entropyTrie pred (Node c () children) = Node c e (map (entropyTrie pred) children)
165 e = sum $ map f $ Map.toList children
166 f (k, child) = if pred k then chc * P.logBase 2 (fromIntegral c)
167 else - chc * P.logBase 2 chc
169 chc = fromIntegral (_node_count child) / fromIntegral c
170 ------------------------------------------------------------------------
172 normalizeLevel :: Entropy e => [e] -> e -> e
173 normalizeLevel = checkDiff (go . filter (not . P.isNaN))
176 -- checkDiff f es e = let e' = f es e in if e == e' then e' else trace ("normalizeLevel: diff " <> show e <> " " <> show e') e'
178 go [] = panic "normalizeLevel: impossible"
179 -- trace "normalizeLevel"
181 go es = \e -> (e - m) / v
184 then trace ("normalizeLevel " <> show (e,m,v,es))
194 nodeChildren :: Trie k e -> Map k (Trie k e)
195 nodeChildren (Node _ _ cs) = cs
196 nodeChildren (Leaf _) = Map.empty
200 class IsTrie trie where
201 buildTrie :: Floating e => [[Token]] -> trie Token e
202 nodeEntropy :: Floating e => Getting e i e -> trie k i -> e
203 nodeChild :: Ord k => k -> trie k e -> trie k e
204 findTrie :: Ord k => [k] -> trie k e -> trie k e
205 normalizeEntropy :: Entropy e
206 => Getting e i e -> ModEntropy i o e
207 -> trie k i -> trie k o
209 nodeAutonomy :: (Ord k, Entropy e) => Getting e i e -> trie k i -> [k] -> e
210 nodeAutonomy inE t ks = nodeEntropy inE $ findTrie ks t
212 instance IsTrie Trie where
213 buildTrie = entropyTrie isTerminal . insertTries
215 nodeEntropy inE (Node _ e _) = e ^. inE
216 nodeEntropy _ (Leaf _) = -- trace "nodeEntropy of Leaf" $
219 nodeChild k (Node _ _ cs) = fromMaybe emptyTrie (Map.lookup k cs)
220 nodeChild _ (Leaf _) = emptyTrie
222 findTrie ks t = L.foldl (flip nodeChild) t ks
224 normalizeEntropy inE modE t = go (modE identity) (entropyLevels inE t) t
226 go _ [] _ = panic "normalizeEntropy' empty levels"
227 go _ _ (Leaf c) = Leaf c
228 go _ ([] : _) _ = panic "normalizeEntropy': empty level"
229 go f (es : ess) (Node c i children) =
230 Node c (f i) $ go (modE $ normalizeLevel es) ess <$> children
234 This is only normalizing a node with respect to its brothers (unlike all the
235 nodes of the same level).
237 normalizeEntropy inE modE = go $ modE identity
239 go _ (Leaf c) = Leaf c
240 go f (Node c i children)
241 | Map.null children =
242 panic "normalizeEntropy: impossible"
244 Node c (f i) $ go (modE $ normalizeLevel es) <$> children
246 es = [ i' ^. inE | Node _ i' _ <- Map.elems children ]
248 ------------------------------------------------------------------------
250 levels :: Trie k e -> [[Trie k e]]
251 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
253 subForest :: Trie k e -> [Trie k e]
254 subForest (Leaf _) = []
255 subForest (Node _ _ children) = Map.elems children
257 entropyLevels :: Entropy e => Getting e i e -> Trie k i -> [[e]]
258 entropyLevels inE = fmap (filter (not . P.isNaN) . map (nodeEntropy inE)) . levels
260 ------------------------------------------------------------------------
262 data Tries k e = Tries
267 instance IsTrie Tries where
268 buildTrie tts = Tries { _fwd = buildTrie tts
269 , _bwd = buildTrie (reverse <$> tts)
272 nodeEntropy inE (Tries fwd bwd) = mean [nodeEntropy inE fwd, nodeEntropy inE bwd]
274 findTrie ks (Tries fwd bwd) = Tries (findTrie ks fwd) (findTrie (reverse ks) bwd)
276 nodeChild k (Tries fwd bwd) = Tries (nodeChild k fwd) (nodeChild k bwd)
278 normalizeEntropy inE modE = onTries (normalizeEntropy inE modE)
280 onTries :: (Trie k i -> Trie k o) -> Tries k i -> Tries k o
281 onTries f (Tries fwd bwd) = Tries (f fwd) (f bwd)
283 ------------------------------------------------------------------------
284 split :: (IsTrie trie, Entropy e) => Lens' i e -> trie Token i -> [Token] -> [[Token]]
286 split inE t0 (Terminal Start:xs0) = split inE (nodeChild (Terminal Start) t0) xs0
287 split inE t0 (x0:xs0) = go (nodeChild x0 t0) [x0] xs0
290 consRev xs xss = reverse xs : xss
292 go _ pref [] = [reverse pref]
293 go _ pref (Terminal Stop:_) = [reverse pref]
294 go t pref (Terminal Start:xs) = go t pref xs
296 -- trace (show (if acc then "ACC" else "CUT", (reverse (x : pref), ext), if acc then ">" else "<=", ((reverse pref, et), "+", ([x], ext0)))) $
298 then go xt (x:pref) xs
299 else consRev pref $ go xt0 [x] xs
304 -- ^ entropy of the current prefix
308 -- ^ entropy of the current prefix plus x
309 acc = ext > et + ext0
310 -- aut(["in","this","paper"]) > aut(["in","this"]) + aut(["paper"])
312 ne d t = if P.isNaN e then d else e
313 where e = nodeEntropy inE t
316 split :: Entropy e => Lens' i e -> Tries Token i -> [Token] -> [[Token]]
318 maximumWith (sum . map $ nodeAutonomy inE t0) (all the splits of ts)
321 ------------------------------------------------------------------------
322 ------------------------------------------------------------------------
324 mainEleve :: Int -> [[Text]] -> [[[Text]]]
327 mainEleve n input = map (map printToken) . split identity (t :: Trie Token Double) <$> inp
329 inp = toToken <$> input
330 t = buildTrie $ L.concat $ chunkAlong n 1 <$> inp
333 sim :: Entropy e => e -> e -> Bool
334 sim x y = x == y || (P.isNaN x && P.isNaN y)
336 chunkAlongEleve :: Int -> [a] -> [[a]]
337 chunkAlongEleve n xs = L.take n <$> L.tails xs
339 testEleve :: e ~ Double => Bool -> Int -> [Text] -> [(Text, Int, e, e, e, e, e)] -> IO Bool
340 testEleve debug n output checks = do
343 pss = [ (ps, findTrie ps fwd ^? _Just . node_entropy) -- . info_entropy)
346 , cs <- chunkAlong m 1 <$> inp
351 --res = map (map printToken) . split identity fwd <$> inp
352 --res = map (map printToken) . split info_norm_entropy' nt' <$> inp
353 res = map (map printToken) . split info_autonomy nt <$> inp
355 P.putStrLn (show input)
356 -- mapM_ (P.putStrLn . show) pss
361 P.putStrLn $ show res
363 pure $ expected == res
366 out = T.words <$> output
367 expected = fmap (T.splitOn "-") <$> out
368 input = (T.splitOn "-" =<<) <$> out
369 inp = toToken <$> input
370 t = buildTrie $ L.concat $ chunkAlongEleve (n + 2) <$> inp
371 -- nt = normalizeEntropy identity set_autonomy (fwd :: Trie Token Double)
372 -- nt = normalizeEntropy' info_entropy (\f -> info_norm_entropy' %~ f) nt
373 nt = normalizeEntropy identity set_autonomy t
377 then P.putStrLn $ " PASS " <> msg <> " " <> show x <> " ~= " <> show y
378 else P.putStrLn $ " FAIL " <> msg <> " " <> show x <> " /= " <> show y
380 checker (ngram, count, entropy, _ev, autonomy, bwd_entropy, fwd_entropy) = do
381 let ns = parseToken <$> T.words ngram
383 P.putStrLn $ " " <> T.unpack ngram <> ":"
384 check (==) "count" count (_node_count (_fwd t'))
385 check sim "entropy" entropy (nodeEntropy info_entropy t')
386 check sim "autonomy" autonomy (nodeEntropy info_autonomy t')
387 check sim "fwd_entropy" fwd_entropy (nodeEntropy info_entropy (_fwd t'))
388 check sim "bwd_entropy" bwd_entropy (nodeEntropy info_entropy (findTrie ns (_bwd nt)))
391 P.putStrLn . Tree.drawTree
393 . toTree (NonTerminal "")
395 -- | TODO real data is a list of tokenized sentences
396 example0, example1, example2, example3, example4, example5, example6 :: [Text]
397 example0 = ["New-York is New-York and New-York"]
398 example1 = ["to-be or not to-be"]
399 example2 = ["to-be-or not to-be-or NOT to-be and"]
400 example3 = example0 <> example0
401 -- > TEST: Should not have York New in the trie
402 example4 = ["a-b-c-d e a-b-c-d f"]
403 example5 = ["a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"]
404 example6 = ["le-petit chat"
410 checks0, checks2 :: [(Text, Int, Double, Double, Double, Double, Double)]
413 [("<start> New", 1, nan, nan, nan, nan, 0.0)
414 ,("New York", 3, 1.584962500721156, 1.584962500721156, 1.414213562373095, nan, 1.584962500721156)
415 ,("York is", 1, 0.0, nan, nan, nan, 0.0)
416 ,("is New", 1, 0.0, nan, nan, nan, 0.0)
417 ,("New York", 3, 1.584962500721156, 1.584962500721156, 1.414213562373095, nan, 1.584962500721156)
418 ,("York and", 1, 0.0, nan, nan, nan, 0.0)
419 ,("and New", 1, 0.0, nan, nan, nan, 0.0)
420 ,("New York", 3, 1.584962500721156, 1.584962500721156, 1.414213562373095, nan, 1.584962500721156)
421 ,("York <stop>", 1, nan, nan, nan, nan, nan)
425 [("to be", 3, 1.2516291673878228, 1.2516291673878228, 1.5535694744293167, nan, 0.9182958340544896)
426 ,("be or", 2, 0.5, nan, nan, nan, 1.0)
427 ,("or not", 1, 0.0, nan, nan, nan, 0.0)
428 ,("not to", 1, 0.0, nan, nan, nan, 0.0)
429 ,("or NOT", 1, 0.0, nan, nan, nan, 0.0)
430 ,("NOT to", 1, 0.0, nan, nan, nan, 0.0)
431 ,("be and", 1, 0.0, nan, nan, nan, 0.0)
438 [("example0", 2, example0, checks0)
439 ,("example1", 2, example1, [])
440 ,("example2", 3, example2, checks2)
441 ,("example3", 2, example3, [])
442 ,("example4", 4, example4, [])
443 ,("example5", 5, example5, [])
445 (\(name, n, ex, checks) -> do
446 P.putStrLn $ name <> " " <> show n
447 b <- testEleve False n ex checks
448 P.putStrLn $ " splitting: " <> if b then "PASS" else "FAIL"