2 Module : Gargantext.Text.Eleve
3 Description : Unsupervized Word segmentation
4 Copyright : (c) CNRS, 2019-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 # Implementation of Unsupervized Word Segmentation
14 - Python implementation (Korantin August, Emmanuel Navarro):
15 [EleVe](https://github.com/kodexlab/eleve.git)
17 - Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
18 Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
19 the 50th Annual Meeting of the Association for Computational Linguistics
20 , pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
22 Notes for current implementation:
23 - TODO extract longer ngrams (see paper above, viterbi algo can be used)
24 - TODO AD TEST: prop (Node c _e f) = c == Map.size f
26 - AD: Real ngrams extraction test
27 from Gargantext.Text.Terms import extractTermsUnsupervised
28 docs <- runCmdRepl $ selectDocs 1004
29 extractTermsUnsupervised 3 $ DT.intercalate " "
31 $ Gargantext.map _hyperdataDocument_abstract docs
34 {-# LANGUAGE ConstraintKinds #-}
35 {-# LANGUAGE NoImplicitPrelude #-}
36 {-# LANGUAGE OverloadedStrings #-}
37 {-# LANGUAGE RankNTypes #-}
38 {-# LANGUAGE TemplateHaskell #-}
39 {-# LANGUAGE TypeFamilies #-}
41 module Gargantext.Text.Eleve where
43 -- import Debug.Trace (trace)
44 -- import Debug.SimpleReflect
46 import Control.Lens hiding (levels, children)
47 import Control.Monad (forM_)
49 import qualified Data.List as L
51 import Data.Text (Text)
52 import qualified Data.Text as T
54 import Data.Maybe (fromMaybe)
55 import qualified Data.Map as Map
56 import Gargantext.Prelude hiding (cs)
57 import qualified Data.Tree as Tree
58 import Data.Tree (Tree)
59 import qualified Prelude as P (putStrLn, logBase, isNaN, RealFloat)
61 nan :: Floating e => e
64 noNaNs :: P.RealFloat e => [e] -> [e]
65 noNaNs = filter (not . P.isNaN)
67 updateIfDefined :: P.RealFloat e => e -> e -> e
68 updateIfDefined e0 e | P.isNaN e = e0
71 sim :: Entropy e => e -> e -> Bool
72 sim x y = x == y || (P.isNaN x && P.isNaN y)
74 subst :: Entropy e => (e, e) -> e -> e
75 subst (src, dst) x | sim src x = dst
77 ------------------------------------------------------------------------
84 -- ^ TODO: only used for debugging
86 ------------------------------------------------------------------------
87 -- | Example and tests for development
90 , _info_entropy_var :: e
94 instance Show e => Show (I e) where
95 show (I e ev a) = show (e, ev, a)
99 type ModEntropy i o e = (e -> e) -> i -> o
101 set_autonomy :: Entropy e => ModEntropy (I e) (I e) e
102 set_autonomy fe i = i & info_autonomy .~ fe (i ^. info_entropy_var)
104 set_entropy_var :: Entropy e => Setter e (I e) e e
105 set_entropy_var f e = (\ev -> I e ev nan) <$> f e
107 data StartStop = Start | Stop
108 deriving (Ord, Eq, Show)
110 data Token = NonTerminal Text
112 deriving (Ord, Eq, Show)
114 isTerminal :: Token -> Bool
115 isTerminal (Terminal _) = True
116 isTerminal (NonTerminal _) = False
118 parseToken :: Text -> Token
119 parseToken "<start>" = Terminal Start
120 parseToken "<stop>" = Terminal Stop
121 parseToken t = NonTerminal t
123 toToken :: [Text] -> [Token]
124 toToken xs = Terminal Start : (NonTerminal <$> xs) <> [Terminal Stop]
126 printToken :: Token -> Text
129 f (NonTerminal x) = x
130 f (Terminal Start) = "<start>"
131 f (Terminal Stop) = "<stop>"
132 ------------------------------------------------------------------------
135 = Node { _node_count :: Int
137 , _node_children :: Map k (Trie k e)
139 | Leaf { _node_count :: Int }
144 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
145 insertTrie [] n = n { _node_count = _node_count n +1}
146 insertTrie (x:xs) (Leaf c) = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
147 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
149 f = Just . insertTrie xs . fromMaybe emptyTrie
151 -- emptyTrie :: (Ord k, Monoid e) => Trie k e
152 -- emptyTrie = Node 0 mempty mempty
153 emptyTrie :: Trie k e
156 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
158 | Map.null children = Leaf c
159 | otherwise = Node c mempty children
161 -----------------------------
162 -- | Trie to Tree since Tree as nice print function
163 toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
164 toTree k (Leaf c) = Tree.Node (k, c, Nothing) []
165 toTree k (Node c e cs) = Tree.Node (k, c, Just e) (map (uncurry toTree) $ Map.toList cs)
167 ------------------------------------------------------------------------
168 ------------------------------------------------------------------------
169 normalizeLevel :: Entropy e => e -> e -> e -> e
170 normalizeLevel m v e = (e - m) / v
174 nodeChildren :: Trie k e -> Map k (Trie k e)
175 nodeChildren (Node _ _ cs) = cs
176 nodeChildren (Leaf _) = Map.empty
180 chunkAlongEleve :: Int -> [a] -> [[a]]
181 chunkAlongEleve n xs = L.take n <$> L.tails xs
183 data Direction = Backward | Forward
185 buildTrie :: Direction -> Int -> [[Token]] -> Trie Token ()
186 buildTrie d n sentences
187 = L.foldr insertTrie emptyTrie
189 $ ( filter (/= [Terminal (term d)])
190 . chunkAlongEleve (n + 1)
195 order Forward = identity
196 order Backward = reverse
198 term Backward = Start
200 class IsTrie trie where
201 entropyTrie :: Entropy e => (k -> Bool) -> trie k () -> trie k e
202 nodeEntropy :: Entropy e => Getting e i e -> trie k i -> e
203 nodeChild :: Ord k => k -> trie k e -> trie k e
204 findTrie :: Ord k => [k] -> trie k e -> trie k e
205 printTrie :: (Show i, Entropy e) => Getting e i e -> trie Token i -> IO ()
206 evTrie :: Entropy e => Getting e i e -> Setter i o e e -> trie k i -> trie k o
207 normalizeEntropy :: Entropy e
208 => Getting e i e -> ModEntropy i o e
209 -> trie k i -> trie k o
211 instance IsTrie Trie where
213 entropyTrie _ (Leaf c) = Leaf c
214 entropyTrie pred (Node c () children) = Node c e (map (entropyTrie pred) children)
216 children' = Map.toList children
217 sum_count = sum $ _node_count . snd <$> children'
218 e | sum_count == 0 = nan
219 | otherwise = sum $ f <$> children'
220 f (k, child) = if pred k then chc * P.logBase 2 (fromIntegral c)
221 else - chc * P.logBase 2 chc
223 chc = fromIntegral (_node_count child) / fromIntegral c
225 nodeEntropy inE (Node _ e _) = e ^. inE
226 nodeEntropy _ (Leaf _) = nan
228 nodeChild k (Node _ _ cs) = fromMaybe emptyTrie (Map.lookup k cs)
229 nodeChild _ (Leaf _) = emptyTrie
231 findTrie ks t = L.foldl (flip nodeChild) t ks
234 P.putStrLn . Tree.drawTree
236 $ toTree (NonTerminal "") t
237 P.putStrLn " Levels:"
238 forM_ (normalizationLevels inE t) $ \level ->
239 P.putStrLn $ " " <> show level
241 evTrie inE setEV = go nan
243 go _ (Leaf c) = Leaf c
244 go e0 (Node c i children) = Node c (i & setEV .~ ev e0 e1) $ go e1 <$> children
250 normalizeEntropy inE modE t = go (modE identity) (normalizationLevels inE t) t
252 go _ _ (Leaf c) = Leaf c
253 go _ [] _ = panic "normalizeEntropy' empty levels"
254 go f ((m, v, _) : ess) (Node c i children)
255 = Node c (f i) $ go (modE $ normalizeLevel m v) ess <$> children
256 ------------------------------------------------------------------------
258 levels :: Trie k e -> [[Trie k e]]
259 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
261 subForest :: Trie k e -> [Trie k e]
262 subForest (Leaf _) = []
263 subForest (Node _ _ children) = Map.elems children
265 entropyLevels :: Entropy e => Getting e i e -> Trie k i -> [[e]]
266 entropyLevels inE = fmap (noNaNs . map (nodeEntropy inE)) . L.tail . levels
268 normalizationLevels :: Entropy e => Getting e i e -> Trie k i -> [(e, e, Int)]
269 normalizationLevels inE = fmap f . entropyLevels inE
271 f es = (mean es, deviation es, length es)
273 ------------------------------------------------------------------------
275 data Tries k e = Tries
282 buildTries :: Int -> [[Token]] -> Tries Token ()
283 buildTries n sentences = Tries
284 { _fwd = buildTrie Forward n sentences
285 , _bwd = buildTrie Backward n sentences
288 instance IsTrie Tries where
290 nodeEntropy inE (Tries f b) = mean [nodeEntropy inE f, nodeEntropy inE b]
292 findTrie ks (Tries f b) = Tries (findTrie ks f) (findTrie (reverse ks) b)
294 nodeChild = onTries . nodeChild
296 entropyTrie = onTries . entropyTrie
298 evTrie inE setEV = onTries $ evTrie inE setEV
300 normalizeEntropy inE = onTries . normalizeEntropy inE
302 printTrie inE (Tries f b) = do
303 P.putStrLn "Forward:"
306 P.putStrLn "Backward:"
309 onTries :: (Trie k i -> Trie k o) -> Tries k i -> Tries k o
310 onTries h (Tries f b) = Tries (h f) (h b)
312 ------------------------------------------------------------------------
313 split :: (IsTrie trie, Entropy e) => Lens' i e -> trie Token i -> [Token] -> [[Token]]
315 split inE t (Terminal Start:xs) = split inE t xs
316 split inE t (x0:xs0) = go [x0] xs0
319 mayCons xs xss = xs : xss
322 go pref (Terminal Stop:_) = [pref]
323 go _ (Terminal Start:_) = panic "split impossible"
325 -- trace (show (if acc then "ACC" else "CUT", (prefx, epxt), if acc then ">" else "<=", ((pref, ept), "+", ([x], ext)))) $
328 else mayCons pref $ go [x] xs
332 pxt = findTrie prefx t
335 -- ^ entropy of the current prefix
339 -- ^ entropy of the current prefix plus x
340 acc = P.isNaN ept || P.isNaN ext || not (P.isNaN epxt) -- && (epxt > ept + ext)
342 -- aut(["in","this","paper"]) > aut(["in","this"]) + aut(["paper"])
347 split :: Entropy e => Lens' i e -> Tries Token i -> [Token] -> [[Token]]
349 maximumWith (sum . map $ nodeAutonomy inE t0) (all the splits of ts)
352 ------------------------------------------------------------------------
354 mainEleve :: Int -> [[Text]] -> [[[Text]]]
355 mainEleve n input = map (map printToken) . split info_autonomy (t :: Tries Token (I Double)) <$> inp
357 inp = toToken <$> input
358 t = normalizeEntropy info_entropy_var set_autonomy
359 . evTrie identity set_entropy_var
360 . entropyTrie isTerminal
364 ---------------------------------------------
366 type Checks e = [(Text, Int, e, e, e, e, e, e, e, e, e)]
368 testEleve :: e ~ Double => Bool -> Int -> [Text] -> Checks e -> IO Bool
369 testEleve debug n output checks = do
371 res = map (map printToken) . split info_autonomy nt <$> inp
373 P.putStrLn $ show input
375 printTrie info_entropy nt
377 P.putStrLn "Splitting:"
378 P.putStrLn $ show res
380 pure $ expected == res
383 out = T.words <$> output
384 expected = fmap (T.splitOn "-") <$> out
385 input = (T.splitOn "-" =<<) <$> out
386 inp = toToken <$> input
388 nt :: Tries Token (I Double)
389 nt = normalizeEntropy info_entropy_var set_autonomy
390 . evTrie identity set_entropy_var
391 . entropyTrie isTerminal
396 then P.putStrLn $ " \ESC[32mPASS\ESC[m " <> msg <> " " <> show ref
397 else P.putStrLn $ " \ESC[31mFAIL\ESC[m " <> msg <> " ref=" <> show ref <> " my=" <> show my
399 checker (ngram, count, entropy, ev, autonomy, fwd_entropy, fwd_ev, fwd_autonomy, bwd_entropy, bwd_ev, bwd_autonomy) = do
400 let ns = parseToken <$> T.words ngram
403 P.putStrLn $ " " <> T.unpack ngram <> ":"
404 check (==) "count" count (_node_count (_fwd nt'))
406 check sim "entropy" entropy (nodeEntropy info_entropy nt' )
407 check sim "ev" ev (nodeEntropy info_entropy_var nt' )
408 check sim "autonomy" autonomy (nodeEntropy info_autonomy nt' )
410 check sim "fwd_entropy" fwd_entropy (nodeEntropy info_entropy (_fwd nt'))
411 check sim "fwd_ev" fwd_ev (nodeEntropy info_entropy_var (_fwd nt'))
412 check sim "fwd_autonomy" fwd_autonomy (nodeEntropy info_autonomy (_fwd nt'))
414 check sim "bwd_entropy" bwd_entropy (nodeEntropy info_entropy (_bwd nt'))
415 check sim "bwd_ev" bwd_ev (nodeEntropy info_entropy_var (_bwd nt'))
416 check sim "bwd_autonomy" bwd_autonomy (nodeEntropy info_autonomy (_bwd nt'))
418 -- | TODO real data is a list of tokenized sentences
419 example0, example1, example2, example3, example4, example5, example6 :: [Text]
420 example0 = ["New-York is New-York and New-York"]
421 example1 = ["to-be or not to-be"]
422 example2 = ["to-be-or not to-be-or NOT to-be and"]
423 example3 = example0 <> example0
424 -- > TEST: Should not have York New in the trie
425 example4 = ["a-b-c-d e a-b-c-d f"]
426 example5 = ["a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"]
427 example6 = ["le-petit chat"
433 checks0, checks2 :: Checks Double
436 -- [(token, count, entropy, ev, autonomy, fwd_entropy, fwd_ev, fwd_autonomy, bwd_entropy, bwd_ev, bwd_autonomy)]
437 [ ("<start>", 1, nan, nan, nan, 0.0, -2.113283334294875, -0.5000000000000002, nan, nan, nan)
438 , ("New", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 0.0, -2.113283334294875, -0.5000000000000002, 1.584962500721156, -0.5283208335737188, 2.0)
439 , ("York", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 1.584962500721156, -0.5283208335737188, 2.0, 0.0, -2.113283334294875, -0.5000000000000002)
440 , ("is", 1, 0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002)
441 , ("and", 1, 0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002)
442 , ("<stop>", 0, nan, nan, nan, nan, nan, nan, 0.0, -2.113283334294875, -0.5000000000000002)
443 , ("<start> New", 1, nan, nan, nan, 0.0, nan, nan, nan, nan, nan)
444 , ("New York", 3, 1.584962500721156, 1.584962500721156, 1.414213562373095, 1.584962500721156, 1.584962500721156, 1.4142135623730947, 1.584962500721156, 1.584962500721156, 1.4142135623730951)
445 , ("York is", 1, 0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865476, 0.0, nan, nan)
446 , ("is New", 1, 0, nan, nan, 0.0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865474)
447 , ("York and", 1, 0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865476, 0.0, nan, nan)
448 , ("and New", 1, 0, nan, nan, 0.0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865474)
449 , ("York <stop>", 1, nan, nan, nan, nan, nan, nan, 0.0, nan, nan)
450 , ("<start> New York", 1, nan, nan, nan, 0.0, nan, nan, nan, nan, nan)
451 , ("New York is", 1, 0, nan, nan, 0.0, -1.584962500721156, nan, 0.0, nan, nan)
452 , ("York is New", 1, 0, nan, nan, 0.0, nan, nan, 0.0, nan, nan)
453 , ("is New York", 1, 0, nan, nan, 0.0, nan, nan, 0.0, -1.584962500721156, nan)
454 , ("New York and", 1, 0, nan, nan, 0.0, -1.584962500721156, nan, 0.0, nan, nan)
455 , ("York and New", 1, 0, nan, nan, 0.0, nan, nan, 0.0, nan, nan)
456 , ("and New York", 1, 0, nan, nan, 0.0, nan, nan, 0.0, -1.584962500721156, nan)
457 , ("New York <stop>", 1, nan, nan, nan, nan, nan, nan, 0.0, nan, nan)
462 [("to be", 3, 1.2516291673878228, 1.2516291673878228, 1.5535694744293167, nan, 0.9182958340544896)
463 ,("be or", 2, 0.5, nan, nan, nan, 1.0)
464 ,("or not", 1, 0.0, nan, nan, nan, 0.0)
465 ,("not to", 1, 0.0, nan, nan, nan, 0.0)
466 ,("or NOT", 1, 0.0, nan, nan, nan, 0.0)
467 ,("NOT to", 1, 0.0, nan, nan, nan, 0.0)
468 ,("be and", 1, 0.0, nan, nan, nan, 0.0)
475 [("example0", 3, example0, checks0)
476 ,("example0", 2, example0, [])
477 ,("example1", 2, example1, [])
478 ,("example2", 3, example2, checks2)
479 ,("example3", 2, example3, [])
480 ,("example4", 4, example4, [])
481 ,("example5", 5, example5, [])
482 ,("example6", 2, example6, [])
484 (\(name, n, ex, checks) -> do
485 P.putStrLn $ name <> " " <> show n
486 b <- testEleve False n ex checks
487 P.putStrLn $ " splitting: " <> if b then "PASS" else "FAIL"