]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Eleve.hs
Eleve: improve splitting which passes 5/7 tests but still lacks a crucial point
[gargantext.git] / src / Gargantext / Text / Eleve.hs
1 {-|
2 Module : Gargantext.Text.Eleve
3 Description : Unsupervized Word segmentation
4 Copyright : (c) CNRS, 2019-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 # Implementation of Unsupervized Word Segmentation
11
12 References:
13
14 - Python implementation (Korantin August, Emmanuel Navarro):
15 [EleVe](https://github.com/kodexlab/eleve.git)
16
17 - Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
18 Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
19 the 50th Annual Meeting of the Association for Computational Linguistics
20 , pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
21
22 Notes for current implementation:
23 - TODO extract longer ngrams (see paper above, viterbi algo can be used)
24 - TODO AD TEST: prop (Node c _e f) = c == Map.size f
25
26 - AD: Real ngrams extraction test
27 from Gargantext.Text.Terms import extractTermsUnsupervised
28 docs <- runCmdRepl $ selectDocs 1004
29 extractTermsUnsupervised 3 $ DT.intercalate " "
30 $ catMaybes
31 $ Gargantext.map _hyperdataDocument_abstract docs
32
33 -}
34 {-# LANGUAGE ConstraintKinds #-}
35 {-# LANGUAGE NoImplicitPrelude #-}
36 {-# LANGUAGE OverloadedStrings #-}
37 {-# LANGUAGE RankNTypes #-}
38 {-# LANGUAGE TemplateHaskell #-}
39 {-# LANGUAGE TypeFamilies #-}
40
41 module Gargantext.Text.Eleve where
42
43 -- import Debug.Trace (trace)
44 -- import Debug.SimpleReflect
45
46 import Control.Lens hiding (levels, children)
47 import Control.Monad (forM_)
48 import Data.Ord (Ord)
49 import qualified Data.List as L
50 import Data.Monoid
51 import Data.Text (Text)
52 import qualified Data.Text as T
53 import Data.Map (Map)
54 import Data.Maybe (fromMaybe)
55 import qualified Data.Map as Map
56 import Gargantext.Prelude hiding (cs)
57 import qualified Data.Tree as Tree
58 import Data.Tree (Tree)
59 import qualified Prelude as P (putStrLn, logBase, isNaN, RealFloat)
60
61 nan :: Floating e => e
62 nan = 0 / 0
63
64 noNaNs :: P.RealFloat e => [e] -> [e]
65 noNaNs = filter (not . P.isNaN)
66
67 updateIfDefined :: P.RealFloat e => e -> e -> e
68 updateIfDefined e0 e | P.isNaN e = e0
69 | otherwise = e
70
71 sim :: Entropy e => e -> e -> Bool
72 sim x y = x == y || (P.isNaN x && P.isNaN y)
73
74 subst :: Entropy e => (e, e) -> e -> e
75 subst (src, dst) x | sim src x = dst
76 | otherwise = x
77 ------------------------------------------------------------------------
78
79 type Entropy e =
80 ( Fractional e
81 , Floating e
82 , P.RealFloat e
83 , Show e
84 -- ^ TODO: only used for debugging
85 )
86 ------------------------------------------------------------------------
87 -- | Example and tests for development
88 data I e = I
89 { _info_entropy :: e
90 , _info_entropy_var :: e
91 , _info_autonomy :: e
92 }
93
94 instance Show e => Show (I e) where
95 show (I e ev a) = show (e, ev, a)
96
97 makeLenses ''I
98
99 type ModEntropy i o e = (e -> e) -> i -> o
100
101 set_autonomy :: Entropy e => ModEntropy e (I e) e
102 set_autonomy f e = I e nan (f e)
103
104 set_entropy_var :: Entropy e => Setter e (I e) e e
105 set_entropy_var f e = (\ev -> I e ev nan) <$> f e
106
107 data StartStop = Start | Stop
108 deriving (Ord, Eq, Show)
109
110 data Token = NonTerminal Text
111 | Terminal StartStop
112 deriving (Ord, Eq, Show)
113
114 isTerminal :: Token -> Bool
115 isTerminal (Terminal _) = True
116 isTerminal (NonTerminal _) = False
117
118 parseToken :: Text -> Token
119 parseToken "<start>" = Terminal Start
120 parseToken "<stop>" = Terminal Stop
121 parseToken t = NonTerminal t
122
123 toToken :: [Text] -> [Token]
124 toToken xs = Terminal Start : (NonTerminal <$> xs) <> [Terminal Stop]
125
126 printToken :: Token -> Text
127 printToken = f
128 where
129 f (NonTerminal x) = x
130 f (Terminal Start) = "<start>"
131 f (Terminal Stop) = "<stop>"
132 ------------------------------------------------------------------------
133
134 data Trie k e
135 = Node { _node_count :: Int
136 , _node_entropy :: e
137 , _node_children :: Map k (Trie k e)
138 }
139 | Leaf { _node_count :: Int }
140 deriving (Show)
141
142 makeLenses ''Trie
143
144 insertTries :: Ord k => [[k]] -> Trie k ()
145 insertTries = L.foldr insertTrie emptyTrie
146
147 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
148 insertTrie [] n = n { _node_count = _node_count n +1}
149 insertTrie (x:xs) (Leaf c) = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
150 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
151 where
152 f = Just . insertTrie xs . fromMaybe emptyTrie
153
154 -- emptyTrie :: (Ord k, Monoid e) => Trie k e
155 -- emptyTrie = Node 0 mempty mempty
156 emptyTrie :: Trie k e
157 emptyTrie = Leaf 0
158
159 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
160 mkTrie c children
161 | Map.null children = Leaf c
162 | otherwise = Node c mempty children
163
164 -----------------------------
165 -- | Trie to Tree since Tree as nice print function
166 toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
167 toTree k (Leaf c) = Tree.Node (k, c, Nothing) []
168 toTree k (Node c e cs) = Tree.Node (k, c, Just e) (map (uncurry toTree) $ Map.toList cs)
169
170 ------------------------------------------------------------------------
171 ------------------------------------------------------------------------
172 entropyTrie :: Entropy e => (k -> Bool) -> Trie k () -> Trie k e
173 entropyTrie _ (Leaf c) = Leaf c
174 entropyTrie pred (Node c () children) = Node c e (map (entropyTrie pred) children)
175 where
176 children' = Map.toList children
177 sum_count = sum $ _node_count . snd <$> children'
178 e | sum_count == 0 = nan
179 | otherwise = sum $ f <$> children'
180 f (k, child) = if pred k then chc * P.logBase 2 (fromIntegral c)
181 else - chc * P.logBase 2 chc
182 where
183 chc = fromIntegral (_node_count child) / fromIntegral c
184 ------------------------------------------------------------------------
185 normalizeLevel :: Entropy e => e -> e -> e -> e
186 normalizeLevel m v e = (e - m) / v
187
188 {- Unused
189
190 nodeChildren :: Trie k e -> Map k (Trie k e)
191 nodeChildren (Node _ _ cs) = cs
192 nodeChildren (Leaf _) = Map.empty
193
194 -}
195
196
197 class IsTrie trie where
198 buildTrie :: Entropy e => [[Token]] -> trie Token e
199 nodeEntropy :: Entropy e => Getting e i e -> trie k i -> e
200 nodeChild :: Ord k => k -> trie k e -> trie k e
201 findTrie :: Ord k => [k] -> trie k e -> trie k e
202 findTrieR :: Ord k => [k] -> trie k e -> trie k e
203 printTrie :: (Show i, Entropy e) => Getting e i e -> trie Token i -> IO ()
204 evTrie :: Entropy e => Getting e i e -> Setter i o e e -> trie k i -> trie k o
205 normalizeEntropy :: Entropy e
206 => Getting e i e -> ModEntropy i o e
207 -> trie k i -> trie k o
208
209 -- UNUSED
210 --nodeAutonomy :: (Ord k, Entropy e) => Getting e i e -> trie k i -> [k] -> e
211 --nodeAutonomy inE t ks = nodeEntropy inE $ findTrie ks t
212
213 instance IsTrie Trie where
214 buildTrie ts = entropyTrie isTerminal $ insertTries ts
215
216 nodeEntropy inE (Node _ e _) = e ^. inE
217 nodeEntropy _ (Leaf _) = nan
218
219 nodeChild k (Node _ _ cs) = fromMaybe emptyTrie (Map.lookup k cs)
220 nodeChild _ (Leaf _) = emptyTrie
221
222 findTrie ks t = L.foldl (flip nodeChild) t ks
223 findTrieR = findTrie
224
225 printTrie inE t = do
226 P.putStrLn . Tree.drawTree
227 . fmap show
228 $ toTree (NonTerminal "") t
229 P.putStrLn " Levels:"
230 forM_ (normalizationLevels inE t) $ \level ->
231 P.putStrLn $ " " <> show level
232
233 evTrie inE setEV = go nan
234 where
235 go _ (Leaf c) = Leaf c
236 go e0 (Node c i children) = Node c (i & setEV .~ ev e0 e1) $ go e1 <$> children
237 where e1 = i ^. inE
238
239 ev 0 0 = nan
240 ev i0 i1 = i1 - i0
241
242 normalizeEntropy inE modE t = go (modE identity) (normalizationLevels inE t) t
243 where
244 go _ _ (Leaf c) = Leaf c
245 go _ [] _ = panic "normalizeEntropy' empty levels"
246 go f ((m, v, _) : ess) (Node c i children)
247 = Node c (f i) $ go (modE $ normalizeLevel m v) ess <$> children
248 ------------------------------------------------------------------------
249
250 levels :: Trie k e -> [[Trie k e]]
251 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
252 where
253 subForest :: Trie k e -> [Trie k e]
254 subForest (Leaf _) = []
255 subForest (Node _ _ children) = Map.elems children
256
257 entropyLevels :: Entropy e => Getting e i e -> Trie k i -> [[e]]
258 entropyLevels inE = fmap (noNaNs . map (nodeEntropy inE)) . L.tail . levels
259
260 normalizationLevels :: Entropy e => Getting e i e -> Trie k i -> [(e, e, Int)]
261 normalizationLevels inE = fmap f . entropyLevels inE
262 where
263 f es = (mean es, deviation es, length es)
264
265 ------------------------------------------------------------------------
266
267 data Tries k e = Tries
268 { _fwd :: Trie k e
269 , _bwd :: Trie k e
270 }
271
272 makeLenses ''Tries
273
274 nodeEntropySafe :: Entropy e => Getting e i e -> Tries k i -> e
275 nodeEntropySafe inE (Tries f b) =
276 mean $ noNaNs [nodeEntropy inE f, nodeEntropy inE b]
277
278 nodeEntropyBwdOpt :: Entropy e => Getting e i e -> Tries k i -> e
279 nodeEntropyBwdOpt inE (Tries f b) =
280 mean $ nodeEntropy inE f : noNaNs [nodeEntropy inE b]
281
282 instance IsTrie Tries where
283 buildTrie tts = Tries { _fwd = buildTrie tts
284 , _bwd = buildTrie (reverse <$> tts)
285 }
286
287 nodeEntropy inE (Tries f b) = mean [nodeEntropy inE f, nodeEntropy inE b]
288
289 findTrie ks = onTries (findTrie ks)
290 findTrieR ks (Tries f b) = Tries (findTrieR ks f) (findTrieR (reverse ks) b)
291
292 nodeChild = onTries . nodeChild
293
294 evTrie inE setEV = onTries $ evTrie inE setEV
295
296 normalizeEntropy inE = onTries . normalizeEntropy inE
297
298 printTrie inE (Tries f b) = do
299 P.putStrLn "Forward:"
300 printTrie inE f
301 P.putStrLn ""
302 P.putStrLn "Backward:"
303 printTrie inE b
304
305 onTries :: (Trie k i -> Trie k o) -> Tries k i -> Tries k o
306 onTries h (Tries f b) = Tries (h f) (h b)
307
308 ------------------------------------------------------------------------
309 split :: (IsTrie trie, Entropy e) => Lens' i e -> trie Token i -> [Token] -> [[Token]]
310 split _ _ [] = []
311 split inE t (Terminal Start:xs) = split inE t xs
312 split inE t (x0:xs0) = go [x0] xs0
313 where
314 mayCons [] xss = xss
315 mayCons xs xss = xs : xss
316
317 go pref [] = [pref]
318 go pref (Terminal Stop:_) = [pref]
319 go _ (Terminal Start:_) = panic "split impossible"
320 go pref (x:xs) =
321 -- trace (show (if acc then "ACC" else "CUT", (prefx, epxt), if acc then ">" else "<=", ((pref, ept), "+", ([x], ext)))) $
322 if acc
323 then go prefx xs
324 else mayCons pref $ go [x] xs
325 where
326 prefx = pref <> [x]
327 pt = findTrieR pref t
328 pxt = findTrieR prefx t
329 xt = findTrieR [x] t
330 ept = ne pt
331 -- ^ entropy of the current prefix
332 ext = ne xt
333 -- ^ entropy of [x]
334 epxt = ne pxt
335 -- ^ entropy of the current prefix plus x
336 acc = P.isNaN ept || P.isNaN ext || not (P.isNaN epxt) -- && (epxt > ept + ext)
337
338 -- aut(["in","this","paper"]) > aut(["in","this"]) + aut(["paper"])
339
340 ne = nodeEntropy inE
341
342 {-
343 split :: Entropy e => Lens' i e -> Tries Token i -> [Token] -> [[Token]]
344 split inE t0 ts =
345 maximumWith (sum . map $ nodeAutonomy inE t0) (all the splits of ts)
346 -}
347
348 ------------------------------------------------------------------------
349 ------------------------------------------------------------------------
350
351 mainEleve :: Int -> [[Text]] -> [[[Text]]]
352 mainEleve _ _ = []
353 {-
354 mainEleve n input = map (map printToken) . split identity (t :: Trie Token Double) <$> inp
355 where
356 inp = toToken <$> input
357 t = buildTrie $ L.concat $ chunkAlong n 1 <$> inp
358 -}
359
360 chunkAlongEleve :: Int -> [a] -> [[a]]
361 chunkAlongEleve n xs = L.take n <$> L.tails xs
362
363 data Order = Backward | Forward
364
365 toToken' :: Order -> Int -> [[Text]] -> [[Token]]
366 toToken' o n input = L.concat
367 $ ( filter (/= [Terminal (term o)])
368 . chunkAlongEleve (n + 1)
369 . (order o)
370 )
371 <$> toToken
372 <$> input
373 where
374 order Forward = identity
375 order Backward = reverse
376 term Forward = Stop
377 term Backward = Start
378
379
380 ---------------------------------------------
381
382 type Checks e = [(Text, Int, e, e, e, e, e, e, e, e, e)]
383
384 testEleve :: e ~ Double => Bool -> Int -> [Text] -> Checks e -> IO Bool
385 testEleve debug n output checks = do
386 let
387 {-
388 pss = [ (ps, findTrie ps fwd ^? _Just . node_entropy) -- . info_entropy)
389 | ps <- L.nub $ [ c
390 | m <- [1..n]
391 , cs <- chunkAlong m 1 <$> inp
392 , c <- cs
393 ]
394 ]
395 -}
396 res = map (map printToken) . split info_autonomy nt <$> inp
397 when debug $ do
398 P.putStrLn (show input)
399 -- forM_ pss (P.putStrLn . show)
400 P.putStrLn ""
401 printTrie info_entropy nt
402 -- P.putStrLn ""
403 -- P.putStrLn "Entropy Var:"
404 -- printTrie identity t''
405 P.putStrLn ""
406 P.putStrLn "Splitting:"
407 P.putStrLn $ show res
408 forM_ checks checker
409 pure $ expected == res
410
411 where
412 out = T.words <$> output
413 expected = fmap (T.splitOn "-") <$> out
414 input = (T.splitOn "-" =<<) <$> out
415 inp = toToken <$> input
416
417 t :: Tries Token Double
418 t = -- buildTrie (toToken' n input)
419 Tries { _fwd = buildTrie (toToken' Forward n input)
420 , _bwd = buildTrie (toToken' Backward n input)
421 }
422
423 evt :: Tries Token (I Double)
424 evt = evTrie identity set_entropy_var t
425
426 nt :: Tries Token (I Double)
427 nt = normalizeEntropy info_entropy_var (\fe i -> i & info_autonomy .~ fe (i ^. info_entropy_var)) evt
428
429 -- t'' :: Trie Token Double
430 -- t'' = set_entropy_vars info_autonomy (\e _i -> e) nt
431
432 -- nt = normalizeEntropy identity set_autonomy (fwd :: Trie Token Double)
433 -- nt = normalizeEntropy' info_entropy (\f -> info_norm_entropy' %~ f) nt
434
435 check f msg ref my =
436 if f ref my
437 then P.putStrLn $ " \ESC[32mPASS\ESC[m " <> msg <> " " <> show ref
438 else P.putStrLn $ " \ESC[31mFAIL\ESC[m " <> msg <> " ref=" <> show ref <> " my=" <> show my
439
440 checker (ngram, count, entropy, ev, autonomy, fwd_entropy, fwd_ev, fwd_autonomy, bwd_entropy, bwd_ev, bwd_autonomy) = do
441 let ns = parseToken <$> T.words ngram
442 nt' = findTrieR ns nt
443
444 P.putStrLn $ " " <> T.unpack ngram <> ":"
445 check (==) "count" count (_node_count (_fwd nt'))
446
447 check sim "entropy" entropy (nodeEntropy info_entropy nt' )
448 check sim "ev" ev (nodeEntropy info_entropy_var nt' )
449 check sim "autonomy" autonomy (nodeEntropy info_autonomy nt' )
450
451 check sim "fwd_entropy" fwd_entropy (nodeEntropy info_entropy (_fwd nt'))
452 check sim "fwd_ev" fwd_ev (nodeEntropy info_entropy_var (_fwd nt'))
453 check sim "fwd_autonomy" fwd_autonomy (nodeEntropy info_autonomy (_fwd nt'))
454
455 check sim "bwd_entropy" bwd_entropy (nodeEntropy info_entropy (_bwd nt'))
456 check sim "bwd_ev" bwd_ev (nodeEntropy info_entropy_var (_bwd nt'))
457 check sim "bwd_autonomy" bwd_autonomy (nodeEntropy info_autonomy (_bwd nt'))
458
459 -- | TODO real data is a list of tokenized sentences
460 example0, example1, example2, example3, example4, example5, example6 :: [Text]
461 example0 = ["New-York is New-York and New-York"]
462 example1 = ["to-be or not to-be"]
463 example2 = ["to-be-or not to-be-or NOT to-be and"]
464 example3 = example0 <> example0
465 -- > TEST: Should not have York New in the trie
466 example4 = ["a-b-c-d e a-b-c-d f"]
467 example5 = ["a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"]
468 example6 = ["le-petit chat"
469 ,"le-petit chien"
470 ,"le-petit rat"
471 ,"le gros rat"
472 ]
473
474 checks0, checks2 :: Checks Double
475
476 checks0 =
477 -- [(token, count, entropy, ev, autonomy, fwd_entropy, fwd_ev, fwd_autonomy, bwd_entropy, bwd_ev, bwd_autonomy)]
478 [ ("<start>", 1, nan, nan, nan, 0.0, -2.113283334294875, -0.5000000000000002, nan, nan, nan)
479 , ("New", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 0.0, -2.113283334294875, -0.5000000000000002, 1.584962500721156, -0.5283208335737188, 2.0)
480 , ("York", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 1.584962500721156, -0.5283208335737188, 2.0, 0.0, -2.113283334294875, -0.5000000000000002)
481 , ("is", 1, 0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002)
482 , ("and", 1, 0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002)
483 , ("<stop>", 0, nan, nan, nan, nan, nan, nan, 0.0, -2.113283334294875, -0.5000000000000002)
484 , ("<start> New", 1, nan, nan, nan, 0.0, nan, nan, nan, nan, nan)
485 , ("New York", 3, 1.584962500721156, 1.584962500721156, 1.414213562373095, 1.584962500721156, 1.584962500721156, 1.4142135623730947, 1.584962500721156, 1.584962500721156, 1.4142135623730951)
486 , ("York is", 1, 0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865476, 0.0, nan, nan)
487 , ("is New", 1, 0, nan, nan, 0.0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865474)
488 , ("York and", 1, 0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865476, 0.0, nan, nan)
489 , ("and New", 1, 0, nan, nan, 0.0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865474)
490 , ("York <stop>", 1, nan, nan, nan, nan, nan, nan, 0.0, nan, nan)
491 , ("<start> New York", 1, nan, nan, nan, 0.0, nan, nan, nan, nan, nan)
492 , ("New York is", 1, 0, nan, nan, 0.0, -1.584962500721156, nan, 0.0, nan, nan)
493 , ("York is New", 1, 0, nan, nan, 0.0, nan, nan, 0.0, nan, nan)
494 , ("is New York", 1, 0, nan, nan, 0.0, nan, nan, 0.0, -1.584962500721156, nan)
495 , ("New York and", 1, 0, nan, nan, 0.0, -1.584962500721156, nan, 0.0, nan, nan)
496 , ("York and New", 1, 0, nan, nan, 0.0, nan, nan, 0.0, nan, nan)
497 , ("and New York", 1, 0, nan, nan, 0.0, nan, nan, 0.0, -1.584962500721156, nan)
498 , ("New York <stop>", 1, nan, nan, nan, nan, nan, nan, 0.0, nan, nan)
499 ]
500
501 checks2 = []
502 {-
503 [("to be", 3, 1.2516291673878228, 1.2516291673878228, 1.5535694744293167, nan, 0.9182958340544896)
504 ,("be or", 2, 0.5, nan, nan, nan, 1.0)
505 ,("or not", 1, 0.0, nan, nan, nan, 0.0)
506 ,("not to", 1, 0.0, nan, nan, nan, 0.0)
507 ,("or NOT", 1, 0.0, nan, nan, nan, 0.0)
508 ,("NOT to", 1, 0.0, nan, nan, nan, 0.0)
509 ,("be and", 1, 0.0, nan, nan, nan, 0.0)
510 ]
511 -}
512
513 runTests :: IO ()
514 runTests =
515 forM_
516 [("example0", 2, example0, checks0)
517 ,("example1", 2, example1, [])
518 ,("example2", 3, example2, checks2)
519 ,("example3", 2, example3, [])
520 ,("example4", 4, example4, [])
521 ,("example5", 5, example5, [])
522 ,("example6", 2, example6, [])
523 ]
524 (\(name, n, ex, checks) -> do
525 P.putStrLn $ name <> " " <> show n
526 b <- testEleve False n ex checks
527 P.putStrLn $ " splitting: " <> if b then "PASS" else "FAIL"
528 )