]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Eleve.hs
[VETODO] Entropy Variation fixed on test, needs now to be implemented in the Getting...
[gargantext.git] / src / Gargantext / Text / Eleve.hs
1 {-|
2 Module : Gargantext.Text.Eleve
3 Description : Unsupervized Word segmentation
4 Copyright : (c) CNRS, 2019-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 # Implementation of Unsupervized Word Segmentation
11
12 References:
13
14 - Python implementation (Korantin August, Emmanuel Navarro):
15 [EleVe](https://github.com/kodexlab/eleve.git)
16
17 - Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
18 Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
19 the 50th Annual Meeting of the Association for Computational Linguistics
20 , pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
21
22 Notes for current implementation:
23 - TODO fix normalization
24 - TODO extract longer ngrams (see paper above, viterbi algo can be used)
25 - TODO AD TEST: prop (Node c _e f) = c == Map.size f
26
27 - AD: Real ngrams extraction test
28 from Gargantext.Text.Terms import extractTermsUnsupervised
29 docs <- runCmdRepl $ selectDocs 1004
30 extractTermsUnsupervised 3 $ DT.intercalate " "
31 $ catMaybes
32 $ Gargantext.map _hyperdataDocument_abstract docs
33
34 -}
35 {-# LANGUAGE ConstraintKinds #-}
36 {-# LANGUAGE NoImplicitPrelude #-}
37 {-# LANGUAGE OverloadedStrings #-}
38 {-# LANGUAGE RankNTypes #-}
39 {-# LANGUAGE TemplateHaskell #-}
40 {-# LANGUAGE TypeFamilies #-}
41
42 module Gargantext.Text.Eleve where
43
44 import Debug.Trace (trace)
45 -- import Debug.SimpleReflect
46
47 import Control.Lens (Lens', Getting, (^.), (^?), view, makeLenses, _Just, under, reversed)
48 import Control.Monad (forM_)
49 import Data.Ord (Ord)
50 import qualified Data.List as L
51 import Data.Monoid
52 import Data.Text (Text)
53 import qualified Data.Text as T
54 import Data.Map (Map)
55 import Data.Maybe (fromMaybe)
56 import qualified Data.Map as Map
57 import Gargantext.Prelude hiding (cs)
58 import qualified Data.Tree as Tree
59 import Data.Tree (Tree)
60 import qualified Prelude as P (putStrLn, logBase, isNaN, RealFloat)
61
62 type Entropy e =
63 ( Fractional e
64 , Floating e
65 , P.RealFloat e
66 , Show e
67 -- ^ TODO: only used for debugging
68 )
69 ------------------------------------------------------------------------
70 -- | Example and tests for development
71 data I e = I
72 { _info_entropy :: e
73 , _info_autonomy :: e
74 }
75
76 instance Show e => Show (I e) where
77 show (I e n) = show (e, n)
78
79 makeLenses ''I
80
81 type ModEntropy i o e = (e -> e) -> i -> o
82
83 set_autonomy :: ModEntropy e (I e) e
84 set_autonomy f e = I e (f e)
85
86 data StartStop = Start | Stop
87 deriving (Ord, Eq, Show)
88
89 data Token = NonTerminal Text
90 | Terminal StartStop
91 deriving (Ord, Eq, Show)
92
93 isTerminal :: Token -> Bool
94 isTerminal (Terminal _) = True
95 isTerminal (NonTerminal _) = False
96
97 parseToken :: Text -> Token
98 parseToken "<start>" = Terminal Start
99 parseToken "<stop>" = Terminal Stop
100 parseToken t = NonTerminal t
101
102
103 toToken :: [Text] -> [Token]
104 toToken xs = Terminal Start : (NonTerminal <$> xs) <> [Terminal Stop]
105
106 printToken :: Token -> Text
107 printToken = f
108 where
109 f (NonTerminal x) = x
110 f (Terminal Start) = "<start>"
111 f (Terminal Stop) = "<stop>"
112
113 ------------------------------------------------------------------------
114
115 data Trie k e
116 = Node { _node_count :: Int
117 , _node_entropy :: e
118 , _node_children :: Map k (Trie k e)
119 }
120 | Leaf { _node_count :: Int }
121 deriving (Show)
122
123 makeLenses ''Trie
124
125 insertTries :: Ord k => [[k]] -> Trie k ()
126 insertTries = L.foldr insertTrie emptyTrie
127
128 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
129 insertTrie [] n = n { _node_count = _node_count n +1}
130 insertTrie (x:xs) (Leaf c) = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
131 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
132 where
133 f = Just . insertTrie xs . fromMaybe emptyTrie
134
135 -- emptyTrie :: (Ord k, Monoid e) => Trie k e
136 -- emptyTrie = Node 0 mempty mempty
137 emptyTrie :: Trie k e
138 emptyTrie = Leaf 0
139
140 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
141 mkTrie c children
142 | Map.null children = Leaf c
143 | otherwise = Node c mempty children
144
145 -----------------------------
146
147 -- | Trie to Tree since Tree as nice print function
148 toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
149 toTree k (Leaf c) = Tree.Node (k, c, Nothing) []
150 toTree k (Node c e cs) = Tree.Node (k, c, Just e) (map (uncurry toTree) $ Map.toList cs)
151
152 ------------------------------------------------------------------------
153 ------------------------------------------------------------------------
154
155 nan :: Floating e => e
156 nan = 0 / 0
157
158 noNaNs :: P.RealFloat e => [e] -> [e]
159 noNaNs = filter (not . P.isNaN)
160
161 updateIfDefined :: P.RealFloat e => e -> e -> e
162 updateIfDefined e0 e | P.isNaN e = e0
163 | otherwise = e
164
165 subst :: Entropy e => (e, e) -> e -> e
166 subst (src, dst) x | sim src x = dst
167 | otherwise = x
168
169 entropyTrie :: Entropy e => (k -> Bool) -> Trie k () -> Trie k e
170 entropyTrie _ (Leaf c) = Leaf c
171 entropyTrie pred (Node c () children) = Node c e (map (entropyTrie pred) children)
172 where
173 e = sum $ map f $ Map.toList children
174 f (k, child) = if pred k then chc * P.logBase 2 (fromIntegral c)
175 else - chc * P.logBase 2 chc
176 where
177 chc = fromIntegral (_node_count child) / fromIntegral c
178 ------------------------------------------------------------------------
179
180 normalizeLevel :: Entropy e => [e] -> e -> e
181 normalizeLevel = checkDiff (go . noNaNs)
182
183 where
184 -- checkDiff f es e = let e' = f es e in if e == e' then e' else trace ("normalizeLevel: diff " <> show e <> " " <> show e') e'
185 checkDiff = identity
186 -- go [] = panic "normalizeLevel: impossible"
187 -- trace "normalizeLevel"
188 -- go [_] = identity
189 go es = \e -> (e - m) / v
190 {-
191 in if P.isNaN e'
192 then trace ("normalizeLevel " <> show (e,m,v,es))
193 e
194 else e'
195 -}
196 where
197 m = mean es
198 v = deviation es
199
200 {- Unused
201
202 nodeChildren :: Trie k e -> Map k (Trie k e)
203 nodeChildren (Node _ _ cs) = cs
204 nodeChildren (Leaf _) = Map.empty
205
206 -}
207
208
209 class IsTrie trie where
210 buildTrie :: Entropy e => (Int -> [[Text]] -> [[Token]]) -> Int -> [[Text]] -> trie Token e
211 nodeEntropy :: Entropy e => Getting e i e -> trie k i -> e
212 nodeChild :: Ord k => k -> trie k e -> trie k e
213 findTrie :: Ord k => [k] -> trie k e -> trie k e
214 normalizeEntropy :: Entropy e
215 => Getting e i e -> ModEntropy i o e
216 -> trie k i -> trie k o
217
218 -- UNUSED
219 --nodeAutonomy :: (Ord k, Entropy e) => Getting e i e -> trie k i -> [k] -> e
220 --nodeAutonomy inE t ks = nodeEntropy inE $ findTrie ks t
221
222 instance IsTrie Trie where
223 buildTrie to n ts = entropyTrie isTerminal $ insertTries $ to n ts
224
225 nodeEntropy inE (Node _ e _) = e ^. inE
226 nodeEntropy _ (Leaf _) = -- trace "nodeEntropy of Leaf" $
227 nan
228
229 nodeChild k (Node _ _ cs) = fromMaybe emptyTrie (Map.lookup k cs)
230 nodeChild _ (Leaf _) = emptyTrie
231
232 findTrie ks t = L.foldl (flip nodeChild) t ks
233
234 normalizeEntropy inE modE t = go (modE identity) (entropyLevels inE t) t
235 where
236 go _ [] _ = panic "normalizeEntropy' empty levels"
237 go _ _ (Leaf c) = Leaf c
238 -- go _ ([] : _) _ = panic "normalizeEntropy': empty level"
239 go f (es : ess) (Node c i children)
240 -- | any (sim (i ^. inE)) es
241 = Node c (f i) $ go (modE $ normalizeLevel es) ess <$> children
242 -- | otherwise
243 -- = panic "NOT an elem"
244
245
246 {-
247 This is only normalizing a node with respect to its brothers (unlike all the
248 nodes of the same level).
249
250 normalizeEntropy inE modE = go $ modE identity
251 where
252 go _ (Leaf c) = Leaf c
253 go f (Node c i children)
254 | Map.null children =
255 panic "normalizeEntropy: impossible"
256 | otherwise =
257 Node c (f i) $ go (modE $ normalizeLevel es) <$> children
258 where
259 es = [ i' ^. inE | Node _ i' _ <- Map.elems children ]
260 -}
261 ------------------------------------------------------------------------
262
263 levels :: Trie k e -> [[Trie k e]]
264 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
265 where
266 subForest :: Trie k e -> [Trie k e]
267 subForest (Leaf _) = []
268 subForest (Node _ _ children) = Map.elems children
269
270 entropyLevels :: Entropy e => Getting e i e -> Trie k i -> [[e]]
271 entropyLevels inE = fmap (noNaNs . map (nodeEntropy inE)) . levels
272
273 ------------------------------------------------------------------------
274
275 data Tries k e = Tries
276 { _fwd :: Trie k e
277 , _bwd :: Trie k e
278 }
279
280 instance IsTrie Tries where
281 buildTrie to n tts = Tries { _fwd = buildTrie to n tts
282 , _bwd = buildTrie to n (map reverse $ tts)
283 }
284
285 nodeEntropy inE (Tries fwd bwd) =
286 -- VETODO reverse the query for bwd here
287 -- mean $ noNaNs [nodeEntropy inE fwd, nodeEntropy inE bwd . under reversed]
288 mean $ noNaNs [nodeEntropy inE fwd, nodeEntropy inE bwd]
289
290 findTrie ks (Tries fwd bwd) = Tries (findTrie ks fwd) (findTrie ks bwd)
291 -- ^^
292 -- TODO: here this is tempting to reverse but this is not always what we
293 -- want. See also nodeAutonomy.
294
295 nodeChild k (Tries fwd bwd) = Tries (nodeChild k fwd) (nodeChild k bwd)
296
297 normalizeEntropy inE modE = onTries (normalizeEntropy inE modE)
298
299 onTries :: (Trie k i -> Trie k o) -> Tries k i -> Tries k o
300 onTries f (Tries fwd bwd) = Tries (f fwd) (f bwd)
301
302 ------------------------------------------------------------------------
303 split :: (IsTrie trie, Entropy e) => Lens' i e -> trie Token i -> [Token] -> [[Token]]
304 split _ _ [] = []
305 split inE t0 (Terminal Start:xs0) = split inE (nodeChild (Terminal Start) t0) xs0
306 split inE t0 (x0:xs0) = go (nodeChild x0 t0) [x0] xs0
307 where
308 consRev [] xss = xss
309 consRev xs xss = reverse xs : xss
310
311 go _ pref [] = [reverse pref]
312 go _ pref (Terminal Stop:_) = [reverse pref]
313 go t pref (Terminal Start:xs) = go t pref xs
314 go t pref (x:xs) =
315 -- trace (show (if acc then "ACC" else "CUT", (reverse (x : pref), ext), if acc then ">" else "<=", ((reverse pref, et), "+", ([x], ext0)))) $
316 if acc
317 then go xt (x:pref) xs
318 else consRev pref $ go xt0 [x] xs
319 where
320 xt = nodeChild x t
321 xt0 = nodeChild x t0
322 et = ne 0 t
323 -- ^ entropy of the current prefix
324 ext0 = ne 0 xt0
325 -- ^ entropy of [x]
326 ext = ne 0 xt
327 -- ^ entropy of the current prefix plus x
328 acc = ext > et + ext0
329 -- aut(["in","this","paper"]) > aut(["in","this"]) + aut(["paper"])
330
331 ne d t = if P.isNaN e then d else e
332 where e = nodeEntropy inE t
333
334 {-
335 split :: Entropy e => Lens' i e -> Tries Token i -> [Token] -> [[Token]]
336 split inE t0 ts =
337 maximumWith (sum . map $ nodeAutonomy inE t0) (all the splits of ts)
338 -}
339
340 ------------------------------------------------------------------------
341 ------------------------------------------------------------------------
342
343 mainEleve :: Int -> [[Text]] -> [[[Text]]]
344 mainEleve _ _ = []
345 {-
346 mainEleve n input = map (map printToken) . split identity (t :: Trie Token Double) <$> inp
347 where
348 inp = toToken <$> input
349 t = buildTrie $ L.concat $ chunkAlong n 1 <$> inp
350 -}
351
352 sim :: Entropy e => e -> e -> Bool
353 sim x y = x == y || (P.isNaN x && P.isNaN y)
354
355 chunkAlongEleve :: Int -> [a] -> [[a]]
356 chunkAlongEleve n xs = L.take n <$> L.tails xs
357
358 toToken' :: Int -> [[Text]] -> [[Token]]
359 toToken' n input = L.concat $ (filter (/= [Terminal Stop]) . chunkAlongEleve (n + 2)) <$> toToken <$> input
360
361 testEleve :: e ~ Double => Bool -> Int -> [Text] -> [(Text, Int, e, e, e, e, e)] -> IO Bool
362 testEleve debug n output checks = do
363 let
364 {-
365 pss = [ (ps, findTrie ps fwd ^? _Just . node_entropy) -- . info_entropy)
366 | ps <- L.nub $ [ c
367 | m <- [1..n]
368 , cs <- chunkAlong m 1 <$> inp
369 , c <- cs
370 ]
371 ]
372 -}
373 --res = map (map printToken) . split identity fwd <$> inp
374 --res = map (map printToken) . split info_norm_entropy' nt' <$> inp
375 res = map (map printToken) . split info_autonomy nt <$> inp
376 when debug $ do
377 P.putStrLn (show input)
378 -- forM_ pss (P.putStrLn . show)
379 P.putStrLn ""
380 P.putStrLn "Levels:"
381 forM_ (entropyLevels identity (_fwd t)) $ \level ->
382 P.putStrLn $ " " <> show level
383 P.putStrLn ""
384 P.putStrLn "Forward:"
385 printTrie (_fwd nt)
386 P.putStrLn ""
387 P.putStrLn "Backward:"
388 printTrie (_bwd nt)
389 P.putStrLn ""
390 P.putStrLn "Splitting:"
391 P.putStrLn $ show res
392 forM_ checks checker
393 pure $ expected == res
394
395 where
396 out = T.words <$> output
397 expected = fmap (T.splitOn "-") <$> out
398 input = (T.splitOn "-" =<<) <$> out
399 inp = toToken <$> input
400 t = buildTrie toToken' n input
401 -- nt = normalizeEntropy identity set_autonomy (fwd :: Trie Token Double)
402 -- nt = normalizeEntropy' info_entropy (\f -> info_norm_entropy' %~ f) nt
403 nt = normalizeEntropy identity set_autonomy t
404
405 check f msg ref my =
406 if f ref my
407 then P.putStrLn $ " PASS " <> msg <> " " <> show ref
408 else P.putStrLn $ " FAIL " <> msg <> " ref=" <> show ref <> " my=" <> show my
409
410 checker (ngram, count, entropy, _ev, autonomy, bwd_entropy, fwd_entropy) = do
411 let ns = parseToken <$> T.words ngram
412 t' = findTrie ns nt
413 nsb = parseToken <$> (reverse $ T.words ngram)
414 tb' = findTrie nsb nt
415 -- TODO put this Variation Entropy at VETODO mark above
416 ev = (mean [(nodeEntropy info_entropy (_fwd t')), (nodeEntropy info_entropy (_bwd tb'))])
417
418 P.putStrLn $ " " <> T.unpack ngram <> ":"
419 check (==) "count" count (_node_count (_fwd t'))
420 check sim "entropy" entropy ev
421 check sim "autonomy" autonomy (nodeEntropy info_autonomy t')
422 check sim "fwd_entropy" fwd_entropy (nodeEntropy info_entropy (_fwd t'))
423 check sim "bwd_entropy" bwd_entropy (nodeEntropy info_entropy (_bwd t'))
424
425 printTrie =
426 P.putStrLn . Tree.drawTree
427 . fmap show
428 . toTree (NonTerminal "")
429
430 -- | TODO real data is a list of tokenized sentences
431 example0, example1, example2, example3, example4, example5, example6 :: [Text]
432 example0 = ["New-York is New-York and New-York"]
433 example1 = ["to-be or not to-be"]
434 example2 = ["to-be-or not to-be-or NOT to-be and"]
435 example3 = example0 <> example0
436 -- > TEST: Should not have York New in the trie
437 example4 = ["a-b-c-d e a-b-c-d f"]
438 example5 = ["a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"]
439 example6 = ["le-petit chat"
440 ,"le-petit chien"
441 ,"le-petit rat"
442 ,"le gros rat"
443 ]
444
445 checks0, checks2 :: [(Text, Int, Double, Double, Double, Double, Double)]
446
447 checks0 =
448 [("<start>", 1, nan, nan, nan, nan, 0.0)
449 ,("New", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 1.584962500721156, 0.0)
450 ,("York", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 0.0, 1.584962500721156)
451 ,("is", 1, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, 0.0)
452 ,("and", 1, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, 0.0)
453 ,("<stop>", 0, nan, nan, nan, 0.0, nan)
454
455 {-
456 ,("<start> New", 1, nan, nan, nan, nan, 0.0)
457 ,("New York", 3, 1.584962500721156, 1.584962500721156, 1.4142135623730951, nan, 1.584962500721156)
458 ,("York is", 1, 0.0, nan, nan, nan, 0.0)
459 ,("is New", 1, 0.0, nan, nan, nan, 0.0)
460 ,("York and", 1, 0.0, nan, nan, nan, 0.0)
461 ,("and New", 1, 0.0, nan, nan, nan, 0.0)
462 ,("York <stop>", 1, nan, nan, nan, nan, nan)
463
464 ,("<start> New York", 1, nan, nan, nan, nan, 0.0)
465 ,("New York is", 1, 0.0, nan, nan, nan, 0.0)
466 ,("York is New", 1, 0.0, nan, nan, nan, 0.0)
467 ,("is New York", 1, 0.0, nan, nan, nan, 0.0)
468 ,("New York and", 1, 0.0, nan, nan, nan, 0.0)
469 ,("York and New", 1, 0.0, nan, nan, nan, 0.0)
470 ,("and New York", 1, 0.0, nan, nan, nan, 0.0)
471 ,("New York <stop>", 1, nan, nan, nan, nan, nan)
472 --}
473 ]
474
475
476
477 checks2 =
478 [("to be", 3, 1.2516291673878228, 1.2516291673878228, 1.5535694744293167, nan, 0.9182958340544896)
479 ,("be or", 2, 0.5, nan, nan, nan, 1.0)
480 ,("or not", 1, 0.0, nan, nan, nan, 0.0)
481 ,("not to", 1, 0.0, nan, nan, nan, 0.0)
482 ,("or NOT", 1, 0.0, nan, nan, nan, 0.0)
483 ,("NOT to", 1, 0.0, nan, nan, nan, 0.0)
484 ,("be and", 1, 0.0, nan, nan, nan, 0.0)
485 ]
486
487
488 runTests :: IO ()
489 runTests =
490 forM_
491 [("example0", 2, example0, checks0)
492 ,("example1", 2, example1, [])
493 ,("example2", 3, example2, checks2)
494 ,("example3", 2, example3, [])
495 ,("example4", 4, example4, [])
496 ,("example5", 5, example5, [])
497 ]
498 (\(name, n, ex, checks) -> do
499 P.putStrLn $ name <> " " <> show n
500 b <- testEleve False n ex checks
501 P.putStrLn $ " splitting: " <> if b then "PASS" else "FAIL"
502 )