]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Eleve.hs
[TEST] uncommenting tests.
[gargantext.git] / src / Gargantext / Text / Eleve.hs
1 {-|
2 Module : Gargantext.Text.Eleve
3 Description : Unsupervized Word segmentation
4 Copyright : (c) CNRS, 2019-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 # Implementation of Unsupervized Word Segmentation
11
12 References:
13
14 - Python implementation (Korantin August, Emmanuel Navarro):
15 [EleVe](https://github.com/kodexlab/eleve.git)
16
17 - Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
18 Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
19 the 50th Annual Meeting of the Association for Computational Linguistics
20 , pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
21
22 Notes for current implementation:
23 - TODO fix normalization
24 - TODO extract longer ngrams (see paper above, viterbi algo can be used)
25 - TODO AD TEST: prop (Node c _e f) = c == Map.size f
26
27 - AD: Real ngrams extraction test
28 from Gargantext.Text.Terms import extractTermsUnsupervised
29 docs <- runCmdRepl $ selectDocs 1004
30 extractTermsUnsupervised 3 $ DT.intercalate " "
31 $ catMaybes
32 $ Gargantext.map _hyperdataDocument_abstract docs
33
34 -}
35 {-# LANGUAGE ConstraintKinds #-}
36 {-# LANGUAGE NoImplicitPrelude #-}
37 {-# LANGUAGE OverloadedStrings #-}
38 {-# LANGUAGE RankNTypes #-}
39 {-# LANGUAGE TemplateHaskell #-}
40 {-# LANGUAGE TypeFamilies #-}
41
42 module Gargantext.Text.Eleve where
43
44 import Debug.Trace (trace)
45 -- import Debug.SimpleReflect
46
47 import Data.Functor.Reverse
48 import Control.Lens (Lens', Getting, (^.), (^?), view, makeLenses, _Just, under, reversed)
49 import Control.Monad (forM_)
50 import Data.Ord (Ord)
51 import qualified Data.List as L
52 import Data.Monoid
53 import Data.Text (Text)
54 import qualified Data.Text as T
55 import Data.Map (Map)
56 import Data.Maybe (fromMaybe)
57 import qualified Data.Map as Map
58 import Gargantext.Prelude hiding (cs)
59 import qualified Data.Tree as Tree
60 import Data.Tree (Tree)
61 import qualified Prelude as P (putStrLn, logBase, isNaN, RealFloat)
62
63 type Entropy e =
64 ( Fractional e
65 , Floating e
66 , P.RealFloat e
67 , Show e
68 -- ^ TODO: only used for debugging
69 )
70 ------------------------------------------------------------------------
71 -- | Example and tests for development
72 data I e = I
73 { _info_entropy :: e
74 , _info_autonomy :: e
75 }
76
77 instance Show e => Show (I e) where
78 show (I e n) = show (e, n)
79
80 makeLenses ''I
81
82 type ModEntropy i o e = (e -> e) -> i -> o
83
84 set_autonomy :: ModEntropy e (I e) e
85 set_autonomy f e = I e (f e)
86
87 data StartStop = Start | Stop
88 deriving (Ord, Eq, Show)
89
90 data Token = NonTerminal Text
91 | Terminal StartStop
92 deriving (Ord, Eq, Show)
93
94 isTerminal :: Token -> Bool
95 isTerminal (Terminal _) = True
96 isTerminal (NonTerminal _) = False
97
98 parseToken :: Text -> Token
99 parseToken "<start>" = Terminal Start
100 parseToken "<stop>" = Terminal Stop
101 parseToken t = NonTerminal t
102
103
104 toToken :: [Text] -> [Token]
105 toToken xs = Terminal Start : (NonTerminal <$> xs) <> [Terminal Stop]
106
107 printToken :: Token -> Text
108 printToken = f
109 where
110 f (NonTerminal x) = x
111 f (Terminal Start) = "<start>"
112 f (Terminal Stop) = "<stop>"
113
114 ------------------------------------------------------------------------
115
116 data Trie k e
117 = Node { _node_count :: Int
118 , _node_entropy :: e
119 , _node_children :: Map k (Trie k e)
120 }
121 | Leaf { _node_count :: Int }
122 deriving (Show)
123
124 makeLenses ''Trie
125
126 insertTries :: Ord k => [[k]] -> Trie k ()
127 insertTries = L.foldr insertTrie emptyTrie
128
129 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
130 insertTrie [] n = n { _node_count = _node_count n +1}
131 insertTrie (x:xs) (Leaf c) = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
132 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
133 where
134 f = Just . insertTrie xs . fromMaybe emptyTrie
135
136 -- emptyTrie :: (Ord k, Monoid e) => Trie k e
137 -- emptyTrie = Node 0 mempty mempty
138 emptyTrie :: Trie k e
139 emptyTrie = Leaf 0
140
141 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
142 mkTrie c children
143 | Map.null children = Leaf c
144 | otherwise = Node c mempty children
145
146 -----------------------------
147
148 -- | Trie to Tree since Tree as nice print function
149 toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
150 toTree k (Leaf c) = Tree.Node (k, c, Nothing) []
151 toTree k (Node c e cs) = Tree.Node (k, c, Just e) (map (uncurry toTree) $ Map.toList cs)
152
153 ------------------------------------------------------------------------
154 ------------------------------------------------------------------------
155
156 nan :: Floating e => e
157 nan = 0 / 0
158
159 noNaNs :: P.RealFloat e => [e] -> [e]
160 noNaNs = filter (not . P.isNaN)
161
162 updateIfDefined :: P.RealFloat e => e -> e -> e
163 updateIfDefined e0 e | P.isNaN e = e0
164 | otherwise = e
165
166 subst :: Entropy e => (e, e) -> e -> e
167 subst (src, dst) x | sim src x = dst
168 | otherwise = x
169
170 entropyTrie :: Entropy e => (k -> Bool) -> Trie k () -> Trie k e
171 entropyTrie _ (Leaf c) = Leaf c
172 entropyTrie pred (Node c () children) = Node c e (map (entropyTrie pred) children)
173 where
174 e = sum $ map f $ Map.toList children
175 f (k, child) = if pred k then chc * P.logBase 2 (fromIntegral c)
176 else - chc * P.logBase 2 chc
177 where
178 chc = fromIntegral (_node_count child) / fromIntegral c
179 ------------------------------------------------------------------------
180
181 normalizeLevel :: Entropy e => [e] -> e -> e
182 normalizeLevel = checkDiff (go . noNaNs)
183
184 where
185 -- checkDiff f es e = let e' = f es e in if e == e' then e' else trace ("normalizeLevel: diff " <> show e <> " " <> show e') e'
186 checkDiff = identity
187 -- go [] = panic "normalizeLevel: impossible"
188 -- trace "normalizeLevel"
189 -- go [_] = identity
190 go es = \e -> (e - m) / v
191 {-
192 in if P.isNaN e'
193 then trace ("normalizeLevel " <> show (e,m,v,es))
194 e
195 else e'
196 -}
197 where
198 m = mean es
199 v = deviation es
200
201 {- Unused
202
203 nodeChildren :: Trie k e -> Map k (Trie k e)
204 nodeChildren (Node _ _ cs) = cs
205 nodeChildren (Leaf _) = Map.empty
206
207 -}
208
209
210 class IsTrie trie where
211 buildTrie :: Entropy e => (Int -> [[Text]] -> [[Token]]) -> Int -> [[Text]] -> trie Token e
212 nodeEntropy :: Entropy e => Getting e i e -> trie k i -> e
213 nodeChild :: Ord k => k -> trie k e -> trie k e
214 findTrie :: Ord k => [k] -> trie k e -> trie k e
215 normalizeEntropy :: Entropy e
216 => Getting e i e -> ModEntropy i o e
217 -> trie k i -> trie k o
218
219 -- UNUSED
220 --nodeAutonomy :: (Ord k, Entropy e) => Getting e i e -> trie k i -> [k] -> e
221 --nodeAutonomy inE t ks = nodeEntropy inE $ findTrie ks t
222
223 instance IsTrie Trie where
224 buildTrie to n ts = entropyTrie isTerminal $ insertTries $ to n ts
225
226 nodeEntropy inE (Node _ e _) = e ^. inE
227 nodeEntropy _ (Leaf _) = nan
228
229 nodeChild k (Node _ _ cs) = fromMaybe emptyTrie (Map.lookup k cs)
230 nodeChild _ (Leaf _) = emptyTrie
231
232 findTrie ks t = L.foldl (flip nodeChild) t ks
233
234 normalizeEntropy inE modE t = go (modE identity) (entropyLevels inE t) t
235 where
236 go _ [] _ = panic "normalizeEntropy' empty levels"
237 go _ _ (Leaf c) = Leaf c
238 -- go _ ([] : _) _ = panic "normalizeEntropy': empty level"
239 go f (es : ess) (Node c i children)
240 -- | any (sim (i ^. inE)) es
241 = Node c (f i) $ go (modE $ normalizeLevel es) ess <$> children
242 -- | otherwise
243 -- = panic "NOT an elem"
244
245
246 {-
247 This is only normalizing a node with respect to its brothers (unlike all the
248 nodes of the same level).
249
250 normalizeEntropy inE modE = go $ modE identity
251 where
252 go _ (Leaf c) = Leaf c
253 go f (Node c i children)
254 | Map.null children =
255 panic "normalizeEntropy: impossible"
256 | otherwise =
257 Node c (f i) $ go (modE $ normalizeLevel es) <$> children
258 where
259 es = [ i' ^. inE | Node _ i' _ <- Map.elems children ]
260 -}
261 ------------------------------------------------------------------------
262
263 levels :: Trie k e -> [[Trie k e]]
264 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
265 where
266 subForest :: Trie k e -> [Trie k e]
267 subForest (Leaf _) = []
268 subForest (Node _ _ children) = Map.elems children
269
270 entropyLevels :: Entropy e => Getting e i e -> Trie k i -> [[e]]
271 entropyLevels inE = fmap (noNaNs . map (nodeEntropy inE)) . levels
272
273 ------------------------------------------------------------------------
274
275 data Tries k e = Tries
276 { _fwd :: Trie k e
277 , _bwd :: Trie k e
278 }
279
280 instance IsTrie Tries where
281 buildTrie to n tts = Tries { _fwd = buildTrie to n tts
282 , _bwd = buildTrie to n (map reverse $ tts)
283 }
284
285 nodeEntropy inE (Tries fwd bwd) =
286 -- VETODO reverse the query for bwd here
287 -- mean $ noNaNs [nodeEntropy inE fwd, nodeEntropy inE bwd . under reversed]
288 mean $ noNaNs [nodeEntropy inE fwd, nodeEntropy inE bwd]
289
290 findTrie ks (Tries fwd bwd) = Tries (findTrie ks fwd) (findTrie ks bwd)
291 -- ^^
292 -- TODO: here this is tempting to reverse but this is not always what we
293 -- want. See also nodeAutonomy.
294 -- AD: I also tried to reverse here and I confirm getting unexpected results (whereas VETODO FIX below is ok)
295 -- since recursivity of the function makes the reverse multiple times (I guess)
296
297 nodeChild k (Tries fwd bwd) = Tries (nodeChild k fwd) (nodeChild k bwd)
298
299 normalizeEntropy inE modE = onTries (normalizeEntropy inE modE)
300
301 onTries :: (Trie k i -> Trie k o) -> Tries k i -> Tries k o
302 onTries f (Tries fwd bwd) = Tries (f fwd) (f bwd)
303
304 ------------------------------------------------------------------------
305 split :: (IsTrie trie, Entropy e) => Lens' i e -> trie Token i -> [Token] -> [[Token]]
306 split _ _ [] = []
307 split inE t0 (Terminal Start:xs0) = split inE (nodeChild (Terminal Start) t0) xs0
308 split inE t0 (x0:xs0) = go (nodeChild x0 t0) [x0] xs0
309 where
310 consRev [] xss = xss
311 consRev xs xss = reverse xs : xss
312
313 go _ pref [] = [reverse pref]
314 go _ pref (Terminal Stop:_) = [reverse pref]
315 go t pref (Terminal Start:xs) = go t pref xs
316 go t pref (x:xs) =
317 -- trace (show (if acc then "ACC" else "CUT", (reverse (x : pref), ext), if acc then ">" else "<=", ((reverse pref, et), "+", ([x], ext0)))) $
318 if acc
319 then go xt (x:pref) xs
320 else consRev pref $ go xt0 [x] xs
321 where
322 xt = nodeChild x t
323 xt0 = nodeChild x t0
324 et = ne 0 t
325 -- ^ entropy of the current prefix
326 ext0 = ne 0 xt0
327 -- ^ entropy of [x]
328 ext = ne 0 xt
329 -- ^ entropy of the current prefix plus x
330 acc = ext > et + ext0
331 -- aut(["in","this","paper"]) > aut(["in","this"]) + aut(["paper"])
332
333 ne d t = if P.isNaN e then d else e
334 where e = nodeEntropy inE t
335
336 {-
337 split :: Entropy e => Lens' i e -> Tries Token i -> [Token] -> [[Token]]
338 split inE t0 ts =
339 maximumWith (sum . map $ nodeAutonomy inE t0) (all the splits of ts)
340 -}
341
342 ------------------------------------------------------------------------
343 ------------------------------------------------------------------------
344
345 mainEleve :: Int -> [[Text]] -> [[[Text]]]
346 mainEleve _ _ = []
347 {-
348 mainEleve n input = map (map printToken) . split identity (t :: Trie Token Double) <$> inp
349 where
350 inp = toToken <$> input
351 t = buildTrie $ L.concat $ chunkAlong n 1 <$> inp
352 -}
353
354 sim :: Entropy e => e -> e -> Bool
355 sim x y = x == y || (P.isNaN x && P.isNaN y)
356
357 chunkAlongEleve :: Int -> [a] -> [[a]]
358 chunkAlongEleve n xs = L.take n <$> L.tails xs
359
360 toToken' :: Int -> [[Text]] -> [[Token]]
361 toToken' n input = L.concat $ (filter (/= [Terminal Stop]) . chunkAlongEleve (n + 2)) <$> toToken <$> input
362
363 testEleve :: e ~ Double => Bool -> Int -> [Text] -> [(Text, Int, e, e, e, e, e)] -> IO Bool
364 testEleve debug n output checks = do
365 let
366 {-
367 pss = [ (ps, findTrie ps fwd ^? _Just . node_entropy) -- . info_entropy)
368 | ps <- L.nub $ [ c
369 | m <- [1..n]
370 , cs <- chunkAlong m 1 <$> inp
371 , c <- cs
372 ]
373 ]
374 -}
375 --res = map (map printToken) . split identity fwd <$> inp
376 --res = map (map printToken) . split info_norm_entropy' nt' <$> inp
377 res = map (map printToken) . split info_autonomy nt <$> inp
378 when debug $ do
379 P.putStrLn (show input)
380 -- forM_ pss (P.putStrLn . show)
381 P.putStrLn ""
382 P.putStrLn "Levels:"
383 forM_ (entropyLevels identity (_fwd t)) $ \level ->
384 P.putStrLn $ " " <> show level
385 P.putStrLn ""
386 P.putStrLn "Forward:"
387 printTrie (_fwd nt)
388 P.putStrLn ""
389 P.putStrLn "Backward:"
390 printTrie (_bwd nt)
391 P.putStrLn ""
392 P.putStrLn "Splitting:"
393 P.putStrLn $ show res
394 forM_ checks checker
395 pure $ expected == res
396
397 where
398 out = T.words <$> output
399 expected = fmap (T.splitOn "-") <$> out
400 input = (T.splitOn "-" =<<) <$> out
401 inp = toToken <$> input
402 t = buildTrie toToken' n input
403 -- nt = normalizeEntropy identity set_autonomy (fwd :: Trie Token Double)
404 -- nt = normalizeEntropy' info_entropy (\f -> info_norm_entropy' %~ f) nt
405 nt = normalizeEntropy identity set_autonomy t
406
407 check f msg ref my =
408 if f ref my
409 then P.putStrLn $ " PASS " <> msg <> " " <> show ref
410 else P.putStrLn $ " FAIL " <> msg <> " ref=" <> show ref <> " my=" <> show my
411
412 checker (ngram, count, entropy, _ev, autonomy, bwd_entropy, fwd_entropy) = do
413 let ns = parseToken <$> T.words ngram
414 nsb = parseToken <$> (reverse $ T.words ngram)
415
416 t' = findTrie ns nt
417 tb' = findTrie nsb nt
418 -- TODO put this Variation Entropy at VETODO mark above maybe in nodeEntropy ?
419 ev = (mean [(nodeEntropy info_entropy (_fwd t')), (nodeEntropy info_entropy (_bwd tb'))])
420
421 P.putStrLn $ " " <> T.unpack ngram <> ":"
422 check (==) "count" count (_node_count (_fwd t'))
423 check sim "entropy" entropy ev
424 check sim "autonomy" autonomy (nodeEntropy info_autonomy t')
425 check sim "fwd_entropy" fwd_entropy (nodeEntropy info_entropy (_fwd t'))
426 check sim "bwd_entropy" bwd_entropy (nodeEntropy info_entropy (_bwd t'))
427
428 printTrie =
429 P.putStrLn . Tree.drawTree
430 . fmap show
431 . toTree (NonTerminal "")
432
433 -- | TODO real data is a list of tokenized sentences
434 example0, example1, example2, example3, example4, example5, example6 :: [Text]
435 example0 = ["New-York is New-York and New-York"]
436 example1 = ["to-be or not to-be"]
437 example2 = ["to-be-or not to-be-or NOT to-be and"]
438 example3 = example0 <> example0
439 -- > TEST: Should not have York New in the trie
440 example4 = ["a-b-c-d e a-b-c-d f"]
441 example5 = ["a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"]
442 example6 = ["le-petit chat"
443 ,"le-petit chien"
444 ,"le-petit rat"
445 ,"le gros rat"
446 ]
447
448 checks0, checks2 :: [(Text, Int, Double, Double, Double, Double, Double)]
449
450 checks0 =
451 [("<start>", 1, nan, nan, nan, nan, 0.0)
452 ,("New", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 1.584962500721156, 0.0)
453 ,("York", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 0.0, 1.584962500721156)
454 ,("is", 1, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, 0.0)
455 ,("and", 1, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, 0.0)
456 ,("<stop>", 0, nan, nan, nan, 0.0, nan)
457
458 --{-
459 ,("<start> New", 1, nan, nan, nan, nan, 0.0)
460 ,("New York", 3, 1.584962500721156, 1.584962500721156, 1.4142135623730951, nan, 1.584962500721156)
461 ,("York is", 1, 0.0, nan, nan, nan, 0.0)
462 ,("is New", 1, 0.0, nan, nan, nan, 0.0)
463 ,("York and", 1, 0.0, nan, nan, nan, 0.0)
464 ,("and New", 1, 0.0, nan, nan, nan, 0.0)
465 ,("York <stop>", 1, nan, nan, nan, nan, nan)
466
467 ,("<start> New York", 1, nan, nan, nan, nan, 0.0)
468 ,("New York is", 1, 0.0, nan, nan, nan, 0.0)
469 ,("York is New", 1, 0.0, nan, nan, nan, 0.0)
470 ,("is New York", 1, 0.0, nan, nan, nan, 0.0)
471 ,("New York and", 1, 0.0, nan, nan, nan, 0.0)
472 ,("York and New", 1, 0.0, nan, nan, nan, 0.0)
473 ,("and New York", 1, 0.0, nan, nan, nan, 0.0)
474 ,("New York <stop>", 1, nan, nan, nan, nan, nan)
475 --}
476 ]
477
478
479
480 checks2 =
481 [("to be", 3, 1.2516291673878228, 1.2516291673878228, 1.5535694744293167, nan, 0.9182958340544896)
482 ,("be or", 2, 0.5, nan, nan, nan, 1.0)
483 ,("or not", 1, 0.0, nan, nan, nan, 0.0)
484 ,("not to", 1, 0.0, nan, nan, nan, 0.0)
485 ,("or NOT", 1, 0.0, nan, nan, nan, 0.0)
486 ,("NOT to", 1, 0.0, nan, nan, nan, 0.0)
487 ,("be and", 1, 0.0, nan, nan, nan, 0.0)
488 ]
489
490
491 runTests :: IO ()
492 runTests =
493 forM_
494 [("example0", 2, example0, checks0)
495 ,("example1", 2, example1, [])
496 ,("example2", 3, example2, checks2)
497 ,("example3", 2, example3, [])
498 ,("example4", 4, example4, [])
499 ,("example5", 5, example5, [])
500 ]
501 (\(name, n, ex, checks) -> do
502 P.putStrLn $ name <> " " <> show n
503 b <- testEleve False n ex checks
504 P.putStrLn $ " splitting: " <> if b then "PASS" else "FAIL"
505 )