]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Eleve.hs
Eleve...
[gargantext.git] / src / Gargantext / Text / Eleve.hs
1 {-|
2 Module : Gargantext.Text.Eleve
3 Description : Unsupervized Word segmentation
4 Copyright : (c) CNRS, 2019-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 # Implementation of Unsupervized Word Segmentation
11
12 References:
13
14 - Python implementation (Korantin August, Emmanuel Navarro):
15 [EleVe](https://github.com/kodexlab/eleve.git)
16
17 - Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
18 Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
19 the 50th Annual Meeting of the Association for Computational Linguistics
20 , pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
21
22 Notes for current implementation:
23 - TODO fix normalization
24 - TODO extract longer ngrams (see paper above, viterbi algo can be used)
25 - TODO AD TEST: prop (Node c _e f) = c == Map.size f
26
27 - AD: Real ngrams extraction test
28 from Gargantext.Text.Terms import extractTermsUnsupervised
29 docs <- runCmdRepl $ selectDocs 1004
30 extractTermsUnsupervised 3 $ DT.intercalate " "
31 $ catMaybes
32 $ Gargantext.map _hyperdataDocument_abstract docs
33
34 -}
35 {-# LANGUAGE ConstraintKinds #-}
36 {-# LANGUAGE NoImplicitPrelude #-}
37 {-# LANGUAGE OverloadedStrings #-}
38 {-# LANGUAGE RankNTypes #-}
39 {-# LANGUAGE TemplateHaskell #-}
40 {-# LANGUAGE TypeFamilies #-}
41
42 module Gargantext.Text.Eleve where
43
44 import Debug.Trace (trace)
45 -- import Debug.SimpleReflect
46
47 import Control.Lens (Lens', Getting, (^.), (^?), view, makeLenses, _Just)
48 import Control.Monad (forM_)
49 import Data.Ord (Ord)
50 import qualified Data.List as L
51 import Data.Monoid
52 import Data.Text (Text)
53 import qualified Data.Text as T
54 import Data.Map (Map)
55 import Data.Maybe (fromMaybe)
56 import qualified Data.Map as Map
57 import Gargantext.Prelude hiding (cs)
58 import qualified Data.Tree as Tree
59 import Data.Tree (Tree)
60 import qualified Prelude as P (putStrLn, logBase, isNaN, RealFloat)
61
62 type Entropy e =
63 ( Fractional e
64 , Floating e
65 , P.RealFloat e
66 , Show e
67 -- ^ TODO: only used for debugging
68 )
69 ------------------------------------------------------------------------
70 -- | Example and tests for development
71 data I e = I
72 { _info_entropy :: e
73 , _info_autonomy :: e
74 }
75
76 instance Show e => Show (I e) where
77 show (I e n) = show (e, n)
78
79 makeLenses ''I
80
81 type ModEntropy i o e = (e -> e) -> i -> o
82
83 set_autonomy :: ModEntropy e (I e) e
84 set_autonomy f e = I e (f e)
85
86 data StartStop = Start | Stop
87 deriving (Ord, Eq, Show)
88
89 data Token = NonTerminal Text
90 | Terminal StartStop
91 deriving (Ord, Eq, Show)
92
93 isTerminal :: Token -> Bool
94 isTerminal (Terminal _) = True
95 isTerminal (NonTerminal _) = False
96
97 parseToken :: Text -> Token
98 parseToken "<start>" = Terminal Start
99 parseToken "<stop>" = Terminal Stop
100 parseToken t = NonTerminal t
101
102 toToken :: [Text] -> [Token]
103 toToken xs = Terminal Start : (NonTerminal <$> xs) <> [Terminal Stop]
104
105 printToken :: Token -> Text
106 printToken = f
107 where
108 f (NonTerminal x) = x
109 f (Terminal Start) = "<start>"
110 f (Terminal Stop) = "<stop>"
111
112 ------------------------------------------------------------------------
113
114 data Trie k e
115 = Node { _node_count :: Int
116 , _node_entropy :: e
117 , _node_children :: Map k (Trie k e)
118 }
119 | Leaf { _node_count :: Int }
120 deriving (Show)
121
122 makeLenses ''Trie
123
124 insertTries :: Ord k => [[k]] -> Trie k ()
125 insertTries = L.foldr insertTrie emptyTrie
126
127 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
128 insertTrie [] n = n { _node_count = _node_count n +1}
129 insertTrie (x:xs) (Leaf c) = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
130 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
131 where
132 f = Just . insertTrie xs . fromMaybe emptyTrie
133
134 -- emptyTrie :: (Ord k, Monoid e) => Trie k e
135 -- emptyTrie = Node 0 mempty mempty
136 emptyTrie :: Trie k e
137 emptyTrie = Leaf 0
138
139 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
140 mkTrie c children
141 | Map.null children = Leaf c
142 | otherwise = Node c mempty children
143
144 -----------------------------
145
146 -- | Trie to Tree since Tree as nice print function
147 toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
148 toTree k (Leaf c) = Tree.Node (k, c, Nothing) []
149 toTree k (Node c e cs) = Tree.Node (k, c, Just e) (map (uncurry toTree) $ Map.toList cs)
150
151 ------------------------------------------------------------------------
152 ------------------------------------------------------------------------
153
154 nan :: Floating e => e
155 nan = 0 / 0
156
157 noNaNs :: P.RealFloat e => [e] -> [e]
158 noNaNs = filter (not . P.isNaN)
159
160 updateIfDefined :: P.RealFloat e => e -> e -> e
161 updateIfDefined e0 e | P.isNaN e = e0
162 | otherwise = e
163
164 subst :: Entropy e => (e, e) -> e -> e
165 subst (src, dst) x | sim src x = dst
166 | otherwise = x
167
168 entropyTrie :: Entropy e => (k -> Bool) -> Trie k () -> Trie k e
169 entropyTrie _ (Leaf c) = Leaf c
170 entropyTrie pred (Node c () children) = Node c e (map (entropyTrie pred) children)
171 where
172 e = sum $ map f $ Map.toList children
173 f (k, child) = if pred k then chc * P.logBase 2 (fromIntegral c)
174 else - chc * P.logBase 2 chc
175 where
176 chc = fromIntegral (_node_count child) / fromIntegral c
177 ------------------------------------------------------------------------
178
179 normalizeLevel :: Entropy e => [e] -> e -> e
180 normalizeLevel = checkDiff (go . noNaNs)
181
182 where
183 -- checkDiff f es e = let e' = f es e in if e == e' then e' else trace ("normalizeLevel: diff " <> show e <> " " <> show e') e'
184 checkDiff = identity
185 -- go [] = panic "normalizeLevel: impossible"
186 -- trace "normalizeLevel"
187 -- go [_] = identity
188 go es = \e -> (e - m) / v
189 {-
190 in if P.isNaN e'
191 then trace ("normalizeLevel " <> show (e,m,v,es))
192 e
193 else e'
194 -}
195 where
196 m = mean es
197 v = deviation es
198
199 {- Unused
200
201 nodeChildren :: Trie k e -> Map k (Trie k e)
202 nodeChildren (Node _ _ cs) = cs
203 nodeChildren (Leaf _) = Map.empty
204
205 -}
206
207 class IsTrie trie where
208 buildTrie :: Entropy e => [[Token]] -> trie Token e
209 nodeEntropy :: Entropy e => Getting e i e -> trie k i -> e
210 nodeChild :: Ord k => k -> trie k e -> trie k e
211 findTrie :: Ord k => [k] -> trie k e -> trie k e
212 normalizeEntropy :: Entropy e
213 => Getting e i e -> ModEntropy i o e
214 -> trie k i -> trie k o
215
216 -- UNUSED
217 --nodeAutonomy :: (Ord k, Entropy e) => Getting e i e -> trie k i -> [k] -> e
218 --nodeAutonomy inE t ks = nodeEntropy inE $ findTrie ks t
219
220 instance IsTrie Trie where
221 buildTrie = entropyTrie isTerminal . insertTries
222
223 nodeEntropy inE (Node _ e _) = e ^. inE
224 nodeEntropy _ (Leaf _) = -- trace "nodeEntropy of Leaf" $
225 nan
226
227 nodeChild k (Node _ _ cs) = fromMaybe emptyTrie (Map.lookup k cs)
228 nodeChild _ (Leaf _) = emptyTrie
229
230 findTrie ks t = L.foldl (flip nodeChild) t ks
231
232 normalizeEntropy inE modE t = go (modE identity) (entropyLevels inE t) t
233 where
234 go _ [] _ = panic "normalizeEntropy' empty levels"
235 go _ _ (Leaf c) = Leaf c
236 -- go _ ([] : _) _ = panic "normalizeEntropy': empty level"
237 go f (es : ess) (Node c i children)
238 -- | any (sim (i ^. inE)) es
239 = Node c (f i) $ go (modE $ normalizeLevel es) ess <$> children
240 -- | otherwise
241 -- = panic "NOT an elem"
242
243
244 {-
245 This is only normalizing a node with respect to its brothers (unlike all the
246 nodes of the same level).
247
248 normalizeEntropy inE modE = go $ modE identity
249 where
250 go _ (Leaf c) = Leaf c
251 go f (Node c i children)
252 | Map.null children =
253 panic "normalizeEntropy: impossible"
254 | otherwise =
255 Node c (f i) $ go (modE $ normalizeLevel es) <$> children
256 where
257 es = [ i' ^. inE | Node _ i' _ <- Map.elems children ]
258 -}
259 ------------------------------------------------------------------------
260
261 levels :: Trie k e -> [[Trie k e]]
262 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
263 where
264 subForest :: Trie k e -> [Trie k e]
265 subForest (Leaf _) = []
266 subForest (Node _ _ children) = Map.elems children
267
268 entropyLevels :: Entropy e => Getting e i e -> Trie k i -> [[e]]
269 entropyLevels inE = fmap (noNaNs . map (nodeEntropy inE)) . levels
270
271 ------------------------------------------------------------------------
272
273 data Tries k e = Tries
274 { _fwd :: Trie k e
275 , _bwd :: Trie k e
276 }
277
278 instance IsTrie Tries where
279 buildTrie tts = Tries { _fwd = buildTrie tts
280 , _bwd = buildTrie (reverse <$> tts)
281 }
282
283 nodeEntropy inE (Tries fwd bwd) =
284 mean $ noNaNs [nodeEntropy inE fwd, nodeEntropy inE bwd]
285
286 findTrie ks (Tries fwd bwd) = Tries (findTrie ks fwd) (findTrie ks bwd)
287 -- ^^
288 -- TODO: here this is tempting to reverse but this is not always what we
289 -- want. See also nodeAutonomy.
290
291 nodeChild k (Tries fwd bwd) = Tries (nodeChild k fwd) (nodeChild k bwd)
292
293 normalizeEntropy inE modE = onTries (normalizeEntropy inE modE)
294
295 onTries :: (Trie k i -> Trie k o) -> Tries k i -> Tries k o
296 onTries f (Tries fwd bwd) = Tries (f fwd) (f bwd)
297
298 ------------------------------------------------------------------------
299 split :: (IsTrie trie, Entropy e) => Lens' i e -> trie Token i -> [Token] -> [[Token]]
300 split _ _ [] = []
301 split inE t0 (Terminal Start:xs0) = split inE (nodeChild (Terminal Start) t0) xs0
302 split inE t0 (x0:xs0) = go (nodeChild x0 t0) [x0] xs0
303 where
304 consRev [] xss = xss
305 consRev xs xss = reverse xs : xss
306
307 go _ pref [] = [reverse pref]
308 go _ pref (Terminal Stop:_) = [reverse pref]
309 go t pref (Terminal Start:xs) = go t pref xs
310 go t pref (x:xs) =
311 -- trace (show (if acc then "ACC" else "CUT", (reverse (x : pref), ext), if acc then ">" else "<=", ((reverse pref, et), "+", ([x], ext0)))) $
312 if acc
313 then go xt (x:pref) xs
314 else consRev pref $ go xt0 [x] xs
315 where
316 xt = nodeChild x t
317 xt0 = nodeChild x t0
318 et = ne 0 t
319 -- ^ entropy of the current prefix
320 ext0 = ne 0 xt0
321 -- ^ entropy of [x]
322 ext = ne 0 xt
323 -- ^ entropy of the current prefix plus x
324 acc = ext > et + ext0
325 -- aut(["in","this","paper"]) > aut(["in","this"]) + aut(["paper"])
326
327 ne d t = if P.isNaN e then d else e
328 where e = nodeEntropy inE t
329
330 {-
331 split :: Entropy e => Lens' i e -> Tries Token i -> [Token] -> [[Token]]
332 split inE t0 ts =
333 maximumWith (sum . map $ nodeAutonomy inE t0) (all the splits of ts)
334 -}
335
336 ------------------------------------------------------------------------
337 ------------------------------------------------------------------------
338
339 mainEleve :: Int -> [[Text]] -> [[[Text]]]
340 mainEleve _ _ = []
341 {-
342 mainEleve n input = map (map printToken) . split identity (t :: Trie Token Double) <$> inp
343 where
344 inp = toToken <$> input
345 t = buildTrie $ L.concat $ chunkAlong n 1 <$> inp
346 -}
347
348 sim :: Entropy e => e -> e -> Bool
349 sim x y = x == y || (P.isNaN x && P.isNaN y)
350
351 chunkAlongEleve :: Int -> [a] -> [[a]]
352 chunkAlongEleve n xs = L.take n <$> L.tails xs
353
354 testEleve :: e ~ Double => Bool -> Int -> [Text] -> [(Text, Int, e, e, e, e, e)] -> IO Bool
355 testEleve debug n output checks = do
356 let
357 {-
358 pss = [ (ps, findTrie ps fwd ^? _Just . node_entropy) -- . info_entropy)
359 | ps <- L.nub $ [ c
360 | m <- [1..n]
361 , cs <- chunkAlong m 1 <$> inp
362 , c <- cs
363 ]
364 ]
365 -}
366 --res = map (map printToken) . split identity fwd <$> inp
367 --res = map (map printToken) . split info_norm_entropy' nt' <$> inp
368 res = map (map printToken) . split info_autonomy nt <$> inp
369 when debug $ do
370 P.putStrLn (show input)
371 -- forM_ pss (P.putStrLn . show)
372 P.putStrLn ""
373 P.putStrLn "Levels:"
374 forM_ (entropyLevels identity (_fwd t)) $ \level ->
375 P.putStrLn $ " " <> show level
376 P.putStrLn ""
377 P.putStrLn "Forward:"
378 printTrie (_fwd nt)
379 P.putStrLn ""
380 P.putStrLn "Backward:"
381 printTrie (_bwd nt)
382 P.putStrLn ""
383 P.putStrLn "Splitting:"
384 P.putStrLn $ show res
385 forM_ checks checker
386 pure $ expected == res
387
388 where
389 out = T.words <$> output
390 expected = fmap (T.splitOn "-") <$> out
391 input = (T.splitOn "-" =<<) <$> out
392 inp = toToken <$> input
393 t = buildTrie $ L.concat $ (filter (/= [Terminal Stop]) . chunkAlongEleve (n + 2)) <$> inp
394 -- nt = normalizeEntropy identity set_autonomy (fwd :: Trie Token Double)
395 -- nt = normalizeEntropy' info_entropy (\f -> info_norm_entropy' %~ f) nt
396 nt = normalizeEntropy identity set_autonomy t
397
398 check f msg ref my =
399 if f ref my
400 then P.putStrLn $ " PASS " <> msg <> " " <> show ref
401 else P.putStrLn $ " FAIL " <> msg <> " ref=" <> show ref <> " my=" <> show my
402
403 checker (ngram, count, entropy, _ev, autonomy, bwd_entropy, fwd_entropy) = do
404 let ns = parseToken <$> T.words ngram
405 t' = findTrie ns nt
406 P.putStrLn $ " " <> T.unpack ngram <> ":"
407 check (==) "count" count (_node_count (_fwd t'))
408 check sim "entropy" entropy (nodeEntropy info_entropy t')
409 check sim "autonomy" autonomy (nodeEntropy info_autonomy t')
410 check sim "fwd_entropy" fwd_entropy (nodeEntropy info_entropy (_fwd t'))
411 check sim "bwd_entropy" bwd_entropy (nodeEntropy info_entropy (_bwd t'))
412
413 printTrie =
414 P.putStrLn . Tree.drawTree
415 . fmap show
416 . toTree (NonTerminal "")
417
418 -- | TODO real data is a list of tokenized sentences
419 example0, example1, example2, example3, example4, example5, example6 :: [Text]
420 example0 = ["New-York is New-York and New-York"]
421 example1 = ["to-be or not to-be"]
422 example2 = ["to-be-or not to-be-or NOT to-be and"]
423 example3 = example0 <> example0
424 -- > TEST: Should not have York New in the trie
425 example4 = ["a-b-c-d e a-b-c-d f"]
426 example5 = ["a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"]
427 example6 = ["le-petit chat"
428 ,"le-petit chien"
429 ,"le-petit rat"
430 ,"le gros rat"
431 ]
432
433 checks0, checks2 :: [(Text, Int, Double, Double, Double, Double, Double)]
434
435 checks0 =
436 [("<start>", 1, nan, nan, nan, nan, 0.0)
437 ,("New", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 1.584962500721156, 0.0)
438 ,("York", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 0.0, 1.584962500721156)
439 ,("is", 1, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, 0.0)
440 ,("and", 1, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, 0.0)
441 ,("<stop>", 0, nan, nan, nan, 0.0, nan)
442
443 ,("<start> New", 1, nan, nan, nan, nan, 0.0)
444 ,("New York", 3, 1.584962500721156, 1.584962500721156, 1.4142135623730951, nan, 1.584962500721156)
445 ,("York is", 1, 0.0, nan, nan, nan, 0.0)
446 ,("is New", 1, 0.0, nan, nan, nan, 0.0)
447 ,("York and", 1, 0.0, nan, nan, nan, 0.0)
448 ,("and New", 1, 0.0, nan, nan, nan, 0.0)
449 ,("York <stop>", 1, nan, nan, nan, nan, nan)
450
451 ,("<start> New York", 1, nan, nan, nan, nan, 0.0)
452 ,("New York is", 1, 0.0, nan, nan, nan, 0.0)
453 ,("York is New", 1, 0.0, nan, nan, nan, 0.0)
454 ,("is New York", 1, 0.0, nan, nan, nan, 0.0)
455 ,("New York and", 1, 0.0, nan, nan, nan, 0.0)
456 ,("York and New", 1, 0.0, nan, nan, nan, 0.0)
457 ,("and New York", 1, 0.0, nan, nan, nan, 0.0)
458 ,("New York <stop>", 1, nan, nan, nan, nan, nan)
459 ]
460
461
462
463 checks2 =
464 [("to be", 3, 1.2516291673878228, 1.2516291673878228, 1.5535694744293167, nan, 0.9182958340544896)
465 ,("be or", 2, 0.5, nan, nan, nan, 1.0)
466 ,("or not", 1, 0.0, nan, nan, nan, 0.0)
467 ,("not to", 1, 0.0, nan, nan, nan, 0.0)
468 ,("or NOT", 1, 0.0, nan, nan, nan, 0.0)
469 ,("NOT to", 1, 0.0, nan, nan, nan, 0.0)
470 ,("be and", 1, 0.0, nan, nan, nan, 0.0)
471 ]
472
473
474 runTests :: IO ()
475 runTests =
476 forM_
477 [("example0", 2, example0, checks0)
478 ,("example1", 2, example1, [])
479 ,("example2", 3, example2, checks2)
480 ,("example3", 2, example3, [])
481 ,("example4", 4, example4, [])
482 ,("example5", 5, example5, [])
483 ]
484 (\(name, n, ex, checks) -> do
485 P.putStrLn $ name <> " " <> show n
486 b <- testEleve False n ex checks
487 P.putStrLn $ " splitting: " <> if b then "PASS" else "FAIL"
488 )