]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Eleve.hs
[FIX][Ngrams] order of tokens.
[gargantext.git] / src / Gargantext / Text / Eleve.hs
1 {-|
2 Module : Gargantext.Text.Eleve
3 Description : Unsupervized Word segmentation
4 Copyright : (c) CNRS, 2019-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 # Implementation of Unsupervized Word Segmentation
11
12 References:
13
14 - Python implementation (Korantin August, Emmanuel Navarro):
15 [EleVe](https://github.com/kodexlab/eleve.git)
16
17 - Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
18 Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
19 the 50th Annual Meeting of the Association for Computational Linguistics
20 , pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
21
22 Notes for current implementation:
23 - TODO fix normalization
24 - TODO extract longer ngrams (see paper above, viterbi algo can be used)
25 - TODO AD TEST: prop (Node c _e f) = c == Map.size f
26
27 - AD: Real ngrams extraction test
28 from Gargantext.Text.Terms import extractTermsUnsupervised
29 docs <- runCmdRepl $ selectDocs 1004
30 extractTermsUnsupervised 3 $ DT.intercalate " "
31 $ catMaybes
32 $ Gargantext.map _hyperdataDocument_abstract docs
33
34 -}
35 {-# LANGUAGE ConstraintKinds #-}
36 {-# LANGUAGE NoImplicitPrelude #-}
37 {-# LANGUAGE OverloadedStrings #-}
38 {-# LANGUAGE RankNTypes #-}
39 {-# LANGUAGE TemplateHaskell #-}
40 {-# LANGUAGE TypeFamilies #-}
41
42 module Gargantext.Text.Eleve where
43
44 import Debug.Trace (trace)
45 -- import Debug.SimpleReflect
46
47 import Control.Lens (Lens', Getting, (^.), (^?), view, makeLenses, _Just)
48 import Control.Monad (forM_)
49 import Data.Ord (Ord)
50 import qualified Data.List as L
51 import Data.Monoid
52 import Data.Text (Text)
53 import qualified Data.Text as T
54 import Data.Map (Map)
55 import Data.Maybe (fromMaybe)
56 import qualified Data.Map as Map
57 import Gargantext.Prelude hiding (cs)
58 import qualified Data.Tree as Tree
59 import Data.Tree (Tree)
60 import qualified Prelude as P (putStrLn, logBase, isNaN, RealFloat)
61
62 type Entropy e =
63 ( Fractional e
64 , Floating e
65 , P.RealFloat e
66 , Show e
67 -- ^ TODO: only used for debugging
68 )
69 ------------------------------------------------------------------------
70 -- | Example and tests for development
71 data I e = I
72 { _info_entropy :: e
73 , _info_autonomy :: e
74 }
75
76 instance Show e => Show (I e) where
77 show (I e n) = show (e, n)
78
79 makeLenses ''I
80
81 type ModEntropy i o e = (e -> e) -> i -> o
82
83 set_autonomy :: ModEntropy e (I e) e
84 set_autonomy f e = I e (f e)
85
86 data StartStop = Start | Stop
87 deriving (Ord, Eq, Show)
88
89 data Token = NonTerminal Text
90 | Terminal StartStop
91 deriving (Ord, Eq, Show)
92
93 isTerminal :: Token -> Bool
94 isTerminal (Terminal _) = True
95 isTerminal (NonTerminal _) = False
96
97 parseToken :: Text -> Token
98 parseToken "<start>" = Terminal Start
99 parseToken "<stop>" = Terminal Stop
100 parseToken t = NonTerminal t
101
102 toToken :: [Text] -> [Token]
103 toToken xs = Terminal Start : (NonTerminal <$> xs) <> [Terminal Stop]
104
105 printToken :: Token -> Text
106 printToken = f
107 where
108 f (NonTerminal x) = x
109 f (Terminal Start) = "<start>"
110 f (Terminal Stop) = "<stop>"
111
112 ------------------------------------------------------------------------
113
114 data Trie k e
115 = Node { _node_count :: Int
116 , _node_entropy :: e
117 , _node_children :: Map k (Trie k e)
118 }
119 | Leaf { _node_count :: Int }
120 deriving (Show)
121
122 makeLenses ''Trie
123
124 insertTries :: Ord k => [[k]] -> Trie k ()
125 insertTries = L.foldr insertTrie emptyTrie
126
127 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
128 insertTrie [] n = n { _node_count = _node_count n +1}
129 insertTrie (x:xs) (Leaf c) = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
130 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
131 where
132 f = Just . insertTrie xs . fromMaybe emptyTrie
133
134 -- emptyTrie :: (Ord k, Monoid e) => Trie k e
135 -- emptyTrie = Node 0 mempty mempty
136 emptyTrie :: Trie k e
137 emptyTrie = Leaf 0
138
139 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
140 mkTrie c children
141 | Map.null children = Leaf c
142 | otherwise = Node c mempty children
143
144 -----------------------------
145
146 -- | Trie to Tree since Tree as nice print function
147 toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
148 toTree k (Leaf c) = Tree.Node (k, c, Nothing) []
149 toTree k (Node c e cs) = Tree.Node (k, c, Just e) (map (uncurry toTree) $ Map.toList cs)
150
151 ------------------------------------------------------------------------
152 ------------------------------------------------------------------------
153
154 nan :: Floating e => e
155 nan = 0 / 0
156
157 noNaNs :: P.RealFloat e => [e] -> [e]
158 noNaNs = filter (not . P.isNaN)
159
160 updateIfDefined :: P.RealFloat e => e -> e -> e
161 updateIfDefined e0 e | P.isNaN e = e0
162 | otherwise = e
163
164 subst :: Entropy e => (e, e) -> e -> e
165 subst (src, dst) x | sim src x = dst
166 | otherwise = x
167
168 entropyTrie :: Entropy e => (k -> Bool) -> Trie k () -> Trie k e
169 entropyTrie _ (Leaf c) = Leaf c
170 entropyTrie pred (Node c () children) = Node c e (map (entropyTrie pred) children)
171 where
172 e = sum $ map f $ Map.toList children
173 f (k, child) = if pred k then chc * P.logBase 2 (fromIntegral c)
174 else - chc * P.logBase 2 chc
175 where
176 chc = fromIntegral (_node_count child) / fromIntegral c
177 ------------------------------------------------------------------------
178
179 normalizeLevel :: Entropy e => [e] -> e -> e
180 normalizeLevel = checkDiff (go . noNaNs)
181
182 where
183 -- checkDiff f es e = let e' = f es e in if e == e' then e' else trace ("normalizeLevel: diff " <> show e <> " " <> show e') e'
184 checkDiff = identity
185 -- go [] = panic "normalizeLevel: impossible"
186 -- trace "normalizeLevel"
187 -- go [_] = identity
188 go es = \e -> (e - m) / v
189 {-
190 in if P.isNaN e'
191 then trace ("normalizeLevel " <> show (e,m,v,es))
192 e
193 else e'
194 -}
195 where
196 m = mean es
197 v = deviation es
198
199 {- Unused
200
201 nodeChildren :: Trie k e -> Map k (Trie k e)
202 nodeChildren (Node _ _ cs) = cs
203 nodeChildren (Leaf _) = Map.empty
204
205 -}
206
207 data Ward = ForWard | BackWard
208
209 class IsTrie trie where
210 buildTrie :: Entropy e => (Int -> [[Text]] -> [[Token]]) -> Int -> [[Text]] -> trie Token e
211 nodeEntropy :: Entropy e => Getting e i e -> trie k i -> e
212 nodeChild :: Ord k => k -> trie k e -> trie k e
213 findTrie :: Ord k => [k] -> trie k e -> trie k e
214 normalizeEntropy :: Entropy e
215 => Getting e i e -> ModEntropy i o e
216 -> trie k i -> trie k o
217
218 -- UNUSED
219 --nodeAutonomy :: (Ord k, Entropy e) => Getting e i e -> trie k i -> [k] -> e
220 --nodeAutonomy inE t ks = nodeEntropy inE $ findTrie ks t
221
222 instance IsTrie Trie where
223 buildTrie to n ts = entropyTrie isTerminal $ insertTries $ to n ts
224
225 nodeEntropy inE (Node _ e _) = e ^. inE
226 nodeEntropy _ (Leaf _) = -- trace "nodeEntropy of Leaf" $
227 nan
228
229 nodeChild k (Node _ _ cs) = fromMaybe emptyTrie (Map.lookup k cs)
230 nodeChild _ (Leaf _) = emptyTrie
231
232 findTrie ks t = L.foldl (flip nodeChild) t ks
233
234 normalizeEntropy inE modE t = go (modE identity) (entropyLevels inE t) t
235 where
236 go _ [] _ = panic "normalizeEntropy' empty levels"
237 go _ _ (Leaf c) = Leaf c
238 -- go _ ([] : _) _ = panic "normalizeEntropy': empty level"
239 go f (es : ess) (Node c i children)
240 -- | any (sim (i ^. inE)) es
241 = Node c (f i) $ go (modE $ normalizeLevel es) ess <$> children
242 -- | otherwise
243 -- = panic "NOT an elem"
244
245
246 {-
247 This is only normalizing a node with respect to its brothers (unlike all the
248 nodes of the same level).
249
250 normalizeEntropy inE modE = go $ modE identity
251 where
252 go _ (Leaf c) = Leaf c
253 go f (Node c i children)
254 | Map.null children =
255 panic "normalizeEntropy: impossible"
256 | otherwise =
257 Node c (f i) $ go (modE $ normalizeLevel es) <$> children
258 where
259 es = [ i' ^. inE | Node _ i' _ <- Map.elems children ]
260 -}
261 ------------------------------------------------------------------------
262
263 levels :: Trie k e -> [[Trie k e]]
264 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
265 where
266 subForest :: Trie k e -> [Trie k e]
267 subForest (Leaf _) = []
268 subForest (Node _ _ children) = Map.elems children
269
270 entropyLevels :: Entropy e => Getting e i e -> Trie k i -> [[e]]
271 entropyLevels inE = fmap (noNaNs . map (nodeEntropy inE)) . levels
272
273 ------------------------------------------------------------------------
274
275 data Tries k e = Tries
276 { _fwd :: Trie k e
277 , _bwd :: Trie k e
278 }
279
280
281
282 toToken' :: Int -> [[Text]] -> [[Token]]
283 toToken' n input = L.concat $ (filter (/= [Terminal Stop]) . chunkAlongEleve (n + 2)) <$> toToken <$> input
284
285 instance IsTrie Tries where
286 buildTrie to n tts = Tries { _fwd = buildTrie to n tts
287 , _bwd = buildTrie to n (map reverse $ tts)
288 }
289
290 nodeEntropy inE (Tries fwd bwd) =
291 mean $ noNaNs [nodeEntropy inE fwd, nodeEntropy inE bwd]
292
293 findTrie ks (Tries fwd bwd) = Tries (findTrie ks fwd) (findTrie ks bwd)
294 -- ^^
295 -- TODO: here this is tempting to reverse but this is not always what we
296 -- want. See also nodeAutonomy.
297
298 nodeChild k (Tries fwd bwd) = Tries (nodeChild k fwd) (nodeChild k bwd)
299
300 normalizeEntropy inE modE = onTries (normalizeEntropy inE modE)
301
302 onTries :: (Trie k i -> Trie k o) -> Tries k i -> Tries k o
303 onTries f (Tries fwd bwd) = Tries (f fwd) (f bwd)
304
305 ------------------------------------------------------------------------
306 split :: (IsTrie trie, Entropy e) => Lens' i e -> trie Token i -> [Token] -> [[Token]]
307 split _ _ [] = []
308 split inE t0 (Terminal Start:xs0) = split inE (nodeChild (Terminal Start) t0) xs0
309 split inE t0 (x0:xs0) = go (nodeChild x0 t0) [x0] xs0
310 where
311 consRev [] xss = xss
312 consRev xs xss = reverse xs : xss
313
314 go _ pref [] = [reverse pref]
315 go _ pref (Terminal Stop:_) = [reverse pref]
316 go t pref (Terminal Start:xs) = go t pref xs
317 go t pref (x:xs) =
318 -- trace (show (if acc then "ACC" else "CUT", (reverse (x : pref), ext), if acc then ">" else "<=", ((reverse pref, et), "+", ([x], ext0)))) $
319 if acc
320 then go xt (x:pref) xs
321 else consRev pref $ go xt0 [x] xs
322 where
323 xt = nodeChild x t
324 xt0 = nodeChild x t0
325 et = ne 0 t
326 -- ^ entropy of the current prefix
327 ext0 = ne 0 xt0
328 -- ^ entropy of [x]
329 ext = ne 0 xt
330 -- ^ entropy of the current prefix plus x
331 acc = ext > et + ext0
332 -- aut(["in","this","paper"]) > aut(["in","this"]) + aut(["paper"])
333
334 ne d t = if P.isNaN e then d else e
335 where e = nodeEntropy inE t
336
337 {-
338 split :: Entropy e => Lens' i e -> Tries Token i -> [Token] -> [[Token]]
339 split inE t0 ts =
340 maximumWith (sum . map $ nodeAutonomy inE t0) (all the splits of ts)
341 -}
342
343 ------------------------------------------------------------------------
344 ------------------------------------------------------------------------
345
346 mainEleve :: Int -> [[Text]] -> [[[Text]]]
347 mainEleve _ _ = []
348 {-
349 mainEleve n input = map (map printToken) . split identity (t :: Trie Token Double) <$> inp
350 where
351 inp = toToken <$> input
352 t = buildTrie $ L.concat $ chunkAlong n 1 <$> inp
353 -}
354
355 sim :: Entropy e => e -> e -> Bool
356 sim x y = x == y || (P.isNaN x && P.isNaN y)
357
358 chunkAlongEleve :: Int -> [a] -> [[a]]
359 chunkAlongEleve n xs = L.take n <$> L.tails xs
360
361 testEleve :: e ~ Double => Bool -> Int -> [Text] -> [(Text, Int, e, e, e, e, e)] -> IO Bool
362 testEleve debug n output checks = do
363 let
364 {-
365 pss = [ (ps, findTrie ps fwd ^? _Just . node_entropy) -- . info_entropy)
366 | ps <- L.nub $ [ c
367 | m <- [1..n]
368 , cs <- chunkAlong m 1 <$> inp
369 , c <- cs
370 ]
371 ]
372 -}
373 --res = map (map printToken) . split identity fwd <$> inp
374 --res = map (map printToken) . split info_norm_entropy' nt' <$> inp
375 res = map (map printToken) . split info_autonomy nt <$> inp
376 when debug $ do
377 P.putStrLn (show input)
378 -- forM_ pss (P.putStrLn . show)
379 P.putStrLn ""
380 P.putStrLn "Levels:"
381 forM_ (entropyLevels identity (_fwd t)) $ \level ->
382 P.putStrLn $ " " <> show level
383 P.putStrLn ""
384 P.putStrLn "Forward:"
385 printTrie (_fwd nt)
386 P.putStrLn ""
387 P.putStrLn "Backward:"
388 printTrie (_bwd nt)
389 P.putStrLn ""
390 P.putStrLn "Splitting:"
391 P.putStrLn $ show res
392 forM_ checks checker
393 pure $ expected == res
394
395 where
396 out = T.words <$> output
397 expected = fmap (T.splitOn "-") <$> out
398 input = (T.splitOn "-" =<<) <$> out
399 inp = toToken <$> input
400 t = buildTrie toToken' n input
401 -- nt = normalizeEntropy identity set_autonomy (fwd :: Trie Token Double)
402 -- nt = normalizeEntropy' info_entropy (\f -> info_norm_entropy' %~ f) nt
403 nt = normalizeEntropy identity set_autonomy t
404
405 check f msg ref my =
406 if f ref my
407 then P.putStrLn $ " PASS " <> msg <> " " <> show ref
408 else P.putStrLn $ " FAIL " <> msg <> " ref=" <> show ref <> " my=" <> show my
409
410 checker (ngram, count, entropy, _ev, autonomy, bwd_entropy, fwd_entropy) = do
411 let ns = parseToken <$> T.words ngram
412 t' = findTrie ns nt
413 P.putStrLn $ " " <> T.unpack ngram <> ":"
414 check (==) "count" count (_node_count (_fwd t'))
415 check sim "entropy" entropy (nodeEntropy info_entropy t')
416 check sim "autonomy" autonomy (nodeEntropy info_autonomy t')
417 check sim "fwd_entropy" fwd_entropy (nodeEntropy info_entropy (_fwd t'))
418 check sim "bwd_entropy" bwd_entropy (nodeEntropy info_entropy (_bwd t'))
419
420 printTrie =
421 P.putStrLn . Tree.drawTree
422 . fmap show
423 . toTree (NonTerminal "")
424
425 -- | TODO real data is a list of tokenized sentences
426 example0, example1, example2, example3, example4, example5, example6 :: [Text]
427 example0 = ["New-York is New-York and New-York"]
428 example1 = ["to-be or not to-be"]
429 example2 = ["to-be-or not to-be-or NOT to-be and"]
430 example3 = example0 <> example0
431 -- > TEST: Should not have York New in the trie
432 example4 = ["a-b-c-d e a-b-c-d f"]
433 example5 = ["a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"]
434 example6 = ["le-petit chat"
435 ,"le-petit chien"
436 ,"le-petit rat"
437 ,"le gros rat"
438 ]
439
440 checks0, checks2 :: [(Text, Int, Double, Double, Double, Double, Double)]
441
442 checks0 =
443 [("<start>", 1, nan, nan, nan, nan, 0.0)
444 ,("New", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 1.584962500721156, 0.0)
445 ,("York", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 0.0, 1.584962500721156)
446 ,("is", 1, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, 0.0)
447 ,("and", 1, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, 0.0)
448 ,("<stop>", 0, nan, nan, nan, 0.0, nan)
449
450 {-
451 ,("<start> New", 1, nan, nan, nan, nan, 0.0)
452 ,("New York", 3, 1.584962500721156, 1.584962500721156, 1.4142135623730951, nan, 1.584962500721156)
453 ,("York is", 1, 0.0, nan, nan, nan, 0.0)
454 ,("is New", 1, 0.0, nan, nan, nan, 0.0)
455 ,("York and", 1, 0.0, nan, nan, nan, 0.0)
456 ,("and New", 1, 0.0, nan, nan, nan, 0.0)
457 ,("York <stop>", 1, nan, nan, nan, nan, nan)
458
459 ,("<start> New York", 1, nan, nan, nan, nan, 0.0)
460 ,("New York is", 1, 0.0, nan, nan, nan, 0.0)
461 ,("York is New", 1, 0.0, nan, nan, nan, 0.0)
462 ,("is New York", 1, 0.0, nan, nan, nan, 0.0)
463 ,("New York and", 1, 0.0, nan, nan, nan, 0.0)
464 ,("York and New", 1, 0.0, nan, nan, nan, 0.0)
465 ,("and New York", 1, 0.0, nan, nan, nan, 0.0)
466 ,("New York <stop>", 1, nan, nan, nan, nan, nan)
467 -}
468 ]
469
470
471
472 checks2 =
473 [("to be", 3, 1.2516291673878228, 1.2516291673878228, 1.5535694744293167, nan, 0.9182958340544896)
474 ,("be or", 2, 0.5, nan, nan, nan, 1.0)
475 ,("or not", 1, 0.0, nan, nan, nan, 0.0)
476 ,("not to", 1, 0.0, nan, nan, nan, 0.0)
477 ,("or NOT", 1, 0.0, nan, nan, nan, 0.0)
478 ,("NOT to", 1, 0.0, nan, nan, nan, 0.0)
479 ,("be and", 1, 0.0, nan, nan, nan, 0.0)
480 ]
481
482
483 runTests :: IO ()
484 runTests =
485 forM_
486 [("example0", 2, example0, checks0)
487 ,("example1", 2, example1, [])
488 ,("example2", 3, example2, checks2)
489 ,("example3", 2, example3, [])
490 ,("example4", 4, example4, [])
491 ,("example5", 5, example5, [])
492 ]
493 (\(name, n, ex, checks) -> do
494 P.putStrLn $ name <> " " <> show n
495 b <- testEleve False n ex checks
496 P.putStrLn $ " splitting: " <> if b then "PASS" else "FAIL"
497 )