]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Eleve.hs
[ELEVE] Temoin data.
[gargantext.git] / src / Gargantext / Text / Eleve.hs
1 {-|
2 Module : Gargantext.Text.Eleve
3 Description : Unsupervized Word segmentation
4 Copyright : (c) CNRS, 2019-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 # Implementation of Unsupervized Word Segmentation
11
12 References:
13
14 - Python implementation (Korantin August, Emmanuel Navarro):
15 [EleVe](https://github.com/kodexlab/eleve.git)
16
17 - Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
18 Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
19 the 50th Annual Meeting of the Association for Computational Linguistics
20 , pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
21
22 Notes for current implementation:
23 - TODO fix normalization
24 - TODO extract longer ngrams (see paper above, viterbi algo can be used)
25 - TODO AD TEST: prop (Node c _e f) = c == Map.size f
26
27 - AD: Real ngrams extraction test
28 from Gargantext.Text.Terms import extractTermsUnsupervised
29 docs <- runCmdRepl $ selectDocs 1004
30 extractTermsUnsupervised 3 $ DT.intercalate " "
31 $ catMaybes
32 $ Gargantext.map _hyperdataDocument_abstract docs
33
34 -}
35 {-# LANGUAGE ConstraintKinds #-}
36 {-# LANGUAGE NoImplicitPrelude #-}
37 {-# LANGUAGE OverloadedStrings #-}
38 {-# LANGUAGE RankNTypes #-}
39 {-# LANGUAGE TemplateHaskell #-}
40 {-# LANGUAGE TypeFamilies #-}
41
42 module Gargantext.Text.Eleve where
43
44 import Debug.Trace (trace)
45 -- import Debug.SimpleReflect
46
47 import Control.Lens (Lens', Getting, (^.), (^?), view, makeLenses, _Just)
48 import Control.Monad (foldM, mapM_, forM_)
49 import Data.Ord (Ord)
50 import qualified Data.List as L
51 import Data.Monoid
52 import Data.Text (Text)
53 import qualified Data.Text as T
54 import Data.Map (Map)
55 import Data.Maybe (fromMaybe, catMaybes)
56 import qualified Data.Map as Map
57 import Gargantext.Prelude hiding (cs)
58 import qualified Data.Tree as Tree
59 import Data.Tree (Tree)
60 import qualified Prelude as P (putStrLn, logBase, isNaN, RealFloat)
61
62 type Entropy e =
63 ( Fractional e
64 , Floating e
65 , P.RealFloat e
66 , Show e
67 -- ^ TODO: only used for debugging
68 )
69 ------------------------------------------------------------------------
70 -- | Example and tests for development
71 data I e = I
72 { _info_entropy :: e
73 , _info_autonomy :: e
74 }
75
76 instance Show e => Show (I e) where
77 show (I e n) = show (e, n)
78
79 makeLenses ''I
80
81 type ModEntropy i o e = (e -> e) -> i -> o
82
83 set_autonomy :: ModEntropy e (I e) e
84 set_autonomy f e = I e (f e)
85
86 data StartStop = Start | Stop
87 deriving (Ord, Eq, Show)
88
89 data Token = NonTerminal Text
90 | Terminal StartStop
91 deriving (Ord, Eq, Show)
92
93 isTerminal :: Token -> Bool
94 isTerminal (Terminal _) = True
95 isTerminal (NonTerminal _) = False
96
97 parseToken :: Text -> Token
98 parseToken "<start>" = Terminal Start
99 parseToken "<stop>" = Terminal Stop
100 parseToken t = NonTerminal t
101
102 toToken :: [Text] -> [Token]
103 toToken xs = Terminal Start : (NonTerminal <$> xs) <> [Terminal Stop]
104
105 printToken :: Token -> Text
106 printToken = f
107 where
108 f (NonTerminal x) = x
109 f (Terminal Start) = "<start>"
110 f (Terminal Stop) = "<stop>"
111
112 ------------------------------------------------------------------------
113
114 data Trie k e
115 = Node { _node_count :: Int
116 , _node_entropy :: e
117 , _node_children :: Map k (Trie k e)
118 }
119 | Leaf { _node_count :: Int }
120 deriving (Show)
121
122 makeLenses ''Trie
123
124 insertTries :: Ord k => [[k]] -> Trie k ()
125 insertTries = L.foldr insertTrie emptyTrie
126
127 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
128 insertTrie [] n = n { _node_count = _node_count n +1}
129 insertTrie (x:xs) (Leaf c) = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
130 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
131 where
132 f = Just . insertTrie xs . fromMaybe emptyTrie
133
134 -- emptyTrie :: (Ord k, Monoid e) => Trie k e
135 -- emptyTrie = Node 0 mempty mempty
136 emptyTrie :: Trie k e
137 emptyTrie = Leaf 0
138
139 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
140 mkTrie c children
141 | Map.null children = Leaf c
142 | otherwise = Node c mempty children
143
144 -----------------------------
145
146 -- | Trie to Tree since Tree as nice print function
147 toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
148 toTree k (Leaf c) = Tree.Node (k, c, Nothing) []
149 toTree k (Node c e cs) = Tree.Node (k, c, Just e) (map (uncurry toTree) $ Map.toList cs)
150
151 ------------------------------------------------------------------------
152 ------------------------------------------------------------------------
153
154 nan :: Floating e => e
155 nan = 0 / 0
156
157 noNaNs :: P.RealFloat e => [e] -> [e]
158 noNaNs = filter (not . P.isNaN)
159
160 updateIfDefined :: P.RealFloat e => e -> e -> e
161 updateIfDefined e0 e | P.isNaN e = e0
162 | otherwise = e
163
164 entropyTrie :: Floating e => (k -> Bool) -> Trie k () -> Trie k e
165 entropyTrie _ (Leaf c) = Leaf c
166 entropyTrie pred (Node c () children) = Node c e (map (entropyTrie pred) children)
167 where
168 e = sum $ map f $ Map.toList children
169 f (k, child) = if pred k then chc * P.logBase 2 (fromIntegral c)
170 else - chc * P.logBase 2 chc
171 where
172 chc = fromIntegral (_node_count child) / fromIntegral c
173 ------------------------------------------------------------------------
174
175 normalizeLevel :: Entropy e => [e] -> e -> e
176 normalizeLevel = checkDiff (go . noNaNs)
177
178 where
179 -- checkDiff f es e = let e' = f es e in if e == e' then e' else trace ("normalizeLevel: diff " <> show e <> " " <> show e') e'
180 checkDiff = identity
181 go [] = panic "normalizeLevel: impossible"
182 -- trace "normalizeLevel"
183 -- go [_] = identity
184 go es = \e -> (e - m) / v
185 {-
186 in if P.isNaN e'
187 then trace ("normalizeLevel " <> show (e,m,v,es))
188 e
189 else e'
190 -}
191 where
192 m = mean es
193 v = deviation es
194
195 {- Unused
196
197 nodeChildren :: Trie k e -> Map k (Trie k e)
198 nodeChildren (Node _ _ cs) = cs
199 nodeChildren (Leaf _) = Map.empty
200
201 -}
202
203 class IsTrie trie where
204 buildTrie :: Floating e => [[Token]] -> trie Token e
205 nodeEntropy :: Entropy e => Getting e i e -> trie k i -> e
206 nodeChild :: Ord k => k -> trie k e -> trie k e
207 findTrie :: Ord k => [k] -> trie k e -> trie k e
208 normalizeEntropy :: Entropy e
209 => Getting e i e -> ModEntropy i o e
210 -> trie k i -> trie k o
211
212 -- UNUSED
213 --nodeAutonomy :: (Ord k, Entropy e) => Getting e i e -> trie k i -> [k] -> e
214 --nodeAutonomy inE t ks = nodeEntropy inE $ findTrie ks t
215
216 instance IsTrie Trie where
217 buildTrie = entropyTrie isTerminal . insertTries
218
219 nodeEntropy inE (Node _ e _) = e ^. inE
220 nodeEntropy _ (Leaf _) = -- trace "nodeEntropy of Leaf" $
221 nan
222
223 nodeChild k (Node _ _ cs) = fromMaybe emptyTrie (Map.lookup k cs)
224 nodeChild _ (Leaf _) = emptyTrie
225
226 findTrie ks t = L.foldl (flip nodeChild) t ks
227
228 normalizeEntropy inE modE t = go (modE identity) (entropyLevels inE t) t
229 where
230 go _ [] _ = panic "normalizeEntropy' empty levels"
231 go _ _ (Leaf c) = Leaf c
232 go _ ([] : _) _ = panic "normalizeEntropy': empty level"
233 go f (es : ess) (Node c i children) =
234 Node c (f i) $ go (modE $ normalizeLevel es) ess <$> children
235
236
237 {-
238 This is only normalizing a node with respect to its brothers (unlike all the
239 nodes of the same level).
240
241 normalizeEntropy inE modE = go $ modE identity
242 where
243 go _ (Leaf c) = Leaf c
244 go f (Node c i children)
245 | Map.null children =
246 panic "normalizeEntropy: impossible"
247 | otherwise =
248 Node c (f i) $ go (modE $ normalizeLevel es) <$> children
249 where
250 es = [ i' ^. inE | Node _ i' _ <- Map.elems children ]
251 -}
252 ------------------------------------------------------------------------
253
254 levels :: Trie k e -> [[Trie k e]]
255 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
256 where
257 subForest :: Trie k e -> [Trie k e]
258 subForest (Leaf _) = []
259 subForest (Node _ _ children) = Map.elems children
260
261 entropyLevels :: Entropy e => Getting e i e -> Trie k i -> [[e]]
262 entropyLevels inE = fmap (noNaNs . map (nodeEntropy inE)) . levels
263
264 ------------------------------------------------------------------------
265
266 data Tries k e = Tries
267 { _fwd :: Trie k e
268 , _bwd :: Trie k e
269 }
270
271 instance IsTrie Tries where
272 buildTrie tts = Tries { _fwd = buildTrie tts
273 , _bwd = buildTrie (reverse <$> tts)
274 }
275
276 nodeEntropy inE (Tries fwd bwd) =
277 mean $ noNaNs [nodeEntropy inE fwd, nodeEntropy inE bwd]
278
279 findTrie ks (Tries fwd bwd) = Tries (findTrie ks fwd) (findTrie ks bwd)
280 -- ^^
281 -- TODO: here this is tempting to reverse but this is not always what we
282 -- want. See also nodeAutonomy.
283
284 nodeChild k (Tries fwd bwd) = Tries (nodeChild k fwd) (nodeChild k bwd)
285
286 normalizeEntropy inE modE = onTries (normalizeEntropy inE modE)
287
288 onTries :: (Trie k i -> Trie k o) -> Tries k i -> Tries k o
289 onTries f (Tries fwd bwd) = Tries (f fwd) (f bwd)
290
291 ------------------------------------------------------------------------
292 split :: (IsTrie trie, Entropy e) => Lens' i e -> trie Token i -> [Token] -> [[Token]]
293 split _ _ [] = []
294 split inE t0 (Terminal Start:xs0) = split inE (nodeChild (Terminal Start) t0) xs0
295 split inE t0 (x0:xs0) = go (nodeChild x0 t0) [x0] xs0
296 where
297 consRev [] xss = xss
298 consRev xs xss = reverse xs : xss
299
300 go _ pref [] = [reverse pref]
301 go _ pref (Terminal Stop:_) = [reverse pref]
302 go t pref (Terminal Start:xs) = go t pref xs
303 go t pref (x:xs) =
304 -- trace (show (if acc then "ACC" else "CUT", (reverse (x : pref), ext), if acc then ">" else "<=", ((reverse pref, et), "+", ([x], ext0)))) $
305 if acc
306 then go xt (x:pref) xs
307 else consRev pref $ go xt0 [x] xs
308 where
309 xt = nodeChild x t
310 xt0 = nodeChild x t0
311 et = ne 0 t
312 -- ^ entropy of the current prefix
313 ext0 = ne 0 xt0
314 -- ^ entropy of [x]
315 ext = ne 0 xt
316 -- ^ entropy of the current prefix plus x
317 acc = ext > et + ext0
318 -- aut(["in","this","paper"]) > aut(["in","this"]) + aut(["paper"])
319
320 ne d t = if P.isNaN e then d else e
321 where e = nodeEntropy inE t
322
323 {-
324 split :: Entropy e => Lens' i e -> Tries Token i -> [Token] -> [[Token]]
325 split inE t0 ts =
326 maximumWith (sum . map $ nodeAutonomy inE t0) (all the splits of ts)
327 -}
328
329 ------------------------------------------------------------------------
330 ------------------------------------------------------------------------
331
332 mainEleve :: Int -> [[Text]] -> [[[Text]]]
333 mainEleve _ _ = []
334 {-
335 mainEleve n input = map (map printToken) . split identity (t :: Trie Token Double) <$> inp
336 where
337 inp = toToken <$> input
338 t = buildTrie $ L.concat $ chunkAlong n 1 <$> inp
339 -}
340
341 sim :: Entropy e => e -> e -> Bool
342 sim x y = x == y || (P.isNaN x && P.isNaN y)
343
344 chunkAlongEleve :: Int -> [a] -> [[a]]
345 chunkAlongEleve n xs = L.take n <$> L.tails xs
346
347 testEleve :: e ~ Double => Bool -> Int -> [Text] -> [(Text, Int, e, e, e, e, e)] -> IO Bool
348 testEleve debug n output checks = do
349 let
350 {-
351 pss = [ (ps, findTrie ps fwd ^? _Just . node_entropy) -- . info_entropy)
352 | ps <- L.nub $ [ c
353 | m <- [1..n]
354 , cs <- chunkAlong m 1 <$> inp
355 , c <- cs
356 ]
357 ]
358 -}
359 --res = map (map printToken) . split identity fwd <$> inp
360 --res = map (map printToken) . split info_norm_entropy' nt' <$> inp
361 res = map (map printToken) . split info_autonomy nt <$> inp
362 when debug $ do
363 P.putStrLn (show input)
364 -- mapM_ (P.putStrLn . show) pss
365 P.putStrLn ""
366 -- printTrie nt
367 printTrie (_fwd nt)
368 printTrie (_bwd nt)
369 P.putStrLn $ show res
370 forM_ checks checker
371 pure $ expected == res
372
373 where
374 out = T.words <$> output
375 expected = fmap (T.splitOn "-") <$> out
376 input = (T.splitOn "-" =<<) <$> out
377 inp = toToken <$> input
378 t = buildTrie $ L.concat $ chunkAlongEleve (n + 2) <$> inp
379 -- nt = normalizeEntropy identity set_autonomy (fwd :: Trie Token Double)
380 -- nt = normalizeEntropy' info_entropy (\f -> info_norm_entropy' %~ f) nt
381 nt = normalizeEntropy identity set_autonomy t
382
383 check f msg ref my =
384 if f ref my
385 then P.putStrLn $ " PASS " <> msg <> " " <> show ref
386 else P.putStrLn $ " FAIL " <> msg <> " ref=" <> show ref <> " my=" <> show my
387
388 checker (ngram, count, entropy, _ev, autonomy, bwd_entropy, fwd_entropy) = do
389 let ns = parseToken <$> T.words ngram
390 t' = findTrie ns nt
391 P.putStrLn $ " " <> T.unpack ngram <> ":"
392 check (==) "count" count (_node_count (_fwd t'))
393 check sim "entropy" entropy (nodeEntropy info_entropy t')
394 check sim "autonomy" autonomy (nodeEntropy info_autonomy t')
395 check sim "fwd_entropy" fwd_entropy (nodeEntropy info_entropy (_fwd t'))
396 check sim "bwd_entropy" bwd_entropy (nodeEntropy info_entropy (_bwd t'))
397
398 printTrie =
399 P.putStrLn . Tree.drawTree
400 . fmap show
401 . toTree (NonTerminal "")
402
403 -- | TODO real data is a list of tokenized sentences
404 example0, example1, example2, example3, example4, example5, example6 :: [Text]
405 example0 = ["New-York is New-York and New-York"]
406 example1 = ["to-be or not to-be"]
407 example2 = ["to-be-or not to-be-or NOT to-be and"]
408 example3 = example0 <> example0
409 -- > TEST: Should not have York New in the trie
410 example4 = ["a-b-c-d e a-b-c-d f"]
411 example5 = ["a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"]
412 example6 = ["le-petit chat"
413 ,"le-petit chien"
414 ,"le-petit rat"
415 ,"le gros rat"
416 ]
417
418 checks0, checks2 :: [(Text, Int, Double, Double, Double, Double, Double)]
419
420 checks0 = {-
421 [("<start> New", 1, nan, nan, nan, nan, 0.0)
422 ,("New York", 3, 1.584962500721156, 1.584962500721156, 1.414213562373095, nan, 1.584962500721156)
423 ,("York is", 1, 0.0, nan, nan, nan, 0.0)
424 ,("is New", 1, 0.0, nan, nan, nan, 0.0)
425 ,("New York", 3, 1.584962500721156, 1.584962500721156, 1.414213562373095, nan, 1.584962500721156)
426 ,("York and", 1, 0.0, nan, nan, nan, 0.0)
427 ,("and New", 1, 0.0, nan, nan, nan, 0.0)
428 ,("New York", 3, 1.584962500721156, 1.584962500721156, 1.414213562373095, nan, 1.584962500721156)
429 ,("York <stop>", 1, nan, nan, nan, nan, nan)
430 ]-}
431
432 [("<start>", 1, nan, nan, nan, nan, 0.0)
433 , ("New", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 1.584962500721156, 0.0)
434 , ("York", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 0.0, 1.584962500721156)
435 , ("is", 1, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, 0.0)
436 , ("New", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 1.584962500721156, 0.0)
437 , ("York", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 0.0, 1.584962500721156)
438 , ("and", 1, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, 0.0)
439 , ("New", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 1.584962500721156, 0.0)
440 , ("York", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 0.0, 1.584962500721156)
441 , ("<stop>", 0, nan, nan, nan, 0.0, nan)]
442 <>
443
444 [("<start> New", 1, nan, nan, nan, nan, 0.0)
445 , ("New York", 3, 1.584962500721156, 1.584962500721156, 1.4142135623730951, nan, 1.584962500721156)
446 , ("York is", 1, 0.0, nan, nan, nan, 0.0)
447 , ("is New", 1, 0.0, nan, nan, nan, 0.0)
448 , ("New York", 3, 1.584962500721156, 1.584962500721156, 1.4142135623730951, nan, 1.584962500721156)
449 , ("York and", 1, 0.0, nan, nan, nan, 0.0)
450 , ("and New", 1, 0.0, nan, nan, nan, 0.0)
451 , ("New York", 3, 1.584962500721156, 1.584962500721156, 1.4142135623730951, nan, 1.584962500721156)
452 , ("York <stop>", 1, nan, nan, nan, nan, nan)]
453 <>
454 [("<start> New York", 1, nan, nan, nan, nan, 0.0)
455 , ("New York is", 1, 0.0, nan, nan, nan, 0.0)
456 , ("York is New", 1, 0.0, nan, nan, nan, 0.0)
457 , ("is New York", 1, 0.0, nan, nan, nan, 0.0)
458 , ("New York and", 1, 0.0, nan, nan, nan, 0.0)
459 , ("York and New", 1, 0.0, nan, nan, nan, 0.0)
460 , ("and New York", 1, 0.0, nan, nan, nan, 0.0)
461 , ("New York <stop>", 1, nan, nan, nan, nan, nan)
462 , ("York <stop>", 1, nan, nan, nan, nan, nan)
463 , ("<stop>", 0, nan, nan, nan, 0.0, nan)
464 , ("", 9, 2.113283334294875, nan, nan, 2.113283334294875, 2.113283334294875)]
465
466
467
468 checks2 =
469 [("to be", 3, 1.2516291673878228, 1.2516291673878228, 1.5535694744293167, nan, 0.9182958340544896)
470 ,("be or", 2, 0.5, nan, nan, nan, 1.0)
471 ,("or not", 1, 0.0, nan, nan, nan, 0.0)
472 ,("not to", 1, 0.0, nan, nan, nan, 0.0)
473 ,("or NOT", 1, 0.0, nan, nan, nan, 0.0)
474 ,("NOT to", 1, 0.0, nan, nan, nan, 0.0)
475 ,("be and", 1, 0.0, nan, nan, nan, 0.0)
476 ]
477
478
479 runTests :: IO ()
480 runTests =
481 forM_
482 [("example0", 2, example0, checks0)
483 ,("example1", 2, example1, [])
484 ,("example2", 3, example2, checks2)
485 ,("example3", 2, example3, [])
486 ,("example4", 4, example4, [])
487 ,("example5", 5, example5, [])
488 ]
489 (\(name, n, ex, checks) -> do
490 P.putStrLn $ name <> " " <> show n
491 b <- testEleve False n ex checks
492 P.putStrLn $ " splitting: " <> if b then "PASS" else "FAIL"
493 )