]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Eleve.hs
Eleve: cleanup and restore mainEleve
[gargantext.git] / src / Gargantext / Text / Eleve.hs
1 {-|
2 Module : Gargantext.Text.Eleve
3 Description : Unsupervized Word segmentation
4 Copyright : (c) CNRS, 2019-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 # Implementation of Unsupervized Word Segmentation
11
12 References:
13
14 - Python implementation (Korantin August, Emmanuel Navarro):
15 [EleVe](https://github.com/kodexlab/eleve.git)
16
17 - Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
18 Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
19 the 50th Annual Meeting of the Association for Computational Linguistics
20 , pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
21
22 Notes for current implementation:
23 - TODO extract longer ngrams (see paper above, viterbi algo can be used)
24 - TODO AD TEST: prop (Node c _e f) = c == Map.size f
25
26 - AD: Real ngrams extraction test
27 from Gargantext.Text.Terms import extractTermsUnsupervised
28 docs <- runCmdRepl $ selectDocs 1004
29 extractTermsUnsupervised 3 $ DT.intercalate " "
30 $ catMaybes
31 $ Gargantext.map _hyperdataDocument_abstract docs
32
33 -}
34 {-# LANGUAGE ConstraintKinds #-}
35 {-# LANGUAGE NoImplicitPrelude #-}
36 {-# LANGUAGE OverloadedStrings #-}
37 {-# LANGUAGE RankNTypes #-}
38 {-# LANGUAGE TemplateHaskell #-}
39 {-# LANGUAGE TypeFamilies #-}
40
41 module Gargantext.Text.Eleve where
42
43 -- import Debug.Trace (trace)
44 -- import Debug.SimpleReflect
45
46 import Control.Lens hiding (levels, children)
47 import Control.Monad (forM_)
48 import Data.Ord (Ord)
49 import qualified Data.List as L
50 import Data.Monoid
51 import Data.Text (Text)
52 import qualified Data.Text as T
53 import Data.Map (Map)
54 import Data.Maybe (fromMaybe)
55 import qualified Data.Map as Map
56 import Gargantext.Prelude hiding (cs)
57 import qualified Data.Tree as Tree
58 import Data.Tree (Tree)
59 import qualified Prelude as P (putStrLn, logBase, isNaN, RealFloat)
60
61 nan :: Floating e => e
62 nan = 0 / 0
63
64 noNaNs :: P.RealFloat e => [e] -> [e]
65 noNaNs = filter (not . P.isNaN)
66
67 updateIfDefined :: P.RealFloat e => e -> e -> e
68 updateIfDefined e0 e | P.isNaN e = e0
69 | otherwise = e
70
71 sim :: Entropy e => e -> e -> Bool
72 sim x y = x == y || (P.isNaN x && P.isNaN y)
73
74 subst :: Entropy e => (e, e) -> e -> e
75 subst (src, dst) x | sim src x = dst
76 | otherwise = x
77 ------------------------------------------------------------------------
78
79 type Entropy e =
80 ( Fractional e
81 , Floating e
82 , P.RealFloat e
83 , Show e
84 -- ^ TODO: only used for debugging
85 )
86 ------------------------------------------------------------------------
87 -- | Example and tests for development
88 data I e = I
89 { _info_entropy :: e
90 , _info_entropy_var :: e
91 , _info_autonomy :: e
92 }
93
94 instance Show e => Show (I e) where
95 show (I e ev a) = show (e, ev, a)
96
97 makeLenses ''I
98
99 type ModEntropy i o e = (e -> e) -> i -> o
100
101 set_autonomy :: Entropy e => ModEntropy (I e) (I e) e
102 set_autonomy fe i = i & info_autonomy .~ fe (i ^. info_entropy_var)
103
104 set_entropy_var :: Entropy e => Setter e (I e) e e
105 set_entropy_var f e = (\ev -> I e ev nan) <$> f e
106
107 data StartStop = Start | Stop
108 deriving (Ord, Eq, Show)
109
110 data Token = NonTerminal Text
111 | Terminal StartStop
112 deriving (Ord, Eq, Show)
113
114 isTerminal :: Token -> Bool
115 isTerminal (Terminal _) = True
116 isTerminal (NonTerminal _) = False
117
118 parseToken :: Text -> Token
119 parseToken "<start>" = Terminal Start
120 parseToken "<stop>" = Terminal Stop
121 parseToken t = NonTerminal t
122
123 toToken :: [Text] -> [Token]
124 toToken xs = Terminal Start : (NonTerminal <$> xs) <> [Terminal Stop]
125
126 printToken :: Token -> Text
127 printToken = f
128 where
129 f (NonTerminal x) = x
130 f (Terminal Start) = "<start>"
131 f (Terminal Stop) = "<stop>"
132 ------------------------------------------------------------------------
133
134 data Trie k e
135 = Node { _node_count :: Int
136 , _node_entropy :: e
137 , _node_children :: Map k (Trie k e)
138 }
139 | Leaf { _node_count :: Int }
140 deriving (Show)
141
142 makeLenses ''Trie
143
144 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
145 insertTrie [] n = n { _node_count = _node_count n +1}
146 insertTrie (x:xs) (Leaf c) = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
147 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
148 where
149 f = Just . insertTrie xs . fromMaybe emptyTrie
150
151 -- emptyTrie :: (Ord k, Monoid e) => Trie k e
152 -- emptyTrie = Node 0 mempty mempty
153 emptyTrie :: Trie k e
154 emptyTrie = Leaf 0
155
156 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
157 mkTrie c children
158 | Map.null children = Leaf c
159 | otherwise = Node c mempty children
160
161 -----------------------------
162 -- | Trie to Tree since Tree as nice print function
163 toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
164 toTree k (Leaf c) = Tree.Node (k, c, Nothing) []
165 toTree k (Node c e cs) = Tree.Node (k, c, Just e) (map (uncurry toTree) $ Map.toList cs)
166
167 ------------------------------------------------------------------------
168 ------------------------------------------------------------------------
169 normalizeLevel :: Entropy e => e -> e -> e -> e
170 normalizeLevel m v e = (e - m) / v
171
172 {- Unused
173
174 nodeChildren :: Trie k e -> Map k (Trie k e)
175 nodeChildren (Node _ _ cs) = cs
176 nodeChildren (Leaf _) = Map.empty
177
178 -}
179
180 chunkAlongEleve :: Int -> [a] -> [[a]]
181 chunkAlongEleve n xs = L.take n <$> L.tails xs
182
183 data Direction = Backward | Forward
184
185 buildTrie :: Direction -> Int -> [[Token]] -> Trie Token ()
186 buildTrie d n sentences
187 = L.foldr insertTrie emptyTrie
188 . L.concat
189 $ ( filter (/= [Terminal (term d)])
190 . chunkAlongEleve (n + 1)
191 . order d
192 )
193 <$> sentences
194 where
195 order Forward = identity
196 order Backward = reverse
197 term Forward = Stop
198 term Backward = Start
199
200 class IsTrie trie where
201 entropyTrie :: Entropy e => (k -> Bool) -> trie k () -> trie k e
202 nodeEntropy :: Entropy e => Getting e i e -> trie k i -> e
203 nodeChild :: Ord k => k -> trie k e -> trie k e
204 findTrie :: Ord k => [k] -> trie k e -> trie k e
205 printTrie :: (Show i, Entropy e) => Getting e i e -> trie Token i -> IO ()
206 evTrie :: Entropy e => Getting e i e -> Setter i o e e -> trie k i -> trie k o
207 normalizeEntropy :: Entropy e
208 => Getting e i e -> ModEntropy i o e
209 -> trie k i -> trie k o
210
211 instance IsTrie Trie where
212
213 entropyTrie _ (Leaf c) = Leaf c
214 entropyTrie pred (Node c () children) = Node c e (map (entropyTrie pred) children)
215 where
216 children' = Map.toList children
217 sum_count = sum $ _node_count . snd <$> children'
218 e | sum_count == 0 = nan
219 | otherwise = sum $ f <$> children'
220 f (k, child) = if pred k then chc * P.logBase 2 (fromIntegral c)
221 else - chc * P.logBase 2 chc
222 where
223 chc = fromIntegral (_node_count child) / fromIntegral c
224
225 nodeEntropy inE (Node _ e _) = e ^. inE
226 nodeEntropy _ (Leaf _) = nan
227
228 nodeChild k (Node _ _ cs) = fromMaybe emptyTrie (Map.lookup k cs)
229 nodeChild _ (Leaf _) = emptyTrie
230
231 findTrie ks t = L.foldl (flip nodeChild) t ks
232
233 printTrie inE t = do
234 P.putStrLn . Tree.drawTree
235 . fmap show
236 $ toTree (NonTerminal "") t
237 P.putStrLn " Levels:"
238 forM_ (normalizationLevels inE t) $ \level ->
239 P.putStrLn $ " " <> show level
240
241 evTrie inE setEV = go nan
242 where
243 go _ (Leaf c) = Leaf c
244 go e0 (Node c i children) = Node c (i & setEV .~ ev e0 e1) $ go e1 <$> children
245 where e1 = i ^. inE
246
247 ev 0 0 = nan
248 ev i0 i1 = i1 - i0
249
250 normalizeEntropy inE modE t = go (modE identity) (normalizationLevels inE t) t
251 where
252 go _ _ (Leaf c) = Leaf c
253 go _ [] _ = panic "normalizeEntropy' empty levels"
254 go f ((m, v, _) : ess) (Node c i children)
255 = Node c (f i) $ go (modE $ normalizeLevel m v) ess <$> children
256 ------------------------------------------------------------------------
257
258 levels :: Trie k e -> [[Trie k e]]
259 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
260 where
261 subForest :: Trie k e -> [Trie k e]
262 subForest (Leaf _) = []
263 subForest (Node _ _ children) = Map.elems children
264
265 entropyLevels :: Entropy e => Getting e i e -> Trie k i -> [[e]]
266 entropyLevels inE = fmap (noNaNs . map (nodeEntropy inE)) . L.tail . levels
267
268 normalizationLevels :: Entropy e => Getting e i e -> Trie k i -> [(e, e, Int)]
269 normalizationLevels inE = fmap f . entropyLevels inE
270 where
271 f es = (mean es, deviation es, length es)
272
273 ------------------------------------------------------------------------
274
275 data Tries k e = Tries
276 { _fwd :: Trie k e
277 , _bwd :: Trie k e
278 }
279
280 makeLenses ''Tries
281
282 buildTries :: Int -> [[Token]] -> Tries Token ()
283 buildTries n sentences = Tries
284 { _fwd = buildTrie Forward n sentences
285 , _bwd = buildTrie Backward n sentences
286 }
287
288 instance IsTrie Tries where
289
290 nodeEntropy inE (Tries f b) = mean [nodeEntropy inE f, nodeEntropy inE b]
291
292 findTrie ks (Tries f b) = Tries (findTrie ks f) (findTrie (reverse ks) b)
293
294 nodeChild = onTries . nodeChild
295
296 entropyTrie = onTries . entropyTrie
297
298 evTrie inE setEV = onTries $ evTrie inE setEV
299
300 normalizeEntropy inE = onTries . normalizeEntropy inE
301
302 printTrie inE (Tries f b) = do
303 P.putStrLn "Forward:"
304 printTrie inE f
305 P.putStrLn ""
306 P.putStrLn "Backward:"
307 printTrie inE b
308
309 onTries :: (Trie k i -> Trie k o) -> Tries k i -> Tries k o
310 onTries h (Tries f b) = Tries (h f) (h b)
311
312 ------------------------------------------------------------------------
313 split :: (IsTrie trie, Entropy e) => Lens' i e -> trie Token i -> [Token] -> [[Token]]
314 split _ _ [] = []
315 split inE t (Terminal Start:xs) = split inE t xs
316 split inE t (x0:xs0) = go [x0] xs0
317 where
318 mayCons [] xss = xss
319 mayCons xs xss = xs : xss
320
321 go pref [] = [pref]
322 go pref (Terminal Stop:_) = [pref]
323 go _ (Terminal Start:_) = panic "split impossible"
324 go pref (x:xs) =
325 -- trace (show (if acc then "ACC" else "CUT", (prefx, epxt), if acc then ">" else "<=", ((pref, ept), "+", ([x], ext)))) $
326 if acc
327 then go prefx xs
328 else mayCons pref $ go [x] xs
329 where
330 prefx = pref <> [x]
331 pt = findTrie pref t
332 pxt = findTrie prefx t
333 xt = findTrie [x] t
334 ept = ne pt
335 -- ^ entropy of the current prefix
336 ext = ne xt
337 -- ^ entropy of [x]
338 epxt = ne pxt
339 -- ^ entropy of the current prefix plus x
340 acc = P.isNaN ept || P.isNaN ext || not (P.isNaN epxt) -- && (epxt > ept + ext)
341
342 -- aut(["in","this","paper"]) > aut(["in","this"]) + aut(["paper"])
343
344 ne = nodeEntropy inE
345
346 {-
347 split :: Entropy e => Lens' i e -> Tries Token i -> [Token] -> [[Token]]
348 split inE t0 ts =
349 maximumWith (sum . map $ nodeAutonomy inE t0) (all the splits of ts)
350 -}
351
352 ------------------------------------------------------------------------
353
354 mainEleve :: Int -> [[Text]] -> [[[Text]]]
355 mainEleve n input = map (map printToken) . split info_autonomy (t :: Tries Token (I Double)) <$> inp
356 where
357 inp = toToken <$> input
358 t = normalizeEntropy info_entropy_var set_autonomy
359 . evTrie identity set_entropy_var
360 . entropyTrie isTerminal
361 $ buildTries n inp
362
363
364 ---------------------------------------------
365
366 type Checks e = [(Text, Int, e, e, e, e, e, e, e, e, e)]
367
368 testEleve :: e ~ Double => Bool -> Int -> [Text] -> Checks e -> IO Bool
369 testEleve debug n output checks = do
370 let
371 res = map (map printToken) . split info_autonomy nt <$> inp
372 when debug $ do
373 P.putStrLn $ show input
374 P.putStrLn ""
375 printTrie info_entropy nt
376 P.putStrLn ""
377 P.putStrLn "Splitting:"
378 P.putStrLn $ show res
379 forM_ checks checker
380 pure $ expected == res
381
382 where
383 out = T.words <$> output
384 expected = fmap (T.splitOn "-") <$> out
385 input = (T.splitOn "-" =<<) <$> out
386 inp = toToken <$> input
387
388 nt :: Tries Token (I Double)
389 nt = normalizeEntropy info_entropy_var set_autonomy
390 . evTrie identity set_entropy_var
391 . entropyTrie isTerminal
392 $ buildTries n inp
393
394 check f msg ref my =
395 if f ref my
396 then P.putStrLn $ " \ESC[32mPASS\ESC[m " <> msg <> " " <> show ref
397 else P.putStrLn $ " \ESC[31mFAIL\ESC[m " <> msg <> " ref=" <> show ref <> " my=" <> show my
398
399 checker (ngram, count, entropy, ev, autonomy, fwd_entropy, fwd_ev, fwd_autonomy, bwd_entropy, bwd_ev, bwd_autonomy) = do
400 let ns = parseToken <$> T.words ngram
401 nt' = findTrie ns nt
402
403 P.putStrLn $ " " <> T.unpack ngram <> ":"
404 check (==) "count" count (_node_count (_fwd nt'))
405
406 check sim "entropy" entropy (nodeEntropy info_entropy nt' )
407 check sim "ev" ev (nodeEntropy info_entropy_var nt' )
408 check sim "autonomy" autonomy (nodeEntropy info_autonomy nt' )
409
410 check sim "fwd_entropy" fwd_entropy (nodeEntropy info_entropy (_fwd nt'))
411 check sim "fwd_ev" fwd_ev (nodeEntropy info_entropy_var (_fwd nt'))
412 check sim "fwd_autonomy" fwd_autonomy (nodeEntropy info_autonomy (_fwd nt'))
413
414 check sim "bwd_entropy" bwd_entropy (nodeEntropy info_entropy (_bwd nt'))
415 check sim "bwd_ev" bwd_ev (nodeEntropy info_entropy_var (_bwd nt'))
416 check sim "bwd_autonomy" bwd_autonomy (nodeEntropy info_autonomy (_bwd nt'))
417
418 -- | TODO real data is a list of tokenized sentences
419 example0, example1, example2, example3, example4, example5, example6 :: [Text]
420 example0 = ["New-York is New-York and New-York"]
421 example1 = ["to-be or not to-be"]
422 example2 = ["to-be-or not to-be-or NOT to-be and"]
423 example3 = example0 <> example0
424 -- > TEST: Should not have York New in the trie
425 example4 = ["a-b-c-d e a-b-c-d f"]
426 example5 = ["a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"]
427 example6 = ["le-petit chat"
428 ,"le-petit chien"
429 ,"le-petit rat"
430 ,"le gros rat"
431 ]
432
433 checks0, checks2 :: Checks Double
434
435 checks0 =
436 -- [(token, count, entropy, ev, autonomy, fwd_entropy, fwd_ev, fwd_autonomy, bwd_entropy, bwd_ev, bwd_autonomy)]
437 [ ("<start>", 1, nan, nan, nan, 0.0, -2.113283334294875, -0.5000000000000002, nan, nan, nan)
438 , ("New", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 0.0, -2.113283334294875, -0.5000000000000002, 1.584962500721156, -0.5283208335737188, 2.0)
439 , ("York", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 1.584962500721156, -0.5283208335737188, 2.0, 0.0, -2.113283334294875, -0.5000000000000002)
440 , ("is", 1, 0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002)
441 , ("and", 1, 0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002)
442 , ("<stop>", 0, nan, nan, nan, nan, nan, nan, 0.0, -2.113283334294875, -0.5000000000000002)
443 , ("<start> New", 1, nan, nan, nan, 0.0, nan, nan, nan, nan, nan)
444 , ("New York", 3, 1.584962500721156, 1.584962500721156, 1.414213562373095, 1.584962500721156, 1.584962500721156, 1.4142135623730947, 1.584962500721156, 1.584962500721156, 1.4142135623730951)
445 , ("York is", 1, 0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865476, 0.0, nan, nan)
446 , ("is New", 1, 0, nan, nan, 0.0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865474)
447 , ("York and", 1, 0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865476, 0.0, nan, nan)
448 , ("and New", 1, 0, nan, nan, 0.0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865474)
449 , ("York <stop>", 1, nan, nan, nan, nan, nan, nan, 0.0, nan, nan)
450 , ("<start> New York", 1, nan, nan, nan, 0.0, nan, nan, nan, nan, nan)
451 , ("New York is", 1, 0, nan, nan, 0.0, -1.584962500721156, nan, 0.0, nan, nan)
452 , ("York is New", 1, 0, nan, nan, 0.0, nan, nan, 0.0, nan, nan)
453 , ("is New York", 1, 0, nan, nan, 0.0, nan, nan, 0.0, -1.584962500721156, nan)
454 , ("New York and", 1, 0, nan, nan, 0.0, -1.584962500721156, nan, 0.0, nan, nan)
455 , ("York and New", 1, 0, nan, nan, 0.0, nan, nan, 0.0, nan, nan)
456 , ("and New York", 1, 0, nan, nan, 0.0, nan, nan, 0.0, -1.584962500721156, nan)
457 , ("New York <stop>", 1, nan, nan, nan, nan, nan, nan, 0.0, nan, nan)
458 ]
459
460 checks2 = []
461 {-
462 [("to be", 3, 1.2516291673878228, 1.2516291673878228, 1.5535694744293167, nan, 0.9182958340544896)
463 ,("be or", 2, 0.5, nan, nan, nan, 1.0)
464 ,("or not", 1, 0.0, nan, nan, nan, 0.0)
465 ,("not to", 1, 0.0, nan, nan, nan, 0.0)
466 ,("or NOT", 1, 0.0, nan, nan, nan, 0.0)
467 ,("NOT to", 1, 0.0, nan, nan, nan, 0.0)
468 ,("be and", 1, 0.0, nan, nan, nan, 0.0)
469 ]
470 -}
471
472 runTests :: IO ()
473 runTests =
474 forM_
475 [("example0", 3, example0, checks0)
476 ,("example0", 2, example0, [])
477 ,("example1", 2, example1, [])
478 ,("example2", 3, example2, checks2)
479 ,("example3", 2, example3, [])
480 ,("example4", 4, example4, [])
481 ,("example5", 5, example5, [])
482 ,("example6", 2, example6, [])
483 ]
484 (\(name, n, ex, checks) -> do
485 P.putStrLn $ name <> " " <> show n
486 b <- testEleve False n ex checks
487 P.putStrLn $ " splitting: " <> if b then "PASS" else "FAIL"
488 )