]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Terms/Eleve.hs
[ELEVE] Specifications to test to fix concurrent multi-terms.
[gargantext.git] / src / Gargantext / Text / Terms / Eleve.hs
1 {-|
2 Module : Gargantext.Text.Terms.Eleve
3 Description : Unsupervized Word segmentation
4 Copyright : (c) CNRS, 2019-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 # Implementation of Unsupervized Word Segmentation
11
12 References:
13
14 - Python implementation (Korantin August, Emmanuel Navarro):
15 [EleVe](https://github.com/kodexlab/eleve.git)
16
17 - Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
18 Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
19 the 50th Annual Meeting of the Association for Computational Linguistics
20 , pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
21
22 Notes for current implementation:
23 - TODO extract longer ngrams (see paper above, viterbi algo can be used)
24 - TODO AD TEST: prop (Node c _e f) = c == Map.size f
25
26 - AD: Real ngrams extraction test
27 from Gargantext.Text.Terms import extractTermsUnsupervised
28 docs <- runCmdRepl $ selectDocs 1004
29 extractTermsUnsupervised 3 $ DT.intercalate " "
30 $ catMaybes
31 $ Gargantext.map _hyperdataDocument_abstract docs
32
33 -}
34 {-# LANGUAGE ConstraintKinds #-}
35 {-# LANGUAGE NoImplicitPrelude #-}
36 {-# LANGUAGE OverloadedStrings #-}
37 {-# LANGUAGE RankNTypes #-}
38 {-# LANGUAGE TemplateHaskell #-}
39 {-# LANGUAGE TypeFamilies #-}
40
41 module Gargantext.Text.Terms.Eleve where
42
43 -- import Debug.Trace (trace)
44 -- import Debug.SimpleReflect
45
46 import Control.Lens hiding (levels, children)
47 import Control.Monad (forM_)
48 import Data.Ord (Ord)
49 import qualified Data.List as L
50 import Data.Monoid
51 import Data.Text (Text)
52 import qualified Data.Text as T
53 import Data.Map (Map)
54 import Data.Maybe (fromMaybe)
55 import qualified Data.Map as Map
56 import Gargantext.Prelude hiding (cs)
57 import qualified Data.Tree as Tree
58 import Data.Tree (Tree)
59 import qualified Prelude as P (putStrLn, logBase, isNaN, RealFloat)
60
61 nan :: Floating e => e
62 nan = 0 / 0
63
64 noNaNs :: P.RealFloat e => [e] -> [e]
65 noNaNs = filter (not . P.isNaN)
66
67 updateIfDefined :: P.RealFloat e => e -> e -> e
68 updateIfDefined e0 e | P.isNaN e = e0
69 | otherwise = e
70
71 sim :: Entropy e => e -> e -> Bool
72 sim x y = x == y || (P.isNaN x && P.isNaN y)
73
74 subst :: Entropy e => (e, e) -> e -> e
75 subst (src, dst) x | sim src x = dst
76 | otherwise = x
77 ------------------------------------------------------------------------
78
79 type Entropy e =
80 ( Fractional e
81 , Floating e
82 , P.RealFloat e
83 , Show e
84 -- ^ TODO: only used for debugging
85 )
86 ------------------------------------------------------------------------
87 -- | Example and tests for development
88 data I e = I
89 { _info_entropy :: e
90 , _info_entropy_var :: e
91 , _info_autonomy :: e
92 }
93
94 instance Show e => Show (I e) where
95 show (I e ev a) = show (e, ev, a)
96
97 makeLenses ''I
98
99 type ModEntropy i o e = (e -> e) -> i -> o
100
101 set_autonomy :: Entropy e => ModEntropy (I e) (I e) e
102 set_autonomy fe i = i & info_autonomy .~ fe (i ^. info_entropy_var)
103
104 set_entropy_var :: Entropy e => Setter e (I e) e e
105 set_entropy_var f e = (\ev -> I e ev nan) <$> f e
106
107 data StartStop = Start | Stop
108 deriving (Ord, Eq, Show)
109
110 data Token = NonTerminal Text
111 | Terminal StartStop
112 deriving (Ord, Eq, Show)
113
114 isTerminal :: Token -> Bool
115 isTerminal (Terminal _) = True
116 isTerminal (NonTerminal _) = False
117
118 nonTerminals :: [Token] -> [Text]
119 nonTerminals ts = [nt | NonTerminal nt <- ts]
120
121 parseToken :: Text -> Token
122 parseToken "<start>" = Terminal Start
123 parseToken "<stop>" = Terminal Stop
124 parseToken t = NonTerminal t
125
126 toToken :: [Text] -> [Token]
127 toToken xs = Terminal Start : (NonTerminal <$> xs) <> [Terminal Stop]
128
129 printToken :: Token -> Text
130 printToken = f
131 where
132 f (NonTerminal x) = x
133 f (Terminal Start) = "<start>"
134 f (Terminal Stop) = "<stop>"
135 ------------------------------------------------------------------------
136
137 data Trie k e
138 = Node { _node_count :: Int
139 , _node_entropy :: e
140 , _node_children :: Map k (Trie k e)
141 }
142 | Leaf { _node_count :: Int }
143 deriving (Show)
144
145 makeLenses ''Trie
146
147 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
148 insertTrie [] n = n { _node_count = _node_count n +1}
149 insertTrie (x:xs) (Leaf c) = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
150 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
151 where
152 f = Just . insertTrie xs . fromMaybe emptyTrie
153
154 -- emptyTrie :: (Ord k, Monoid e) => Trie k e
155 -- emptyTrie = Node 0 mempty mempty
156 emptyTrie :: Trie k e
157 emptyTrie = Leaf 0
158
159 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
160 mkTrie c children
161 | Map.null children = Leaf c
162 | otherwise = Node c mempty children
163
164 -----------------------------
165 -- | Trie to Tree since Tree as nice print function
166 toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
167 toTree k (Leaf c) = Tree.Node (k, c, Nothing) []
168 toTree k (Node c e cs) = Tree.Node (k, c, Just e) (map (uncurry toTree) $ Map.toList cs)
169
170 ------------------------------------------------------------------------
171 ------------------------------------------------------------------------
172 normalizeLevel :: Entropy e => e -> e -> e -> e
173 normalizeLevel m v e = (e - m) / v
174
175 {- Unused
176
177 nodeChildren :: Trie k e -> Map k (Trie k e)
178 nodeChildren (Node _ _ cs) = cs
179 nodeChildren (Leaf _) = Map.empty
180
181 -}
182
183 chunkAlongEleve :: Int -> [a] -> [[a]]
184 chunkAlongEleve n xs = L.take n <$> L.tails xs
185
186 data Direction = Backward | Forward
187
188 buildTrie :: Direction -> Int -> [[Token]] -> Trie Token ()
189 buildTrie d n sentences
190 = L.foldr insertTrie emptyTrie
191 . L.concat
192 $ ( filter (/= [Terminal (term d)])
193 . chunkAlongEleve (n + 1)
194 . order d
195 )
196 <$> sentences
197 where
198 order Forward = identity
199 order Backward = reverse
200 term Forward = Stop
201 term Backward = Start
202
203 class IsTrie trie where
204 entropyTrie :: Entropy e => (k -> Bool) -> trie k () -> trie k e
205 nodeEntropy :: Entropy e => Getting e i e -> trie k i -> e
206 nodeChild :: Ord k => k -> trie k e -> trie k e
207 findTrie :: Ord k => [k] -> trie k e -> trie k e
208 printTrie :: (Show i, Entropy e) => Getting e i e -> trie Token i -> IO ()
209 evTrie :: Entropy e => Getting e i e -> Setter i o e e -> trie k i -> trie k o
210 normalizeEntropy :: Entropy e
211 => Getting e i e -> ModEntropy i o e
212 -> trie k i -> trie k o
213
214 instance IsTrie Trie where
215
216 entropyTrie _ (Leaf c) = Leaf c
217 entropyTrie pred (Node c () children) = Node c e (map (entropyTrie pred) children)
218 where
219 children' = Map.toList children
220 sum_count = sum $ _node_count . snd <$> children'
221 e | sum_count == 0 = nan
222 | otherwise = sum $ f <$> children'
223 f (k, child) = if pred k then chc * P.logBase 2 (fromIntegral c)
224 else - chc * P.logBase 2 chc
225 where
226 chc = fromIntegral (_node_count child) / fromIntegral c
227
228 nodeEntropy inE (Node _ e _) = e ^. inE
229 nodeEntropy _ (Leaf _) = nan
230
231 nodeChild k (Node _ _ cs) = fromMaybe emptyTrie (Map.lookup k cs)
232 nodeChild _ (Leaf _) = emptyTrie
233
234 findTrie ks t = L.foldl (flip nodeChild) t ks
235
236 printTrie inE t = do
237 P.putStrLn . Tree.drawTree
238 . fmap show
239 $ toTree (NonTerminal "") t
240 P.putStrLn " Levels:"
241 forM_ (normalizationLevels inE t) $ \level ->
242 P.putStrLn $ " " <> show level
243
244 evTrie inE setEV = go nan
245 where
246 go _ (Leaf c) = Leaf c
247 go e0 (Node c i children) = Node c (i & setEV .~ ev e0 e1) $ go e1 <$> children
248 where e1 = i ^. inE
249
250 ev 0 0 = nan
251 ev i0 i1 = i1 - i0
252
253 normalizeEntropy inE modE t = go (modE identity) (normalizationLevels inE t) t
254 where
255 go _ _ (Leaf c) = Leaf c
256 go _ [] _ = panic "normalizeEntropy' empty levels"
257 go f ((m, v, _) : ess) (Node c i children)
258 = Node c (f i) $ go (modE $ normalizeLevel m v) ess <$> children
259 ------------------------------------------------------------------------
260
261 levels :: Trie k e -> [[Trie k e]]
262 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
263 where
264 subForest :: Trie k e -> [Trie k e]
265 subForest (Leaf _) = []
266 subForest (Node _ _ children) = Map.elems children
267
268 entropyLevels :: Entropy e => Getting e i e -> Trie k i -> [[e]]
269 entropyLevels inE = fmap (noNaNs . map (nodeEntropy inE)) . L.tail . levels
270
271 normalizationLevels :: Entropy e => Getting e i e -> Trie k i -> [(e, e, Int)]
272 normalizationLevels inE = fmap f . entropyLevels inE
273 where
274 f es = (mean es, deviation es, length es)
275
276 ------------------------------------------------------------------------
277
278 data Tries k e = Tries
279 { _fwd :: Trie k e
280 , _bwd :: Trie k e
281 }
282
283 makeLenses ''Tries
284
285 buildTries :: Int -> [[Token]] -> Tries Token ()
286 buildTries n sentences = Tries
287 { _fwd = buildTrie Forward n sentences
288 , _bwd = buildTrie Backward n sentences
289 }
290
291 instance IsTrie Tries where
292
293 nodeEntropy inE (Tries f b) = mean [nodeEntropy inE f, nodeEntropy inE b]
294
295 findTrie ks (Tries f b) = Tries (findTrie ks f) (findTrie (reverse ks) b)
296
297 nodeChild = onTries . nodeChild
298
299 entropyTrie = onTries . entropyTrie
300
301 evTrie inE setEV = onTries $ evTrie inE setEV
302
303 normalizeEntropy inE = onTries . normalizeEntropy inE
304
305 printTrie inE (Tries f b) = do
306 P.putStrLn "Forward:"
307 printTrie inE f
308 P.putStrLn ""
309 P.putStrLn "Backward:"
310 printTrie inE b
311
312 onTries :: (Trie k i -> Trie k o) -> Tries k i -> Tries k o
313 onTries h (Tries f b) = Tries (h f) (h b)
314
315 ------------------------------------------------------------------------
316 mayCons :: [a] -> [[a]] -> [[a]]
317 mayCons [] xss = xss
318 mayCons xs xss = xs : xss
319
320 {-
321 split :: (IsTrie trie, Entropy e) => Lens' i e -> trie Token i -> [Token] -> [[Token]]
322 split _ _ [] = []
323 split inE t (Terminal Start:xs) = split inE t xs
324 split inE t (x0:xs0) = go [x0] xs0
325 where
326 go pref [] = [pref]
327 go pref (Terminal Stop:_) = [pref]
328 go _ (Terminal Start:_) = panic "split impossible"
329 go pref (x:xs) =
330 -- trace (show (if acc then "ACC" else "CUT", (prefx, epxt), if acc then ">" else "<=", ((pref, ept), "+", ([x], ext)))) $
331 if acc
332 then go prefx xs
333 else mayCons pref $ go [x] xs
334 where
335 prefx = pref <> [x]
336 pt = findTrie pref t
337 pxt = findTrie prefx t
338 xt = findTrie [x] t
339 ept = ne pt
340 -- ^ entropy of the current prefix
341 ext = ne xt
342 -- ^ entropy of [x]
343 epxt = ne pxt
344 -- ^ entropy of the current prefix plus x
345 acc = P.isNaN ept || P.isNaN ext || not (P.isNaN epxt) -- && (epxt > mean [ept, ext])
346
347 -- aut(["in","this","paper"]) > aut(["in","this"]) + aut(["paper"])
348
349 ne = nodeEntropy inE
350 -}
351
352 split :: Entropy e => Int -> Lens' i e -> Tries Token i -> [Token] -> [[Text]]
353 split _ _ _ [] = []
354 split _ _ _ [t] = pure <$> nonTerminals [t]
355 split n inE t ts = nonTerminals pref `mayCons` split n inE t (drop (length pref) ts)
356 where
357 pref = maximumWith (\ks -> nodeEntropy inE $ findTrie ks t)
358 (L.tail . L.inits . take n $ ts)
359
360
361 {-
362 split :: Entropy e => Lens' i e -> Tries Token i -> [Token] -> [[Token]]
363 split inE t0 ts =
364 maximumWith (sum . map $ nodeAutonomy inE t0) (all the splits of ts)
365 -}
366
367 ------------------------------------------------------------------------
368
369 mainEleve :: Int -> [[Text]] -> [[[Text]]]
370 mainEleve n x = mainEleve' n x x
371
372 mainEleve' :: Int -> [[Text]] -> [[Text]] -> [[[Text]]]
373 mainEleve' n x y = mainEleveWith x' n y
374 where
375 x' = buildTries n (fmap toToken x)
376
377 -- | This function should take the longest possible chain of:
378 -- mainEleve'' n x y = maxChainWith [mainEleve' n x y, mainEleve' n x x]
379 mainEleve'' :: Int -> [[Text]] -> [[Text]] -> [[[Text]]]
380 mainEleve'' = undefined
381
382 mainEleveWith :: Tries Token () -> Int -> [[Text]] -> [[[Text]]]
383 mainEleveWith m n i = fmap (split n info_autonomy t) (fmap toToken i)
384 where
385 t :: Tries Token (I Double)
386 t = normalizeEntropy info_entropy_var set_autonomy
387 $ evTrie identity set_entropy_var
388 $ entropyTrie isTerminal m
389
390 ------------------------------------------------------------------------
391
392 type Checks e = [(Text, Int, e, e, e, e, e, e, e, e, e)]
393
394 testEleve :: e ~ Double => Bool -> Int -> [Text] -> Checks e -> IO Bool
395 testEleve debug n output checks = do
396 let
397 res = split n info_autonomy nt <$> inp
398 when debug $ do
399 P.putStrLn $ show input
400 P.putStrLn ""
401 printTrie info_entropy nt
402 P.putStrLn ""
403 P.putStrLn "Splitting:"
404 P.putStrLn $ show res
405 forM_ checks checker
406 pure $ expected == res
407
408 where
409 out = T.words <$> output
410 expected = fmap (T.splitOn "-") <$> out
411 input = (T.splitOn "-" =<<) <$> out
412 inp = toToken <$> input
413
414 nt :: Tries Token (I Double)
415 nt = normalizeEntropy info_entropy_var set_autonomy
416 . evTrie identity set_entropy_var
417 . entropyTrie isTerminal
418 $ buildTries n inp
419
420 check f msg ref my =
421 if f ref my
422 then P.putStrLn $ " \ESC[32mPASS\ESC[m " <> msg <> " " <> show ref
423 else P.putStrLn $ " \ESC[31mFAIL\ESC[m " <> msg <> " ref=" <> show ref <> " my=" <> show my
424
425 checker (ngram, count, entropy, ev, autonomy, fwd_entropy, fwd_ev, fwd_autonomy, bwd_entropy, bwd_ev, bwd_autonomy) = do
426 let ns = parseToken <$> T.words ngram
427 nt' = findTrie ns nt
428
429 P.putStrLn $ " " <> T.unpack ngram <> ":"
430 check (==) "count" count (_node_count (_fwd nt'))
431
432 check sim "entropy" entropy (nodeEntropy info_entropy nt' )
433 check sim "ev" ev (nodeEntropy info_entropy_var nt' )
434 check sim "autonomy" autonomy (nodeEntropy info_autonomy nt' )
435
436 check sim "fwd_entropy" fwd_entropy (nodeEntropy info_entropy (_fwd nt'))
437 check sim "fwd_ev" fwd_ev (nodeEntropy info_entropy_var (_fwd nt'))
438 check sim "fwd_autonomy" fwd_autonomy (nodeEntropy info_autonomy (_fwd nt'))
439
440 check sim "bwd_entropy" bwd_entropy (nodeEntropy info_entropy (_bwd nt'))
441 check sim "bwd_ev" bwd_ev (nodeEntropy info_entropy_var (_bwd nt'))
442 check sim "bwd_autonomy" bwd_autonomy (nodeEntropy info_autonomy (_bwd nt'))
443
444 -- | TODO real data is a list of tokenized sentences
445 example0, example1, example2, example3, example4, example5, example6 :: [Text]
446 example0 = ["New-York is New-York and New-York"]
447 example1 = ["to-be or not to-be"]
448 example2 = ["to-be-or not to-be-or NOT to-be and"]
449 example3 = example0 <> example0
450 -- > TEST: Should not have York New in the trie
451 example4 = ["a-b-c-d e a-b-c-d f"]
452 example5 = ["a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"]
453 example6 = ["le-petit chat"
454 ,"le-petit chien"
455 ,"le-petit rat"
456 ,"le gros rat"
457 ]
458
459 checks0, checks2 :: Checks Double
460
461 checks0 =
462 -- [(token, count, entropy, ev, autonomy, fwd_entropy, fwd_ev, fwd_autonomy, bwd_entropy, bwd_ev, bwd_autonomy)]
463 [ ("<start>", 1, nan, nan, nan, 0.0, -2.113283334294875, -0.5000000000000002, nan, nan, nan)
464 , ("New", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 0.0, -2.113283334294875, -0.5000000000000002, 1.584962500721156, -0.5283208335737188, 2.0)
465 , ("York", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 1.584962500721156, -0.5283208335737188, 2.0, 0.0, -2.113283334294875, -0.5000000000000002)
466 , ("is", 1, 0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002)
467 , ("and", 1, 0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002)
468 , ("<stop>", 0, nan, nan, nan, nan, nan, nan, 0.0, -2.113283334294875, -0.5000000000000002)
469 , ("<start> New", 1, nan, nan, nan, 0.0, nan, nan, nan, nan, nan)
470 , ("New York", 3, 1.584962500721156, 1.584962500721156, 1.414213562373095, 1.584962500721156, 1.584962500721156, 1.4142135623730947, 1.584962500721156, 1.584962500721156, 1.4142135623730951)
471 , ("York is", 1, 0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865476, 0.0, nan, nan)
472 , ("is New", 1, 0, nan, nan, 0.0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865474)
473 , ("York and", 1, 0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865476, 0.0, nan, nan)
474 , ("and New", 1, 0, nan, nan, 0.0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865474)
475 , ("York <stop>", 1, nan, nan, nan, nan, nan, nan, 0.0, nan, nan)
476 , ("<start> New York", 1, nan, nan, nan, 0.0, nan, nan, nan, nan, nan)
477 , ("New York is", 1, 0, nan, nan, 0.0, -1.584962500721156, nan, 0.0, nan, nan)
478 , ("York is New", 1, 0, nan, nan, 0.0, nan, nan, 0.0, nan, nan)
479 , ("is New York", 1, 0, nan, nan, 0.0, nan, nan, 0.0, -1.584962500721156, nan)
480 , ("New York and", 1, 0, nan, nan, 0.0, -1.584962500721156, nan, 0.0, nan, nan)
481 , ("York and New", 1, 0, nan, nan, 0.0, nan, nan, 0.0, nan, nan)
482 , ("and New York", 1, 0, nan, nan, 0.0, nan, nan, 0.0, -1.584962500721156, nan)
483 , ("New York <stop>", 1, nan, nan, nan, nan, nan, nan, 0.0, nan, nan)
484 ]
485
486 checks2 = []
487 {-
488 [("to be", 3, 1.2516291673878228, 1.2516291673878228, 1.5535694744293167, nan, 0.9182958340544896)
489 ,("be or", 2, 0.5, nan, nan, nan, 1.0)
490 ,("or not", 1, 0.0, nan, nan, nan, 0.0)
491 ,("not to", 1, 0.0, nan, nan, nan, 0.0)
492 ,("or NOT", 1, 0.0, nan, nan, nan, 0.0)
493 ,("NOT to", 1, 0.0, nan, nan, nan, 0.0)
494 ,("be and", 1, 0.0, nan, nan, nan, 0.0)
495 ]
496 -}
497
498 runTestsEleve :: IO ()
499 runTestsEleve =
500 forM_
501 [("example0", 3, example0, checks0)
502 ,("example0", 2, example0, [])
503 ,("example1", 2, example1, [])
504 ,("example2", 3, example2, checks2)
505 ,("example3", 2, example3, [])
506 ,("example4", 4, example4, [])
507 ,("example5", 5, example5, [])
508 ,("example6", 2, example6, [])
509 ]
510 (\(name, n, ex, checks) -> do
511 P.putStrLn $ name <> " " <> show n
512 b <- testEleve False n ex checks
513 P.putStrLn $ " splitting: " <> if b then "PASS" else "FAIL"
514 )