]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Terms/Eleve.hs
Merge remote-tracking branch 'home/dev' into dev-merge
[gargantext.git] / src / Gargantext / Text / Terms / Eleve.hs
1 {-|
2 Module : Gargantext.Text.Terms.Eleve
3 Description : Unsupervized Word segmentation
4 Copyright : (c) CNRS, 2019-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 # Implementation of Unsupervized Word Segmentation
11
12 References:
13
14 - Python implementation (Korantin August, Emmanuel Navarro):
15 [EleVe](https://github.com/kodexlab/eleve.git)
16
17 - Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
18 Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
19 the 50th Annual Meeting of the Association for Computational Linguistics
20 , pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
21
22 Notes for current implementation:
23 - TODO extract longer ngrams (see paper above, viterbi algo can be used)
24 - TODO AD TEST: prop (Node c _e f) = c == Map.size f
25
26 - AD: Real ngrams extraction test
27 from Gargantext.Text.Terms import extractTermsUnsupervised
28 docs <- runCmdRepl $ selectDocs 1004
29 extractTermsUnsupervised 3 $ DT.intercalate " "
30 $ catMaybes
31 $ Gargantext.map _hyperdataDocument_abstract docs
32
33 -}
34 {-# LANGUAGE ConstraintKinds #-}
35 {-# LANGUAGE NoImplicitPrelude #-}
36 {-# LANGUAGE OverloadedStrings #-}
37 {-# LANGUAGE RankNTypes #-}
38 {-# LANGUAGE TemplateHaskell #-}
39 {-# LANGUAGE TypeFamilies #-}
40
41 module Gargantext.Text.Terms.Eleve where
42
43 -- import Debug.Trace (trace)
44 -- import Debug.SimpleReflect
45
46 import Control.Lens hiding (levels, children)
47 import Control.Monad (forM_)
48 import Data.Ord (Ord)
49 import qualified Data.List as L
50 import Data.Monoid
51 import Data.Text (Text)
52 import qualified Data.Text as T
53 import Data.Map (Map)
54 import Data.Maybe (fromMaybe)
55 import qualified Data.Map as Map
56 import Gargantext.Prelude hiding (cs)
57 import qualified Data.Tree as Tree
58 import Data.Tree (Tree)
59 import qualified Prelude as P (putStrLn, logBase, isNaN, RealFloat)
60
61 nan :: Floating e => e
62 nan = 0 / 0
63
64 noNaNs :: P.RealFloat e => [e] -> [e]
65 noNaNs = filter (not . P.isNaN)
66
67 updateIfDefined :: P.RealFloat e => e -> e -> e
68 updateIfDefined e0 e | P.isNaN e = e0
69 | otherwise = e
70
71 sim :: Entropy e => e -> e -> Bool
72 sim x y = x == y || (P.isNaN x && P.isNaN y)
73
74 subst :: Entropy e => (e, e) -> e -> e
75 subst (src, dst) x | sim src x = dst
76 | otherwise = x
77 ------------------------------------------------------------------------
78
79 type Entropy e =
80 ( Fractional e
81 , Floating e
82 , P.RealFloat e
83 , Show e
84 -- ^ TODO: only used for debugging
85 )
86 ------------------------------------------------------------------------
87 -- | Example and tests for development
88 data I e = I
89 { _info_entropy :: e
90 , _info_entropy_var :: e
91 , _info_autonomy :: e
92 }
93
94 instance Show e => Show (I e) where
95 show (I e ev a) = show (e, ev, a)
96
97 makeLenses ''I
98
99 type ModEntropy i o e = (e -> e) -> i -> o
100
101 set_autonomy :: Entropy e => ModEntropy (I e) (I e) e
102 set_autonomy fe i = i & info_autonomy .~ fe (i ^. info_entropy_var)
103
104 set_entropy_var :: Entropy e => Setter e (I e) e e
105 set_entropy_var f e = (\ev -> I e ev nan) <$> f e
106
107 data StartStop = Start | Stop
108 deriving (Ord, Eq, Show)
109
110 data Token = NonTerminal Text
111 | Terminal StartStop
112 deriving (Ord, Eq, Show)
113
114 isTerminal :: Token -> Bool
115 isTerminal (Terminal _) = True
116 isTerminal (NonTerminal _) = False
117
118 nonTerminals :: [Token] -> [Text]
119 nonTerminals ts = [nt | NonTerminal nt <- ts]
120
121 parseToken :: Text -> Token
122 parseToken "<start>" = Terminal Start
123 parseToken "<stop>" = Terminal Stop
124 parseToken t = NonTerminal t
125
126 toToken :: [Text] -> [Token]
127 toToken xs = Terminal Start : (NonTerminal <$> xs) <> [Terminal Stop]
128
129 printToken :: Token -> Text
130 printToken = f
131 where
132 f (NonTerminal x) = x
133 f (Terminal Start) = "<start>"
134 f (Terminal Stop) = "<stop>"
135 ------------------------------------------------------------------------
136
137 data Trie k e
138 = Node { _node_count :: Int
139 , _node_entropy :: e
140 , _node_children :: Map k (Trie k e)
141 }
142 | Leaf { _node_count :: Int }
143 deriving (Show)
144
145 makeLenses ''Trie
146
147 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
148 insertTrie [] n = n { _node_count = _node_count n +1}
149 insertTrie (x:xs) (Leaf c) = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
150 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
151 where
152 f = Just . insertTrie xs . fromMaybe emptyTrie
153
154 -- emptyTrie :: (Ord k, Monoid e) => Trie k e
155 -- emptyTrie = Node 0 mempty mempty
156 emptyTrie :: Trie k e
157 emptyTrie = Leaf 0
158
159 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
160 mkTrie c children
161 | Map.null children = Leaf c
162 | otherwise = Node c mempty children
163
164 -----------------------------
165 -- | Trie to Tree since Tree as nice print function
166 toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
167 toTree k (Leaf c) = Tree.Node (k, c, Nothing) []
168 toTree k (Node c e cs) = Tree.Node (k, c, Just e) (map (uncurry toTree) $ Map.toList cs)
169
170 ------------------------------------------------------------------------
171 ------------------------------------------------------------------------
172 normalizeLevel :: Entropy e => e -> e -> e -> e
173 normalizeLevel m v e = (e - m) / v
174
175 {- Unused
176
177 nodeChildren :: Trie k e -> Map k (Trie k e)
178 nodeChildren (Node _ _ cs) = cs
179 nodeChildren (Leaf _) = Map.empty
180
181 -}
182
183 chunkAlongEleve :: Int -> [a] -> [[a]]
184 chunkAlongEleve n xs = L.take n <$> L.tails xs
185
186 data Direction = Backward | Forward
187
188 buildTrie :: Direction -> Int -> [[Token]] -> Trie Token ()
189 buildTrie d n sentences
190 = L.foldr insertTrie emptyTrie
191 . L.concat
192 $ ( filter (/= [Terminal (term d)])
193 . chunkAlongEleve (n + 1)
194 . order d
195 )
196 <$> sentences
197 where
198 order Forward = identity
199 order Backward = reverse
200 term Forward = Stop
201 term Backward = Start
202
203 class IsTrie trie where
204 entropyTrie :: Entropy e => (k -> Bool) -> trie k () -> trie k e
205 nodeEntropy :: Entropy e => Getting e i e -> trie k i -> e
206 nodeChild :: Ord k => k -> trie k e -> trie k e
207 findTrie :: Ord k => [k] -> trie k e -> trie k e
208 printTrie :: (Show i, Entropy e) => Getting e i e -> trie Token i -> IO ()
209 evTrie :: Entropy e => Getting e i e -> Setter i o e e -> trie k i -> trie k o
210 normalizeEntropy :: Entropy e
211 => Getting e i e -> ModEntropy i o e
212 -> trie k i -> trie k o
213
214 instance IsTrie Trie where
215
216 entropyTrie _ (Leaf c) = Leaf c
217 entropyTrie pred (Node c () children) = Node c e (map (entropyTrie pred) children)
218 where
219 children' = Map.toList children
220 sum_count = sum $ _node_count . snd <$> children'
221 e | sum_count == 0 = nan
222 | otherwise = sum $ f <$> children'
223 f (k, child) = if pred k then chc * P.logBase 2 (fromIntegral c)
224 else - chc * P.logBase 2 chc
225 where
226 chc = fromIntegral (_node_count child) / fromIntegral c
227
228 nodeEntropy inE (Node _ e _) = e ^. inE
229 nodeEntropy _ (Leaf _) = nan
230
231 nodeChild k (Node _ _ cs) = fromMaybe emptyTrie (Map.lookup k cs)
232 nodeChild _ (Leaf _) = emptyTrie
233
234 findTrie ks t = L.foldl (flip nodeChild) t ks
235
236 printTrie inE t = do
237 P.putStrLn . Tree.drawTree
238 . fmap show
239 $ toTree (NonTerminal "") t
240 P.putStrLn " Levels:"
241 forM_ (normalizationLevels inE t) $ \level ->
242 P.putStrLn $ " " <> show level
243
244 evTrie inE setEV = go nan
245 where
246 go _ (Leaf c) = Leaf c
247 go e0 (Node c i children) = Node c (i & setEV .~ ev e0 e1) $ go e1 <$> children
248 where e1 = i ^. inE
249
250 ev 0 0 = nan
251 ev i0 i1 = i1 - i0
252
253 normalizeEntropy inE modE t = go (modE identity) (normalizationLevels inE t) t
254 where
255 go _ _ (Leaf c) = Leaf c
256 go _ [] _ = panic "normalizeEntropy' empty levels"
257 go f ((m, v, _) : ess) (Node c i children)
258 = Node c (f i) $ go (modE $ normalizeLevel m v) ess <$> children
259 ------------------------------------------------------------------------
260
261 levels :: Trie k e -> [[Trie k e]]
262 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
263 where
264 subForest :: Trie k e -> [Trie k e]
265 subForest (Leaf _) = []
266 subForest (Node _ _ children) = Map.elems children
267
268 entropyLevels :: Entropy e => Getting e i e -> Trie k i -> [[e]]
269 entropyLevels inE = fmap (noNaNs . map (nodeEntropy inE)) . L.tail . levels
270
271 normalizationLevels :: Entropy e => Getting e i e -> Trie k i -> [(e, e, Int)]
272 normalizationLevels inE = fmap f . entropyLevels inE
273 where
274 f es = (mean es, deviation es, length es)
275
276 ------------------------------------------------------------------------
277
278 data Tries k e = Tries
279 { _fwd :: Trie k e
280 , _bwd :: Trie k e
281 }
282
283 makeLenses ''Tries
284
285 buildTries :: Int -> [[Token]] -> Tries Token ()
286 buildTries n sentences = Tries
287 { _fwd = buildTrie Forward n sentences
288 , _bwd = buildTrie Backward n sentences
289 }
290
291 instance IsTrie Tries where
292
293 nodeEntropy inE (Tries f b) = mean [nodeEntropy inE f, nodeEntropy inE b]
294
295 findTrie ks (Tries f b) = Tries (findTrie ks f) (findTrie (reverse ks) b)
296
297 nodeChild = onTries . nodeChild
298
299 entropyTrie = onTries . entropyTrie
300
301 evTrie inE setEV = onTries $ evTrie inE setEV
302
303 normalizeEntropy inE = onTries . normalizeEntropy inE
304
305 printTrie inE (Tries f b) = do
306 P.putStrLn "Forward:"
307 printTrie inE f
308 P.putStrLn ""
309 P.putStrLn "Backward:"
310 printTrie inE b
311
312 onTries :: (Trie k i -> Trie k o) -> Tries k i -> Tries k o
313 onTries h (Tries f b) = Tries (h f) (h b)
314
315 ------------------------------------------------------------------------
316 mayCons :: [a] -> [[a]] -> [[a]]
317 mayCons [] xss = xss
318 mayCons xs xss = xs : xss
319
320 {-
321 split :: (IsTrie trie, Entropy e) => Lens' i e -> trie Token i -> [Token] -> [[Token]]
322 split _ _ [] = []
323 split inE t (Terminal Start:xs) = split inE t xs
324 split inE t (x0:xs0) = go [x0] xs0
325 where
326 go pref [] = [pref]
327 go pref (Terminal Stop:_) = [pref]
328 go _ (Terminal Start:_) = panic "split impossible"
329 go pref (x:xs) =
330 -- trace (show (if acc then "ACC" else "CUT", (prefx, epxt), if acc then ">" else "<=", ((pref, ept), "+", ([x], ext)))) $
331 if acc
332 then go prefx xs
333 else mayCons pref $ go [x] xs
334 where
335 prefx = pref <> [x]
336 pt = findTrie pref t
337 pxt = findTrie prefx t
338 xt = findTrie [x] t
339 ept = ne pt
340 -- ^ entropy of the current prefix
341 ext = ne xt
342 -- ^ entropy of [x]
343 epxt = ne pxt
344 -- ^ entropy of the current prefix plus x
345 acc = P.isNaN ept || P.isNaN ext || not (P.isNaN epxt) -- && (epxt > mean [ept, ext])
346
347 -- aut(["in","this","paper"]) > aut(["in","this"]) + aut(["paper"])
348
349 ne = nodeEntropy inE
350 -}
351
352 split :: Entropy e => Int -> Lens' i e -> Tries Token i -> [Token] -> [[Text]]
353 split _ _ _ [] = []
354 split _ _ _ [t] = pure <$> nonTerminals [t]
355 split n inE t ts = nonTerminals pref `mayCons` split n inE t (drop (length pref) ts)
356 where
357 pref = maximumWith (\ks -> nodeEntropy inE $ findTrie ks t)
358 (L.tail . L.inits . take n $ ts)
359
360
361 {-
362 split :: Entropy e => Lens' i e -> Tries Token i -> [Token] -> [[Token]]
363 split inE t0 ts =
364 maximumWith (sum . map $ nodeAutonomy inE t0) (all the splits of ts)
365 -}
366
367 ------------------------------------------------------------------------
368
369 mainEleve :: Int -> [[Text]] -> [[[Text]]]
370 mainEleve n i = mainEleveWith m n i
371 where
372 m = buildTries n (fmap toToken i)
373
374 mainEleveWith :: Tries Token () -> Int -> [[Text]] -> [[[Text]]]
375 mainEleveWith m n i = fmap (split n info_autonomy t) (fmap toToken i)
376 where
377 t :: Tries Token (I Double)
378 t = normalizeEntropy info_entropy_var set_autonomy
379 $ evTrie identity set_entropy_var
380 $ entropyTrie isTerminal m
381
382 ------------------------------------------------------------------------
383
384 type Checks e = [(Text, Int, e, e, e, e, e, e, e, e, e)]
385
386 testEleve :: e ~ Double => Bool -> Int -> [Text] -> Checks e -> IO Bool
387 testEleve debug n output checks = do
388 let
389 res = split n info_autonomy nt <$> inp
390 when debug $ do
391 P.putStrLn $ show input
392 P.putStrLn ""
393 printTrie info_entropy nt
394 P.putStrLn ""
395 P.putStrLn "Splitting:"
396 P.putStrLn $ show res
397 forM_ checks checker
398 pure $ expected == res
399
400 where
401 out = T.words <$> output
402 expected = fmap (T.splitOn "-") <$> out
403 input = (T.splitOn "-" =<<) <$> out
404 inp = toToken <$> input
405
406 nt :: Tries Token (I Double)
407 nt = normalizeEntropy info_entropy_var set_autonomy
408 . evTrie identity set_entropy_var
409 . entropyTrie isTerminal
410 $ buildTries n inp
411
412 check f msg ref my =
413 if f ref my
414 then P.putStrLn $ " \ESC[32mPASS\ESC[m " <> msg <> " " <> show ref
415 else P.putStrLn $ " \ESC[31mFAIL\ESC[m " <> msg <> " ref=" <> show ref <> " my=" <> show my
416
417 checker (ngram, count, entropy, ev, autonomy, fwd_entropy, fwd_ev, fwd_autonomy, bwd_entropy, bwd_ev, bwd_autonomy) = do
418 let ns = parseToken <$> T.words ngram
419 nt' = findTrie ns nt
420
421 P.putStrLn $ " " <> T.unpack ngram <> ":"
422 check (==) "count" count (_node_count (_fwd nt'))
423
424 check sim "entropy" entropy (nodeEntropy info_entropy nt' )
425 check sim "ev" ev (nodeEntropy info_entropy_var nt' )
426 check sim "autonomy" autonomy (nodeEntropy info_autonomy nt' )
427
428 check sim "fwd_entropy" fwd_entropy (nodeEntropy info_entropy (_fwd nt'))
429 check sim "fwd_ev" fwd_ev (nodeEntropy info_entropy_var (_fwd nt'))
430 check sim "fwd_autonomy" fwd_autonomy (nodeEntropy info_autonomy (_fwd nt'))
431
432 check sim "bwd_entropy" bwd_entropy (nodeEntropy info_entropy (_bwd nt'))
433 check sim "bwd_ev" bwd_ev (nodeEntropy info_entropy_var (_bwd nt'))
434 check sim "bwd_autonomy" bwd_autonomy (nodeEntropy info_autonomy (_bwd nt'))
435
436 -- | TODO real data is a list of tokenized sentences
437 example0, example1, example2, example3, example4, example5, example6 :: [Text]
438 example0 = ["New-York is New-York and New-York"]
439 example1 = ["to-be or not to-be"]
440 example2 = ["to-be-or not to-be-or NOT to-be and"]
441 example3 = example0 <> example0
442 -- > TEST: Should not have York New in the trie
443 example4 = ["a-b-c-d e a-b-c-d f"]
444 example5 = ["a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"]
445 example6 = ["le-petit chat"
446 ,"le-petit chien"
447 ,"le-petit rat"
448 ,"le gros rat"
449 ]
450
451 checks0, checks2 :: Checks Double
452
453 checks0 =
454 -- [(token, count, entropy, ev, autonomy, fwd_entropy, fwd_ev, fwd_autonomy, bwd_entropy, bwd_ev, bwd_autonomy)]
455 [ ("<start>", 1, nan, nan, nan, 0.0, -2.113283334294875, -0.5000000000000002, nan, nan, nan)
456 , ("New", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 0.0, -2.113283334294875, -0.5000000000000002, 1.584962500721156, -0.5283208335737188, 2.0)
457 , ("York", 3, 0.792481250360578, -1.3208020839342969, 0.7499999999999999, 1.584962500721156, -0.5283208335737188, 2.0, 0.0, -2.113283334294875, -0.5000000000000002)
458 , ("is", 1, 0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002)
459 , ("and", 1, 0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002, 0.0, -2.113283334294875, -0.5000000000000002)
460 , ("<stop>", 0, nan, nan, nan, nan, nan, nan, 0.0, -2.113283334294875, -0.5000000000000002)
461 , ("<start> New", 1, nan, nan, nan, 0.0, nan, nan, nan, nan, nan)
462 , ("New York", 3, 1.584962500721156, 1.584962500721156, 1.414213562373095, 1.584962500721156, 1.584962500721156, 1.4142135623730947, 1.584962500721156, 1.584962500721156, 1.4142135623730951)
463 , ("York is", 1, 0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865476, 0.0, nan, nan)
464 , ("is New", 1, 0, nan, nan, 0.0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865474)
465 , ("York and", 1, 0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865476, 0.0, nan, nan)
466 , ("and New", 1, 0, nan, nan, 0.0, nan, nan, 0.0, -1.584962500721156, -0.7071067811865474)
467 , ("York <stop>", 1, nan, nan, nan, nan, nan, nan, 0.0, nan, nan)
468 , ("<start> New York", 1, nan, nan, nan, 0.0, nan, nan, nan, nan, nan)
469 , ("New York is", 1, 0, nan, nan, 0.0, -1.584962500721156, nan, 0.0, nan, nan)
470 , ("York is New", 1, 0, nan, nan, 0.0, nan, nan, 0.0, nan, nan)
471 , ("is New York", 1, 0, nan, nan, 0.0, nan, nan, 0.0, -1.584962500721156, nan)
472 , ("New York and", 1, 0, nan, nan, 0.0, -1.584962500721156, nan, 0.0, nan, nan)
473 , ("York and New", 1, 0, nan, nan, 0.0, nan, nan, 0.0, nan, nan)
474 , ("and New York", 1, 0, nan, nan, 0.0, nan, nan, 0.0, -1.584962500721156, nan)
475 , ("New York <stop>", 1, nan, nan, nan, nan, nan, nan, 0.0, nan, nan)
476 ]
477
478 checks2 = []
479 {-
480 [("to be", 3, 1.2516291673878228, 1.2516291673878228, 1.5535694744293167, nan, 0.9182958340544896)
481 ,("be or", 2, 0.5, nan, nan, nan, 1.0)
482 ,("or not", 1, 0.0, nan, nan, nan, 0.0)
483 ,("not to", 1, 0.0, nan, nan, nan, 0.0)
484 ,("or NOT", 1, 0.0, nan, nan, nan, 0.0)
485 ,("NOT to", 1, 0.0, nan, nan, nan, 0.0)
486 ,("be and", 1, 0.0, nan, nan, nan, 0.0)
487 ]
488 -}
489
490 runTestsEleve :: IO ()
491 runTestsEleve =
492 forM_
493 [("example0", 3, example0, checks0)
494 ,("example0", 2, example0, [])
495 ,("example1", 2, example1, [])
496 ,("example2", 3, example2, checks2)
497 ,("example3", 2, example3, [])
498 ,("example4", 4, example4, [])
499 ,("example5", 5, example5, [])
500 ,("example6", 2, example6, [])
501 ]
502 (\(name, n, ex, checks) -> do
503 P.putStrLn $ name <> " " <> show n
504 b <- testEleve False n ex checks
505 P.putStrLn $ " splitting: " <> if b then "PASS" else "FAIL"
506 )