]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Eleve.hs
Eleve: mean noNaNs
[gargantext.git] / src / Gargantext / Text / Eleve.hs
1 {-|
2 Module : Gargantext.Text.Eleve
3 Description : Unsupervized Word segmentation
4 Copyright : (c) CNRS, 2019-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 # Implementation of Unsupervized Word Segmentation
11
12 References:
13
14 - Python implementation (Korantin August, Emmanuel Navarro):
15 [EleVe](https://github.com/kodexlab/eleve.git)
16
17 - Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
18 Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
19 the 50th Annual Meeting of the Association for Computational Linguistics
20 , pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
21
22 Notes for current implementation:
23 - TODO fix normalization
24 - TODO extract longer ngrams (see paper above, viterbi algo can be used)
25 - TODO AD TEST: prop (Node c _e f) = c == Map.size f
26
27 - AD: Real ngrams extraction test
28 from Gargantext.Text.Terms import extractTermsUnsupervised
29 docs <- runCmdRepl $ selectDocs 1004
30 extractTermsUnsupervised 3 $ DT.intercalate " "
31 $ catMaybes
32 $ Gargantext.map _hyperdataDocument_abstract docs
33
34 -}
35 {-# LANGUAGE ConstraintKinds #-}
36 {-# LANGUAGE NoImplicitPrelude #-}
37 {-# LANGUAGE OverloadedStrings #-}
38 {-# LANGUAGE RankNTypes #-}
39 {-# LANGUAGE TemplateHaskell #-}
40 {-# LANGUAGE TypeFamilies #-}
41
42 module Gargantext.Text.Eleve where
43
44 import Debug.Trace (trace)
45 -- import Debug.SimpleReflect
46
47 import Control.Lens (Lens', Getting, (^.), (^?), view, makeLenses, _Just)
48 import Control.Monad (foldM, mapM_, forM_)
49 import Data.Ord (Ord)
50 import qualified Data.List as L
51 import Data.Monoid
52 import Data.Text (Text)
53 import qualified Data.Text as T
54 import Data.Map (Map)
55 import Data.Maybe (fromMaybe, catMaybes)
56 import qualified Data.Map as Map
57 import Gargantext.Prelude hiding (cs)
58 import qualified Data.Tree as Tree
59 import Data.Tree (Tree)
60 import qualified Prelude as P (putStrLn, logBase, isNaN, RealFloat)
61
62 type Entropy e =
63 ( Fractional e
64 , Floating e
65 , P.RealFloat e
66 , Show e
67 -- ^ TODO: only used for debugging
68 )
69 ------------------------------------------------------------------------
70 -- | Example and tests for development
71 data I e = I
72 { _info_entropy :: e
73 , _info_autonomy :: e
74 }
75
76 instance Show e => Show (I e) where
77 show (I e n) = show (e, n)
78
79 makeLenses ''I
80
81 type ModEntropy i o e = (e -> e) -> i -> o
82
83 set_autonomy :: ModEntropy e (I e) e
84 set_autonomy f e = I e (f e)
85
86 data StartStop = Start | Stop
87 deriving (Ord, Eq, Show)
88
89 data Token = NonTerminal Text
90 | Terminal StartStop
91 deriving (Ord, Eq, Show)
92
93 isTerminal :: Token -> Bool
94 isTerminal (Terminal _) = True
95 isTerminal (NonTerminal _) = False
96
97 parseToken :: Text -> Token
98 parseToken "<start>" = Terminal Start
99 parseToken "<stop>" = Terminal Stop
100 parseToken t = NonTerminal t
101
102 toToken :: [Text] -> [Token]
103 toToken xs = Terminal Start : (NonTerminal <$> xs) <> [Terminal Stop]
104
105 printToken :: Token -> Text
106 printToken = f
107 where
108 f (NonTerminal x) = x
109 f (Terminal Start) = "<start>"
110 f (Terminal Stop) = "<stop>"
111
112 ------------------------------------------------------------------------
113
114 data Trie k e
115 = Node { _node_count :: Int
116 , _node_entropy :: e
117 , _node_children :: Map k (Trie k e)
118 }
119 | Leaf { _node_count :: Int }
120 deriving (Show)
121
122 makeLenses ''Trie
123
124 insertTries :: Ord k => [[k]] -> Trie k ()
125 insertTries = L.foldr insertTrie emptyTrie
126
127 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
128 insertTrie [] n = n { _node_count = _node_count n +1}
129 insertTrie (x:xs) (Leaf c) = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
130 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
131 where
132 f = Just . insertTrie xs . fromMaybe emptyTrie
133
134 -- emptyTrie :: (Ord k, Monoid e) => Trie k e
135 -- emptyTrie = Node 0 mempty mempty
136 emptyTrie :: Trie k e
137 emptyTrie = Leaf 0
138
139 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
140 mkTrie c children
141 | Map.null children = Leaf c
142 | otherwise = Node c mempty children
143
144 -----------------------------
145
146 -- | Trie to Tree since Tree as nice print function
147 toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
148 toTree k (Leaf c) = Tree.Node (k, c, Nothing) []
149 toTree k (Node c e cs) = Tree.Node (k, c, Just e) (map (uncurry toTree) $ Map.toList cs)
150
151 ------------------------------------------------------------------------
152 ------------------------------------------------------------------------
153
154 nan :: Floating e => e
155 nan = 0 / 0
156
157 noNaNs :: P.RealFloat e => [e] -> [e]
158 noNaNs = filter (not . P.isNaN)
159
160 updateIfDefined :: P.RealFloat e => e -> e -> e
161 updateIfDefined e0 e | P.isNaN e = e0
162 | otherwise = e
163
164 entropyTrie :: Floating e => (k -> Bool) -> Trie k () -> Trie k e
165 entropyTrie _ (Leaf c) = Leaf c
166 entropyTrie pred (Node c () children) = Node c e (map (entropyTrie pred) children)
167 where
168 e = sum $ map f $ Map.toList children
169 f (k, child) = if pred k then chc * P.logBase 2 (fromIntegral c)
170 else - chc * P.logBase 2 chc
171 where
172 chc = fromIntegral (_node_count child) / fromIntegral c
173 ------------------------------------------------------------------------
174
175 normalizeLevel :: Entropy e => [e] -> e -> e
176 normalizeLevel = checkDiff (go . noNaNs)
177
178 where
179 -- checkDiff f es e = let e' = f es e in if e == e' then e' else trace ("normalizeLevel: diff " <> show e <> " " <> show e') e'
180 checkDiff = identity
181 go [] = panic "normalizeLevel: impossible"
182 -- trace "normalizeLevel"
183 -- go [_] = identity
184 go es = \e -> (e - m) / v
185 {-
186 in if P.isNaN e'
187 then trace ("normalizeLevel " <> show (e,m,v,es))
188 e
189 else e'
190 -}
191 where
192 m = mean es
193 v = deviation es
194
195 {- Unused
196
197 nodeChildren :: Trie k e -> Map k (Trie k e)
198 nodeChildren (Node _ _ cs) = cs
199 nodeChildren (Leaf _) = Map.empty
200
201 -}
202
203 class IsTrie trie where
204 buildTrie :: Floating e => [[Token]] -> trie Token e
205 nodeEntropy :: Entropy e => Getting e i e -> trie k i -> e
206 nodeChild :: Ord k => k -> trie k e -> trie k e
207 findTrie :: Ord k => [k] -> trie k e -> trie k e
208 normalizeEntropy :: Entropy e
209 => Getting e i e -> ModEntropy i o e
210 -> trie k i -> trie k o
211
212 nodeAutonomy :: (Ord k, Entropy e) => Getting e i e -> trie k i -> [k] -> e
213 nodeAutonomy inE t ks = nodeEntropy inE $ findTrie ks t
214
215 instance IsTrie Trie where
216 buildTrie = entropyTrie isTerminal . insertTries
217
218 nodeEntropy inE (Node _ e _) = e ^. inE
219 nodeEntropy _ (Leaf _) = -- trace "nodeEntropy of Leaf" $
220 nan
221
222 nodeChild k (Node _ _ cs) = fromMaybe emptyTrie (Map.lookup k cs)
223 nodeChild _ (Leaf _) = emptyTrie
224
225 findTrie ks t = L.foldl (flip nodeChild) t ks
226
227 normalizeEntropy inE modE t = go (modE identity) (entropyLevels inE t) t
228 where
229 go _ [] _ = panic "normalizeEntropy' empty levels"
230 go _ _ (Leaf c) = Leaf c
231 go _ ([] : _) _ = panic "normalizeEntropy': empty level"
232 go f (es : ess) (Node c i children) =
233 Node c (f i) $ go (modE $ normalizeLevel es) ess <$> children
234
235
236 {-
237 This is only normalizing a node with respect to its brothers (unlike all the
238 nodes of the same level).
239
240 normalizeEntropy inE modE = go $ modE identity
241 where
242 go _ (Leaf c) = Leaf c
243 go f (Node c i children)
244 | Map.null children =
245 panic "normalizeEntropy: impossible"
246 | otherwise =
247 Node c (f i) $ go (modE $ normalizeLevel es) <$> children
248 where
249 es = [ i' ^. inE | Node _ i' _ <- Map.elems children ]
250 -}
251 ------------------------------------------------------------------------
252
253 levels :: Trie k e -> [[Trie k e]]
254 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
255 where
256 subForest :: Trie k e -> [Trie k e]
257 subForest (Leaf _) = []
258 subForest (Node _ _ children) = Map.elems children
259
260 entropyLevels :: Entropy e => Getting e i e -> Trie k i -> [[e]]
261 entropyLevels inE = fmap (noNaNs . map (nodeEntropy inE)) . levels
262
263 ------------------------------------------------------------------------
264
265 data Tries k e = Tries
266 { _fwd :: Trie k e
267 , _bwd :: Trie k e
268 }
269
270 instance IsTrie Tries where
271 buildTrie tts = Tries { _fwd = buildTrie tts
272 , _bwd = buildTrie (reverse <$> tts)
273 }
274
275 nodeEntropy inE (Tries fwd bwd) =
276 mean $ noNaNs [nodeEntropy inE fwd, nodeEntropy inE bwd]
277
278 findTrie ks (Tries fwd bwd) = Tries (findTrie ks fwd) (findTrie (reverse ks) bwd)
279
280 nodeChild k (Tries fwd bwd) = Tries (nodeChild k fwd) (nodeChild k bwd)
281
282 normalizeEntropy inE modE = onTries (normalizeEntropy inE modE)
283
284 onTries :: (Trie k i -> Trie k o) -> Tries k i -> Tries k o
285 onTries f (Tries fwd bwd) = Tries (f fwd) (f bwd)
286
287 ------------------------------------------------------------------------
288 split :: (IsTrie trie, Entropy e) => Lens' i e -> trie Token i -> [Token] -> [[Token]]
289 split _ _ [] = []
290 split inE t0 (Terminal Start:xs0) = split inE (nodeChild (Terminal Start) t0) xs0
291 split inE t0 (x0:xs0) = go (nodeChild x0 t0) [x0] xs0
292 where
293 consRev [] xss = xss
294 consRev xs xss = reverse xs : xss
295
296 go _ pref [] = [reverse pref]
297 go _ pref (Terminal Stop:_) = [reverse pref]
298 go t pref (Terminal Start:xs) = go t pref xs
299 go t pref (x:xs) =
300 -- trace (show (if acc then "ACC" else "CUT", (reverse (x : pref), ext), if acc then ">" else "<=", ((reverse pref, et), "+", ([x], ext0)))) $
301 if acc
302 then go xt (x:pref) xs
303 else consRev pref $ go xt0 [x] xs
304 where
305 xt = nodeChild x t
306 xt0 = nodeChild x t0
307 et = ne 0 t
308 -- ^ entropy of the current prefix
309 ext0 = ne 0 xt0
310 -- ^ entropy of [x]
311 ext = ne 0 xt
312 -- ^ entropy of the current prefix plus x
313 acc = ext > et + ext0
314 -- aut(["in","this","paper"]) > aut(["in","this"]) + aut(["paper"])
315
316 ne d t = if P.isNaN e then d else e
317 where e = nodeEntropy inE t
318
319 {-
320 split :: Entropy e => Lens' i e -> Tries Token i -> [Token] -> [[Token]]
321 split inE t0 ts =
322 maximumWith (sum . map $ nodeAutonomy inE t0) (all the splits of ts)
323 -}
324
325 ------------------------------------------------------------------------
326 ------------------------------------------------------------------------
327
328 mainEleve :: Int -> [[Text]] -> [[[Text]]]
329 mainEleve _ _ = []
330 {-
331 mainEleve n input = map (map printToken) . split identity (t :: Trie Token Double) <$> inp
332 where
333 inp = toToken <$> input
334 t = buildTrie $ L.concat $ chunkAlong n 1 <$> inp
335 -}
336
337 sim :: Entropy e => e -> e -> Bool
338 sim x y = x == y || (P.isNaN x && P.isNaN y)
339
340 chunkAlongEleve :: Int -> [a] -> [[a]]
341 chunkAlongEleve n xs = L.take n <$> L.tails xs
342
343 testEleve :: e ~ Double => Bool -> Int -> [Text] -> [(Text, Int, e, e, e, e, e)] -> IO Bool
344 testEleve debug n output checks = do
345 let
346 {-
347 pss = [ (ps, findTrie ps fwd ^? _Just . node_entropy) -- . info_entropy)
348 | ps <- L.nub $ [ c
349 | m <- [1..n]
350 , cs <- chunkAlong m 1 <$> inp
351 , c <- cs
352 ]
353 ]
354 -}
355 --res = map (map printToken) . split identity fwd <$> inp
356 --res = map (map printToken) . split info_norm_entropy' nt' <$> inp
357 res = map (map printToken) . split info_autonomy nt <$> inp
358 when debug $ do
359 P.putStrLn (show input)
360 -- mapM_ (P.putStrLn . show) pss
361 P.putStrLn ""
362 -- printTrie nt
363 printTrie (_fwd nt)
364 printTrie (_bwd nt)
365 P.putStrLn $ show res
366 forM_ checks checker
367 pure $ expected == res
368
369 where
370 out = T.words <$> output
371 expected = fmap (T.splitOn "-") <$> out
372 input = (T.splitOn "-" =<<) <$> out
373 inp = toToken <$> input
374 t = buildTrie $ L.concat $ chunkAlongEleve (n + 2) <$> inp
375 -- nt = normalizeEntropy identity set_autonomy (fwd :: Trie Token Double)
376 -- nt = normalizeEntropy' info_entropy (\f -> info_norm_entropy' %~ f) nt
377 nt = normalizeEntropy identity set_autonomy t
378
379 check f msg x y =
380 if f x y
381 then P.putStrLn $ " PASS " <> msg <> " " <> show x <> " ~= " <> show y
382 else P.putStrLn $ " FAIL " <> msg <> " " <> show x <> " /= " <> show y
383
384 checker (ngram, count, entropy, _ev, autonomy, bwd_entropy, fwd_entropy) = do
385 let ns = parseToken <$> T.words ngram
386 t' = findTrie ns nt
387 P.putStrLn $ " " <> T.unpack ngram <> ":"
388 check (==) "count" count (_node_count (_fwd t'))
389 check sim "entropy" entropy (nodeEntropy info_entropy t')
390 check sim "autonomy" autonomy (nodeEntropy info_autonomy t')
391 check sim "fwd_entropy" fwd_entropy (nodeEntropy info_entropy (_fwd t'))
392 check sim "bwd_entropy" bwd_entropy (nodeEntropy info_entropy (findTrie ns (_bwd nt)))
393
394 printTrie =
395 P.putStrLn . Tree.drawTree
396 . fmap show
397 . toTree (NonTerminal "")
398
399 -- | TODO real data is a list of tokenized sentences
400 example0, example1, example2, example3, example4, example5, example6 :: [Text]
401 example0 = ["New-York is New-York and New-York"]
402 example1 = ["to-be or not to-be"]
403 example2 = ["to-be-or not to-be-or NOT to-be and"]
404 example3 = example0 <> example0
405 -- > TEST: Should not have York New in the trie
406 example4 = ["a-b-c-d e a-b-c-d f"]
407 example5 = ["a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"]
408 example6 = ["le-petit chat"
409 ,"le-petit chien"
410 ,"le-petit rat"
411 ,"le gros rat"
412 ]
413
414 checks0, checks2 :: [(Text, Int, Double, Double, Double, Double, Double)]
415
416 checks0 =
417 [("<start> New", 1, nan, nan, nan, nan, 0.0)
418 ,("New York", 3, 1.584962500721156, 1.584962500721156, 1.414213562373095, nan, 1.584962500721156)
419 ,("York is", 1, 0.0, nan, nan, nan, 0.0)
420 ,("is New", 1, 0.0, nan, nan, nan, 0.0)
421 ,("New York", 3, 1.584962500721156, 1.584962500721156, 1.414213562373095, nan, 1.584962500721156)
422 ,("York and", 1, 0.0, nan, nan, nan, 0.0)
423 ,("and New", 1, 0.0, nan, nan, nan, 0.0)
424 ,("New York", 3, 1.584962500721156, 1.584962500721156, 1.414213562373095, nan, 1.584962500721156)
425 ,("York <stop>", 1, nan, nan, nan, nan, nan)
426 ]
427
428 checks2 =
429 [("to be", 3, 1.2516291673878228, 1.2516291673878228, 1.5535694744293167, nan, 0.9182958340544896)
430 ,("be or", 2, 0.5, nan, nan, nan, 1.0)
431 ,("or not", 1, 0.0, nan, nan, nan, 0.0)
432 ,("not to", 1, 0.0, nan, nan, nan, 0.0)
433 ,("or NOT", 1, 0.0, nan, nan, nan, 0.0)
434 ,("NOT to", 1, 0.0, nan, nan, nan, 0.0)
435 ,("be and", 1, 0.0, nan, nan, nan, 0.0)
436 ]
437
438
439 runTests :: IO ()
440 runTests =
441 forM_
442 [("example0", 2, example0, checks0)
443 ,("example1", 2, example1, [])
444 ,("example2", 3, example2, checks2)
445 ,("example3", 2, example3, [])
446 ,("example4", 4, example4, [])
447 ,("example5", 5, example5, [])
448 ]
449 (\(name, n, ex, checks) -> do
450 P.putStrLn $ name <> " " <> show n
451 b <- testEleve False n ex checks
452 P.putStrLn $ " splitting: " <> if b then "PASS" else "FAIL"
453 )