]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Eleve.hs
Eleve...
[gargantext.git] / src / Gargantext / Text / Eleve.hs
1 {-|
2 Module : Gargantext.Text.Eleve
3 Description : Unsupervized Word segmentation
4 Copyright : (c) CNRS, 2019-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 # Implementation of Unsupervized Word Segmentation
11
12 References:
13
14 - Python implementation (Korantin August, Emmanuel Navarro):
15 [EleVe](https://github.com/kodexlab/eleve.git)
16
17 - Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
18 Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
19 the 50th Annual Meeting of the Association for Computational Linguistics
20 , pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
21
22 Notes for current implementation:
23 - TODO fix normalization
24 - TODO extract longer ngrams (see paper above, viterbi algo can be used)
25 - TODO AD TEST: prop (Node c _e f) = c == Map.size f
26
27 - AD: Real ngrams extraction test
28 from Gargantext.Text.Terms import extractTermsUnsupervised
29 docs <- runCmdRepl $ selectDocs 1004
30 extractTermsUnsupervised 3 $ DT.intercalate " "
31 $ catMaybes
32 $ Gargantext.map _hyperdataDocument_abstract docs
33
34 -}
35 {-# LANGUAGE ConstraintKinds #-}
36 {-# LANGUAGE NoImplicitPrelude #-}
37 {-# LANGUAGE OverloadedStrings #-}
38 {-# LANGUAGE RankNTypes #-}
39 {-# LANGUAGE TemplateHaskell #-}
40 {-# LANGUAGE TypeFamilies #-}
41
42 module Gargantext.Text.Eleve where
43
44 import Debug.Trace (trace)
45 -- import Debug.SimpleReflect
46
47 import Control.Lens (Lens', Getting, (^.), (^?), view, makeLenses, _Just)
48 import Control.Monad (foldM, mapM_, forM_)
49 import Data.Ord (Ord)
50 import qualified Data.List as L
51 import Data.Monoid
52 import Data.Text (Text)
53 import qualified Data.Text as T
54 import Data.Map (Map)
55 import Data.Maybe (fromMaybe, catMaybes)
56 import qualified Data.Map as Map
57 import Gargantext.Prelude hiding (cs)
58 import qualified Data.Tree as Tree
59 import Data.Tree (Tree)
60 import qualified Prelude as P (putStrLn, logBase, isNaN, RealFloat)
61
62 type Entropy e =
63 ( Fractional e
64 , Floating e
65 , P.RealFloat e
66 , Show e
67 -- ^ TODO: only used for debugging
68 )
69 ------------------------------------------------------------------------
70 -- | Example and tests for development
71 data I e = I
72 { _info_entropy :: e
73 , _info_autonomy :: e
74 }
75
76 instance Show e => Show (I e) where
77 show (I e n) = show (e, n)
78
79 makeLenses ''I
80
81 type ModEntropy i o e = (e -> e) -> i -> o
82
83 set_autonomy :: ModEntropy e (I e) e
84 set_autonomy f e = I e (f e)
85
86 data StartStop = Start | Stop
87 deriving (Ord, Eq, Show)
88
89 data Token = NonTerminal Text
90 | Terminal StartStop
91 deriving (Ord, Eq, Show)
92
93 isTerminal :: Token -> Bool
94 isTerminal (Terminal _) = True
95 isTerminal (NonTerminal _) = False
96
97 parseToken :: Text -> Token
98 parseToken "<start>" = Terminal Start
99 parseToken "<stop>" = Terminal Stop
100 parseToken t = NonTerminal t
101
102 toToken :: [Text] -> [Token]
103 toToken xs = Terminal Start : (NonTerminal <$> xs) <> [Terminal Stop]
104
105 printToken :: Token -> Text
106 printToken = f
107 where
108 f (NonTerminal x) = x
109 f (Terminal Start) = "<start>"
110 f (Terminal Stop) = "<stop>"
111
112 ------------------------------------------------------------------------
113
114 data Trie k e
115 = Node { _node_count :: Int
116 , _node_entropy :: e
117 , _node_children :: Map k (Trie k e)
118 }
119 | Leaf { _node_count :: Int }
120 deriving (Show)
121
122 makeLenses ''Trie
123
124 insertTries :: Ord k => [[k]] -> Trie k ()
125 insertTries = L.foldr insertTrie emptyTrie
126
127 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
128 insertTrie [] n = n { _node_count = _node_count n +1}
129 insertTrie (x:xs) (Leaf c) = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
130 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
131 where
132 f = Just . insertTrie xs . fromMaybe emptyTrie
133
134 -- emptyTrie :: (Ord k, Monoid e) => Trie k e
135 -- emptyTrie = Node 0 mempty mempty
136 emptyTrie :: Trie k e
137 emptyTrie = Leaf 0
138
139 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
140 mkTrie c children
141 | Map.null children = Leaf c
142 | otherwise = Node c mempty children
143
144 -----------------------------
145
146 -- | Trie to Tree since Tree as nice print function
147 toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
148 toTree k (Leaf c) = Tree.Node (k, c, Nothing) []
149 toTree k (Node c e cs) = Tree.Node (k, c, Just e) (map (uncurry toTree) $ Map.toList cs)
150
151 ------------------------------------------------------------------------
152 ------------------------------------------------------------------------
153
154 nan :: Floating e => e
155 nan = 0 / 0
156
157 updateIfDefined :: P.RealFloat e => e -> e -> e
158 updateIfDefined e0 e | P.isNaN e = e0
159 | otherwise = e
160
161 entropyTrie :: Floating e => (k -> Bool) -> Trie k () -> Trie k e
162 entropyTrie _ (Leaf c) = Leaf c
163 entropyTrie pred (Node c () children) = Node c e (map (entropyTrie pred) children)
164 where
165 e = sum $ map f $ Map.toList children
166 f (k, child) = if pred k then chc * P.logBase 2 (fromIntegral c)
167 else - chc * P.logBase 2 chc
168 where
169 chc = fromIntegral (_node_count child) / fromIntegral c
170 ------------------------------------------------------------------------
171
172 normalizeLevel :: Entropy e => [e] -> e -> e
173 normalizeLevel = checkDiff (go . filter (not . P.isNaN))
174
175 where
176 -- checkDiff f es e = let e' = f es e in if e == e' then e' else trace ("normalizeLevel: diff " <> show e <> " " <> show e') e'
177 checkDiff = identity
178 go [] = panic "normalizeLevel: impossible"
179 -- trace "normalizeLevel"
180 -- go [_] = identity
181 go es = \e -> (e - m) / v
182 {-
183 in if P.isNaN e'
184 then trace ("normalizeLevel " <> show (e,m,v,es))
185 e
186 else e'
187 -}
188 where
189 m = mean es
190 v = deviation es
191
192 {- Unused
193
194 nodeChildren :: Trie k e -> Map k (Trie k e)
195 nodeChildren (Node _ _ cs) = cs
196 nodeChildren (Leaf _) = Map.empty
197
198 -}
199
200 class IsTrie trie where
201 buildTrie :: Floating e => [[Token]] -> trie Token e
202 nodeEntropy :: Floating e => Getting e i e -> trie k i -> e
203 nodeChild :: Ord k => k -> trie k e -> trie k e
204 findTrie :: Ord k => [k] -> trie k e -> trie k e
205 normalizeEntropy :: Entropy e
206 => Getting e i e -> ModEntropy i o e
207 -> trie k i -> trie k o
208
209 nodeAutonomy :: (Ord k, Entropy e) => Getting e i e -> trie k i -> [k] -> e
210 nodeAutonomy inE t ks = nodeEntropy inE $ findTrie ks t
211
212 instance IsTrie Trie where
213 buildTrie = entropyTrie isTerminal . insertTries
214
215 nodeEntropy inE (Node _ e _) = e ^. inE
216 nodeEntropy _ (Leaf _) = -- trace "nodeEntropy of Leaf" $
217 nan
218
219 nodeChild k (Node _ _ cs) = fromMaybe emptyTrie (Map.lookup k cs)
220 nodeChild _ (Leaf _) = emptyTrie
221
222 findTrie ks t = L.foldl (flip nodeChild) t ks
223
224 normalizeEntropy inE modE t = go (modE identity) (entropyLevels inE t) t
225 where
226 go _ [] _ = panic "normalizeEntropy' empty levels"
227 go _ _ (Leaf c) = Leaf c
228 go _ ([] : _) _ = panic "normalizeEntropy': empty level"
229 go f (es : ess) (Node c i children) =
230 Node c (f i) $ go (modE $ normalizeLevel es) ess <$> children
231
232
233 {-
234 This is only normalizing a node with respect to its brothers (unlike all the
235 nodes of the same level).
236
237 normalizeEntropy inE modE = go $ modE identity
238 where
239 go _ (Leaf c) = Leaf c
240 go f (Node c i children)
241 | Map.null children =
242 panic "normalizeEntropy: impossible"
243 | otherwise =
244 Node c (f i) $ go (modE $ normalizeLevel es) <$> children
245 where
246 es = [ i' ^. inE | Node _ i' _ <- Map.elems children ]
247 -}
248 ------------------------------------------------------------------------
249
250 levels :: Trie k e -> [[Trie k e]]
251 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
252 where
253 subForest :: Trie k e -> [Trie k e]
254 subForest (Leaf _) = []
255 subForest (Node _ _ children) = Map.elems children
256
257 entropyLevels :: Entropy e => Getting e i e -> Trie k i -> [[e]]
258 entropyLevels inE = fmap (filter (not . P.isNaN) . map (nodeEntropy inE)) . levels
259
260 ------------------------------------------------------------------------
261
262 data Tries k e = Tries
263 { _fwd :: Trie k e
264 , _bwd :: Trie k e
265 }
266
267 instance IsTrie Tries where
268 buildTrie tts = Tries { _fwd = buildTrie tts
269 , _bwd = buildTrie (reverse <$> tts)
270 }
271
272 nodeEntropy inE (Tries fwd bwd) = mean [nodeEntropy inE fwd, nodeEntropy inE bwd]
273
274 findTrie ks (Tries fwd bwd) = Tries (findTrie ks fwd) (findTrie (reverse ks) bwd)
275
276 nodeChild k (Tries fwd bwd) = Tries (nodeChild k fwd) (nodeChild k bwd)
277
278 normalizeEntropy inE modE = onTries (normalizeEntropy inE modE)
279
280 onTries :: (Trie k i -> Trie k o) -> Tries k i -> Tries k o
281 onTries f (Tries fwd bwd) = Tries (f fwd) (f bwd)
282
283 ------------------------------------------------------------------------
284 split :: (IsTrie trie, Entropy e) => Lens' i e -> trie Token i -> [Token] -> [[Token]]
285 split _ _ [] = []
286 split inE t0 (Terminal Start:xs0) = split inE (nodeChild (Terminal Start) t0) xs0
287 split inE t0 (x0:xs0) = go (nodeChild x0 t0) [x0] xs0
288 where
289 consRev [] xss = xss
290 consRev xs xss = reverse xs : xss
291
292 go _ pref [] = [reverse pref]
293 go _ pref (Terminal Stop:_) = [reverse pref]
294 go t pref (Terminal Start:xs) = go t pref xs
295 go t pref (x:xs) =
296 -- trace (show (if acc then "ACC" else "CUT", (reverse (x : pref), ext), if acc then ">" else "<=", ((reverse pref, et), "+", ([x], ext0)))) $
297 if acc
298 then go xt (x:pref) xs
299 else consRev pref $ go xt0 [x] xs
300 where
301 xt = nodeChild x t
302 xt0 = nodeChild x t0
303 et = ne 0 t
304 -- ^ entropy of the current prefix
305 ext0 = ne 0 xt0
306 -- ^ entropy of [x]
307 ext = ne 0 xt
308 -- ^ entropy of the current prefix plus x
309 acc = ext > et + ext0
310 -- aut(["in","this","paper"]) > aut(["in","this"]) + aut(["paper"])
311
312 ne d t = if P.isNaN e then d else e
313 where e = nodeEntropy inE t
314
315 {-
316 split :: Entropy e => Lens' i e -> Tries Token i -> [Token] -> [[Token]]
317 split inE t0 ts =
318 maximumWith (sum . map $ nodeAutonomy inE t0) (all the splits of ts)
319 -}
320
321 ------------------------------------------------------------------------
322 ------------------------------------------------------------------------
323
324 mainEleve :: Int -> [[Text]] -> [[[Text]]]
325 mainEleve _ _ = []
326 {-
327 mainEleve n input = map (map printToken) . split identity (t :: Trie Token Double) <$> inp
328 where
329 inp = toToken <$> input
330 t = buildTrie $ L.concat $ chunkAlong n 1 <$> inp
331 -}
332
333 sim :: Entropy e => e -> e -> Bool
334 sim x y = x == y || (P.isNaN x && P.isNaN y)
335
336 chunkAlongEleve :: Int -> [a] -> [[a]]
337 chunkAlongEleve n xs = L.take n <$> L.tails xs
338
339 testEleve :: e ~ Double => Bool -> Int -> [Text] -> [(Text, Int, e, e, e, e, e)] -> IO Bool
340 testEleve debug n output checks = do
341 let
342 {-
343 pss = [ (ps, findTrie ps fwd ^? _Just . node_entropy) -- . info_entropy)
344 | ps <- L.nub $ [ c
345 | m <- [1..n]
346 , cs <- chunkAlong m 1 <$> inp
347 , c <- cs
348 ]
349 ]
350 -}
351 --res = map (map printToken) . split identity fwd <$> inp
352 --res = map (map printToken) . split info_norm_entropy' nt' <$> inp
353 res = map (map printToken) . split info_autonomy nt <$> inp
354 when debug $ do
355 P.putStrLn (show input)
356 -- mapM_ (P.putStrLn . show) pss
357 P.putStrLn ""
358 -- printTrie nt
359 printTrie (_fwd nt)
360 printTrie (_bwd nt)
361 P.putStrLn $ show res
362 forM_ checks checker
363 pure $ expected == res
364
365 where
366 out = T.words <$> output
367 expected = fmap (T.splitOn "-") <$> out
368 input = (T.splitOn "-" =<<) <$> out
369 inp = toToken <$> input
370 t = buildTrie $ L.concat $ chunkAlongEleve (n + 2) <$> inp
371 -- nt = normalizeEntropy identity set_autonomy (fwd :: Trie Token Double)
372 -- nt = normalizeEntropy' info_entropy (\f -> info_norm_entropy' %~ f) nt
373 nt = normalizeEntropy identity set_autonomy t
374
375 check f msg x y =
376 if f x y
377 then P.putStrLn $ " PASS " <> msg <> " " <> show x <> " ~= " <> show y
378 else P.putStrLn $ " FAIL " <> msg <> " " <> show x <> " /= " <> show y
379
380 checker (ngram, count, entropy, _ev, autonomy, bwd_entropy, fwd_entropy) = do
381 let ns = parseToken <$> T.words ngram
382 t' = findTrie ns nt
383 P.putStrLn $ " " <> T.unpack ngram <> ":"
384 check (==) "count" count (_node_count (_fwd t'))
385 check sim "entropy" entropy (nodeEntropy info_entropy t')
386 check sim "autonomy" autonomy (nodeEntropy info_autonomy t')
387 check sim "fwd_entropy" fwd_entropy (nodeEntropy info_entropy (_fwd t'))
388 check sim "bwd_entropy" bwd_entropy (nodeEntropy info_entropy (findTrie ns (_bwd nt)))
389
390 printTrie =
391 P.putStrLn . Tree.drawTree
392 . fmap show
393 . toTree (NonTerminal "")
394
395 -- | TODO real data is a list of tokenized sentences
396 example0, example1, example2, example3, example4, example5, example6 :: [Text]
397 example0 = ["New-York is New-York and New-York"]
398 example1 = ["to-be or not to-be"]
399 example2 = ["to-be-or not to-be-or NOT to-be and"]
400 example3 = example0 <> example0
401 -- > TEST: Should not have York New in the trie
402 example4 = ["a-b-c-d e a-b-c-d f"]
403 example5 = ["a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"]
404 example6 = ["le-petit chat"
405 ,"le-petit chien"
406 ,"le-petit rat"
407 ,"le gros rat"
408 ]
409
410 checks0, checks2 :: [(Text, Int, Double, Double, Double, Double, Double)]
411
412 checks0 =
413 [("<start> New", 1, nan, nan, nan, nan, 0.0)
414 ,("New York", 3, 1.584962500721156, 1.584962500721156, 1.414213562373095, nan, 1.584962500721156)
415 ,("York is", 1, 0.0, nan, nan, nan, 0.0)
416 ,("is New", 1, 0.0, nan, nan, nan, 0.0)
417 ,("New York", 3, 1.584962500721156, 1.584962500721156, 1.414213562373095, nan, 1.584962500721156)
418 ,("York and", 1, 0.0, nan, nan, nan, 0.0)
419 ,("and New", 1, 0.0, nan, nan, nan, 0.0)
420 ,("New York", 3, 1.584962500721156, 1.584962500721156, 1.414213562373095, nan, 1.584962500721156)
421 ,("York <stop>", 1, nan, nan, nan, nan, nan)
422 ]
423
424 checks2 =
425 [("to be", 3, 1.2516291673878228, 1.2516291673878228, 1.5535694744293167, nan, 0.9182958340544896)
426 ,("be or", 2, 0.5, nan, nan, nan, 1.0)
427 ,("or not", 1, 0.0, nan, nan, nan, 0.0)
428 ,("not to", 1, 0.0, nan, nan, nan, 0.0)
429 ,("or NOT", 1, 0.0, nan, nan, nan, 0.0)
430 ,("NOT to", 1, 0.0, nan, nan, nan, 0.0)
431 ,("be and", 1, 0.0, nan, nan, nan, 0.0)
432 ]
433
434
435 runTests :: IO ()
436 runTests =
437 forM_
438 [("example0", 2, example0, checks0)
439 ,("example1", 2, example1, [])
440 ,("example2", 3, example2, checks2)
441 ,("example3", 2, example3, [])
442 ,("example4", 4, example4, [])
443 ,("example5", 5, example5, [])
444 ]
445 (\(name, n, ex, checks) -> do
446 P.putStrLn $ name <> " " <> show n
447 b <- testEleve False n ex checks
448 P.putStrLn $ " splitting: " <> if b then "PASS" else "FAIL"
449 )