]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Eleve.hs
[DEBUG] Temoin example (merge).
[gargantext.git] / src / Gargantext / Text / Eleve.hs
1 {-|
2 Module : Gargantext.Text.Eleve
3 Description : Unsupervized Word segmentation
4 Copyright : (c) CNRS, 2019-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 # Implementation of Unsupervized Word Segmentation
11
12 References:
13
14 - Python implementation (Korantin August, Emmanuel Navarro):
15 [EleVe](https://github.com/kodexlab/eleve.git)
16
17 - Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
18 Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
19 the 50th Annual Meeting of the Association for Computational Linguistics
20 , pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
21
22 Notes for current implementation:
23 - TODO fix normalization
24 - TODO extract longer ngrams (see paper above, viterbi algo can be used)
25 - TODO AD TEST: prop (Node c _e f) = c == Map.size f
26
27 - AD: Real ngrams extraction test
28 from Gargantext.Text.Terms import extractTermsUnsupervised
29 docs <- runCmdRepl $ selectDocs 1004
30 extractTermsUnsupervised 3 $ DT.intercalate " "
31 $ catMaybes
32 $ Gargantext.map _hyperdataDocument_abstract docs
33
34 -}
35 {-# LANGUAGE ConstraintKinds #-}
36 {-# LANGUAGE NoImplicitPrelude #-}
37 {-# LANGUAGE OverloadedStrings #-}
38 {-# LANGUAGE RankNTypes #-}
39 {-# LANGUAGE TemplateHaskell #-}
40 {-# LANGUAGE TypeFamilies #-}
41
42 module Gargantext.Text.Eleve where
43
44 import Debug.Trace (trace)
45 -- import Debug.SimpleReflect
46
47 import Control.Lens (Lens', Getting, (^.), (^?), view, makeLenses, _Just)
48 import Control.Monad (foldM, mapM_, forM_)
49 import Data.Ord (Ord)
50 import qualified Data.List as L
51 import Data.Monoid
52 import Data.Text (Text)
53 import qualified Data.Text as T
54 import Data.Map (Map)
55 import Data.Maybe (fromMaybe, catMaybes)
56 import qualified Data.Map as Map
57 import Gargantext.Prelude hiding (cs)
58 import qualified Data.Tree as Tree
59 import Data.Tree (Tree)
60 import qualified Prelude as P (putStrLn, logBase, isNaN, RealFloat)
61
62 type Entropy e =
63 ( Fractional e
64 , Floating e
65 , P.RealFloat e
66 , Show e
67 -- ^ TODO: only used for debugging
68 )
69 ------------------------------------------------------------------------
70 -- | Example and tests for development
71 data I e = I
72 { _info_entropy :: e
73 , _info_autonomy :: e
74 }
75
76 instance Show e => Show (I e) where
77 show (I e n) = show (e, n)
78
79 makeLenses ''I
80
81 type ModEntropy i o e = (e -> e) -> i -> o
82
83 setNormEntropy :: ModEntropy e (I e) e
84 setNormEntropy f e = I e (f e)
85
86 data StartStop = Start | Stop
87 deriving (Ord, Eq, Show)
88
89 data Token = NonTerminal Text
90 | Terminal StartStop
91 deriving (Ord, Eq, Show)
92
93 isTerminal :: Token -> Bool
94 isTerminal (Terminal _) = True
95 isTerminal (NonTerminal _) = False
96
97 toToken :: Int -> [Text] -> [Token]
98 toToken n xs = Terminal Start : (NonTerminal <$> xs) <> [Terminal Stop]
99
100 unToken :: [Token] -> [Text]
101 unToken = map f
102 where
103 f (NonTerminal x) = x
104 f (Terminal _) = ""
105
106 ------------------------------------------------------------------------
107
108 data Trie k e
109 = Node { _node_count :: Int
110 , _node_entropy :: e
111 , _node_children :: Map k (Trie k e)
112 }
113 | Leaf { _node_count :: Int }
114 deriving (Show)
115
116 makeLenses ''Trie
117
118 insertTries :: Ord k => [[k]] -> Trie k ()
119 insertTries = L.foldr insertTrie emptyTrie
120
121 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
122 insertTrie [] n = n { _node_count = _node_count n +1}
123 insertTrie (x:xs) (Leaf c) = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
124 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
125 where
126 f = Just . insertTrie xs . fromMaybe emptyTrie
127
128 -- emptyTrie :: (Ord k, Monoid e) => Trie k e
129 -- emptyTrie = Node 0 mempty mempty
130 emptyTrie :: Trie k e
131 emptyTrie = Leaf 0
132
133 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
134 mkTrie c children
135 | Map.null children = Leaf c
136 | otherwise = Node c mempty children
137
138 -----------------------------
139
140 -- | Trie to Tree since Tree as nice print function
141 toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
142 toTree k (Leaf c) = Tree.Node (k, c, Nothing) []
143 toTree k (Node c e cs) = Tree.Node (k, c, Just e) (map (uncurry toTree) $ Map.toList cs)
144
145 ------------------------------------------------------------------------
146 ------------------------------------------------------------------------
147
148 nan :: Floating e => e
149 nan = 0 / 0
150
151 updateIfDefined :: P.RealFloat e => e -> e -> e
152 updateIfDefined e0 e | P.isNaN e = e0
153 | otherwise = e
154
155 entropyTrie :: Floating e => (k -> Bool) -> Trie k () -> Trie k e
156 entropyTrie _ (Leaf c) = Leaf c
157 entropyTrie pred (Node c () children) = Node c e (map (entropyTrie pred) children)
158 where
159 e = sum $ map f $ Map.toList children
160 f (k, child) = if pred k then chc * P.logBase 2 (fromIntegral c)
161 else - chc * P.logBase 2 chc
162 where
163 chc = fromIntegral (_node_count child) / fromIntegral c
164 ------------------------------------------------------------------------
165
166 normalizeLevel :: Entropy e => [e] -> e -> e
167 normalizeLevel = checkDiff (go . filter (not . P.isNaN))
168
169 where
170 -- checkDiff f es e = let e' = f es e in if e == e' then e' else trace ("normalizeLevel: diff " <> show e <> " " <> show e') e'
171 checkDiff = identity
172 go [] = panic "normalizeLevel: impossible"
173 -- trace "normalizeLevel"
174 -- go [_] = identity
175 go es = \e -> (e - m) / v
176 {-
177 in if P.isNaN e'
178 then trace ("normalizeLevel " <> show (e,m,v,es))
179 e
180 else e'
181 -}
182 where
183 m = mean es
184 v = deviation es
185
186 {- Unused
187
188 nodeChildren :: Trie k e -> Map k (Trie k e)
189 nodeChildren (Node _ _ cs) = cs
190 nodeChildren (Leaf _) = Map.empty
191
192 -}
193
194 class IsTrie trie where
195 buildTrie :: Floating e => [[Token]] -> trie Token e
196 nodeEntropy :: Floating e => Getting e i e -> trie k i -> e
197 nodeChild :: Ord k => k -> trie k e -> trie k e
198 findTrie :: Ord k => [k] -> trie k e -> trie k e
199 normalizeEntropy :: Entropy e
200 => Getting e i e -> ModEntropy i o e
201 -> trie k i -> trie k o
202
203 nodeAutonomy :: (Ord k, Entropy e) => Getting e i e -> trie k i -> [k] -> e
204 nodeAutonomy inE t ks = nodeEntropy inE $ findTrie ks t
205
206 instance IsTrie Trie where
207 buildTrie = entropyTrie isTerminal . insertTries
208
209 nodeEntropy inE (Node _ e _) = e ^. inE
210 nodeEntropy _ (Leaf _) = -- trace "nodeEntropy of Leaf" $
211 nan
212
213 nodeChild k (Node _ _ cs) = fromMaybe emptyTrie (Map.lookup k cs)
214 nodeChild _ (Leaf _) = emptyTrie
215
216 findTrie ks t = L.foldl (flip nodeChild) t ks
217
218 normalizeEntropy inE modE t = go (modE identity) (entropyLevels inE t) t
219 where
220 go _ [] _ = panic "normalizeEntropy' empty levels"
221 go _ _ (Leaf c) = Leaf c
222 go _ ([] : _) _ = panic "normalizeEntropy': empty level"
223 go f (es : ess) (Node c i children) =
224 Node c (f i) $ go (modE $ normalizeLevel es) ess <$> children
225
226
227 {-
228 This is only normalizing a node with respect to its brothers (unlike all the
229 nodes of the same level).
230
231 normalizeEntropy inE modE = go $ modE identity
232 where
233 go _ (Leaf c) = Leaf c
234 go f (Node c i children)
235 | Map.null children =
236 panic "normalizeEntropy: impossible"
237 | otherwise =
238 Node c (f i) $ go (modE $ normalizeLevel es) <$> children
239 where
240 es = [ i' ^. inE | Node _ i' _ <- Map.elems children ]
241 -}
242 ------------------------------------------------------------------------
243
244 levels :: Trie k e -> [[Trie k e]]
245 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
246 where
247 subForest :: Trie k e -> [Trie k e]
248 subForest (Leaf _) = []
249 subForest (Node _ _ children) = Map.elems children
250
251 entropyLevels :: Entropy e => Getting e i e -> Trie k i -> [[e]]
252 entropyLevels inE = fmap (filter (not . P.isNaN) . map (nodeEntropy inE)) . levels
253
254 ------------------------------------------------------------------------
255
256 data Tries k e = Tries
257 { _fwd :: Trie k e
258 , _bwd :: Trie k e
259 }
260
261 instance IsTrie Tries where
262 buildTrie tts = Tries { _fwd = buildTrie tts
263 , _bwd = buildTrie (reverse <$> tts)
264 }
265
266 nodeEntropy inE (Tries fwd bwd) = mean [nodeEntropy inE fwd, nodeEntropy inE bwd]
267
268 findTrie ks (Tries fwd bwd) = Tries (findTrie ks fwd) (findTrie (reverse ks) bwd)
269
270 nodeChild k (Tries fwd bwd) = Tries (nodeChild k fwd) (nodeChild k bwd)
271
272 normalizeEntropy inE modE = onTries (normalizeEntropy inE modE)
273
274 onTries :: (Trie k i -> Trie k o) -> Tries k i -> Tries k o
275 onTries f (Tries fwd bwd) = Tries (f fwd) (f bwd)
276
277 ------------------------------------------------------------------------
278 split :: (IsTrie trie, Entropy e) => Lens' i e -> trie Token i -> [Token] -> [[Token]]
279 split _ _ [] = []
280 split inE t0 (Terminal Start:xs0) = split inE (nodeChild (Terminal Start) t0) xs0
281 split inE t0 (x0:xs0) = go (nodeChild x0 t0) [x0] xs0
282 where
283 consRev [] xss = xss
284 consRev xs xss = reverse xs : xss
285
286 go _ pref [] = [reverse pref]
287 go _ pref (Terminal Stop:_) = [reverse pref]
288 go t pref (Terminal Start:xs) = go t pref xs
289 go t pref (x:xs) =
290 -- trace (show (if acc then "ACC" else "CUT", (reverse (x : pref), ext), if acc then ">" else "<=", ((reverse pref, et), "+", ([x], ext0)))) $
291 if acc
292 then go xt (x:pref) xs
293 else consRev pref $ go xt0 [x] xs
294 where
295 xt = nodeChild x t
296 xt0 = nodeChild x t0
297 et = ne 0 t
298 -- ^ entropy of the current prefix
299 ext0 = ne 0 xt0
300 -- ^ entropy of [x]
301 ext = ne 0 xt
302 -- ^ entropy of the current prefix plus x
303 acc = ext > et + ext0
304 -- aut(["in","this","paper"]) > aut(["in","this"]) + aut(["paper"])
305
306 ne d t = if P.isNaN e then d else e
307 where e = nodeEntropy inE t
308
309 {-
310 split :: Entropy e => Lens' i e -> Tries Token i -> [Token] -> [[Token]]
311 split inE t0 ts =
312 maximumWith (sum . map $ nodeAutonomy inE t0) (all the splits of ts)
313 -}
314
315 ------------------------------------------------------------------------
316 ------------------------------------------------------------------------
317
318 mainEleve :: Int -> [[Text]] -> [[[Text]]]
319 mainEleve _ _ = []
320 {-
321 mainEleve n input = map unToken . split identity (t :: Trie Token Double) <$> inp
322 where
323 inp = toToken (n - 1) <$> input
324 t = buildTrie $ L.concat $ chunkAlong n 1 <$> inp
325 -}
326
327 sim :: Entropy e => e -> e -> Bool
328 sim x y = x == y || (P.isNaN x && P.isNaN y)
329
330 testEleve :: e ~ Double => Bool -> Int -> [Text] -> [(Text, Int, e, e, e, e, e)] -> IO Bool
331 testEleve debug n output checks = do
332 let
333 {-
334 pss = [ (ps, findTrie ps fwd ^? _Just . node_entropy) -- . info_entropy)
335 | ps <- L.nub $ [ c
336 | m <- [1..n]
337 , cs <- chunkAlong m 1 <$> inp
338 , c <- cs
339 ]
340 ]
341 -}
342 --res = map unToken . split identity fwd <$> inp
343 --res = map unToken . split info_norm_entropy' nt' <$> inp
344 res = map unToken . split info_autonomy nt <$> inp
345 when debug $ do
346 P.putStrLn (show input)
347 -- mapM_ (P.putStrLn . show) pss
348 P.putStrLn ""
349 -- printTrie nt
350 printTrie (_fwd nt)
351 printTrie (_bwd nt)
352 P.putStrLn $ show res
353 forM_ checks checker
354 pure $ expected == res
355
356 where
357 out = T.words <$> output
358 expected = fmap (T.splitOn "-") <$> out
359 input = (T.splitOn "-" =<<) <$> out
360 inp = toToken (n - 1) <$> input
361 t = buildTrie $ L.concat $ chunkAlong (n + 1) 1 <$> inp
362 -- nt = normalizeEntropy identity setNormEntropy (fwd :: Trie Token Double)
363 -- nt = normalizeEntropy' info_entropy (\f -> info_norm_entropy' %~ f) nt
364 nt = normalizeEntropy identity setNormEntropy t
365
366 check f msg x y =
367 if f x y
368 then P.putStrLn $ " PASS " <> msg <> " " <> show x <> " ~= " <> show y
369 else P.putStrLn $ " FAIL " <> msg <> " " <> show x <> " /= " <> show y
370
371 checker (ngram, count, entropy, _ev, autonomy, bwd_entropy, fwd_entropy) = do
372 let ns = NonTerminal <$> T.words ngram
373 t' = findTrie ns nt
374 P.putStrLn $ " " <> T.unpack ngram <> ":"
375 check (==) "count" count (_node_count (_fwd t'))
376 check sim "entropy" entropy (nodeEntropy info_entropy t')
377 check sim "autonomy" autonomy (nodeEntropy info_autonomy t')
378 check sim "fwd_entropy" fwd_entropy (nodeEntropy info_entropy (_fwd t'))
379 check sim "bwd_entropy" bwd_entropy (nodeEntropy info_entropy (findTrie ns (_bwd nt)))
380
381 printTrie =
382 P.putStrLn . Tree.drawTree
383 . fmap show
384 . toTree (NonTerminal "")
385
386 -- | TODO real data is a list of tokenized sentences
387 example0, example1, example2, example3, example4, example5, example6 :: [Text]
388 example0 = ["New-York is New-York and New-York"]
389 example1 = ["to-be or not to-be"]
390 example2 = ["to-be-or not to-be-or NOT to-be and"]
391 example3 = example0 <> example0
392 -- > TEST: Should not have York New in the trie
393 example4 = ["a-b-c-d e a-b-c-d f"]
394 example5 = ["a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"]
395 example6 = ["le-petit chat"
396 ,"le-petit chien"
397 ,"le-petit rat"
398 ,"le gros rat"
399 ]
400
401 checks0, checks2 :: [(Text, Int, Double, Double, Double, Double, Double)]
402
403 checks0 = [("\ue02b New", 1, nan, nan, nan, nan, 0.0), ("New York", 3, 1.584962500721156, 1.584962500721156, 1.414213562373095, nan, 1.584962500721156), ("York is", 1, 0.0, nan, nan, nan, 0.0), ("is New", 1, 0.0, nan, nan, nan, 0.0), ("New York", 3, 1.584962500721156, 1.584962500721156, 1.414213562373095, nan, 1.584962500721156), ("York and", 1, 0.0, nan, nan, nan, 0.0), ("and New", 1, 0.0, nan, nan, nan, 0.0), ("New York", 3, 1.584962500721156, 1.584962500721156, 1.414213562373095, nan, 1.584962500721156), ("York \ue02d", 1, nan, nan, nan, nan, nan)]
404
405 checks2 =
406 [("to be", 3, 1.2516291673878228, 1.2516291673878228, 1.5535694744293167, nan, 0.9182958340544896)
407 ,("be or", 2, 0.5, nan, nan, nan, 1.0)
408 ,("or not", 1, 0.0, nan, nan, nan, 0.0)
409 ,("not to", 1, 0.0, nan, nan, nan, 0.0)
410 ,("or NOT", 1, 0.0, nan, nan, nan, 0.0)
411 ,("NOT to", 1, 0.0, nan, nan, nan, 0.0)
412 ,("be and", 1, 0.0, nan, nan, nan, 0.0)
413 ]
414
415
416 runTests :: IO ()
417 runTests =
418 forM_
419 [("example0", 2, example0, checks0)
420 ,("example1", 2, example1, [])
421 ,("example2", 3, example2, checks2)
422 ,("example3", 2, example3, [])
423 ,("example4", 4, example4, [])
424 ,("example5", 5, example5, [])
425 ]
426 (\(name, n, ex, checks) -> do
427 P.putStrLn $ name <> " " <> show n
428 b <- testEleve False n ex checks
429 P.putStrLn $ " splitting: " <> if b then "PASS" else "FAIL"
430 )