]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Eleve.hs
WIP ElEve
[gargantext.git] / src / Gargantext / Text / Eleve.hs
1 {-|
2 Module : Gargantext.Text.Eleve
3 Description : Unsupervized Word segmentation
4 Copyright : (c) CNRS, 2019-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 # Implementation of Unsupervized Word Segmentation
11
12 References:
13
14 - Python implementation (Korantin August, Emmanuel Navarro):
15 [EleVe](https://github.com/kodexlab/eleve.git)
16
17 - Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
18 Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
19 the 50th Annual Meeting of the Association for Computational Linguistics
20 , pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
21
22 Notes for current implementation:
23 - TODO fix normalization
24 - TODO extract longer ngrams (see paper above, viterbi algo can be used)
25 - TODO AD TEST: prop (Node c _e f) = c == Map.size f
26
27 - AD: Real ngrams extraction test
28 from Gargantext.Text.Terms import extractTermsUnsupervised
29 docs <- runCmdRepl $ selectDocs 1004
30 extractTermsUnsupervised 3 $ DT.intercalate " "
31 $ catMaybes
32 $ Gargantext.map _hyperdataDocument_abstract docs
33
34 -}
35 {-# LANGUAGE ConstraintKinds #-}
36 {-# LANGUAGE NoImplicitPrelude #-}
37 {-# LANGUAGE OverloadedStrings #-}
38 {-# LANGUAGE RankNTypes #-}
39 {-# LANGUAGE TemplateHaskell #-}
40
41 module Gargantext.Text.Eleve where
42
43 import Debug.Trace (trace)
44 -- import Debug.SimpleReflect
45
46 import Control.Lens (Lens', Getting, (^.), (^?), view, makeLenses, _Just)
47 import Control.Monad (foldM, mapM_, forM_)
48 import Data.Ord (Ord)
49 import qualified Data.List as L
50 import Data.Monoid
51 import Data.Text (Text)
52 import qualified Data.Text as T
53 import Data.Map (Map)
54 import Data.Maybe (fromMaybe, catMaybes)
55 import qualified Data.Map as Map
56 import Gargantext.Prelude hiding (cs)
57 import qualified Data.Tree as Tree
58 import Data.Tree (Tree)
59 import qualified Prelude as P (putStrLn, logBase, isNaN, RealFloat)
60
61 type Entropy e =
62 ( Fractional e
63 , Floating e
64 , P.RealFloat e
65 , Show e
66 -- ^ TODO: only used for debugging
67 )
68 ------------------------------------------------------------------------
69 -- | Example and tests for development
70 data I e = I
71 { _info_entropy :: e
72 , _info_norm_entropy :: e
73 }
74
75 instance Show e => Show (I e) where
76 show (I e n) = show (e, n)
77
78 makeLenses ''I
79
80 type ModEntropy i o e = (e -> e) -> i -> o
81
82 setNormEntropy :: ModEntropy e (I e) e
83 setNormEntropy f e = I e (f e)
84
85 data StartStop = Start | Stop
86 deriving (Ord, Eq, Show)
87
88 data Token = NonTerminal Text
89 | Terminal StartStop
90 deriving (Ord, Eq, Show)
91
92 isTerminal :: Token -> Bool
93 isTerminal (Terminal _) = True
94 isTerminal (NonTerminal _) = False
95
96 toToken :: Int -> [Text] -> [Token]
97 toToken n xs = Terminal Start : (NonTerminal <$> xs) <> L.take n (repeat $ Terminal Stop)
98
99 unToken :: [Token] -> [Text]
100 unToken = map f
101 where
102 f (NonTerminal x) = x
103 f (Terminal _) = ""
104
105 ------------------------------------------------------------------------
106
107 data Trie k e
108 = Node { _node_count :: Int
109 , _node_entropy :: e
110 , _node_children :: Map k (Trie k e)
111 }
112 | Leaf { _node_count :: Int }
113 deriving (Show)
114
115 makeLenses ''Trie
116
117 insertTries :: Ord k => [[k]] -> Trie k ()
118 insertTries = L.foldr insertTrie emptyTrie
119
120 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
121 insertTrie [] n = n { _node_count = _node_count n +1}
122 insertTrie (x:xs) (Leaf c) = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
123 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
124 where
125 f = Just . insertTrie xs . fromMaybe emptyTrie
126
127 -- emptyTrie :: (Ord k, Monoid e) => Trie k e
128 -- emptyTrie = Node 0 mempty mempty
129 emptyTrie :: Trie k e
130 emptyTrie = Leaf 0
131
132 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
133 mkTrie c children
134 | Map.null children = Leaf c
135 | otherwise = Node c mempty children
136
137 -----------------------------
138
139 -- | Trie to Tree since Tree as nice print function
140 toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
141 toTree k (Leaf c) = Tree.Node (k, c, Nothing) []
142 toTree k (Node c e cs) = Tree.Node (k, c, Just e) (map (uncurry toTree) $ Map.toList cs)
143
144 ------------------------------------------------------------------------
145 ------------------------------------------------------------------------
146
147 nan :: Floating e => e
148 nan = 0 / 0
149
150 updateIfDefined :: P.RealFloat e => e -> e -> e
151 updateIfDefined e0 e | P.isNaN e = e0
152 | otherwise = e
153
154 entropyTrie :: Floating e => (k -> Bool) -> Trie k () -> Trie k e
155 entropyTrie _ (Leaf c) = Leaf c
156 entropyTrie pred (Node c () children) = Node c e (map (entropyTrie pred) children)
157 where
158 e = sum $ map f $ Map.toList children
159 f (k, child) = if pred k then chc * P.logBase 2 (fromIntegral c)
160 else - chc * P.logBase 2 chc
161 where
162 chc = fromIntegral (_node_count child) / fromIntegral c
163 ------------------------------------------------------------------------
164
165 normalizeLevel :: Entropy e => [e] -> e -> e
166 normalizeLevel = checkDiff (go . filter (not . P.isNaN))
167
168 where
169 -- checkDiff f es e = let e' = f es e in if e == e' then e' else trace ("normalizeLevel: diff " <> show e <> " " <> show e') e'
170 checkDiff = identity
171 go [] = panic "normalizeLevel: impossible"
172 -- trace "normalizeLevel"
173 -- go [_] = identity
174 go es = \e -> (e - m) / v
175 {-
176 in if P.isNaN e'
177 then trace ("normalizeLevel " <> show (e,m,v,es))
178 e
179 else e'
180 -}
181 where
182 m = mean es
183 v = deviation es
184
185 {- Unused
186
187 nodeChildren :: Trie k e -> Map k (Trie k e)
188 nodeChildren (Node _ _ cs) = cs
189 nodeChildren (Leaf _) = Map.empty
190
191 -}
192
193 class IsTrie trie where
194 buildTrie :: Floating e => [[Token]] -> trie Token e
195 nodeEntropy :: Floating e => Getting e i e -> trie k i -> e
196 nodeChild :: Ord k => k -> trie k e -> trie k e
197 findTrie :: Ord k => [k] -> trie k e -> trie k e
198 normalizeEntropy :: Entropy e
199 => Getting e i e -> ModEntropy i o e
200 -> trie k i -> trie k o
201
202 nodeAutonomy :: (Ord k, Entropy e) => Getting e i e -> trie k i -> [k] -> e
203 nodeAutonomy inE t ks = nodeEntropy inE $ findTrie ks t
204
205 instance IsTrie Trie where
206 buildTrie = entropyTrie isTerminal . insertTries
207
208 nodeEntropy inE (Node _ e _) = e ^. inE
209 nodeEntropy _ (Leaf _) = -- trace "nodeEntropy of Leaf" $
210 nan
211
212 nodeChild k (Node _ _ cs) = fromMaybe emptyTrie (Map.lookup k cs)
213 nodeChild _ (Leaf _) = emptyTrie
214
215 findTrie ks t = L.foldl (flip nodeChild) t ks
216
217 normalizeEntropy inE modE t = go (modE identity) (entropyLevels inE t) t
218 where
219 go _ [] _ = panic "normalizeEntropy' empty levels"
220 go _ _ (Leaf c) = Leaf c
221 go _ ([] : _) _ = panic "normalizeEntropy': empty level"
222 go f (es : ess) (Node c i children) =
223 Node c (f i) $ go (modE $ normalizeLevel es) ess <$> children
224
225
226 {-
227 This is only normalizing a node with respect to its brothers (unlike all the
228 nodes of the same level).
229
230 normalizeEntropy inE modE = go $ modE identity
231 where
232 go _ (Leaf c) = Leaf c
233 go f (Node c i children)
234 | Map.null children =
235 panic "normalizeEntropy: impossible"
236 | otherwise =
237 Node c (f i) $ go (modE $ normalizeLevel es) <$> children
238 where
239 es = [ i' ^. inE | Node _ i' _ <- Map.elems children ]
240 -}
241 ------------------------------------------------------------------------
242
243 levels :: Trie k e -> [[Trie k e]]
244 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
245 where
246 subForest :: Trie k e -> [Trie k e]
247 subForest (Leaf _) = []
248 subForest (Node _ _ children) = Map.elems children
249
250 entropyLevels :: Entropy e => Getting e i e -> Trie k i -> [[e]]
251 entropyLevels inE = fmap (filter (not . P.isNaN) . map (nodeEntropy inE)) . levels
252
253 ------------------------------------------------------------------------
254
255 data Tries k e = Tries
256 { _fwd :: Trie k e
257 , _bwd :: Trie k e
258 }
259
260 instance IsTrie Tries where
261 buildTrie tts = Tries { _fwd = buildTrie tts
262 , _bwd = buildTrie (reverse <$> tts)
263 }
264
265 nodeEntropy inE (Tries fwd bwd) = mean [nodeEntropy inE fwd, nodeEntropy inE bwd]
266
267 findTrie ks (Tries fwd bwd) = Tries (findTrie ks fwd) (findTrie (reverse ks) bwd)
268
269 nodeChild k (Tries fwd bwd) = Tries (nodeChild k fwd) (nodeChild k bwd)
270
271 normalizeEntropy inE modE = onTries (normalizeEntropy inE modE)
272
273 onTries :: (Trie k i -> Trie k o) -> Tries k i -> Tries k o
274 onTries f (Tries fwd bwd) = Tries (f fwd) (f bwd)
275
276 ------------------------------------------------------------------------
277 split :: (IsTrie trie, Entropy e) => Lens' i e -> trie Token i -> [Token] -> [[Token]]
278 split _ _ [] = []
279 split inE t0 (Terminal Start:xs0) = split inE (nodeChild (Terminal Start) t0) xs0
280 split inE t0 (x0:xs0) = go (nodeChild x0 t0) [x0] xs0
281 where
282 consRev [] xss = xss
283 consRev xs xss = reverse xs : xss
284
285 go _ pref [] = [reverse pref]
286 go _ pref (Terminal Stop:_) = [reverse pref]
287 go t pref (Terminal Start:xs) = go t pref xs
288 go t pref (x:xs) =
289 -- trace (show (if acc then "ACC" else "CUT", (reverse (x : pref), ext), if acc then ">" else "<=", ((reverse pref, et), "+", ([x], ext0)))) $
290 if acc
291 then go xt (x:pref) xs
292 else consRev pref $ go xt0 [x] xs
293 where
294 xt = nodeChild x t
295 xt0 = nodeChild x t0
296 et = ne 0 t
297 -- ^ entropy of the current prefix
298 ext0 = ne 0 xt0
299 -- ^ entropy of [x]
300 ext = ne 0 xt
301 -- ^ entropy of the current prefix plus x
302 acc = ext > et + ext0
303 -- aut(["in","this","paper"]) > aut(["in","this"]) + aut(["paper"])
304
305 ne d t = if P.isNaN e then d else e
306 where e = nodeEntropy inE t
307
308 {-
309 split :: Entropy e => Lens' i e -> Tries Token i -> [Token] -> [[Token]]
310 split inE t0 ts =
311 maximumWith (sum . map $ nodeAutonomy inE t0) (all the splits of ts)
312 -}
313
314 ------------------------------------------------------------------------
315 ------------------------------------------------------------------------
316
317 mainEleve :: Int -> [[Text]] -> [[[Text]]]
318 mainEleve _ _ = []
319 {-
320 mainEleve n input = map unToken . split identity (t :: Trie Token Double) <$> inp
321 where
322 inp = toToken (n - 1) <$> input
323 t = buildTrie $ L.concat $ chunkAlong n 1 <$> inp
324 -- NP: here we use the entropy to split
325 -- instead we should use either:
326 -- info_norm_entropy or info_norm_entropy'
327 -- However they should first be fixed.
328 -}
329
330 testEleve :: Bool -> Int -> [Text] -> IO Bool
331 testEleve debug n output = do
332 let
333 out = T.words <$> output
334 expected = fmap (T.splitOn "-") <$> out
335 input = (T.splitOn "-" =<<) <$> out
336 inp = toToken (n - 1) <$> input
337 t = buildTrie $ L.concat $ chunkAlong n 1 <$> inp
338 -- nt = normalizeEntropy identity setNormEntropy (fwd :: Trie Token Double)
339 -- nt = normalizeEntropy' info_entropy (\f -> info_norm_entropy' %~ f) nt
340 nt = normalizeEntropy identity setNormEntropy
341 (t :: Trie Token Double)
342 {-
343 pss = [ (ps, findTrie ps fwd ^? _Just . node_entropy) -- . info_entropy)
344 | ps <- L.nub $ [ c
345 | m <- [1..n]
346 , cs <- chunkAlong m 1 <$> inp
347 , c <- cs
348 ]
349 ]
350 -}
351 --res = map unToken . split identity fwd <$> inp
352 --res = map unToken . split info_norm_entropy' nt' <$> inp
353 res = map unToken . split info_norm_entropy nt <$> inp
354 when debug $ do
355 P.putStrLn (show input)
356 -- mapM_ (P.putStrLn . show) pss
357 P.putStrLn ""
358 printTrie nt
359 {-
360 printTrie (_fwd nt)
361 printTrie (_bwd nt)
362 -}
363 P.putStrLn $ show res
364 pure $ expected == res
365
366 where
367 printTrie =
368 P.putStrLn . Tree.drawTree
369 . fmap show
370 . toTree (NonTerminal "")
371
372 -- | TODO real data is a list of tokenized sentences
373 example0, example1, example2, example3, example4, example5, example6 :: [Text]
374 example0 = ["New-York is New-York and New-York"]
375 example1 = ["to-be or not to-be"]
376 example2 = ["to-be-or not to-be-or NOT to-be and"]
377 example3 = example0 <> example0
378 -- > TEST: Should not have York New in the trie
379 example4 = ["a-b-c-d e a-b-c-d f"]
380 example5 = ["a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"]
381 example6 = ["le-petit chat"
382 ,"le-petit chien"
383 ,"le-petit rat"
384 ,"le gros rat"
385 ]
386
387 runTests :: IO ()
388 runTests =
389 forM_
390 [("example0", 2, example0)
391 ,("example1", 2, example1)
392 ,("example2", 3, example2)
393 ,("example3", 2, example3)
394 ,("example4", 4, example4)
395 ,("example5", 5, example5)
396 ]
397 (\(name, n, ex) -> do
398 b <- testEleve False n ex
399 P.putStrLn $ name <> " " <> show n <> " " <> if b then "PASS" else "FAIL"
400 )