2 Module : Gargantext.Text.Eleve
3 Description : Unsupervized Word segmentation
4 Copyright : (c) CNRS, 2019-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 # Implementation of Unsupervized Word Segmentation
14 - Python implementation (Korantin August, Emmanuel Navarro):
15 [EleVe](https://github.com/kodexlab/eleve.git)
17 - Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
18 Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
19 the 50th Annual Meeting of the Association for Computational Linguistics
20 , pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
22 Notes for current implementation:
23 - TODO fix normalization
24 - TODO extract longer ngrams (see paper above, viterbi algo can be used)
25 - TODO AD TEST: prop (Node c _e f) = c == Map.size f
27 - AD: Real ngrams extraction test
28 from Gargantext.Text.Terms import extractTermsUnsupervised
29 docs <- runCmdRepl $ selectDocs 1004
30 extractTermsUnsupervised 3 $ DT.intercalate " "
32 $ Gargantext.map _hyperdataDocument_abstract docs
35 {-# LANGUAGE ConstraintKinds #-}
36 {-# LANGUAGE NoImplicitPrelude #-}
37 {-# LANGUAGE OverloadedStrings #-}
38 {-# LANGUAGE RankNTypes #-}
39 {-# LANGUAGE TemplateHaskell #-}
41 module Gargantext.Text.Eleve where
43 import Debug.Trace (trace)
44 -- import Debug.SimpleReflect
46 import Control.Lens (Lens', Getting, (^.), (^?), view, makeLenses, _Just)
47 import Control.Monad (foldM, mapM_, forM_)
49 import qualified Data.List as L
51 import Data.Text (Text)
52 import qualified Data.Text as T
54 import Data.Maybe (fromMaybe, catMaybes)
55 import qualified Data.Map as Map
56 import Gargantext.Prelude hiding (cs)
57 import qualified Data.Tree as Tree
58 import Data.Tree (Tree)
59 import qualified Prelude as P (putStrLn, logBase, isNaN, RealFloat)
66 -- ^ TODO: only used for debugging
68 ------------------------------------------------------------------------
69 -- | Example and tests for development
72 , _info_norm_entropy :: e
75 instance Show e => Show (I e) where
76 show (I e n) = show (e, n)
80 type ModEntropy i o e = (e -> e) -> i -> o
82 setNormEntropy :: ModEntropy e (I e) e
83 setNormEntropy f e = I e (f e)
85 data StartStop = Start | Stop
86 deriving (Ord, Eq, Show)
88 data Token = NonTerminal Text
90 deriving (Ord, Eq, Show)
92 isTerminal :: Token -> Bool
93 isTerminal (Terminal _) = True
94 isTerminal (NonTerminal _) = False
96 toToken :: Int -> [Text] -> [Token]
97 toToken n xs = Terminal Start : (NonTerminal <$> xs) <> L.take n (repeat $ Terminal Stop)
99 unToken :: [Token] -> [Text]
102 f (NonTerminal x) = x
105 ------------------------------------------------------------------------
108 = Node { _node_count :: Int
110 , _node_children :: Map k (Trie k e)
112 | Leaf { _node_count :: Int }
117 insertTries :: Ord k => [[k]] -> Trie k ()
118 insertTries = L.foldr insertTrie emptyTrie
120 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
121 insertTrie [] n = n { _node_count = _node_count n +1}
122 insertTrie (x:xs) (Leaf c) = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
123 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
125 f = Just . insertTrie xs . fromMaybe emptyTrie
127 -- emptyTrie :: (Ord k, Monoid e) => Trie k e
128 -- emptyTrie = Node 0 mempty mempty
129 emptyTrie :: Trie k e
132 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
134 | Map.null children = Leaf c
135 | otherwise = Node c mempty children
137 -----------------------------
139 -- | Trie to Tree since Tree as nice print function
140 toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
141 toTree k (Leaf c) = Tree.Node (k, c, Nothing) []
142 toTree k (Node c e cs) = Tree.Node (k, c, Just e) (map (uncurry toTree) $ Map.toList cs)
144 ------------------------------------------------------------------------
145 ------------------------------------------------------------------------
147 nan :: Floating e => e
150 updateIfDefined :: P.RealFloat e => e -> e -> e
151 updateIfDefined e0 e | P.isNaN e = e0
154 entropyTrie :: Floating e => (k -> Bool) -> Trie k () -> Trie k e
155 entropyTrie _ (Leaf c) = Leaf c
156 entropyTrie pred (Node c () children) = Node c e (map (entropyTrie pred) children)
158 e = sum $ map f $ Map.toList children
159 f (k, child) = if pred k then chc * P.logBase 2 (fromIntegral c)
160 else - chc * P.logBase 2 chc
162 chc = fromIntegral (_node_count child) / fromIntegral c
163 ------------------------------------------------------------------------
165 normalizeLevel :: Entropy e => [e] -> e -> e
166 normalizeLevel = checkDiff (go . filter (not . P.isNaN))
169 -- checkDiff f es e = let e' = f es e in if e == e' then e' else trace ("normalizeLevel: diff " <> show e <> " " <> show e') e'
171 go [] = panic "normalizeLevel: impossible"
172 -- trace "normalizeLevel"
174 go es = \e -> (e - m) / v
177 then trace ("normalizeLevel " <> show (e,m,v,es))
187 nodeChildren :: Trie k e -> Map k (Trie k e)
188 nodeChildren (Node _ _ cs) = cs
189 nodeChildren (Leaf _) = Map.empty
193 class IsTrie trie where
194 buildTrie :: Floating e => [[Token]] -> trie Token e
195 nodeEntropy :: Floating e => Getting e i e -> trie k i -> e
196 nodeChild :: Ord k => k -> trie k e -> trie k e
197 findTrie :: Ord k => [k] -> trie k e -> trie k e
198 normalizeEntropy :: Entropy e
199 => Getting e i e -> ModEntropy i o e
200 -> trie k i -> trie k o
202 nodeAutonomy :: (Ord k, Entropy e) => Getting e i e -> trie k i -> [k] -> e
203 nodeAutonomy inE t ks = nodeEntropy inE $ findTrie ks t
205 instance IsTrie Trie where
206 buildTrie = entropyTrie isTerminal . insertTries
208 nodeEntropy inE (Node _ e _) = e ^. inE
209 nodeEntropy _ (Leaf _) = -- trace "nodeEntropy of Leaf" $
212 nodeChild k (Node _ _ cs) = fromMaybe emptyTrie (Map.lookup k cs)
213 nodeChild _ (Leaf _) = emptyTrie
215 findTrie ks t = L.foldl (flip nodeChild) t ks
217 normalizeEntropy inE modE t = go (modE identity) (entropyLevels inE t) t
219 go _ [] _ = panic "normalizeEntropy' empty levels"
220 go _ _ (Leaf c) = Leaf c
221 go _ ([] : _) _ = panic "normalizeEntropy': empty level"
222 go f (es : ess) (Node c i children) =
223 Node c (f i) $ go (modE $ normalizeLevel es) ess <$> children
227 This is only normalizing a node with respect to its brothers (unlike all the
228 nodes of the same level).
230 normalizeEntropy inE modE = go $ modE identity
232 go _ (Leaf c) = Leaf c
233 go f (Node c i children)
234 | Map.null children =
235 panic "normalizeEntropy: impossible"
237 Node c (f i) $ go (modE $ normalizeLevel es) <$> children
239 es = [ i' ^. inE | Node _ i' _ <- Map.elems children ]
241 ------------------------------------------------------------------------
243 levels :: Trie k e -> [[Trie k e]]
244 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
246 subForest :: Trie k e -> [Trie k e]
247 subForest (Leaf _) = []
248 subForest (Node _ _ children) = Map.elems children
250 entropyLevels :: Entropy e => Getting e i e -> Trie k i -> [[e]]
251 entropyLevels inE = fmap (filter (not . P.isNaN) . map (nodeEntropy inE)) . levels
253 ------------------------------------------------------------------------
255 data Tries k e = Tries
260 instance IsTrie Tries where
261 buildTrie tts = Tries { _fwd = buildTrie tts
262 , _bwd = buildTrie (reverse <$> tts)
265 nodeEntropy inE (Tries fwd bwd) = mean [nodeEntropy inE fwd, nodeEntropy inE bwd]
267 findTrie ks (Tries fwd bwd) = Tries (findTrie ks fwd) (findTrie (reverse ks) bwd)
269 nodeChild k (Tries fwd bwd) = Tries (nodeChild k fwd) (nodeChild k bwd)
271 normalizeEntropy inE modE = onTries (normalizeEntropy inE modE)
273 onTries :: (Trie k i -> Trie k o) -> Tries k i -> Tries k o
274 onTries f (Tries fwd bwd) = Tries (f fwd) (f bwd)
276 ------------------------------------------------------------------------
277 split :: (IsTrie trie, Entropy e) => Lens' i e -> trie Token i -> [Token] -> [[Token]]
279 split inE t0 (Terminal Start:xs0) = split inE (nodeChild (Terminal Start) t0) xs0
280 split inE t0 (x0:xs0) = go (nodeChild x0 t0) [x0] xs0
283 consRev xs xss = reverse xs : xss
285 go _ pref [] = [reverse pref]
286 go _ pref (Terminal Stop:_) = [reverse pref]
287 go t pref (Terminal Start:xs) = go t pref xs
289 -- trace (show (if acc then "ACC" else "CUT", (reverse (x : pref), ext), if acc then ">" else "<=", ((reverse pref, et), "+", ([x], ext0)))) $
291 then go xt (x:pref) xs
292 else consRev pref $ go xt0 [x] xs
297 -- ^ entropy of the current prefix
301 -- ^ entropy of the current prefix plus x
302 acc = ext > et + ext0
303 -- aut(["in","this","paper"]) > aut(["in","this"]) + aut(["paper"])
305 ne d t = if P.isNaN e then d else e
306 where e = nodeEntropy inE t
309 split :: Entropy e => Lens' i e -> Tries Token i -> [Token] -> [[Token]]
311 maximumWith (sum . map $ nodeAutonomy inE t0) (all the splits of ts)
314 ------------------------------------------------------------------------
315 ------------------------------------------------------------------------
317 mainEleve :: Int -> [[Text]] -> [[[Text]]]
320 mainEleve n input = map unToken . split identity (t :: Trie Token Double) <$> inp
322 inp = toToken (n - 1) <$> input
323 t = buildTrie $ L.concat $ chunkAlong n 1 <$> inp
324 -- NP: here we use the entropy to split
325 -- instead we should use either:
326 -- info_norm_entropy or info_norm_entropy'
327 -- However they should first be fixed.
330 testEleve :: Bool -> Int -> [Text] -> IO Bool
331 testEleve debug n output = do
333 out = T.words <$> output
334 expected = fmap (T.splitOn "-") <$> out
335 input = (T.splitOn "-" =<<) <$> out
336 inp = toToken (n - 1) <$> input
337 t = buildTrie $ L.concat $ chunkAlong n 1 <$> inp
338 -- nt = normalizeEntropy identity setNormEntropy (fwd :: Trie Token Double)
339 -- nt = normalizeEntropy' info_entropy (\f -> info_norm_entropy' %~ f) nt
340 nt = normalizeEntropy identity setNormEntropy
341 (t :: Trie Token Double)
343 pss = [ (ps, findTrie ps fwd ^? _Just . node_entropy) -- . info_entropy)
346 , cs <- chunkAlong m 1 <$> inp
351 --res = map unToken . split identity fwd <$> inp
352 --res = map unToken . split info_norm_entropy' nt' <$> inp
353 res = map unToken . split info_norm_entropy nt <$> inp
355 P.putStrLn (show input)
356 -- mapM_ (P.putStrLn . show) pss
363 P.putStrLn $ show res
364 pure $ expected == res
368 P.putStrLn . Tree.drawTree
370 . toTree (NonTerminal "")
372 -- | TODO real data is a list of tokenized sentences
373 example0, example1, example2, example3, example4, example5, example6 :: [Text]
374 example0 = ["New-York is New-York and New-York"]
375 example1 = ["to-be or not to-be"]
376 example2 = ["to-be-or not to-be-or NOT to-be and"]
377 example3 = example0 <> example0
378 -- > TEST: Should not have York New in the trie
379 example4 = ["a-b-c-d e a-b-c-d f"]
380 example5 = ["a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"]
381 example6 = ["le-petit chat"
390 [("example0", 2, example0)
391 ,("example1", 2, example1)
392 ,("example2", 3, example2)
393 ,("example3", 2, example3)
394 ,("example4", 4, example4)
395 ,("example5", 5, example5)
397 (\(name, n, ex) -> do
398 b <- testEleve False n ex
399 P.putStrLn $ name <> " " <> show n <> " " <> if b then "PASS" else "FAIL"