2 Module : Gargantext.Text.Eleve
3 Description : Unsupervized Word segmentation
4 Copyright : (c) CNRS, 2019-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 # Implementation of Unsupervized Word Segmentation
13 - EleVe Python implementation and discussions with Korantin August and Bruno Gaume
14 [git repo](https://github.com/kodexlab/eleve.git)
16 - Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
17 Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
18 the 50th Annual Meeting of the Association for Computational Linguistics
19 , pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
21 Notes for current implementation:
22 - The node count is correct; TODO AD add tests to keep track of it
23 - NP fix normalization
24 - NP extract longer ngrams (see paper above, viterbi algo can be used)
25 - TODO AD TEST: prop (Node c _e f) = c == Map.size f
27 - AD: Real ngrams extraction test
28 from Gargantext.Text.Terms import extractTermsUnsupervised
29 docs <- runCmdRepl $ selectDocs 1004
30 extractTermsUnsupervised 3 $ DT.intercalate " "
32 $ Gargantext.map _hyperdataDocument_abstract docs
36 {-# LANGUAGE NoImplicitPrelude #-}
37 {-# LANGUAGE OverloadedStrings #-}
38 {-# LANGUAGE RankNTypes #-}
39 {-# LANGUAGE TemplateHaskell #-}
41 module Gargantext.Text.Eleve where
43 -- import Debug.Trace (trace)
44 -- import Debug.SimpleReflect
46 import Control.Lens (Lens', Getting, (^.), (^?), (%~), view, makeLenses, _Just)
47 import Control.Monad (foldM, mapM_, forM_)
49 import qualified Data.List as L
51 import Data.Text (Text)
52 import qualified Data.Text as T
54 import Data.Maybe (fromMaybe, catMaybes)
55 import qualified Data.Map as Map
56 import Gargantext.Prelude hiding (cs)
57 import qualified Data.Tree as Tree
58 import Data.Tree (Tree)
59 import qualified Prelude as P (putStrLn, logBase)
61 ------------------------------------------------------------------------
62 -- | Example and tests for development
65 , _info_norm_entropy :: e
66 , _info_norm_entropy' :: e
69 instance Show e => Show (I e) where
70 show (I e n n') = show (e, n, n')
74 type ModEntropy i o e = (e -> e) -> i -> o
76 setNormEntropy :: ModEntropy e (I e) e
77 setNormEntropy f e = I e (f e) e -- (panic "setNormEntropy")
79 data Token = NonTerminal Text
81 deriving (Ord, Eq, Show)
83 toToken :: Int -> [Text] -> [Token]
84 toToken n xs = (NonTerminal <$> xs) <> L.take n (repeat Terminal)
86 unToken :: [Token] -> [Text]
92 ------------------------------------------------------------------------
95 = Node { _node_count :: Int
97 , _node_children :: Map k (Trie k e)
99 | Leaf { _node_count :: Int }
104 insertTries :: Ord k => [[k]] -> Trie k ()
105 insertTries = L.foldr insertTrie emptyTrie
107 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
108 insertTrie [] n = n { _node_count = _node_count n +1}
109 insertTrie (x:xs) (Leaf c) = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
110 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
112 f = Just . insertTrie xs . fromMaybe emptyTrie
114 -- emptyTrie :: (Ord k, Monoid e) => Trie k e
115 -- emptyTrie = Node 0 mempty mempty
116 emptyTrie :: Trie k e
119 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
121 | Map.null children = Leaf c
122 | otherwise = Node c mempty children
124 -----------------------------
126 -- | Trie to Tree since Tree as nice print function
127 toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
128 toTree k (Leaf c) = Tree.Node (k, c, Nothing) []
129 toTree k (Node c e cs) = Tree.Node (k, c, Just e) (map (uncurry toTree) $ Map.toList cs)
131 ------------------------------------------------------------------------
132 ------------------------------------------------------------------------
134 entropyTrie :: (Num e, Floating e) => (k -> Bool) -> Trie k () -> Trie k e
135 entropyTrie _ (Leaf c) = Leaf c
136 entropyTrie pred (Node c _e children) = Node c e (map (entropyTrie pred) children)
138 e = sum $ map f $ Map.toList children
139 f (k, child) = if pred k then chc * P.logBase 2 (fromIntegral c)
140 else - chc * P.logBase 2 chc
142 chc = fromIntegral (_node_count child) / fromIntegral c
144 normalizeEntropy :: (Fractional e, Floating e, Show e)
145 => Getting e i e -> ModEntropy i o e -> Trie k i -> Trie k o
146 normalizeEntropy inE modE = go $ modE identity
148 go _ (Leaf c) = Leaf c
149 go f (Node c i children)
150 | Map.null children =
151 panic "normalizeEntropy: impossible"
153 -- trace (show $ L.length es) $
154 Node c (f i) $ go (modE $ normalizeLevel m v) <$> children
156 es = [ i' ^. inE | Node _ i' _ <- Map.elems children ]
159 ------------------------------------------------------------------------
161 normalizeLevel :: (Fractional e, Floating e, Show e)
163 normalizeLevel m v e = (e - m) / v
165 buildTrie :: (Floating e, Show e) => [[Token]] -> Trie Token e
166 buildTrie = entropyTrie (== Terminal) . insertTries
168 nodeEntropy :: Trie k e -> Maybe e
169 nodeEntropy (Node _ e _) = Just e
170 nodeEntropy (Leaf _) = Nothing
172 nodeChildren :: Trie k e -> Map k (Trie k e)
173 nodeChildren (Node _ _ cs) = cs
174 nodeChildren (Leaf _) = Map.empty
176 nodeChild :: Ord k => k -> Trie k e -> Maybe (Trie k e)
177 nodeChild k (Node _ _ cs) = Map.lookup k cs
178 nodeChild _ (Leaf _) = Nothing
180 findTrie :: Ord k => [k] -> Trie k e -> Maybe (Trie k e)
181 findTrie ks t = foldM (flip nodeChild) t ks
183 levels :: Trie k e -> [[Trie k e]]
184 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
186 subForest :: Trie k e -> [Trie k e]
187 subForest (Leaf _) = []
188 subForest (Node _ _ children) = Map.elems children
190 entropyLevels :: Getting e i e -> Trie k i -> [[e]]
191 entropyLevels inE = fmap (fmap (view inE) . catMaybes . fmap nodeEntropy) . levels
193 --fwd :: Getting a s a -> ASetter s t u3 a -> s -> t
194 --fwd inE outE s = s & outE .~ (s ^. inE)
196 normalizeEntropy' :: (Fractional e, Floating e, Show e)
197 => Getting e i e -> ModEntropy i o e -> Trie k i -> Trie k o
198 normalizeEntropy' inE modE t = go (modE identity) (entropyLevels inE t) t
200 go _ [] _ = panic "normalizeEntropy' empty levels"
201 go _ _ (Leaf c) = Leaf c
202 go _ ([] : _) _ = panic "normalizeEntropy': empty level"
203 go f (es : ess) (Node c i children) =
204 Node c (f i) $ go (modE $ normalizeLevel m v) ess <$> children
209 ------------------------------------------------------------------------
210 ------------------------------------------------------------------------
211 split :: (Num e, Ord e, Show e) => Lens' i e -> Trie Token i -> [Token] -> [[Token]]
212 split inE t0 = go t0 []
215 consRev xs xss = reverse xs : xss
217 go _ pref [] = [reverse pref]
218 go _ pref (Terminal:_) = [reverse pref]
219 go t pref (x:xs) = case nodeChild x t of
220 Nothing -> consRev pref $ go t0 [x] xs
221 Just xt -> case nodeChild x t0 of
222 Nothing -> panic $ "TODO"
224 let et = ne (panic "t") t
225 -- ^ entropy of the current prefix
226 ext0 = ne (panic "xt0") xt0
229 -- ^ entropy of the current prefix plus x
231 -- trace (show ((reverse pref, et, ext0), (reverse (x : pref), ext))) $
233 then go xt (x:pref) xs
234 else consRev pref $ go xt0 [x] xs
236 ne d t = fromMaybe d (nodeEntropy t ^? _Just . inE)
238 ------------------------------------------------------------------------
239 ------------------------------------------------------------------------
241 mainEleve :: Int -> [[Text]] -> [[[Text]]]
242 mainEleve n input = map unToken . split identity (t :: Trie Token Double) <$> inp
244 inp = toToken (n - 1) <$> input
245 t = buildTrie $ L.concat $ chunkAlong n 1 <$> inp
246 -- NP: here we use the entropy to split
247 -- instead we should use either:
248 -- info_norm_entropy or info_norm_entropy'
249 -- However they should first be fixed.
251 testEleve :: Bool -> Int -> [Text] -> IO Bool
252 testEleve debug n output = do
254 out = T.words <$> output
255 expected = fmap (T.splitOn "-") <$> out
256 input = (T.splitOn "-" =<<) <$> out
257 inp = toToken (n - 1) <$> input
258 t = buildTrie $ L.concat $ chunkAlong n 1 <$> inp
259 nt = normalizeEntropy identity setNormEntropy (t :: Trie Token Double)
260 nt' = normalizeEntropy' info_entropy (\f -> info_norm_entropy' %~ f) nt
261 pss = [ (ps, findTrie ps t ^? _Just . node_entropy) -- . info_entropy)
264 , cs <- chunkAlong m 1 <$> inp
268 res = map unToken . split identity t <$> inp
270 P.putStrLn (show input)
271 mapM_ (P.putStrLn . show) pss
272 P.putStrLn $ Tree.drawTree
274 $ toTree (NonTerminal "") nt'
275 P.putStrLn $ show res
276 pure $ expected == res
278 -- | TODO real data is a list of tokenized sentences
279 example0, example1, example2, example3, example4, example5 :: [Text]
280 example0 = ["New-York is New-York and New-York"]
281 example1 = ["to-be or not to-be"]
282 example2 = ["to-be-or not to-be-or NOT to-be and"]
283 example3 = example0 <> example0
284 -- > TEST: Should not have York New in the trie
285 example4 = ["a-b-c-d e a-b-c-d f"]
286 example5 = ["a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"]
291 [("example0", 2, example0)
292 ,("example1", 2, example1)
293 ,("example2", 3, example2)
294 ,("example3", 2, example3)
295 ,("example4", 4, example4)
296 ,("example5", 5, example5)
298 (\(name, n, ex) -> do
299 b <- testEleve False n ex
300 P.putStrLn $ name <> " " <> show n <> " " <> if b then "PASS" else "FAIL"