2 Module : Gargantext.Text.Eleve
3 Description : Unsupervized Word segmentation
4 Copyright : (c) CNRS, 2019-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 # Implementation of Unsupervized Word Segmentation
14 - Python implementation (Korantin August, Emmanuel Navarro):
15 [EleVe](https://github.com/kodexlab/eleve.git)
17 - Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
18 Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
19 the 50th Annual Meeting of the Association for Computational Linguistics
20 , pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
22 Notes for current implementation:
23 - TODO fix normalization
24 - TODO extract longer ngrams (see paper above, viterbi algo can be used)
25 - TODO AD TEST: prop (Node c _e f) = c == Map.size f
27 - AD: Real ngrams extraction test
28 from Gargantext.Text.Terms import extractTermsUnsupervised
29 docs <- runCmdRepl $ selectDocs 1004
30 extractTermsUnsupervised 3 $ DT.intercalate " "
32 $ Gargantext.map _hyperdataDocument_abstract docs
35 {-# LANGUAGE NoImplicitPrelude #-}
36 {-# LANGUAGE OverloadedStrings #-}
37 {-# LANGUAGE RankNTypes #-}
38 {-# LANGUAGE TemplateHaskell #-}
40 module Gargantext.Text.Eleve where
42 -- import Debug.Trace (trace)
43 -- import Debug.SimpleReflect
45 import Control.Lens (Lens', Getting, (^.), (^?), (%~), view, makeLenses, _Just)
46 import Control.Monad (foldM, mapM_, forM_)
48 import qualified Data.List as L
50 import Data.Text (Text)
51 import qualified Data.Text as T
53 import Data.Maybe (fromMaybe, catMaybes)
54 import qualified Data.Map as Map
55 import Gargantext.Prelude hiding (cs)
56 import qualified Data.Tree as Tree
57 import Data.Tree (Tree)
58 import qualified Prelude as P (putStrLn, logBase)
60 ------------------------------------------------------------------------
61 -- | Example and tests for development
64 , _info_norm_entropy :: e
65 , _info_norm_entropy' :: e
68 instance Show e => Show (I e) where
69 show (I e n n') = show (e, n, n')
73 type ModEntropy i o e = (e -> e) -> i -> o
75 setNormEntropy :: ModEntropy e (I e) e
76 setNormEntropy f e = I e (f e) e -- (panic "setNormEntropy")
78 data Token = NonTerminal Text
80 deriving (Ord, Eq, Show)
82 toToken :: Int -> [Text] -> [Token]
83 toToken n xs = (NonTerminal <$> xs) <> L.take n (repeat Terminal)
85 unToken :: [Token] -> [Text]
91 ------------------------------------------------------------------------
94 = Node { _node_count :: Int
96 , _node_children :: Map k (Trie k e)
98 | Leaf { _node_count :: Int }
103 insertTries :: Ord k => [[k]] -> Trie k ()
104 insertTries = L.foldr insertTrie emptyTrie
106 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
107 insertTrie [] n = n { _node_count = _node_count n +1}
108 insertTrie (x:xs) (Leaf c) = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
109 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
111 f = Just . insertTrie xs . fromMaybe emptyTrie
113 -- emptyTrie :: (Ord k, Monoid e) => Trie k e
114 -- emptyTrie = Node 0 mempty mempty
115 emptyTrie :: Trie k e
118 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
120 | Map.null children = Leaf c
121 | otherwise = Node c mempty children
123 -----------------------------
125 -- | Trie to Tree since Tree as nice print function
126 toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
127 toTree k (Leaf c) = Tree.Node (k, c, Nothing) []
128 toTree k (Node c e cs) = Tree.Node (k, c, Just e) (map (uncurry toTree) $ Map.toList cs)
130 ------------------------------------------------------------------------
131 ------------------------------------------------------------------------
133 entropyTrie :: (Num e, Floating e) => (k -> Bool) -> Trie k () -> Trie k e
134 entropyTrie _ (Leaf c) = Leaf c
135 entropyTrie pred (Node c _e children) = Node c e (map (entropyTrie pred) children)
137 e = sum $ map f $ Map.toList children
138 f (k, child) = if pred k then chc * P.logBase 2 (fromIntegral c)
139 else - chc * P.logBase 2 chc
141 chc = fromIntegral (_node_count child) / fromIntegral c
143 normalizeEntropy :: (Fractional e, Floating e, Show e)
144 => Getting e i e -> ModEntropy i o e -> Trie k i -> Trie k o
145 normalizeEntropy inE modE = go $ modE identity
147 go _ (Leaf c) = Leaf c
148 go f (Node c i children)
149 | Map.null children =
150 panic "normalizeEntropy: impossible"
152 -- trace (show $ L.length es) $
153 Node c (f i) $ go (modE $ normalizeLevel m v) <$> children
155 es = [ i' ^. inE | Node _ i' _ <- Map.elems children ]
158 ------------------------------------------------------------------------
160 normalizeLevel :: (Fractional e, Floating e, Show e)
162 normalizeLevel m v e = (e - m) / v
164 buildTrie :: (Floating e, Show e) => [[Token]] -> Trie Token e
165 buildTrie = entropyTrie (== Terminal) . insertTries
167 nodeEntropy :: Trie k e -> Maybe e
168 nodeEntropy (Node _ e _) = Just e
169 nodeEntropy (Leaf _) = Nothing
171 nodeChildren :: Trie k e -> Map k (Trie k e)
172 nodeChildren (Node _ _ cs) = cs
173 nodeChildren (Leaf _) = Map.empty
175 nodeChild :: Ord k => k -> Trie k e -> Maybe (Trie k e)
176 nodeChild k (Node _ _ cs) = Map.lookup k cs
177 nodeChild _ (Leaf _) = Nothing
179 findTrie :: Ord k => [k] -> Trie k e -> Maybe (Trie k e)
180 findTrie ks t = foldM (flip nodeChild) t ks
182 levels :: Trie k e -> [[Trie k e]]
183 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
185 subForest :: Trie k e -> [Trie k e]
186 subForest (Leaf _) = []
187 subForest (Node _ _ children) = Map.elems children
189 entropyLevels :: Getting e i e -> Trie k i -> [[e]]
190 entropyLevels inE = fmap (fmap (view inE) . catMaybes . fmap nodeEntropy) . levels
192 --fwd :: Getting a s a -> ASetter s t u3 a -> s -> t
193 --fwd inE outE s = s & outE .~ (s ^. inE)
195 normalizeEntropy' :: (Fractional e, Floating e, Show e)
196 => Getting e i e -> ModEntropy i o e -> Trie k i -> Trie k o
197 normalizeEntropy' inE modE t = go (modE identity) (entropyLevels inE t) t
199 go _ [] _ = panic "normalizeEntropy' empty levels"
200 go _ _ (Leaf c) = Leaf c
201 go _ ([] : _) _ = panic "normalizeEntropy': empty level"
202 go f (es : ess) (Node c i children) =
203 Node c (f i) $ go (modE $ normalizeLevel m v) ess <$> children
208 ------------------------------------------------------------------------
209 ------------------------------------------------------------------------
210 split :: (Num e, Ord e, Show e) => Lens' i e -> Trie Token i -> [Token] -> [[Token]]
211 split inE t0 = go t0 []
214 consRev xs xss = reverse xs : xss
216 go _ pref [] = [reverse pref]
217 go _ pref (Terminal:_) = [reverse pref]
218 go t pref (x:xs) = case nodeChild x t of
219 Nothing -> consRev pref $ go t0 [x] xs
220 Just xt -> case nodeChild x t0 of
221 Nothing -> panic $ "TODO"
223 let et = ne (panic "t") t
224 -- ^ entropy of the current prefix
225 ext0 = ne (panic "xt0") xt0
228 -- ^ entropy of the current prefix plus x
230 -- trace (show ((reverse pref, et, ext0), (reverse (x : pref), ext))) $
232 then go xt (x:pref) xs
233 else consRev pref $ go xt0 [x] xs
235 ne d t = fromMaybe d (nodeEntropy t ^? _Just . inE)
237 ------------------------------------------------------------------------
238 ------------------------------------------------------------------------
240 mainEleve :: Int -> [[Text]] -> [[[Text]]]
241 mainEleve n input = map unToken . split identity (t :: Trie Token Double) <$> inp
243 inp = toToken (n - 1) <$> input
244 t = buildTrie $ L.concat $ chunkAlong n 1 <$> inp
245 -- NP: here we use the entropy to split
246 -- instead we should use either:
247 -- info_norm_entropy or info_norm_entropy'
248 -- However they should first be fixed.
250 testEleve :: Bool -> Int -> [Text] -> IO Bool
251 testEleve debug n output = do
253 out = T.words <$> output
254 expected = fmap (T.splitOn "-") <$> out
255 input = (T.splitOn "-" =<<) <$> out
256 inp = toToken (n - 1) <$> input
257 t = buildTrie $ L.concat $ chunkAlong n 1 <$> inp
258 nt = normalizeEntropy identity setNormEntropy (t :: Trie Token Double)
259 nt' = normalizeEntropy' info_entropy (\f -> info_norm_entropy' %~ f) nt
260 pss = [ (ps, findTrie ps t ^? _Just . node_entropy) -- . info_entropy)
263 , cs <- chunkAlong m 1 <$> inp
267 res = map unToken . split identity t <$> inp
269 P.putStrLn (show input)
270 mapM_ (P.putStrLn . show) pss
271 P.putStrLn $ Tree.drawTree
273 $ toTree (NonTerminal "") nt'
274 P.putStrLn $ show res
275 pure $ expected == res
277 -- | TODO real data is a list of tokenized sentences
278 example0, example1, example2, example3, example4, example5 :: [Text]
279 example0 = ["New-York is New-York and New-York"]
280 example1 = ["to-be or not to-be"]
281 example2 = ["to-be-or not to-be-or NOT to-be and"]
282 example3 = example0 <> example0
283 -- > TEST: Should not have York New in the trie
284 example4 = ["a-b-c-d e a-b-c-d f"]
285 example5 = ["a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"]
290 [("example0", 2, example0)
291 ,("example1", 2, example1)
292 ,("example2", 3, example2)
293 ,("example3", 2, example3)
294 ,("example4", 4, example4)
295 ,("example5", 5, example5)
297 (\(name, n, ex) -> do
298 b <- testEleve False n ex
299 P.putStrLn $ name <> " " <> show n <> " " <> if b then "PASS" else "FAIL"