1 {-# LANGUAGE NoImplicitPrelude #-}
2 {-# LANGUAGE OverloadedStrings #-}
3 {-# LANGUAGE RankNTypes #-}
4 {-# LANGUAGE TemplateHaskell #-}
6 # Implementation of Unsupervized Word Segmentation
9 - EleVe Python implementation and discussions with Korantin August and Bruno Gaume
10 [git repo](https://github.com/kodexlab/eleve.git)
12 - Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
13 Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
14 the 50th Annual Meeting of the Association for Computational Linguistics
15 , pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
17 Notes for current implementation:
18 - The node count is correct; TODO add tests to keep track of it
19 - NP fix normalization
20 - NP extract longer ngrams (see paper above, viterbi algo can be used)
21 - TODO TEST: prop (Node c _e f) = c == Map.size f
23 - AD: Real ngrams extraction test
24 from Gargantext.Text.Terms import extractTermsUnsupervised
25 docs <- runCmdRepl $ selectDocs 1004
26 extractTermsUnsupervised 3 $ DT.intercalate " "
28 $ Gargantext.map _hyperdataDocument_abstract docs
32 module Gargantext.Text.Eleve where
34 import Debug.Trace (trace)
35 -- import Debug.SimpleReflect
37 import Control.Lens (Lens, Lens', ASetter, Getting, (^.), (^?), (&), (.~), (%~), view, makeLenses, _Just)
38 import Control.Monad (foldM)
40 import qualified Data.List as L
42 import Data.Text (Text)
43 import qualified Data.Text as T
45 import Data.Maybe (fromMaybe, catMaybes)
46 import qualified Data.Map as Map
47 import Gargantext.Prelude hiding (cs)
48 import qualified Data.Tree as Tree
49 import Data.Tree (Tree)
50 import qualified Prelude as P (putStrLn, logBase, String)
52 ------------------------------------------------------------------------
55 , _info_norm_entropy :: e
56 , _info_norm_entropy' :: e
59 instance Show e => Show (I e) where
60 show (I e n n') = show (e, n, n')
64 type ModEntropy i o e = (e -> e) -> i -> o
66 setNormEntropy :: ModEntropy e (I e) e
67 setNormEntropy f e = I e (f e) e -- (panic "setNormEntropy")
69 testEleve n example = do
71 ex = toToken n <$> example
72 t = buildTrie $ L.concat $ chunkAlong n 1 <$> ex
73 nt = normalizeEntropy identity setNormEntropy (t :: Trie Token Double)
74 nt' = normalizeEntropy' info_entropy (\f -> info_norm_entropy' %~ f) nt
76 P.putStrLn $ Tree.drawTree
78 $ toTree (NonTerminal "") nt'
80 pure $ map unToken $ split info_entropy nt' $ L.concat ex
81 -- NP: here we use the entropy to split
82 -- instead we should use either:
83 -- info_norm_entropy or info_norm_entropy'
84 -- However they should first be fixed.
86 -- | TODO real data is a list of tokenized sentences
87 example0 = [T.words "New York is New York and New York"]
88 example1 = [T.words "to be or not to be"]
89 example2 = [T.words "to be or not to be or"]
90 example3 = example0 <> example0 -- > TEST: Should not have York New in the trie
91 example4 = map (T.pack . pure) ("abcdefabcdegabcde" :: P.String)
93 ------------------------------------------------------------------------
94 ------------------------------------------------------------------------
95 data Token = NonTerminal Text
97 deriving (Ord, Eq, Show)
99 toToken :: Int -> [Text] -> [Token]
100 toToken n xs = (NonTerminal <$> xs) <> L.take n (repeat Terminal)
102 unToken :: [Token] -> [Text]
105 f (NonTerminal x) = x
108 ------------------------------------------------------------------------
111 = Node { _node_count :: Int
113 , _node_children :: Map k (Trie k e)
115 | Leaf { _node_count :: Int }
119 insertTries :: Ord k => [[k]] -> Trie k ()
120 insertTries = L.foldr insertTrie emptyTrie
122 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
123 insertTrie [] n = n { _node_count = _node_count n +1}
124 insertTrie (x:xs) (Leaf c) = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
125 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
127 f = Just . insertTrie xs . fromMaybe emptyTrie
129 -- emptyTrie :: (Ord k, Monoid e) => Trie k e
130 -- emptyTrie = Node 0 mempty mempty
131 emptyTrie :: Trie k e
134 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
136 | Map.null children = Leaf c
137 | otherwise = Node c mempty children
139 -----------------------------
141 -- | Trie to Tree since Tree as nice print function
142 toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
143 toTree k (Leaf c) = Tree.Node (k, c, Nothing) []
144 toTree k (Node c e cs) = Tree.Node (k, c, Just e) (map (uncurry toTree) $ Map.toList cs)
146 ------------------------------------------------------------------------
147 ------------------------------------------------------------------------
149 entropyTrie :: (Num e, Floating e) => (k -> Bool) -> Trie k () -> Trie k e
150 entropyTrie _ (Leaf c) = Leaf c
151 entropyTrie pred (Node c _e children) = Node c e (map (entropyTrie pred) children)
153 e = sum $ map f $ Map.toList children
154 f (k, child) = if pred k then chc * P.logBase 2 (fromIntegral c)
155 else - chc * P.logBase 2 chc
157 chc = fromIntegral (_node_count child) / fromIntegral c
159 normalizeEntropy :: (Fractional e, Floating e, Show e)
160 => Getting e i e -> ModEntropy i o e -> Trie k i -> Trie k o
161 normalizeEntropy inE modE = go $ modE identity
163 go _ (Leaf c) = Leaf c
164 go f (Node c i children) | not (Map.null children) =
165 -- trace (show $ L.length es) $
166 Node c (f i) $ go (modE $ normalizeLevel m v) <$> children
168 es = [ i' ^. inE | Node _ i' _ <- Map.elems children ]
171 ------------------------------------------------------------------------
173 normalizeLevel :: (Fractional e, Floating e, Show e)
175 normalizeLevel m v e = (e - m) / v
177 buildTrie :: (Floating e, Show e) => [[Token]] -> Trie Token e
178 buildTrie = entropyTrie (== Terminal) . insertTries
180 nodeEntropy :: Trie k e -> Maybe e
181 nodeEntropy (Node _ e _) = Just e
182 nodeEntropy (Leaf _) = Nothing
184 nodeChildren :: Trie k e -> Map k (Trie k e)
185 nodeChildren (Node _ _ cs) = cs
186 nodeChildren (Leaf _) = Map.empty
188 nodeChild :: Ord k => k -> Trie k e -> Maybe (Trie k e)
189 nodeChild k (Node _ _ cs) = Map.lookup k cs
190 nodeChild _ (Leaf _) = Nothing
192 levels :: Trie k e -> [[Trie k e]]
193 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
195 subForest :: Trie k e -> [Trie k e]
196 subForest (Leaf _) = []
197 subForest (Node _ _ children) = Map.elems children
199 entropyLevels :: Getting e i e -> Trie k i -> [[e]]
200 entropyLevels inE = fmap (fmap (view inE) . catMaybes . fmap nodeEntropy) . levels
202 --fwd :: Getting a s a -> ASetter s t u3 a -> s -> t
203 --fwd inE outE s = s & outE .~ (s ^. inE)
205 normalizeEntropy' :: (Fractional e, Floating e, Show e)
206 => Getting e i e -> ModEntropy i o e -> Trie k i -> Trie k o
207 normalizeEntropy' inE modE t = go (modE identity) (entropyLevels inE t) t
209 go _ [] _ = panic "normalizeEntropy' empty levels"
210 go _ _ (Leaf c) = Leaf c
211 go _ ([] : _) _ = panic "normalizeEntropy': empty level"
212 go f (es : ess) (Node c i children) =
213 Node c (f i) $ go (modE $ normalizeLevel m v) ess <$> children
218 ------------------------------------------------------------------------
219 ------------------------------------------------------------------------
220 split :: (Num e, Ord e, Show e) => Lens' i e -> Trie Token i -> [Token] -> [[Token]]
221 split inE t0 = go t0 []
224 consRev xs xss = reverse xs : xss
226 go _ pref [] = [reverse pref]
227 go t pref (x:xs) = case nodeChild x t of
228 Nothing -> consRev pref $ go t0 [x] xs
229 Just xt -> case nodeChild x t0 of
230 Nothing -> panic "TODO"
232 let et = ne (panic "t") t
233 ext0 = ne (panic "xt0") xt0
236 -- trace (show ((reverse pref, et, ext0), (reverse (x : pref), ext))) $
237 case et {-+ ext0-} < ext of
238 -- NP: here we must take ext0 in account however currently it
240 -- For instance it currently works well to 2-grams but not more.
241 -- PASS: test 4 example1
242 -- FAIL: test 4 example2
243 True -> go xt (x:pref) xs
244 False -> consRev pref $ go xt0 [x] xs
246 ne d t = fromMaybe d (nodeEntropy t ^? _Just . inE)