]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Eleve.hs
[DOC] references.
[gargantext.git] / src / Gargantext / Text / Eleve.hs
1 {-|
2 Module : Gargantext.Text.Eleve
3 Description : Unsupervized Word segmentation
4 Copyright : (c) CNRS, 2019-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 # Implementation of Unsupervized Word Segmentation
11
12 References:
13
14 - Python implementation (Korantin August, Emmanuel Navarro):
15 [EleVe](https://github.com/kodexlab/eleve.git)
16
17 - Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
18 Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
19 the 50th Annual Meeting of the Association for Computational Linguistics
20 , pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
21
22 Notes for current implementation:
23 - TODO fix normalization
24 - TODO extract longer ngrams (see paper above, viterbi algo can be used)
25 - TODO AD TEST: prop (Node c _e f) = c == Map.size f
26
27 - AD: Real ngrams extraction test
28 from Gargantext.Text.Terms import extractTermsUnsupervised
29 docs <- runCmdRepl $ selectDocs 1004
30 extractTermsUnsupervised 3 $ DT.intercalate " "
31 $ catMaybes
32 $ Gargantext.map _hyperdataDocument_abstract docs
33
34 -}
35 {-# LANGUAGE NoImplicitPrelude #-}
36 {-# LANGUAGE OverloadedStrings #-}
37 {-# LANGUAGE RankNTypes #-}
38 {-# LANGUAGE TemplateHaskell #-}
39
40 module Gargantext.Text.Eleve where
41
42 -- import Debug.Trace (trace)
43 -- import Debug.SimpleReflect
44
45 import Control.Lens (Lens', Getting, (^.), (^?), (%~), view, makeLenses, _Just)
46 import Control.Monad (foldM, mapM_, forM_)
47 import Data.Ord (Ord)
48 import qualified Data.List as L
49 import Data.Monoid
50 import Data.Text (Text)
51 import qualified Data.Text as T
52 import Data.Map (Map)
53 import Data.Maybe (fromMaybe, catMaybes)
54 import qualified Data.Map as Map
55 import Gargantext.Prelude hiding (cs)
56 import qualified Data.Tree as Tree
57 import Data.Tree (Tree)
58 import qualified Prelude as P (putStrLn, logBase)
59
60 ------------------------------------------------------------------------
61 -- | Example and tests for development
62 data I e = I
63 { _info_entropy :: e
64 , _info_norm_entropy :: e
65 , _info_norm_entropy' :: e
66 }
67
68 instance Show e => Show (I e) where
69 show (I e n n') = show (e, n, n')
70
71 makeLenses ''I
72
73 type ModEntropy i o e = (e -> e) -> i -> o
74
75 setNormEntropy :: ModEntropy e (I e) e
76 setNormEntropy f e = I e (f e) e -- (panic "setNormEntropy")
77
78 data Token = NonTerminal Text
79 | Terminal
80 deriving (Ord, Eq, Show)
81
82 toToken :: Int -> [Text] -> [Token]
83 toToken n xs = (NonTerminal <$> xs) <> L.take n (repeat Terminal)
84
85 unToken :: [Token] -> [Text]
86 unToken = map f
87 where
88 f (NonTerminal x) = x
89 f Terminal = ""
90
91 ------------------------------------------------------------------------
92
93 data Trie k e
94 = Node { _node_count :: Int
95 , _node_entropy :: e
96 , _node_children :: Map k (Trie k e)
97 }
98 | Leaf { _node_count :: Int }
99 deriving (Show)
100
101 makeLenses ''Trie
102
103 insertTries :: Ord k => [[k]] -> Trie k ()
104 insertTries = L.foldr insertTrie emptyTrie
105
106 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
107 insertTrie [] n = n { _node_count = _node_count n +1}
108 insertTrie (x:xs) (Leaf c) = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
109 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
110 where
111 f = Just . insertTrie xs . fromMaybe emptyTrie
112
113 -- emptyTrie :: (Ord k, Monoid e) => Trie k e
114 -- emptyTrie = Node 0 mempty mempty
115 emptyTrie :: Trie k e
116 emptyTrie = Leaf 0
117
118 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
119 mkTrie c children
120 | Map.null children = Leaf c
121 | otherwise = Node c mempty children
122
123 -----------------------------
124
125 -- | Trie to Tree since Tree as nice print function
126 toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
127 toTree k (Leaf c) = Tree.Node (k, c, Nothing) []
128 toTree k (Node c e cs) = Tree.Node (k, c, Just e) (map (uncurry toTree) $ Map.toList cs)
129
130 ------------------------------------------------------------------------
131 ------------------------------------------------------------------------
132
133 entropyTrie :: (Num e, Floating e) => (k -> Bool) -> Trie k () -> Trie k e
134 entropyTrie _ (Leaf c) = Leaf c
135 entropyTrie pred (Node c _e children) = Node c e (map (entropyTrie pred) children)
136 where
137 e = sum $ map f $ Map.toList children
138 f (k, child) = if pred k then chc * P.logBase 2 (fromIntegral c)
139 else - chc * P.logBase 2 chc
140 where
141 chc = fromIntegral (_node_count child) / fromIntegral c
142
143 normalizeEntropy :: (Fractional e, Floating e, Show e)
144 => Getting e i e -> ModEntropy i o e -> Trie k i -> Trie k o
145 normalizeEntropy inE modE = go $ modE identity
146 where
147 go _ (Leaf c) = Leaf c
148 go f (Node c i children)
149 | Map.null children =
150 panic "normalizeEntropy: impossible"
151 | otherwise =
152 -- trace (show $ L.length es) $
153 Node c (f i) $ go (modE $ normalizeLevel m v) <$> children
154 where
155 es = [ i' ^. inE | Node _ i' _ <- Map.elems children ]
156 m = mean es
157 v = deviation es
158 ------------------------------------------------------------------------
159
160 normalizeLevel :: (Fractional e, Floating e, Show e)
161 => e -> e -> e -> e
162 normalizeLevel m v e = (e - m) / v
163
164 buildTrie :: (Floating e, Show e) => [[Token]] -> Trie Token e
165 buildTrie = entropyTrie (== Terminal) . insertTries
166
167 nodeEntropy :: Trie k e -> Maybe e
168 nodeEntropy (Node _ e _) = Just e
169 nodeEntropy (Leaf _) = Nothing
170
171 nodeChildren :: Trie k e -> Map k (Trie k e)
172 nodeChildren (Node _ _ cs) = cs
173 nodeChildren (Leaf _) = Map.empty
174
175 nodeChild :: Ord k => k -> Trie k e -> Maybe (Trie k e)
176 nodeChild k (Node _ _ cs) = Map.lookup k cs
177 nodeChild _ (Leaf _) = Nothing
178
179 findTrie :: Ord k => [k] -> Trie k e -> Maybe (Trie k e)
180 findTrie ks t = foldM (flip nodeChild) t ks
181
182 levels :: Trie k e -> [[Trie k e]]
183 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
184 where
185 subForest :: Trie k e -> [Trie k e]
186 subForest (Leaf _) = []
187 subForest (Node _ _ children) = Map.elems children
188
189 entropyLevels :: Getting e i e -> Trie k i -> [[e]]
190 entropyLevels inE = fmap (fmap (view inE) . catMaybes . fmap nodeEntropy) . levels
191
192 --fwd :: Getting a s a -> ASetter s t u3 a -> s -> t
193 --fwd inE outE s = s & outE .~ (s ^. inE)
194
195 normalizeEntropy' :: (Fractional e, Floating e, Show e)
196 => Getting e i e -> ModEntropy i o e -> Trie k i -> Trie k o
197 normalizeEntropy' inE modE t = go (modE identity) (entropyLevels inE t) t
198 where
199 go _ [] _ = panic "normalizeEntropy' empty levels"
200 go _ _ (Leaf c) = Leaf c
201 go _ ([] : _) _ = panic "normalizeEntropy': empty level"
202 go f (es : ess) (Node c i children) =
203 Node c (f i) $ go (modE $ normalizeLevel m v) ess <$> children
204 where
205 m = mean es
206 v = deviation es
207
208 ------------------------------------------------------------------------
209 ------------------------------------------------------------------------
210 split :: (Num e, Ord e, Show e) => Lens' i e -> Trie Token i -> [Token] -> [[Token]]
211 split inE t0 = go t0 []
212 where
213 consRev [] xss = xss
214 consRev xs xss = reverse xs : xss
215
216 go _ pref [] = [reverse pref]
217 go _ pref (Terminal:_) = [reverse pref]
218 go t pref (x:xs) = case nodeChild x t of
219 Nothing -> consRev pref $ go t0 [x] xs
220 Just xt -> case nodeChild x t0 of
221 Nothing -> panic $ "TODO"
222 Just xt0 ->
223 let et = ne (panic "t") t
224 -- ^ entropy of the current prefix
225 ext0 = ne (panic "xt0") xt0
226 -- ^ entropy of [x]
227 ext = ne 0 xt
228 -- ^ entropy of the current prefix plus x
229 in
230 -- trace (show ((reverse pref, et, ext0), (reverse (x : pref), ext))) $
231 if ext + ext0 > et
232 then go xt (x:pref) xs
233 else consRev pref $ go xt0 [x] xs
234
235 ne d t = fromMaybe d (nodeEntropy t ^? _Just . inE)
236
237 ------------------------------------------------------------------------
238 ------------------------------------------------------------------------
239
240 mainEleve :: Int -> [[Text]] -> [[[Text]]]
241 mainEleve n input = map unToken . split identity (t :: Trie Token Double) <$> inp
242 where
243 inp = toToken (n - 1) <$> input
244 t = buildTrie $ L.concat $ chunkAlong n 1 <$> inp
245 -- NP: here we use the entropy to split
246 -- instead we should use either:
247 -- info_norm_entropy or info_norm_entropy'
248 -- However they should first be fixed.
249
250 testEleve :: Bool -> Int -> [Text] -> IO Bool
251 testEleve debug n output = do
252 let
253 out = T.words <$> output
254 expected = fmap (T.splitOn "-") <$> out
255 input = (T.splitOn "-" =<<) <$> out
256 inp = toToken (n - 1) <$> input
257 t = buildTrie $ L.concat $ chunkAlong n 1 <$> inp
258 nt = normalizeEntropy identity setNormEntropy (t :: Trie Token Double)
259 nt' = normalizeEntropy' info_entropy (\f -> info_norm_entropy' %~ f) nt
260 pss = [ (ps, findTrie ps t ^? _Just . node_entropy) -- . info_entropy)
261 | ps <- L.nub $ [ c
262 | m <- [1..n]
263 , cs <- chunkAlong m 1 <$> inp
264 , c <- cs
265 ]
266 ]
267 res = map unToken . split identity t <$> inp
268 when debug $ do
269 P.putStrLn (show input)
270 mapM_ (P.putStrLn . show) pss
271 P.putStrLn $ Tree.drawTree
272 $ fmap show
273 $ toTree (NonTerminal "") nt'
274 P.putStrLn $ show res
275 pure $ expected == res
276
277 -- | TODO real data is a list of tokenized sentences
278 example0, example1, example2, example3, example4, example5 :: [Text]
279 example0 = ["New-York is New-York and New-York"]
280 example1 = ["to-be or not to-be"]
281 example2 = ["to-be-or not to-be-or NOT to-be and"]
282 example3 = example0 <> example0
283 -- > TEST: Should not have York New in the trie
284 example4 = ["a-b-c-d e a-b-c-d f"]
285 example5 = ["a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"]
286
287 runTests :: IO ()
288 runTests =
289 forM_
290 [("example0", 2, example0)
291 ,("example1", 2, example1)
292 ,("example2", 3, example2)
293 ,("example3", 2, example3)
294 ,("example4", 4, example4)
295 ,("example5", 5, example5)
296 ]
297 (\(name, n, ex) -> do
298 b <- testEleve False n ex
299 P.putStrLn $ name <> " " <> show n <> " " <> if b then "PASS" else "FAIL"
300 )