]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Eleve.hs
Merge branch 'dev' of ssh://delanoe.org/haskell-gargantext into dev
[gargantext.git] / src / Gargantext / Text / Eleve.hs
1 {-|
2 Module : Gargantext.Text.Eleve
3 Description : Unsupervized Word segmentation
4 Copyright : (c) CNRS, 2019-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 # Implementation of Unsupervized Word Segmentation
11
12 References:
13 - EleVe Python implementation and discussions with Korantin August and Bruno Gaume
14 [git repo](https://github.com/kodexlab/eleve.git)
15
16 - Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
17 Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
18 the 50th Annual Meeting of the Association for Computational Linguistics
19 , pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
20
21 Notes for current implementation:
22 - The node count is correct; TODO AD add tests to keep track of it
23 - NP fix normalization
24 - NP extract longer ngrams (see paper above, viterbi algo can be used)
25 - TODO AD TEST: prop (Node c _e f) = c == Map.size f
26
27 - AD: Real ngrams extraction test
28 from Gargantext.Text.Terms import extractTermsUnsupervised
29 docs <- runCmdRepl $ selectDocs 1004
30 extractTermsUnsupervised 3 $ DT.intercalate " "
31 $ catMaybes
32 $ Gargantext.map _hyperdataDocument_abstract docs
33
34
35 -}
36 {-# LANGUAGE NoImplicitPrelude #-}
37 {-# LANGUAGE OverloadedStrings #-}
38 {-# LANGUAGE RankNTypes #-}
39 {-# LANGUAGE TemplateHaskell #-}
40
41 module Gargantext.Text.Eleve where
42
43 -- import Debug.Trace (trace)
44 -- import Debug.SimpleReflect
45
46 import Control.Lens (Lens', Getting, (^.), (^?), (%~), view, makeLenses, _Just)
47 import Control.Monad (foldM, mapM_, forM_)
48 import Data.Ord (Ord)
49 import qualified Data.List as L
50 import Data.Monoid
51 import Data.Text (Text)
52 import qualified Data.Text as T
53 import Data.Map (Map)
54 import Data.Maybe (fromMaybe, catMaybes)
55 import qualified Data.Map as Map
56 import Gargantext.Prelude hiding (cs)
57 import qualified Data.Tree as Tree
58 import Data.Tree (Tree)
59 import qualified Prelude as P (putStrLn, logBase)
60
61 ------------------------------------------------------------------------
62 -- | Example and tests for development
63 data I e = I
64 { _info_entropy :: e
65 , _info_norm_entropy :: e
66 , _info_norm_entropy' :: e
67 }
68
69 instance Show e => Show (I e) where
70 show (I e n n') = show (e, n, n')
71
72 makeLenses ''I
73
74 type ModEntropy i o e = (e -> e) -> i -> o
75
76 setNormEntropy :: ModEntropy e (I e) e
77 setNormEntropy f e = I e (f e) e -- (panic "setNormEntropy")
78
79 data Token = NonTerminal Text
80 | Terminal
81 deriving (Ord, Eq, Show)
82
83 toToken :: Int -> [Text] -> [Token]
84 toToken n xs = (NonTerminal <$> xs) <> L.take n (repeat Terminal)
85
86 unToken :: [Token] -> [Text]
87 unToken = map f
88 where
89 f (NonTerminal x) = x
90 f Terminal = ""
91
92 ------------------------------------------------------------------------
93
94 data Trie k e
95 = Node { _node_count :: Int
96 , _node_entropy :: e
97 , _node_children :: Map k (Trie k e)
98 }
99 | Leaf { _node_count :: Int }
100 deriving (Show)
101
102 makeLenses ''Trie
103
104 insertTries :: Ord k => [[k]] -> Trie k ()
105 insertTries = L.foldr insertTrie emptyTrie
106
107 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
108 insertTrie [] n = n { _node_count = _node_count n +1}
109 insertTrie (x:xs) (Leaf c) = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
110 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
111 where
112 f = Just . insertTrie xs . fromMaybe emptyTrie
113
114 -- emptyTrie :: (Ord k, Monoid e) => Trie k e
115 -- emptyTrie = Node 0 mempty mempty
116 emptyTrie :: Trie k e
117 emptyTrie = Leaf 0
118
119 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
120 mkTrie c children
121 | Map.null children = Leaf c
122 | otherwise = Node c mempty children
123
124 -----------------------------
125
126 -- | Trie to Tree since Tree as nice print function
127 toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
128 toTree k (Leaf c) = Tree.Node (k, c, Nothing) []
129 toTree k (Node c e cs) = Tree.Node (k, c, Just e) (map (uncurry toTree) $ Map.toList cs)
130
131 ------------------------------------------------------------------------
132 ------------------------------------------------------------------------
133
134 entropyTrie :: (Num e, Floating e) => (k -> Bool) -> Trie k () -> Trie k e
135 entropyTrie _ (Leaf c) = Leaf c
136 entropyTrie pred (Node c _e children) = Node c e (map (entropyTrie pred) children)
137 where
138 e = sum $ map f $ Map.toList children
139 f (k, child) = if pred k then chc * P.logBase 2 (fromIntegral c)
140 else - chc * P.logBase 2 chc
141 where
142 chc = fromIntegral (_node_count child) / fromIntegral c
143
144 normalizeEntropy :: (Fractional e, Floating e, Show e)
145 => Getting e i e -> ModEntropy i o e -> Trie k i -> Trie k o
146 normalizeEntropy inE modE = go $ modE identity
147 where
148 go _ (Leaf c) = Leaf c
149 go f (Node c i children)
150 | Map.null children =
151 panic "normalizeEntropy: impossible"
152 | otherwise =
153 -- trace (show $ L.length es) $
154 Node c (f i) $ go (modE $ normalizeLevel m v) <$> children
155 where
156 es = [ i' ^. inE | Node _ i' _ <- Map.elems children ]
157 m = mean es
158 v = deviation es
159 ------------------------------------------------------------------------
160
161 normalizeLevel :: (Fractional e, Floating e, Show e)
162 => e -> e -> e -> e
163 normalizeLevel m v e = (e - m) / v
164
165 buildTrie :: (Floating e, Show e) => [[Token]] -> Trie Token e
166 buildTrie = entropyTrie (== Terminal) . insertTries
167
168 nodeEntropy :: Trie k e -> Maybe e
169 nodeEntropy (Node _ e _) = Just e
170 nodeEntropy (Leaf _) = Nothing
171
172 nodeChildren :: Trie k e -> Map k (Trie k e)
173 nodeChildren (Node _ _ cs) = cs
174 nodeChildren (Leaf _) = Map.empty
175
176 nodeChild :: Ord k => k -> Trie k e -> Maybe (Trie k e)
177 nodeChild k (Node _ _ cs) = Map.lookup k cs
178 nodeChild _ (Leaf _) = Nothing
179
180 findTrie :: Ord k => [k] -> Trie k e -> Maybe (Trie k e)
181 findTrie ks t = foldM (flip nodeChild) t ks
182
183 levels :: Trie k e -> [[Trie k e]]
184 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
185 where
186 subForest :: Trie k e -> [Trie k e]
187 subForest (Leaf _) = []
188 subForest (Node _ _ children) = Map.elems children
189
190 entropyLevels :: Getting e i e -> Trie k i -> [[e]]
191 entropyLevels inE = fmap (fmap (view inE) . catMaybes . fmap nodeEntropy) . levels
192
193 --fwd :: Getting a s a -> ASetter s t u3 a -> s -> t
194 --fwd inE outE s = s & outE .~ (s ^. inE)
195
196 normalizeEntropy' :: (Fractional e, Floating e, Show e)
197 => Getting e i e -> ModEntropy i o e -> Trie k i -> Trie k o
198 normalizeEntropy' inE modE t = go (modE identity) (entropyLevels inE t) t
199 where
200 go _ [] _ = panic "normalizeEntropy' empty levels"
201 go _ _ (Leaf c) = Leaf c
202 go _ ([] : _) _ = panic "normalizeEntropy': empty level"
203 go f (es : ess) (Node c i children) =
204 Node c (f i) $ go (modE $ normalizeLevel m v) ess <$> children
205 where
206 m = mean es
207 v = deviation es
208
209 ------------------------------------------------------------------------
210 ------------------------------------------------------------------------
211 split :: (Num e, Ord e, Show e) => Lens' i e -> Trie Token i -> [Token] -> [[Token]]
212 split inE t0 = go t0 []
213 where
214 consRev [] xss = xss
215 consRev xs xss = reverse xs : xss
216
217 go _ pref [] = [reverse pref]
218 go _ pref (Terminal:_) = [reverse pref]
219 go t pref (x:xs) = case nodeChild x t of
220 Nothing -> consRev pref $ go t0 [x] xs
221 Just xt -> case nodeChild x t0 of
222 Nothing -> panic $ "TODO"
223 Just xt0 ->
224 let et = ne (panic "t") t
225 -- ^ entropy of the current prefix
226 ext0 = ne (panic "xt0") xt0
227 -- ^ entropy of [x]
228 ext = ne 0 xt
229 -- ^ entropy of the current prefix plus x
230 in
231 -- trace (show ((reverse pref, et, ext0), (reverse (x : pref), ext))) $
232 if ext + ext0 > et
233 then go xt (x:pref) xs
234 else consRev pref $ go xt0 [x] xs
235
236 ne d t = fromMaybe d (nodeEntropy t ^? _Just . inE)
237
238 ------------------------------------------------------------------------
239 ------------------------------------------------------------------------
240
241 mainEleve :: Int -> [[Text]] -> [[[Text]]]
242 mainEleve n input = map unToken . split identity (t :: Trie Token Double) <$> inp
243 where
244 inp = toToken (n - 1) <$> input
245 t = buildTrie $ L.concat $ chunkAlong n 1 <$> inp
246 -- NP: here we use the entropy to split
247 -- instead we should use either:
248 -- info_norm_entropy or info_norm_entropy'
249 -- However they should first be fixed.
250
251 testEleve :: Bool -> Int -> [Text] -> IO Bool
252 testEleve debug n output = do
253 let
254 out = T.words <$> output
255 expected = fmap (T.splitOn "-") <$> out
256 input = (T.splitOn "-" =<<) <$> out
257 inp = toToken (n - 1) <$> input
258 t = buildTrie $ L.concat $ chunkAlong n 1 <$> inp
259 nt = normalizeEntropy identity setNormEntropy (t :: Trie Token Double)
260 nt' = normalizeEntropy' info_entropy (\f -> info_norm_entropy' %~ f) nt
261 pss = [ (ps, findTrie ps t ^? _Just . node_entropy) -- . info_entropy)
262 | ps <- L.nub $ [ c
263 | m <- [1..n]
264 , cs <- chunkAlong m 1 <$> inp
265 , c <- cs
266 ]
267 ]
268 res = map unToken . split identity t <$> inp
269 when debug $ do
270 P.putStrLn (show input)
271 mapM_ (P.putStrLn . show) pss
272 P.putStrLn $ Tree.drawTree
273 $ fmap show
274 $ toTree (NonTerminal "") nt'
275 P.putStrLn $ show res
276 pure $ expected == res
277
278 -- | TODO real data is a list of tokenized sentences
279 example0, example1, example2, example3, example4, example5 :: [Text]
280 example0 = ["New-York is New-York and New-York"]
281 example1 = ["to-be or not to-be"]
282 example2 = ["to-be-or not to-be-or NOT to-be and"]
283 example3 = example0 <> example0
284 -- > TEST: Should not have York New in the trie
285 example4 = ["a-b-c-d e a-b-c-d f"]
286 example5 = ["a-b-c-d-e f a-b-c-d-e g a-b-c-d-e"]
287
288 runTests :: IO ()
289 runTests =
290 forM_
291 [("example0", 2, example0)
292 ,("example1", 2, example1)
293 ,("example2", 3, example2)
294 ,("example3", 2, example3)
295 ,("example4", 4, example4)
296 ,("example5", 5, example5)
297 ]
298 (\(name, n, ex) -> do
299 b <- testEleve False n ex
300 P.putStrLn $ name <> " " <> show n <> " " <> if b then "PASS" else "FAIL"
301 )