]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Eleve.hs
[FEAT] Ngrams extraction unsupervized (doc, type, function).
[gargantext.git] / src / Gargantext / Text / Eleve.hs
1 {-# LANGUAGE NoImplicitPrelude #-}
2 {-# LANGUAGE OverloadedStrings #-}
3 {-# LANGUAGE RankNTypes #-}
4 {-# LANGUAGE TemplateHaskell #-}
5 {-
6 # Implementation of Unsupervized Word Segmentation
7
8 References:
9 - EleVe Python implementation and discussions with Korantin August and Bruno Gaume
10 [git repo](https://github.com/kodexlab/eleve.git)
11
12 - Unsupervized Word Segmentation:the case for Mandarin Chinese Pierre
13 Magistry, Benoît Sagot, Alpage, INRIA & Univ. Paris 7, Proceedings of
14 the 50th Annual Meeting of the Association for Computational Linguistics
15 , pages 383–387. [PDF](https://www.aclweb.org/anthology/P12-2075)
16
17 Notes for current implementation:
18 - The node count is correct; TODO add tests to keep track of it
19 - NP fix normalization
20 - NP extract longer ngrams (see paper above, viterbi algo can be used)
21 - TODO TEST: prop (Node c _e f) = c == Map.size f
22
23 - AD: Real ngrams extraction test
24 from Gargantext.Text.Terms import extractTermsUnsupervised
25 docs <- runCmdRepl $ selectDocs 1004
26 extractTermsUnsupervised 3 $ DT.intercalate " "
27 $ catMaybes
28 $ Gargantext.map _hyperdataDocument_abstract docs
29
30
31 -}
32 module Gargantext.Text.Eleve where
33
34 import Debug.Trace (trace)
35 -- import Debug.SimpleReflect
36
37 import Control.Lens (Lens, Lens', ASetter, Getting, (^.), (^?), (&), (.~), (%~), view, makeLenses, _Just)
38 import Control.Monad (foldM)
39 import Data.Ord (Ord)
40 import qualified Data.List as L
41 import Data.Monoid
42 import Data.Text (Text)
43 import qualified Data.Text as T
44 import Data.Map (Map)
45 import Data.Maybe (fromMaybe, catMaybes)
46 import qualified Data.Map as Map
47 import Gargantext.Prelude hiding (cs)
48 import qualified Data.Tree as Tree
49 import Data.Tree (Tree)
50 import qualified Prelude as P (putStrLn, logBase, String)
51
52 ------------------------------------------------------------------------
53 data I e = I
54 { _info_entropy :: e
55 , _info_norm_entropy :: e
56 , _info_norm_entropy' :: e
57 }
58
59 instance Show e => Show (I e) where
60 show (I e n n') = show (e, n, n')
61
62 makeLenses ''I
63
64 type ModEntropy i o e = (e -> e) -> i -> o
65
66 setNormEntropy :: ModEntropy e (I e) e
67 setNormEntropy f e = I e (f e) e -- (panic "setNormEntropy")
68
69 testEleve n example = do
70 let
71 ex = toToken n <$> example
72 t = buildTrie $ L.concat $ chunkAlong n 1 <$> ex
73 nt = normalizeEntropy identity setNormEntropy (t :: Trie Token Double)
74 nt' = normalizeEntropy' info_entropy (\f -> info_norm_entropy' %~ f) nt
75 {-
76 P.putStrLn $ Tree.drawTree
77 $ fmap show
78 $ toTree (NonTerminal "") nt'
79 --}
80 pure $ map unToken $ split info_entropy nt' $ L.concat ex
81 -- NP: here we use the entropy to split
82 -- instead we should use either:
83 -- info_norm_entropy or info_norm_entropy'
84 -- However they should first be fixed.
85
86 -- | TODO real data is a list of tokenized sentences
87 example0 = [T.words "New York is New York and New York"]
88 example1 = [T.words "to be or not to be"]
89 example2 = [T.words "to be or not to be or"]
90 example3 = example0 <> example0 -- > TEST: Should not have York New in the trie
91 example4 = map (T.pack . pure) ("abcdefabcdegabcde" :: P.String)
92
93 ------------------------------------------------------------------------
94 ------------------------------------------------------------------------
95 data Token = NonTerminal Text
96 | Terminal
97 deriving (Ord, Eq, Show)
98
99 toToken :: Int -> [Text] -> [Token]
100 toToken n xs = (NonTerminal <$> xs) <> L.take n (repeat Terminal)
101
102 unToken :: [Token] -> [Text]
103 unToken = map f
104 where
105 f (NonTerminal x) = x
106 f Terminal = ""
107
108 ------------------------------------------------------------------------
109
110 data Trie k e
111 = Node { _node_count :: Int
112 , _node_entropy :: e
113 , _node_children :: Map k (Trie k e)
114 }
115 | Leaf { _node_count :: Int }
116 deriving (Show)
117
118
119 insertTries :: Ord k => [[k]] -> Trie k ()
120 insertTries = L.foldr insertTrie emptyTrie
121
122 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
123 insertTrie [] n = n { _node_count = _node_count n +1}
124 insertTrie (x:xs) (Leaf c) = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
125 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
126 where
127 f = Just . insertTrie xs . fromMaybe emptyTrie
128
129 -- emptyTrie :: (Ord k, Monoid e) => Trie k e
130 -- emptyTrie = Node 0 mempty mempty
131 emptyTrie :: Trie k e
132 emptyTrie = Leaf 0
133
134 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
135 mkTrie c children
136 | Map.null children = Leaf c
137 | otherwise = Node c mempty children
138
139 -----------------------------
140
141 -- | Trie to Tree since Tree as nice print function
142 toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
143 toTree k (Leaf c) = Tree.Node (k, c, Nothing) []
144 toTree k (Node c e cs) = Tree.Node (k, c, Just e) (map (uncurry toTree) $ Map.toList cs)
145
146 ------------------------------------------------------------------------
147 ------------------------------------------------------------------------
148
149 entropyTrie :: (Num e, Floating e) => (k -> Bool) -> Trie k () -> Trie k e
150 entropyTrie _ (Leaf c) = Leaf c
151 entropyTrie pred (Node c _e children) = Node c e (map (entropyTrie pred) children)
152 where
153 e = sum $ map f $ Map.toList children
154 f (k, child) = if pred k then chc * P.logBase 2 (fromIntegral c)
155 else - chc * P.logBase 2 chc
156 where
157 chc = fromIntegral (_node_count child) / fromIntegral c
158
159 normalizeEntropy :: (Fractional e, Floating e, Show e)
160 => Getting e i e -> ModEntropy i o e -> Trie k i -> Trie k o
161 normalizeEntropy inE modE = go $ modE identity
162 where
163 go _ (Leaf c) = Leaf c
164 go f (Node c i children) | not (Map.null children) =
165 -- trace (show $ L.length es) $
166 Node c (f i) $ go (modE $ normalizeLevel m v) <$> children
167 where
168 es = [ i' ^. inE | Node _ i' _ <- Map.elems children ]
169 m = mean es
170 v = deviation es
171 ------------------------------------------------------------------------
172
173 normalizeLevel :: (Fractional e, Floating e, Show e)
174 => e -> e -> e -> e
175 normalizeLevel m v e = (e - m) / v
176
177 buildTrie :: (Floating e, Show e) => [[Token]] -> Trie Token e
178 buildTrie = entropyTrie (== Terminal) . insertTries
179
180 nodeEntropy :: Trie k e -> Maybe e
181 nodeEntropy (Node _ e _) = Just e
182 nodeEntropy (Leaf _) = Nothing
183
184 nodeChildren :: Trie k e -> Map k (Trie k e)
185 nodeChildren (Node _ _ cs) = cs
186 nodeChildren (Leaf _) = Map.empty
187
188 nodeChild :: Ord k => k -> Trie k e -> Maybe (Trie k e)
189 nodeChild k (Node _ _ cs) = Map.lookup k cs
190 nodeChild _ (Leaf _) = Nothing
191
192 levels :: Trie k e -> [[Trie k e]]
193 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
194 where
195 subForest :: Trie k e -> [Trie k e]
196 subForest (Leaf _) = []
197 subForest (Node _ _ children) = Map.elems children
198
199 entropyLevels :: Getting e i e -> Trie k i -> [[e]]
200 entropyLevels inE = fmap (fmap (view inE) . catMaybes . fmap nodeEntropy) . levels
201
202 --fwd :: Getting a s a -> ASetter s t u3 a -> s -> t
203 --fwd inE outE s = s & outE .~ (s ^. inE)
204
205 normalizeEntropy' :: (Fractional e, Floating e, Show e)
206 => Getting e i e -> ModEntropy i o e -> Trie k i -> Trie k o
207 normalizeEntropy' inE modE t = go (modE identity) (entropyLevels inE t) t
208 where
209 go _ [] _ = panic "normalizeEntropy' empty levels"
210 go _ _ (Leaf c) = Leaf c
211 go _ ([] : _) _ = panic "normalizeEntropy': empty level"
212 go f (es : ess) (Node c i children) =
213 Node c (f i) $ go (modE $ normalizeLevel m v) ess <$> children
214 where
215 m = mean es
216 v = deviation es
217
218 ------------------------------------------------------------------------
219 ------------------------------------------------------------------------
220 split :: (Num e, Ord e, Show e) => Lens' i e -> Trie Token i -> [Token] -> [[Token]]
221 split inE t0 = go t0 []
222 where
223 consRev [] xss = xss
224 consRev xs xss = reverse xs : xss
225
226 go _ pref [] = [reverse pref]
227 go t pref (x:xs) = case nodeChild x t of
228 Nothing -> consRev pref $ go t0 [x] xs
229 Just xt -> case nodeChild x t0 of
230 Nothing -> panic "TODO"
231 Just xt0 ->
232 let et = ne (panic "t") t
233 ext0 = ne (panic "xt0") xt0
234 ext = ne 0 xt
235 in
236 -- trace (show ((reverse pref, et, ext0), (reverse (x : pref), ext))) $
237 case et {-+ ext0-} < ext of
238 -- NP: here we must take ext0 in account however currently it
239 -- makes it worse.
240 -- For instance it currently works well to 2-grams but not more.
241 -- PASS: test 4 example1
242 -- FAIL: test 4 example2
243 True -> go xt (x:pref) xs
244 False -> consRev pref $ go xt0 [x] xs
245
246 ne d t = fromMaybe d (nodeEntropy t ^? _Just . inE)