1 {-# LANGUAGE NoImplicitPrelude #-}
2 {-# LANGUAGE OverloadedStrings #-}
3 {-# LANGUAGE RankNTypes #-}
4 {-# LANGUAGE TemplateHaskell #-}
7 Implementation of EleVe Python version of papers:
11 * The node count is correct and we should not regress on this front.
13 module Gargantext.Text.Eleve where
15 import Debug.Trace (trace)
16 -- import Debug.SimpleReflect
18 import Control.Lens (Lens, Lens', ASetter, Getting, (^.), (^?), (&), (.~), (%~), view, makeLenses, _Just)
19 import Control.Monad (foldM)
21 import qualified Data.List as L
23 import Data.Text (Text)
24 import qualified Data.Text as T
26 import Data.Maybe (fromMaybe, catMaybes)
27 import qualified Data.Map as Map
28 import Gargantext.Prelude hiding (cs)
29 import qualified Data.Tree as Tree
30 import Data.Tree (Tree)
31 import qualified Prelude as P (putStrLn, logBase, String)
33 -- prop (Node c _e f) = c == Map.size f
34 -- TODO maybe add Leaf
35 -- NP: I think Leaf is an optimisation (less data, a tiny bit more code and time)
39 , _info_norm_entropy :: e
40 , _info_norm_entropy' :: e
43 instance Show e => Show (I e) where
44 show (I e n n') = show (e, n, n')
48 type ModEntropy i o e = (e -> e) -> i -> o
50 setNormEntropy :: ModEntropy e (I e) e
51 setNormEntropy f e = I e (f e) e -- (panic "setNormEntropy")
55 ex = toToken n example
56 t = buildTrie $ chunkAlong n 1 ex
57 nt = normalizeEntropy identity setNormEntropy (t :: Trie Token Double)
58 nt' = normalizeEntropy' info_entropy (\f -> info_norm_entropy' %~ f) nt
60 P.putStrLn $ Tree.drawTree
62 $ toTree (NonTerminal "") nt'
64 pure $ map unToken $ split info_entropy nt' ex
65 -- NP: here we use the entropy to split
66 -- instead we should use either:
67 -- info_norm_entropy or info_norm_entropy'
68 -- However they should first be fixed.
71 example0 = T.words "New York is New York and New York"
72 example1 = T.words "to be or not to be"
73 example2 = T.words "to be or not to be or"
74 example3 = map (T.pack . pure) ("abcdefabcdegabcde" :: P.String)
77 data Token = NonTerminal Text
79 deriving (Ord, Eq, Show)
81 toToken :: Int -> [Text] -> [Token]
82 toToken n xs = (NonTerminal <$> xs) <> L.take n (repeat Terminal)
84 unToken :: [Token] -> [Text]
92 = Node { _node_count :: Int
94 , _node_children :: Map k (Trie k e)
96 | Leaf { _node_count :: Int }
99 toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
100 toTree k (Leaf c) = Tree.Node (k, c, Nothing) []
101 toTree k (Node c e cs) = Tree.Node (k, c, Just e) (map (uncurry toTree) $ Map.toList cs)
103 -- emptyTrie :: (Ord k, Monoid e) => Trie k e
104 -- emptyTrie = Node 0 mempty mempty
105 emptyTrie :: Trie k e
108 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
110 | Map.null children = Leaf c
111 | otherwise = Node c mempty children
113 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
114 insertTrie [] n = n { _node_count = _node_count n +1}
115 insertTrie (x:xs) (Leaf c) = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
116 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
118 f = Just . insertTrie xs . fromMaybe emptyTrie
120 insertTries :: Ord k => [[k]] -> Trie k ()
121 insertTries = L.foldr insertTrie emptyTrie
123 entropyTrie :: (Num e, Floating e) => (k -> Bool) -> Trie k () -> Trie k e
124 entropyTrie _ (Leaf c) = Leaf c
125 entropyTrie pred (Node c _e children) = Node c e (map (entropyTrie pred) children)
127 e = sum $ map f $ Map.toList children
128 f (k, child) = if pred k then cfc * P.logBase 2 (fromIntegral c)
129 else - cfc * P.logBase 2 cfc
131 cfc = fromIntegral (_node_count child) / fromIntegral c
133 normalizeEntropy :: (Fractional e, Floating e, Show e)
134 => Getting e i e -> ModEntropy i o e -> Trie k i -> Trie k o
135 normalizeEntropy inE modE = go $ modE identity
137 go _ (Leaf c) = Leaf c
138 go f (Node c i children) | not (Map.null children) =
139 -- trace (show $ L.length es) $
140 Node c (f i) $ go (modE $ normalizeLevel m v) <$> children
142 es = [ i' ^. inE | Node _ i' _ <- Map.elems children ]
146 normalizeLevel :: (Fractional e, Floating e, Show e)
148 normalizeLevel m v e = (e - m) / v
150 buildTrie :: (Floating e, Show e) => [[Token]] -> Trie Token e
151 buildTrie = entropyTrie (== Terminal) . insertTries
153 subForest :: Trie k e -> [Trie k e]
154 subForest (Leaf _) = []
155 subForest (Node _ _ children) = Map.elems children
157 nodeEntropy :: Trie k e -> Maybe e
158 nodeEntropy (Node _ e _) = Just e
159 nodeEntropy (Leaf _) = Nothing
161 nodeChildren :: Trie k e -> Map k (Trie k e)
162 nodeChildren (Node _ _ cs) = cs
163 nodeChildren (Leaf _) = Map.empty
165 nodeChild :: Ord k => k -> Trie k e -> Maybe (Trie k e)
166 nodeChild k (Node _ _ cs) = Map.lookup k cs
167 nodeChild _ (Leaf _) = Nothing
169 levels :: Trie k e -> [[Trie k e]]
170 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
172 entropyLevels :: Getting e i e -> Trie k i -> [[e]]
173 entropyLevels inE = fmap (fmap (view inE) . catMaybes . fmap nodeEntropy) . levels
175 --fwd :: Getting a s a -> ASetter s t u3 a -> s -> t
176 --fwd inE outE s = s & outE .~ (s ^. inE)
178 normalizeEntropy' :: (Fractional e, Floating e, Show e)
179 => Getting e i e -> ModEntropy i o e -> Trie k i -> Trie k o
180 normalizeEntropy' inE modE t = go (modE identity) (entropyLevels inE t) t
182 go _ [] _ = panic "normalizeEntropy' empty levels"
183 go _ _ (Leaf c) = Leaf c
184 go _ ([] : _) _ = panic "normalizeEntropy': empty level"
185 go f (es : ess) (Node c i children) =
186 Node c (f i) $ go (modE $ normalizeLevel m v) ess <$> children
191 ------------------------------------------------------------------------
193 split :: (Num e, Ord e, Show e) => Lens' i e -> Trie Token i -> [Token] -> [[Token]]
194 split inE t0 = go t0 []
196 ne d t = fromMaybe d (nodeEntropy t ^? _Just . inE)
198 consRev xs xss = reverse xs : xss
200 go _ pref [] = [reverse pref]
201 go t pref (x:xs) = case nodeChild x t of
202 Nothing -> consRev pref $ go t0 [x] xs
203 Just xt -> case nodeChild x t0 of
204 Nothing -> panic "TODO"
206 let et = ne (panic "t") t
207 ext0 = ne (panic "xt0") xt0
210 -- trace (show ((reverse pref, et, ext0), (reverse (x : pref), ext))) $
211 case et {-+ ext0-} < ext of
212 -- NP: here we must take ext0 in account howover currently it
214 -- For instance it currently works well to 2-grams but not more.
215 -- PASS: test 4 example1
216 -- FAIL: test 4 example2
217 True -> go xt (x:pref) xs
218 False -> consRev pref $ go xt0 [x] xs