]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Eleve.hs
Eleve: working on 2-grams
[gargantext.git] / src / Gargantext / Text / Eleve.hs
1 {-# LANGUAGE NoImplicitPrelude #-}
2 {-# LANGUAGE OverloadedStrings #-}
3 {-# LANGUAGE RankNTypes #-}
4 {-# LANGUAGE TemplateHaskell #-}
5 {-
6
7 Implementation of EleVe Python version of papers:
8
9
10 NP:
11 * The node count is correct and we should not regress on this front.
12 -}
13 module Gargantext.Text.Eleve where
14
15 import Debug.Trace (trace)
16 -- import Debug.SimpleReflect
17
18 import Control.Lens (Lens, Lens', ASetter, Getting, (^.), (^?), (&), (.~), (%~), view, makeLenses, _Just)
19 import Control.Monad (foldM)
20 import Data.Ord (Ord)
21 import qualified Data.List as L
22 import Data.Monoid
23 import Data.Text (Text)
24 import qualified Data.Text as T
25 import Data.Map (Map)
26 import Data.Maybe (fromMaybe, catMaybes)
27 import qualified Data.Map as Map
28 import Gargantext.Prelude hiding (cs)
29 import qualified Data.Tree as Tree
30 import Data.Tree (Tree)
31 import qualified Prelude as P (putStrLn, logBase, String)
32
33 -- prop (Node c _e f) = c == Map.size f
34 -- TODO maybe add Leaf
35 -- NP: I think Leaf is an optimisation (less data, a tiny bit more code and time)
36
37 data I e = I
38 { _info_entropy :: e
39 , _info_norm_entropy :: e
40 , _info_norm_entropy' :: e
41 }
42
43 instance Show e => Show (I e) where
44 show (I e n n') = show (e, n, n')
45
46 makeLenses ''I
47
48 type ModEntropy i o e = (e -> e) -> i -> o
49
50 setNormEntropy :: ModEntropy e (I e) e
51 setNormEntropy f e = I e (f e) e -- (panic "setNormEntropy")
52
53 test n example = do
54 let
55 ex = toToken n example
56 t = buildTrie $ chunkAlong n 1 ex
57 nt = normalizeEntropy identity setNormEntropy (t :: Trie Token Double)
58 nt' = normalizeEntropy' info_entropy (\f -> info_norm_entropy' %~ f) nt
59
60 P.putStrLn $ Tree.drawTree
61 $ fmap show
62 $ toTree (NonTerminal "") nt'
63
64 pure $ map unToken $ split info_entropy nt' ex
65 -- NP: here we use the entropy to split
66 -- instead we should use either:
67 -- info_norm_entropy or info_norm_entropy'
68 -- However they should first be fixed.
69
70
71 example0 = T.words "New York is New York and New York"
72 example1 = T.words "to be or not to be"
73 example2 = T.words "to be or not to be or"
74 example3 = map (T.pack . pure) ("abcdefabcdegabcde" :: P.String)
75
76
77 data Token = NonTerminal Text
78 | Terminal
79 deriving (Ord, Eq, Show)
80
81 toToken :: Int -> [Text] -> [Token]
82 toToken n xs = (NonTerminal <$> xs) <> L.take n (repeat Terminal)
83
84 unToken :: [Token] -> [Text]
85 unToken = map f
86 where
87 f (NonTerminal x) = x
88 f Terminal = ""
89
90
91 data Trie k e
92 = Node { _node_count :: Int
93 , _node_entropy :: e
94 , _node_children :: Map k (Trie k e)
95 }
96 | Leaf { _node_count :: Int }
97 deriving (Show)
98
99 toTree :: k -> Trie k e -> Tree (k,Int,Maybe e)
100 toTree k (Leaf c) = Tree.Node (k, c, Nothing) []
101 toTree k (Node c e cs) = Tree.Node (k, c, Just e) (map (uncurry toTree) $ Map.toList cs)
102
103 -- emptyTrie :: (Ord k, Monoid e) => Trie k e
104 -- emptyTrie = Node 0 mempty mempty
105 emptyTrie :: Trie k e
106 emptyTrie = Leaf 0
107
108 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
109 mkTrie c children
110 | Map.null children = Leaf c
111 | otherwise = Node c mempty children
112
113 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
114 insertTrie [] n = n { _node_count = _node_count n +1}
115 insertTrie (x:xs) (Leaf c) = mkTrie (c+1) $ Map.singleton x $ insertTrie xs emptyTrie
116 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
117 where
118 f = Just . insertTrie xs . fromMaybe emptyTrie
119
120 insertTries :: Ord k => [[k]] -> Trie k ()
121 insertTries = L.foldr insertTrie emptyTrie
122
123 entropyTrie :: (Num e, Floating e) => (k -> Bool) -> Trie k () -> Trie k e
124 entropyTrie _ (Leaf c) = Leaf c
125 entropyTrie pred (Node c _e children) = Node c e (map (entropyTrie pred) children)
126 where
127 e = sum $ map f $ Map.toList children
128 f (k, child) = if pred k then cfc * P.logBase 2 (fromIntegral c)
129 else - cfc * P.logBase 2 cfc
130 where
131 cfc = fromIntegral (_node_count child) / fromIntegral c
132
133 normalizeEntropy :: (Fractional e, Floating e, Show e)
134 => Getting e i e -> ModEntropy i o e -> Trie k i -> Trie k o
135 normalizeEntropy inE modE = go $ modE identity
136 where
137 go _ (Leaf c) = Leaf c
138 go f (Node c i children) | not (Map.null children) =
139 -- trace (show $ L.length es) $
140 Node c (f i) $ go (modE $ normalizeLevel m v) <$> children
141 where
142 es = [ i' ^. inE | Node _ i' _ <- Map.elems children ]
143 m = mean es
144 v = deviation es
145
146 normalizeLevel :: (Fractional e, Floating e, Show e)
147 => e -> e -> e -> e
148 normalizeLevel m v e = (e - m) / v
149
150 buildTrie :: (Floating e, Show e) => [[Token]] -> Trie Token e
151 buildTrie = entropyTrie (== Terminal) . insertTries
152
153 subForest :: Trie k e -> [Trie k e]
154 subForest (Leaf _) = []
155 subForest (Node _ _ children) = Map.elems children
156
157 nodeEntropy :: Trie k e -> Maybe e
158 nodeEntropy (Node _ e _) = Just e
159 nodeEntropy (Leaf _) = Nothing
160
161 nodeChildren :: Trie k e -> Map k (Trie k e)
162 nodeChildren (Node _ _ cs) = cs
163 nodeChildren (Leaf _) = Map.empty
164
165 nodeChild :: Ord k => k -> Trie k e -> Maybe (Trie k e)
166 nodeChild k (Node _ _ cs) = Map.lookup k cs
167 nodeChild _ (Leaf _) = Nothing
168
169 levels :: Trie k e -> [[Trie k e]]
170 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
171
172 entropyLevels :: Getting e i e -> Trie k i -> [[e]]
173 entropyLevels inE = fmap (fmap (view inE) . catMaybes . fmap nodeEntropy) . levels
174
175 --fwd :: Getting a s a -> ASetter s t u3 a -> s -> t
176 --fwd inE outE s = s & outE .~ (s ^. inE)
177
178 normalizeEntropy' :: (Fractional e, Floating e, Show e)
179 => Getting e i e -> ModEntropy i o e -> Trie k i -> Trie k o
180 normalizeEntropy' inE modE t = go (modE identity) (entropyLevels inE t) t
181 where
182 go _ [] _ = panic "normalizeEntropy' empty levels"
183 go _ _ (Leaf c) = Leaf c
184 go _ ([] : _) _ = panic "normalizeEntropy': empty level"
185 go f (es : ess) (Node c i children) =
186 Node c (f i) $ go (modE $ normalizeLevel m v) ess <$> children
187 where
188 m = mean es
189 v = deviation es
190
191 ------------------------------------------------------------------------
192
193 split :: (Num e, Ord e, Show e) => Lens' i e -> Trie Token i -> [Token] -> [[Token]]
194 split inE t0 = go t0 []
195 where
196 ne d t = fromMaybe d (nodeEntropy t ^? _Just . inE)
197 consRev [] xss = xss
198 consRev xs xss = reverse xs : xss
199
200 go _ pref [] = [reverse pref]
201 go t pref (x:xs) = case nodeChild x t of
202 Nothing -> consRev pref $ go t0 [x] xs
203 Just xt -> case nodeChild x t0 of
204 Nothing -> panic "TODO"
205 Just xt0 ->
206 let et = ne (panic "t") t
207 ext0 = ne (panic "xt0") xt0
208 ext = ne 0 xt
209 in
210 -- trace (show ((reverse pref, et, ext0), (reverse (x : pref), ext))) $
211 case et {-+ ext0-} < ext of
212 -- NP: here we must take ext0 in account howover currently it
213 -- makes it worse.
214 -- For instance it currently works well to 2-grams but not more.
215 -- PASS: test 4 example1
216 -- FAIL: test 4 example2
217 True -> go xt (x:pref) xs
218 False -> consRev pref $ go xt0 [x] xs