]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Eleve.hs
[ELEVE] Ngrams, still NaN.
[gargantext.git] / src / Gargantext / Text / Eleve.hs
1 {-# LANGUAGE NoImplicitPrelude #-}
2 {-# LANGUAGE OverloadedStrings #-}
3 {-
4
5 Implementation of EleVe Python version of papers:
6
7
8 -}
9 module Gargantext.Text.Eleve where
10
11 import Debug.Trace (trace)
12 import Debug.SimpleReflect
13
14 import Control.Monad (foldM)
15 import Data.Ord (Ord)
16 import qualified Data.List as L
17 import Data.Monoid
18 import Data.Text (Text)
19 import qualified Data.Text as T
20 import Data.Map (Map)
21 import Data.Maybe (fromMaybe)
22 import qualified Data.Map as Map
23 import Gargantext.Prelude
24 import qualified Data.Tree as Tree
25 import Data.Tree (Tree)
26 import qualified Prelude as P (putStrLn, logBase, String)
27
28 -- prop (Node c _e f) = c == Map.size f
29 -- TODO maybe add Leaf
30 -- NP: I think Leaf is an optimisation (less data, a tiny bit more code and time)
31
32 --test = split t ts
33 test n example = do
34 let
35 ex = toToken n example
36 t = buildTrie $ chunkAlong n 1 ex
37
38 P.putStrLn $ Tree.drawTree
39 $ fmap show
40 $ toTree (NonTerminal "") t
41
42 pure $ map unToken $ split t t [] ex
43
44
45 example' = T.words "New York and New York"
46 example'' = map (T.pack . pure) ("abcdefabcdegabcde" :: P.String)
47
48
49 data Token = NonTerminal Text
50 | Terminal
51 deriving (Ord, Eq, Show)
52
53 toToken :: Int -> [Text] -> [Token]
54 toToken n xs = (NonTerminal <$> xs) <> L.take n (repeat Terminal)
55
56 unToken :: [Token] -> [Text]
57 unToken = map f
58 where
59 f (NonTerminal x) = x
60 f Terminal = ""
61
62
63 data Trie k e
64 = Node { _node_count :: Int
65 , _node_entropy :: e
66 , _node_children :: Map k (Trie k e)
67 }
68 | Leaf { _node_count :: Int }
69 deriving (Show)
70
71 toTree :: k -> Trie k e -> Tree (k,Int,e)
72 toTree k (Node c e cs) = Tree.Node (k, c, e) (map (uncurry toTree) $ Map.toList cs)
73
74 -- emptyTrie :: Trie k e
75 emptyTrie :: (Ord k, Monoid e) => Int -> Trie k e
76 --emptyTrie n = Node n mempty mempty
77 emptyTrie = Leaf
78
79 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
80 mkTrie c children
81 {-| Map.null children = Leaf c
82 | otherwise -} = Node c mempty children
83
84 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
85 insertTrie [] n = n { _node_count = _node_count n +1}
86 -- insertTrie (x:xs) (Leaf c) = mkTrie (c+1) (Map.singleton x $ insertTrie xs emptyTrie)
87 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
88 where
89 f = Just . insertTrie xs . fromMaybe (emptyTrie 0)
90
91 insertTries :: Ord k => [[k]] -> Trie k ()
92 insertTries = L.foldr insertTrie (emptyTrie 1)
93
94 entropyTrie :: (Num e, Floating e) => (k -> Bool) -> Trie k () -> Trie k e
95 -- entropyTrie _ (Leaf c) = Leaf c
96 entropyTrie pred (Node c _e children) = Node c e (map (entropyTrie pred) children)
97 where
98 e = sum $ map f $ Map.toList children
99 f (k, child) = if pred k then cfc * P.logBase 2 (fromIntegral c)
100 else - cfc * P.logBase 2 cfc
101 where
102 cfc = fromIntegral (_node_count child) / fromIntegral c
103
104 normalizeEntropy :: (Fractional e, Floating e, Show e) => Trie k e -> Trie k e
105 -- normalizeEntropy (Leaf c) = Leaf c
106 normalizeEntropy (Node c e children) =
107 trace (show $ L.length es) $ Node c e $ map (normalizeLevel m v . normalizeEntropy) children
108 where
109 es = map _node_entropy $ Map.elems children
110 m = mean es
111 v = deviation es
112
113 normalizeLevel :: (Fractional e, Floating e, Show e) => e -> e -> Trie k e -> Trie k e
114 -- normalizeLevel _ _ (Leaf c) = Leaf c
115 --normalizeLevel m v n = n { _node_entropy = (_node_entropy n - m) }
116 normalizeLevel m v n = trace (show (_node_entropy n,m,v)) $ n { _node_entropy = (_node_entropy n - m) / v}
117
118 buildTrie :: (Floating e, Show e) => [[Token]] -> Trie Token e
119 buildTrie = normalizeEntropy . entropyTrie (== Terminal) . insertTries
120
121 subForest :: Trie k e -> [Trie k e]
122 -- subForest (Leaf _) = []
123 subForest (Node _ _ children) = Map.elems children
124
125 levels :: Trie k e -> [[Trie k e]]
126 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
127
128 entropyLevels :: Trie k e -> [[e]]
129 entropyLevels = fmap (fmap _node_entropy) . levels
130
131 normalizeEntropy' :: (Floating e, Show e) => Trie k e -> Trie k e
132 normalizeEntropy' t = go (entropyLevels t) t
133 where
134 go :: (Floating e, Show e) => [[e]] -> Trie k e -> Trie k e
135 go [] _ = panic "normalizeEntropy' empty levels"
136 -- go _ (Leaf c) = Leaf c
137 go (es : ess) (Node c e children) =
138 Node c e (normalizeLevel m v . go ess <$> children)
139 where
140 m = mean es
141 v = deviation es
142
143 buildTrie' :: (Floating e, Show e) => [[Token]] -> Trie Token e
144 buildTrie' = normalizeEntropy' . entropyTrie (== Terminal) . insertTries
145
146 ------------------------------------------------------------------------
147
148 autonomie :: Trie Token e -> Token -> e
149 autonomie trie t = case (Map.lookup t (_node_children trie)) of
150 Nothing -> panic $ "Gargantext.Text.Ngrams: autonomie" <> (cs $ show t)
151 Just a -> _node_entropy a
152
153 ------------------------------------------------------------------------
154
155 split :: (Num e, Ord e) => Trie Token e -> Trie Token e -> [Token] -> [Token] -> [[Token]]
156 split _ _ pref [] = [reverse pref]
157 split t0 t pref (x:xs) = case Map.lookup x $ _node_children t of
158 Nothing -> reverse pref : split t0 t0 [x] xs
159 Just a -> case Map.lookup x $ _node_children t0 of
160 Nothing -> panic "TODO" -- reverse pref : split t0 t0 [] xs
161 Just xt0 -> case _node_entropy t + _node_entropy xt0 > _node_entropy a of
162 True -> split t0 a (x:pref) xs
163 False -> reverse pref : split t0 xt0 [x] xs