]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Eleve.hs
Eleve: tweaks
[gargantext.git] / src / Gargantext / Text / Eleve.hs
1 {-# LANGUAGE NoImplicitPrelude #-}
2 {-# LANGUAGE OverloadedStrings #-}
3 {-
4
5 Implementation of EleVe Python version of papers:
6
7
8 -}
9 module Gargantext.Text.Eleve where
10
11
12 import Data.Ord (Ord)
13 import qualified Data.List as L
14 import Data.Monoid
15 import Data.Text (Text)
16 import qualified Data.Text as T
17 import Data.Map (Map)
18 import Data.Maybe (fromMaybe)
19 import qualified Data.Map as Map
20 import Gargantext.Prelude
21
22 -- prop (Node c _e f) = c == Map.size f
23 -- TODO remove Leaf
24 -- NP: I think Leaf is an optimisation (less data, a tiny bit more code and time)
25
26 example :: [[Token]]
27 example = map token
28 $ chunkAlong 3 1
29 $ T.words "New York and New York is a big apple"
30
31 data Token = NonTerminal Text | Terminal
32 deriving (Ord, Eq, Show)
33
34 token :: [Text] -> [Token]
35 token xs = (NonTerminal <$> xs) <> [Terminal]
36
37 data Trie k e
38 = Node { _node_count :: Int
39 , _node_entropy :: e
40 , _node_children :: Map k (Trie k e)
41 }
42 -- | Leaf { _node_count :: Int }
43 deriving (Show)
44
45 -- emptyTrie :: Trie k e
46 -- emptyTrie = Leaf 0
47 emptyTrie :: (Ord k, Monoid e) => Trie k e
48 emptyTrie = Node 0 mempty mempty
49
50 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
51 mkTrie c children
52 {-| Map.null children = Leaf c
53 | otherwise -} = Node c mempty children
54
55 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
56 insertTrie [] n = n
57 -- insertTrie (x:xs) (Leaf c) = mkTrie (c+1) (Map.singleton x $ insertTrie xs emptyTrie)
58 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
59 where
60 f = Just . insertTrie xs . fromMaybe emptyTrie
61
62 insertTries :: Ord k => [[k]] -> Trie k ()
63 insertTries = L.foldr insertTrie emptyTrie
64
65 entropyTrie :: (k -> Bool) -> Trie k () -> Trie k Double
66 -- entropyTrie _ (Leaf c) = Leaf c
67 entropyTrie pred (Node c _e children) = Node c e (entropyTrie pred <$> children)
68 where
69 e = sum $ f <$> Map.toList children
70 f (k, child) = if pred k then cfc * log (fromIntegral c) else - cfc * log cfc
71 where
72 cfc = fromIntegral (_node_count child) / fromIntegral c
73
74 normalizeEntropy :: Trie k Double -> Trie k Double
75 -- normalizeEntropy (Leaf c) = Leaf c
76 normalizeEntropy (Node c e children) =
77 Node c e $ normalizeLevel m v . normalizeEntropy <$> children
78 where
79 es = _node_entropy <$> Map.elems children
80 m = mean es
81 v = variance es
82
83 normalizeLevel :: Double -> Double -> Trie k Double -> Trie k Double
84 -- normalizeLevel _ _ (Leaf c) = Leaf c
85 -- normalizeLevel m v (Node c e children) = Node c ((e - m) / v) children
86 normalizeLevel m v n = n { _node_entropy = (_node_entropy n - m) / v }
87
88 buildTrie :: [[Token]] -> Trie Token Double
89 buildTrie = normalizeEntropy . entropyTrie (== Terminal) . insertTries
90
91 subForest :: Trie k e -> [Trie k e]
92 -- subForest (Leaf _) = []
93 subForest (Node _ _ children) = Map.elems children
94
95 levels :: Trie k e -> [[Trie k e]]
96 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
97
98 entropyLevels :: Trie k e -> [[e]]
99 entropyLevels = fmap (fmap _node_entropy) . levels
100
101 normalizeEntropy' :: Trie k Double -> Trie k Double
102 normalizeEntropy' t = go (entropyLevels t) t
103 where
104 go :: [[Double]] -> Trie k Double -> Trie k Double
105 go [] _ = panic "normalizeEntropy' empty levels"
106 -- go _ (Leaf c) = Leaf c
107 go (es : ess) (Node c e children) =
108 Node c e (normalizeLevel m v . go ess <$> children)
109 where
110 m = mean es
111 v = variance es
112
113 buildTrie' :: [[Token]] -> Trie Token Double
114 buildTrie' = normalizeEntropy' . entropyTrie (== Terminal) . insertTries