1 {-# LANGUAGE NoImplicitPrelude #-}
2 {-# LANGUAGE OverloadedStrings #-}
5 Implementation of EleVe Python version of papers:
9 module Gargantext.Text.Eleve where
13 import qualified Data.List as L
15 import Data.Text (Text)
16 import qualified Data.Text as T
18 import Data.Maybe (fromMaybe)
19 import qualified Data.Map as Map
20 import Gargantext.Prelude
22 -- prop (Node c _e f) = c == Map.size f
24 -- NP: I think Leaf is an optimisation (less data, a tiny bit more code and time)
29 $ T.words "New York and New York is a big apple"
31 data Token = NonTerminal Text | Terminal
32 deriving (Ord, Eq, Show)
34 token :: [Text] -> [Token]
35 token xs = (NonTerminal <$> xs) <> [Terminal]
38 = Node { _node_count :: Int
40 , _node_children :: Map k (Trie k e)
42 -- | Leaf { _node_count :: Int }
45 -- emptyTrie :: Trie k e
47 emptyTrie :: (Ord k, Monoid e) => Trie k e
48 emptyTrie = Node 0 mempty mempty
50 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
52 {-| Map.null children = Leaf c
53 | otherwise -} = Node c mempty children
55 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
57 -- insertTrie (x:xs) (Leaf c) = mkTrie (c+1) (Map.singleton x $ insertTrie xs emptyTrie)
58 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
60 f = Just . insertTrie xs . fromMaybe emptyTrie
62 insertTries :: Ord k => [[k]] -> Trie k ()
63 insertTries = L.foldr insertTrie emptyTrie
65 entropyTrie :: (k -> Bool) -> Trie k () -> Trie k Double
66 -- entropyTrie _ (Leaf c) = Leaf c
67 entropyTrie pred (Node c _e children) = Node c e (entropyTrie pred <$> children)
69 e = sum $ f <$> Map.toList children
70 f (k, child) = if pred k then cfc * log (fromIntegral c) else - cfc * log cfc
72 cfc = fromIntegral (_node_count child) / fromIntegral c
74 normalizeEntropy :: Trie k Double -> Trie k Double
75 -- normalizeEntropy (Leaf c) = Leaf c
76 normalizeEntropy (Node c e children) =
77 Node c e $ normalizeLevel m v . normalizeEntropy <$> children
79 es = _node_entropy <$> Map.elems children
83 normalizeLevel :: Double -> Double -> Trie k Double -> Trie k Double
84 -- normalizeLevel _ _ (Leaf c) = Leaf c
85 -- normalizeLevel m v (Node c e children) = Node c ((e - m) / v) children
86 normalizeLevel m v n = n { _node_entropy = (_node_entropy n - m) / v }
88 buildTrie :: [[Token]] -> Trie Token Double
89 buildTrie = normalizeEntropy . entropyTrie (== Terminal) . insertTries
91 subForest :: Trie k e -> [Trie k e]
92 -- subForest (Leaf _) = []
93 subForest (Node _ _ children) = Map.elems children
95 levels :: Trie k e -> [[Trie k e]]
96 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
98 entropyLevels :: Trie k e -> [[e]]
99 entropyLevels = fmap (fmap _node_entropy) . levels
101 normalizeEntropy' :: Trie k Double -> Trie k Double
102 normalizeEntropy' t = go (entropyLevels t) t
104 go :: [[Double]] -> Trie k Double -> Trie k Double
105 go [] _ = panic "normalizeEntropy' empty levels"
106 -- go _ (Leaf c) = Leaf c
107 go (es : ess) (Node c e children) =
108 Node c e (normalizeLevel m v . go ess <$> children)
113 buildTrie' :: [[Token]] -> Trie Token Double
114 buildTrie' = normalizeEntropy' . entropyTrie (== Terminal) . insertTries