1 {-# LANGUAGE NoImplicitPrelude #-}
2 {-# LANGUAGE OverloadedStrings #-}
5 Implementation of EleVe Python version of papers:
9 module Gargantext.Text.Eleve where
11 import Debug.Trace (trace)
12 import Debug.SimpleReflect
14 import Control.Monad (foldM)
16 import qualified Data.List as L
18 import Data.Text (Text)
19 import qualified Data.Text as T
21 import Data.Maybe (fromMaybe)
22 import qualified Data.Map as Map
23 import Gargantext.Prelude
24 import qualified Data.Tree as Tree
25 import Data.Tree (Tree)
26 import qualified Prelude as P (putStrLn, logBase, String)
28 -- prop (Node c _e f) = c == Map.size f
29 -- TODO maybe add Leaf
30 -- NP: I think Leaf is an optimisation (less data, a tiny bit more code and time)
35 ex = toToken n example
36 t = buildTrie $ chunkAlong n 1 ex
38 P.putStrLn $ Tree.drawTree
40 $ toTree (NonTerminal "") t
42 pure $ map unToken $ split t t [] ex
45 example' = T.words "New York and New York"
46 example'' = map (T.pack . pure) ("abcdefabcdegabcde" :: P.String)
49 data Token = NonTerminal Text
51 deriving (Ord, Eq, Show)
53 toToken :: Int -> [Text] -> [Token]
54 toToken n xs = (NonTerminal <$> xs) <> L.take n (repeat Terminal)
56 unToken :: [Token] -> [Text]
64 = Node { _node_count :: Int
66 , _node_children :: Map k (Trie k e)
68 | Leaf { _node_count :: Int }
71 toTree :: k -> Trie k e -> Tree (k,Int,e)
72 toTree k (Node c e cs) = Tree.Node (k, c, e) (map (uncurry toTree) $ Map.toList cs)
74 -- emptyTrie :: Trie k e
75 emptyTrie :: (Ord k, Monoid e) => Int -> Trie k e
76 --emptyTrie n = Node n mempty mempty
79 mkTrie :: Monoid e => Int -> Map k (Trie k e) -> Trie k e
81 {-| Map.null children = Leaf c
82 | otherwise -} = Node c mempty children
84 insertTrie :: Ord k => [k] -> Trie k () -> Trie k ()
85 insertTrie [] n = n { _node_count = _node_count n +1}
86 -- insertTrie (x:xs) (Leaf c) = mkTrie (c+1) (Map.singleton x $ insertTrie xs emptyTrie)
87 insertTrie (x:xs) (Node c _e children) = mkTrie (c+1) $ Map.alter f x children
89 f = Just . insertTrie xs . fromMaybe (emptyTrie 0)
91 insertTries :: Ord k => [[k]] -> Trie k ()
92 insertTries = L.foldr insertTrie (emptyTrie 1)
94 entropyTrie :: (Num e, Floating e) => (k -> Bool) -> Trie k () -> Trie k e
95 -- entropyTrie _ (Leaf c) = Leaf c
96 entropyTrie pred (Node c _e children) = Node c e (map (entropyTrie pred) children)
98 e = sum $ map f $ Map.toList children
99 f (k, child) = if pred k then cfc * P.logBase 2 (fromIntegral c)
100 else - cfc * P.logBase 2 cfc
102 cfc = fromIntegral (_node_count child) / fromIntegral c
104 normalizeEntropy :: (Fractional e, Floating e, Show e) => Trie k e -> Trie k e
105 -- normalizeEntropy (Leaf c) = Leaf c
106 normalizeEntropy (Node c e children) =
107 trace (show $ L.length es) $ Node c e $ map (normalizeLevel m v . normalizeEntropy) children
109 es = map _node_entropy $ Map.elems children
113 normalizeLevel :: (Fractional e, Floating e, Show e) => e -> e -> Trie k e -> Trie k e
114 -- normalizeLevel _ _ (Leaf c) = Leaf c
115 --normalizeLevel m v n = n { _node_entropy = (_node_entropy n - m) }
116 normalizeLevel m v n = trace (show (_node_entropy n,m,v)) $ n { _node_entropy = (_node_entropy n - m) / v}
118 buildTrie :: (Floating e, Show e) => [[Token]] -> Trie Token e
119 buildTrie = normalizeEntropy . entropyTrie (== Terminal) . insertTries
121 subForest :: Trie k e -> [Trie k e]
122 -- subForest (Leaf _) = []
123 subForest (Node _ _ children) = Map.elems children
125 levels :: Trie k e -> [[Trie k e]]
126 levels = L.takeWhile (not . L.null) . L.iterate (L.concatMap subForest) . pure
128 entropyLevels :: Trie k e -> [[e]]
129 entropyLevels = fmap (fmap _node_entropy) . levels
131 normalizeEntropy' :: (Floating e, Show e) => Trie k e -> Trie k e
132 normalizeEntropy' t = go (entropyLevels t) t
134 go :: (Floating e, Show e) => [[e]] -> Trie k e -> Trie k e
135 go [] _ = panic "normalizeEntropy' empty levels"
136 -- go _ (Leaf c) = Leaf c
137 go (es : ess) (Node c e children) =
138 Node c e (normalizeLevel m v . go ess <$> children)
143 buildTrie' :: (Floating e, Show e) => [[Token]] -> Trie Token e
144 buildTrie' = normalizeEntropy' . entropyTrie (== Terminal) . insertTries
146 ------------------------------------------------------------------------
148 autonomie :: Trie Token e -> Token -> e
149 autonomie trie t = case (Map.lookup t (_node_children trie)) of
150 Nothing -> panic $ "Gargantext.Text.Ngrams: autonomie" <> (cs $ show t)
151 Just a -> _node_entropy a
153 ------------------------------------------------------------------------
155 split :: (Num e, Ord e) => Trie Token e -> Trie Token e -> [Token] -> [Token] -> [[Token]]
156 split _ _ pref [] = [reverse pref]
157 split t0 t pref (x:xs) = case Map.lookup x $ _node_children t of
158 Nothing -> reverse pref : split t0 t0 [x] xs
159 Just a -> case Map.lookup x $ _node_children t0 of
160 Nothing -> panic "TODO" -- reverse pref : split t0 t0 [] xs
161 Just xt0 -> case _node_entropy t + _node_entropy xt0 > _node_entropy a of
162 True -> split t0 a (x:pref) xs
163 False -> reverse pref : split t0 xt0 [x] xs