2 Module : Gargantext.Viz.Phylo.Tools
3 Description : Phylomemy Tools to build/manage it
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 {-# LANGUAGE ViewPatterns #-}
13 module Gargantext.Viz.Phylo.Metrics
16 import Gargantext.Prelude
17 import Gargantext.Viz.Phylo
18 import Gargantext.Viz.Phylo.Tools
20 import Control.Lens hiding (Level)
22 import Data.List ((\\), sortOn, concat, nub, take, union, intersect, null, (++), sort)
23 import Data.Map (Map, (!), toList, size, insert, unionWith, intersection, intersectionWith, filterWithKey, elems, fromList, findWithDefault, fromListWith)
24 import Data.Text (Text)
26 -- import Debug.Trace (trace)
33 -- | Return the conditional probability of i knowing j
34 conditional :: Ord a => Map (a,a) Double -> a -> a -> Double
35 conditional m i j = (findWithDefault 0 (i,j) m)
39 -- | Return the genericity score of a given ngram
40 genericity :: Map (Int, Int) Double -> [Int] -> Int -> Double
41 genericity m l i = ( (sum $ map (\j -> conditional m i j) l)
42 - (sum $ map (\j -> conditional m j i) l)) / (fromIntegral $ (length l) + 1)
45 -- | Return the specificity score of a given ngram
46 specificity :: Map (Int, Int) Double -> [Int] -> Int -> Double
47 specificity m l i = ( (sum $ map (\j -> conditional m j i) l)
48 - (sum $ map (\j -> conditional m i j) l)) / (fromIntegral $ (length l) + 1)
51 -- | Return the inclusion score of a given ngram
52 inclusion :: Map (Int, Int) Double -> [Int] -> Int -> Double
53 inclusion m l i = ( (sum $ map (\j -> conditional m j i) l)
54 + (sum $ map (\j -> conditional m i j) l)) / (fromIntegral $ (length l) + 1)
57 -- | Process some metrics on top of ngrams
58 getNgramsMeta :: Map (Int, Int) Double -> [Int] -> Map Text [Double]
59 getNgramsMeta m ngrams = fromList
60 [ ("genericity" , map (\n -> genericity m (ngrams \\ [n]) n) ngrams ),
61 ("specificity", map (\n -> specificity m (ngrams \\ [n]) n) ngrams ),
62 ("inclusion" , map (\n -> inclusion m (ngrams \\ [n]) n) ngrams )]
65 -- | To get the nth most occurent elems in a coocurency matrix
66 getNthMostOcc :: Int -> Map (Int,Int) Double -> [Int]
67 getNthMostOcc nth cooc = (nub . concat)
68 $ map (\((idx,idx'),_) -> [idx,idx'])
71 $ sortOn snd $ toList cooc
74 -------------------------
75 -- | Ngrams Dynamics | --
76 -------------------------
78 sharedWithParents :: Date -> PhyloBranchId -> Int -> PhyloView -> Bool
79 sharedWithParents inf bid n pv = elem n
80 $ foldl (\mem pn -> if ((bid == (fromJust $ (pn ^. pn_bid)))
81 && (inf > (fst $ getNodePeriod pn)))
82 then nub $ mem ++ (pn ^. pn_idx)
87 findDynamics :: Int -> PhyloView -> PhyloNode -> Map Int (Date,Date) -> Double
88 findDynamics n pv pn m =
89 let prd = getNodePeriod pn
90 bid = fromJust $ (pn ^. pn_bid)
91 end = last' "dynamics" (sort $ map snd $ elems m)
92 in if (((snd prd) == (snd $ m ! n)) && (snd prd /= end))
95 else if ((fst prd) == (fst $ m ! n))
98 else if (not $ sharedWithParents (fst prd) bid n pv)
105 processDynamics :: PhyloView -> PhyloView
106 processDynamics pv = alterPhyloNode (\pn ->
107 pn & pn_metrics %~ insert "dynamics" (map (\n -> findDynamics n pv pn ngramsDates) $ (pn ^. pn_idx) ) ) pv
109 --------------------------------------
110 ngramsDates :: Map Int (Date,Date)
111 ngramsDates = map (\ds -> let ds' = sort ds
112 in (head' "Dynamics" ds', last' "Dynamics" ds'))
114 $ foldl (\mem pn -> mem ++ (map (\n -> (n, [fst $ getNodePeriod pn, snd $ getNodePeriod pn]))
115 $ (pn ^. pn_idx))) []
117 --------------------------------------
126 -- | Process the inverse sumLog
127 sumInvLog :: Double -> [Double] -> Double
128 sumInvLog s l = foldl (\mem x -> mem + (1 / log (s + x))) 0 l
131 -- | Process the sumLog
132 sumLog :: Double -> [Double] -> Double
133 sumLog s l = foldl (\mem x -> mem + log (s + x)) 0 l
136 -- | To compute a jaccard similarity between two lists
137 jaccard :: [Int] -> [Int] -> Double
138 jaccard inter' union' = ((fromIntegral . length) $ inter') / ((fromIntegral . length) $ union')
141 -- | To get the diagonal of a matrix
142 toDiago :: Map (Int, Int) Double -> [Double]
143 toDiago cooc = elems $ filterWithKey (\(x,x') _ -> x == x') cooc
146 -- | To process WeighedLogJaccard distance between to coocurency matrix
147 weightedLogJaccard :: Double -> Double -> Map (Int, Int) Double -> Map (Int, Int) Double -> [Int] -> [Int] -> Double
148 weightedLogJaccard sens nbDocs cooc cooc' ngrams ngrams'
150 | gInter == gUnion = 1
151 | sens == 0 = jaccard gInter gUnion
152 | sens > 0 = (sumInvLog sens wInter) / (sumInvLog sens wUnion)
153 | otherwise = (sumLog sens wInter) / (sumLog sens wUnion)
155 --------------------------------------
157 gInter = intersect ngrams ngrams'
158 --------------------------------------
160 gUnion = union ngrams ngrams'
161 --------------------------------------
163 wInter = toDiago $ map (/nbDocs) $ intersectionWith (+) cooc cooc'
164 --------------------------------------
166 wUnion = toDiago $ map (/nbDocs) $ unionWith (+) cooc cooc'
167 --------------------------------------
170 -- | To process the Hamming distance between two PhyloGroup fields
171 hamming :: Map (Int, Int) Double -> Map (Int, Int) Double -> Double
172 hamming f1 f2 = fromIntegral $ max ((size inter) - (size f1)) ((size inter) - (size f2))
174 --------------------------------------
175 inter :: Map (Int, Int) Double
176 inter = intersection f1 f2
177 --------------------------------------