2 Module : Gargantext.Viz.Phylo.Tools
3 Description : Phylomemy Tools to build/manage it
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 {-# LANGUAGE FlexibleContexts #-}
12 {-# LANGUAGE NoImplicitPrelude #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE RankNTypes #-}
15 {-# LANGUAGE ViewPatterns #-}
17 module Gargantext.Viz.Phylo.Metrics
20 import Gargantext.Prelude
21 import Gargantext.Viz.Phylo
22 import Gargantext.Viz.Phylo.Tools
24 import Control.Lens hiding (Level)
26 import Data.List ((\\), sortOn, concat, nub, take, union, intersect, null, (++), sort)
27 import Data.Map (Map, (!), foldlWithKey, toList, size, insert, unionWith, intersection, intersectionWith, filterWithKey, elems, fromList, findWithDefault, fromListWith)
28 import Data.Text (Text)
30 -- import Debug.Trace (trace)
37 -- | Return the conditional probability of i knowing j
38 conditional :: Ord a => Map (a,a) Double -> a -> a -> Double
39 conditional m i j = (findWithDefault 0 (i,j) m)
40 / foldlWithKey (\s (x,_) v -> if x == j
45 -- | Return the genericity score of a given ngram
46 genericity :: Map (Int, Int) Double -> [Int] -> Int -> Double
47 genericity m l i = ( (sum $ map (\j -> conditional m i j) l)
48 - (sum $ map (\j -> conditional m j i) l)) / 2
51 -- | Return the specificity score of a given ngram
52 specificity :: Map (Int, Int) Double -> [Int] -> Int -> Double
53 specificity m l i = ( (sum $ map (\j -> conditional m j i) l)
54 - (sum $ map (\j -> conditional m i j) l)) / 2
57 -- | Return the coverage score of a given ngram
58 coverage :: Map (Int, Int) Double -> [Int] -> Int -> Double
59 coverage m l i = ( (sum $ map (\j -> conditional m j i) l)
60 + (sum $ map (\j -> conditional m i j) l)) / 2
63 -- | Process some metrics on top of ngrams
64 getNgramsMeta :: Map (Int, Int) Double -> [Int] -> Map Text [Double]
65 getNgramsMeta m ngrams = fromList
66 [ ("genericity" , map (\n -> genericity m (ngrams \\ [n]) n) ngrams ),
67 ("specificity", map (\n -> specificity m (ngrams \\ [n]) n) ngrams ),
68 ("coverage" , map (\n -> coverage m (ngrams \\ [n]) n) ngrams )]
71 -- | To get the nth most occurent elems in a coocurency matrix
72 getNthMostOcc :: Int -> Map (Int,Int) Double -> [Int]
73 getNthMostOcc nth cooc = (nub . concat)
74 $ map (\((idx,idx'),_) -> [idx,idx'])
77 $ sortOn snd $ toList cooc
80 -------------------------
81 -- | Ngrams Dynamics | --
82 -------------------------
84 sharedWithParents :: Date -> PhyloBranchId -> Int -> PhyloView -> Bool
85 sharedWithParents inf bid n pv = elem n
86 $ foldl (\mem pn -> if ((bid == (fromJust $ (pn ^. pn_bid)))
87 && (inf > (fst $ getNodePeriod pn)))
88 then nub $ mem ++ (pn ^. pn_idx)
93 findDynamics :: Int -> PhyloView -> PhyloNode -> Map Int (Date,Date) -> Double
94 findDynamics n pv pn m =
95 let prd = getNodePeriod pn
96 bid = fromJust $ (pn ^. pn_bid)
97 end = last' "dynamics" (sort $ map snd $ elems m)
98 in if (((snd prd) == (snd $ m ! n)) && (snd prd /= end))
101 else if ((fst prd) == (fst $ m ! n))
104 else if (not $ sharedWithParents (fst prd) bid n pv)
111 processDynamics :: PhyloView -> PhyloView
112 processDynamics pv = alterPhyloNode (\pn ->
113 pn & pn_metrics %~ insert "dynamics" (map (\n -> findDynamics n pv pn ngramsDates) $ (pn ^. pn_idx) ) ) pv
115 --------------------------------------
116 ngramsDates :: Map Int (Date,Date)
117 ngramsDates = map (\ds -> let ds' = sort ds
118 in (head' "Dynamics" ds', last' "Dynamics" ds'))
120 $ foldl (\mem pn -> mem ++ (map (\n -> (n, [fst $ getNodePeriod pn, snd $ getNodePeriod pn]))
121 $ (pn ^. pn_idx))) []
123 --------------------------------------
132 -- | Process the inverse sumLog
133 sumInvLog :: Double -> [Double] -> Double
134 sumInvLog s l = foldl (\mem x -> mem + (1 / log (s + x))) 0 l
137 -- | Process the sumLog
138 sumLog :: Double -> [Double] -> Double
139 sumLog s l = foldl (\mem x -> mem + log (s + x)) 0 l
142 -- | To compute a jaccard similarity between two lists
143 jaccard :: [Int] -> [Int] -> Double
144 jaccard inter' union' = ((fromIntegral . length) $ inter') / ((fromIntegral . length) $ union')
147 -- | To get the diagonal of a matrix
148 toDiago :: Map (Int, Int) Double -> [Double]
149 toDiago cooc = elems $ filterWithKey (\(x,x') _ -> x == x') cooc
152 -- | To process WeighedLogJaccard distance between to coocurency matrix
153 weightedLogJaccard :: Double -> Double -> Map (Int, Int) Double -> Map (Int, Int) Double -> [Int] -> [Int] -> Double
154 weightedLogJaccard sens nbDocs cooc cooc' ngrams ngrams'
156 | gInter == gUnion = 1
157 | sens == 0 = jaccard gInter gUnion
158 | sens > 0 = (sumInvLog sens wInter) / (sumInvLog sens wUnion)
159 | otherwise = (sumLog sens wInter) / (sumLog sens wUnion)
161 --------------------------------------
163 gInter = intersect ngrams ngrams'
164 --------------------------------------
166 gUnion = union ngrams ngrams'
167 --------------------------------------
169 wInter = toDiago $ map (/nbDocs) $ intersectionWith (+) cooc cooc'
170 --------------------------------------
172 wUnion = toDiago $ map (/nbDocs) $ unionWith (+) cooc cooc'
173 --------------------------------------
176 -- | To process the Hamming distance between two PhyloGroup fields
177 hamming :: Map (Int, Int) Double -> Map (Int, Int) Double -> Double
178 hamming f1 f2 = fromIntegral $ max ((size inter) - (size f1)) ((size inter) - (size f2))
180 --------------------------------------
181 inter :: Map (Int, Int) Double
182 inter = intersection f1 f2
183 --------------------------------------