2 Module : Gargantext.Viz.Phylo.Tools
3 Description : Phylomemy Tools to build/manage it
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 {-# LANGUAGE FlexibleContexts #-}
12 {-# LANGUAGE NoImplicitPrelude #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE RankNTypes #-}
15 {-# LANGUAGE ViewPatterns #-}
17 module Gargantext.Viz.Phylo.Metrics
20 import Gargantext.Prelude
21 import Gargantext.Viz.Phylo
22 import Gargantext.Viz.Phylo.Tools
24 import Control.Lens hiding (Level)
26 import Data.List ((\\), sortOn, concat, nub, take, union, intersect, null, (++), sort)
27 import Data.Map (Map, (!), toList, size, insert, unionWith, intersection, intersectionWith, filterWithKey, elems, fromList, findWithDefault, fromListWith)
28 import Data.Text (Text)
30 -- import Debug.Trace (trace)
37 -- | Return the conditional probability of i knowing j
38 conditional :: Ord a => Map (a,a) Double -> a -> a -> Double
39 conditional m i j = (findWithDefault 0 (i,j) m)
43 -- | Return the genericity score of a given ngram
44 genericity :: Map (Int, Int) Double -> [Int] -> Int -> Double
45 genericity m l i = ( (sum $ map (\j -> conditional m i j) l)
46 - (sum $ map (\j -> conditional m j i) l)) / (fromIntegral $ (length l) + 1)
49 -- | Return the specificity score of a given ngram
50 specificity :: Map (Int, Int) Double -> [Int] -> Int -> Double
51 specificity m l i = ( (sum $ map (\j -> conditional m j i) l)
52 - (sum $ map (\j -> conditional m i j) l)) / (fromIntegral $ (length l) + 1)
55 -- | Return the inclusion score of a given ngram
56 inclusion :: Map (Int, Int) Double -> [Int] -> Int -> Double
57 inclusion m l i = ( (sum $ map (\j -> conditional m j i) l)
58 + (sum $ map (\j -> conditional m i j) l)) / (fromIntegral $ (length l) + 1)
61 -- | Process some metrics on top of ngrams
62 getNgramsMeta :: Map (Int, Int) Double -> [Int] -> Map Text [Double]
63 getNgramsMeta m ngrams = fromList
64 [ ("genericity" , map (\n -> genericity m (ngrams \\ [n]) n) ngrams ),
65 ("specificity", map (\n -> specificity m (ngrams \\ [n]) n) ngrams ),
66 ("inclusion" , map (\n -> inclusion m (ngrams \\ [n]) n) ngrams )]
69 -- | To get the nth most occurent elems in a coocurency matrix
70 getNthMostOcc :: Int -> Map (Int,Int) Double -> [Int]
71 getNthMostOcc nth cooc = (nub . concat)
72 $ map (\((idx,idx'),_) -> [idx,idx'])
75 $ sortOn snd $ toList cooc
78 -------------------------
79 -- | Ngrams Dynamics | --
80 -------------------------
82 sharedWithParents :: Date -> PhyloBranchId -> Int -> PhyloView -> Bool
83 sharedWithParents inf bid n pv = elem n
84 $ foldl (\mem pn -> if ((bid == (fromJust $ (pn ^. pn_bid)))
85 && (inf > (fst $ getNodePeriod pn)))
86 then nub $ mem ++ (pn ^. pn_idx)
91 findDynamics :: Int -> PhyloView -> PhyloNode -> Map Int (Date,Date) -> Double
92 findDynamics n pv pn m =
93 let prd = getNodePeriod pn
94 bid = fromJust $ (pn ^. pn_bid)
95 end = last' "dynamics" (sort $ map snd $ elems m)
96 in if (((snd prd) == (snd $ m ! n)) && (snd prd /= end))
99 else if ((fst prd) == (fst $ m ! n))
102 else if (not $ sharedWithParents (fst prd) bid n pv)
109 processDynamics :: PhyloView -> PhyloView
110 processDynamics pv = alterPhyloNode (\pn ->
111 pn & pn_metrics %~ insert "dynamics" (map (\n -> findDynamics n pv pn ngramsDates) $ (pn ^. pn_idx) ) ) pv
113 --------------------------------------
114 ngramsDates :: Map Int (Date,Date)
115 ngramsDates = map (\ds -> let ds' = sort ds
116 in (head' "Dynamics" ds', last' "Dynamics" ds'))
118 $ foldl (\mem pn -> mem ++ (map (\n -> (n, [fst $ getNodePeriod pn, snd $ getNodePeriod pn]))
119 $ (pn ^. pn_idx))) []
121 --------------------------------------
130 -- | Process the inverse sumLog
131 sumInvLog :: Double -> [Double] -> Double
132 sumInvLog s l = foldl (\mem x -> mem + (1 / log (s + x))) 0 l
135 -- | Process the sumLog
136 sumLog :: Double -> [Double] -> Double
137 sumLog s l = foldl (\mem x -> mem + log (s + x)) 0 l
140 -- | To compute a jaccard similarity between two lists
141 jaccard :: [Int] -> [Int] -> Double
142 jaccard inter' union' = ((fromIntegral . length) $ inter') / ((fromIntegral . length) $ union')
145 -- | To get the diagonal of a matrix
146 toDiago :: Map (Int, Int) Double -> [Double]
147 toDiago cooc = elems $ filterWithKey (\(x,x') _ -> x == x') cooc
150 -- | To process WeighedLogJaccard distance between to coocurency matrix
151 weightedLogJaccard :: Double -> Double -> Map (Int, Int) Double -> Map (Int, Int) Double -> [Int] -> [Int] -> Double
152 weightedLogJaccard sens nbDocs cooc cooc' ngrams ngrams'
154 | gInter == gUnion = 1
155 | sens == 0 = jaccard gInter gUnion
156 | sens > 0 = (sumInvLog sens wInter) / (sumInvLog sens wUnion)
157 | otherwise = (sumLog sens wInter) / (sumLog sens wUnion)
159 --------------------------------------
161 gInter = intersect ngrams ngrams'
162 --------------------------------------
164 gUnion = union ngrams ngrams'
165 --------------------------------------
167 wInter = toDiago $ map (/nbDocs) $ intersectionWith (+) cooc cooc'
168 --------------------------------------
170 wUnion = toDiago $ map (/nbDocs) $ unionWith (+) cooc cooc'
171 --------------------------------------
174 -- | To process the Hamming distance between two PhyloGroup fields
175 hamming :: Map (Int, Int) Double -> Map (Int, Int) Double -> Double
176 hamming f1 f2 = fromIntegral $ max ((size inter) - (size f1)) ((size inter) - (size f2))
178 --------------------------------------
179 inter :: Map (Int, Int) Double
180 inter = intersection f1 f2
181 --------------------------------------