{-| Module : Gargantext.Viz.Phylo.Tools Description : Phylomemy Tools to build/manage it Copyright : (c) CNRS, 2017-Present License : AGPL + CECILL v3 Maintainer : team@gargantext.org Stability : experimental Portability : POSIX -} {-# LANGUAGE ViewPatterns #-} module Gargantext.Viz.Phylo.Metrics where import Gargantext.Prelude import Gargantext.Viz.Phylo import Gargantext.Viz.Phylo.Tools import Control.Lens hiding (Level) import Data.List ((\\), sortOn, concat, nub, take, union, intersect, null, (++), sort) import Data.Map (Map, (!), toList, size, insert, unionWith, intersection, intersectionWith, filterWithKey, elems, fromList, findWithDefault, fromListWith) import Data.Text (Text) -- import Debug.Trace (trace) ---------------- -- | Ngrams | -- ---------------- -- | Return the conditional probability of i knowing j conditional :: Ord a => Map (a,a) Double -> a -> a -> Double conditional m i j = (findWithDefault 0 (i,j) m) / (m ! (j,j)) -- | Return the genericity score of a given ngram genericity :: Map (Int, Int) Double -> [Int] -> Int -> Double genericity m l i = ( (sum $ map (\j -> conditional m i j) l) - (sum $ map (\j -> conditional m j i) l)) / (fromIntegral $ (length l) + 1) -- | Return the specificity score of a given ngram specificity :: Map (Int, Int) Double -> [Int] -> Int -> Double specificity m l i = ( (sum $ map (\j -> conditional m j i) l) - (sum $ map (\j -> conditional m i j) l)) / (fromIntegral $ (length l) + 1) -- | Return the inclusion score of a given ngram inclusion :: Map (Int, Int) Double -> [Int] -> Int -> Double inclusion m l i = ( (sum $ map (\j -> conditional m j i) l) + (sum $ map (\j -> conditional m i j) l)) / (fromIntegral $ (length l) + 1) -- | Process some metrics on top of ngrams getNgramsMeta :: Map (Int, Int) Double -> [Int] -> Map Text [Double] getNgramsMeta m ngrams = fromList [ ("genericity" , map (\n -> genericity m (ngrams \\ [n]) n) ngrams ), ("specificity", map (\n -> specificity m (ngrams \\ [n]) n) ngrams ), ("inclusion" , map (\n -> inclusion m (ngrams \\ [n]) n) ngrams )] -- | To get the nth most occurent elems in a coocurency matrix getNthMostOcc :: Int -> Map (Int,Int) Double -> [Int] getNthMostOcc nth cooc = (nub . concat) $ map (\((idx,idx'),_) -> [idx,idx']) $ take nth $ reverse $ sortOn snd $ toList cooc ------------------------- -- | Ngrams Dynamics | -- ------------------------- sharedWithParents :: Date -> PhyloBranchId -> Int -> PhyloView -> Bool sharedWithParents inf bid n pv = elem n $ foldl (\mem pn -> if ((bid == (fromJust $ (pn ^. pn_bid))) && (inf > (fst $ getNodePeriod pn))) then nub $ mem ++ (pn ^. pn_idx) else mem ) [] $ (pv ^. pv_nodes) findDynamics :: Int -> PhyloView -> PhyloNode -> Map Int (Date,Date) -> Double findDynamics n pv pn m = let prd = getNodePeriod pn bid = fromJust $ (pn ^. pn_bid) end = last' "dynamics" (sort $ map snd $ elems m) in if (((snd prd) == (snd $ m ! n)) && (snd prd /= end)) -- emergence then 2 else if ((fst prd) == (fst $ m ! n)) -- recombination then 0 else if (not $ sharedWithParents (fst prd) bid n pv) -- decrease then 1 else 3 processDynamics :: PhyloView -> PhyloView processDynamics pv = alterPhyloNode (\pn -> pn & pn_metrics %~ insert "dynamics" (map (\n -> findDynamics n pv pn ngramsDates) $ (pn ^. pn_idx) ) ) pv where -------------------------------------- ngramsDates :: Map Int (Date,Date) ngramsDates = map (\ds -> let ds' = sort ds in (head' "Dynamics" ds', last' "Dynamics" ds')) $ fromListWith (++) $ foldl (\mem pn -> mem ++ (map (\n -> (n, [fst $ getNodePeriod pn, snd $ getNodePeriod pn])) $ (pn ^. pn_idx))) [] $ (pv ^. pv_nodes) -------------------------------------- ------------------- -- | Proximity | -- ------------------- -- | Process the inverse sumLog sumInvLog :: Double -> [Double] -> Double sumInvLog s l = foldl (\mem x -> mem + (1 / log (s + x))) 0 l -- | Process the sumLog sumLog :: Double -> [Double] -> Double sumLog s l = foldl (\mem x -> mem + log (s + x)) 0 l -- | To compute a jaccard similarity between two lists jaccard :: [Int] -> [Int] -> Double jaccard inter' union' = ((fromIntegral . length) $ inter') / ((fromIntegral . length) $ union') -- | To get the diagonal of a matrix toDiago :: Map (Int, Int) Double -> [Double] toDiago cooc = elems $ filterWithKey (\(x,x') _ -> x == x') cooc -- | To process WeighedLogJaccard distance between to coocurency matrix weightedLogJaccard :: Double -> Double -> Map (Int, Int) Double -> Map (Int, Int) Double -> [Int] -> [Int] -> Double weightedLogJaccard sens nbDocs cooc cooc' ngrams ngrams' | null gInter = 0 | gInter == gUnion = 1 | sens == 0 = jaccard gInter gUnion | sens > 0 = (sumInvLog sens wInter) / (sumInvLog sens wUnion) | otherwise = (sumLog sens wInter) / (sumLog sens wUnion) where -------------------------------------- gInter :: [Int] gInter = intersect ngrams ngrams' -------------------------------------- gUnion :: [Int] gUnion = union ngrams ngrams' -------------------------------------- wInter :: [Double] wInter = toDiago $ map (/nbDocs) $ intersectionWith (+) cooc cooc' -------------------------------------- wUnion :: [Double] wUnion = toDiago $ map (/nbDocs) $ unionWith (+) cooc cooc' -------------------------------------- -- | To process the Hamming distance between two PhyloGroup fields hamming :: Map (Int, Int) Double -> Map (Int, Int) Double -> Double hamming f1 f2 = fromIntegral $ max ((size inter) - (size f1)) ((size inter) - (size f2)) where -------------------------------------- inter :: Map (Int, Int) Double inter = intersection f1 f2 --------------------------------------