]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/Phylo/Metrics.hs
Merge branch 'dev-phylo' of ssh://gitlab.iscpif.fr:20022/gargantext/haskell-gargantex...
[gargantext.git] / src / Gargantext / Viz / Phylo / Metrics.hs
1 {-|
2 Module : Gargantext.Viz.Phylo.Tools
3 Description : Phylomemy Tools to build/manage it
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 {-# LANGUAGE FlexibleContexts #-}
12 {-# LANGUAGE NoImplicitPrelude #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE RankNTypes #-}
15 {-# LANGUAGE ViewPatterns #-}
16
17 module Gargantext.Viz.Phylo.Metrics
18 where
19
20 import Gargantext.Prelude
21 import Gargantext.Viz.Phylo
22 import Gargantext.Viz.Phylo.Tools
23
24 import Control.Lens hiding (Level)
25
26 import Data.List ((\\), sortOn, concat, nub, take, union, intersect, null, (++), sort)
27 import Data.Map (Map, (!), foldlWithKey, toList, size, insert, unionWith, intersection, intersectionWith, filterWithKey, elems, fromList, findWithDefault, fromListWith)
28 import Data.Text (Text)
29
30 -- import Debug.Trace (trace)
31
32 ----------------
33 -- | Ngrams | --
34 ----------------
35
36
37 -- | Return the conditional probability of i knowing j
38 conditional :: Ord a => Map (a,a) Double -> a -> a -> Double
39 conditional m i j = (findWithDefault 0 (i,j) m)
40 / foldlWithKey (\s (x,_) v -> if x == j
41 then s + v
42 else s ) 0 m
43
44
45 -- | Return the genericity score of a given ngram
46 genericity :: Map (Int, Int) Double -> [Int] -> Int -> Double
47 genericity m l i = ( (sum $ map (\j -> conditional m i j) l)
48 - (sum $ map (\j -> conditional m j i) l)) / 2
49
50
51 -- | Return the specificity score of a given ngram
52 specificity :: Map (Int, Int) Double -> [Int] -> Int -> Double
53 specificity m l i = ( (sum $ map (\j -> conditional m j i) l)
54 - (sum $ map (\j -> conditional m i j) l)) / 2
55
56
57 -- | Return the coverage score of a given ngram
58 coverage :: Map (Int, Int) Double -> [Int] -> Int -> Double
59 coverage m l i = ( (sum $ map (\j -> conditional m j i) l)
60 + (sum $ map (\j -> conditional m i j) l)) / 2
61
62
63 -- | Process some metrics on top of ngrams
64 getNgramsMeta :: Map (Int, Int) Double -> [Int] -> Map Text [Double]
65 getNgramsMeta m ngrams = fromList
66 [ ("genericity" , map (\n -> genericity m (ngrams \\ [n]) n) ngrams ),
67 ("specificity", map (\n -> specificity m (ngrams \\ [n]) n) ngrams ),
68 ("coverage" , map (\n -> coverage m (ngrams \\ [n]) n) ngrams )]
69
70
71 -- | To get the nth most occurent elems in a coocurency matrix
72 getNthMostOcc :: Int -> Map (Int,Int) Double -> [Int]
73 getNthMostOcc nth cooc = (nub . concat)
74 $ map (\((idx,idx'),_) -> [idx,idx'])
75 $ take nth
76 $ reverse
77 $ sortOn snd $ toList cooc
78
79
80 -------------------------
81 -- | Ngrams Dynamics | --
82 -------------------------
83
84 sharedWithParents :: Date -> PhyloBranchId -> Int -> PhyloView -> Bool
85 sharedWithParents inf bid n pv = elem n
86 $ foldl (\mem pn -> if ((bid == (fromJust $ (pn ^. pn_bid)))
87 && (inf > (fst $ getNodePeriod pn)))
88 then nub $ mem ++ (pn ^. pn_idx)
89 else mem ) []
90 $ (pv ^. pv_nodes)
91
92
93 findDynamics :: Int -> PhyloView -> PhyloNode -> Map Int (Date,Date) -> Double
94 findDynamics n pv pn m =
95 let prd = getNodePeriod pn
96 bid = fromJust $ (pn ^. pn_bid)
97 end = last' "dynamics" (sort $ map snd $ elems m)
98 in if (((snd prd) == (snd $ m ! n)) && (snd prd /= end))
99 -- | decrease
100 then 0
101 else if ((fst prd) == (fst $ m ! n))
102 -- | emergence
103 then 1
104 else if (not $ sharedWithParents (fst prd) bid n pv)
105 -- | recombination
106 then 2
107 else 3
108
109
110
111 processDynamics :: PhyloView -> PhyloView
112 processDynamics pv = alterPhyloNode (\pn ->
113 pn & pn_metrics %~ insert "dynamics" (map (\n -> findDynamics n pv pn ngramsDates) $ (pn ^. pn_idx) ) ) pv
114 where
115 --------------------------------------
116 ngramsDates :: Map Int (Date,Date)
117 ngramsDates = map (\ds -> let ds' = sort ds
118 in (head' "Dynamics" ds', last' "Dynamics" ds'))
119 $ fromListWith (++)
120 $ foldl (\mem pn -> mem ++ (map (\n -> (n, [fst $ getNodePeriod pn, snd $ getNodePeriod pn]))
121 $ (pn ^. pn_idx))) []
122 $ (pv ^. pv_nodes)
123 --------------------------------------
124
125
126
127 -------------------
128 -- | Proximity | --
129 -------------------
130
131
132 -- | Process the inverse sumLog
133 sumInvLog :: Double -> [Double] -> Double
134 sumInvLog s l = foldl (\mem x -> mem + (1 / log (s + x))) 0 l
135
136
137 -- | Process the sumLog
138 sumLog :: Double -> [Double] -> Double
139 sumLog s l = foldl (\mem x -> mem + log (s + x)) 0 l
140
141
142 -- | To compute a jaccard similarity between two lists
143 jaccard :: [Int] -> [Int] -> Double
144 jaccard inter' union' = ((fromIntegral . length) $ inter') / ((fromIntegral . length) $ union')
145
146
147 -- | To get the diagonal of a matrix
148 toDiago :: Map (Int, Int) Double -> [Double]
149 toDiago cooc = elems $ filterWithKey (\(x,x') _ -> x == x') cooc
150
151
152 -- | To process WeighedLogJaccard distance between to coocurency matrix
153 weightedLogJaccard :: Double -> Double -> Map (Int, Int) Double -> Map (Int, Int) Double -> [Int] -> [Int] -> Double
154 weightedLogJaccard sens nbDocs cooc cooc' ngrams ngrams'
155 | null gInter = 0
156 | gInter == gUnion = 1
157 | sens == 0 = jaccard gInter gUnion
158 | sens > 0 = (sumInvLog sens wInter) / (sumInvLog sens wUnion)
159 | otherwise = (sumLog sens wInter) / (sumLog sens wUnion)
160 where
161 --------------------------------------
162 gInter :: [Int]
163 gInter = intersect ngrams ngrams'
164 --------------------------------------
165 gUnion :: [Int]
166 gUnion = union ngrams ngrams'
167 --------------------------------------
168 wInter :: [Double]
169 wInter = toDiago $ map (/nbDocs) $ intersectionWith (+) cooc cooc'
170 --------------------------------------
171 wUnion :: [Double]
172 wUnion = toDiago $ map (/nbDocs) $ unionWith (+) cooc cooc'
173 --------------------------------------
174
175
176 -- | To process the Hamming distance between two PhyloGroup fields
177 hamming :: Map (Int, Int) Double -> Map (Int, Int) Double -> Double
178 hamming f1 f2 = fromIntegral $ max ((size inter) - (size f1)) ((size inter) - (size f2))
179 where
180 --------------------------------------
181 inter :: Map (Int, Int) Double
182 inter = intersection f1 f2
183 --------------------------------------
184
185
186
187
188
189