]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Viz/Phylo/Metrics.hs
[ngrams] add score update endpoint + sorting
[gargantext.git] / src / Gargantext / Core / Viz / Phylo / Metrics.hs
1 {-|
2 Module : Gargantext.Core.Viz.Phylo.Tools
3 Description : Phylomemy Tools to build/manage it
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 {-# LANGUAGE ViewPatterns #-}
12
13 module Gargantext.Core.Viz.Phylo.Metrics
14 where
15
16 import Gargantext.Prelude
17 import Gargantext.Core.Viz.Phylo
18 import Gargantext.Core.Viz.Phylo.Tools
19
20 import Control.Lens hiding (Level)
21
22 import Data.List ((\\), sortOn, concat, nub, union, intersect, null, (++), sort)
23 import Data.Map (Map, (!), toList, size, insert, unionWith, intersection, intersectionWith, filterWithKey, elems, fromList, findWithDefault, fromListWith)
24 import Data.Text (Text)
25
26 -- import Debug.Trace (trace)
27
28 ----------------
29 -- | Ngrams | --
30 ----------------
31
32
33 -- | Return the conditional probability of i knowing j
34 conditional :: Ord a => Map (a,a) Double -> a -> a -> Double
35 conditional m i j = (findWithDefault 0 (i,j) m)
36 / (m ! (j,j))
37
38
39 -- | Return the genericity score of a given ngram
40 genericity :: Map (Int, Int) Double -> [Int] -> Int -> Double
41 genericity m l i = ( (sum $ map (\j -> conditional m i j) l)
42 - (sum $ map (\j -> conditional m j i) l)) / (fromIntegral $ (length l) + 1)
43
44
45 -- | Return the specificity score of a given ngram
46 specificity :: Map (Int, Int) Double -> [Int] -> Int -> Double
47 specificity m l i = ( (sum $ map (\j -> conditional m j i) l)
48 - (sum $ map (\j -> conditional m i j) l)) / (fromIntegral $ (length l) + 1)
49
50
51 -- | Return the inclusion score of a given ngram
52 inclusion :: Map (Int, Int) Double -> [Int] -> Int -> Double
53 inclusion m l i = ( (sum $ map (\j -> conditional m j i) l)
54 + (sum $ map (\j -> conditional m i j) l)) / (fromIntegral $ (length l) + 1)
55
56
57 -- | Process some metrics on top of ngrams
58 getNgramsMeta :: Map (Int, Int) Double -> [Int] -> Map Text [Double]
59 getNgramsMeta m ngrams = fromList
60 [ ("genericity" , map (\n -> genericity m (ngrams \\ [n]) n) ngrams ),
61 ("specificity", map (\n -> specificity m (ngrams \\ [n]) n) ngrams ),
62 ("inclusion" , map (\n -> inclusion m (ngrams \\ [n]) n) ngrams )]
63
64
65 -- | To get the nth most occurent elems in a coocurency matrix
66 getNthMostOcc :: Int -> Map (Int,Int) Double -> [Int]
67 getNthMostOcc nth cooc = (nub . concat)
68 $ map (\((idx,idx'),_) -> [idx,idx'])
69 $ take nth
70 $ reverse
71 $ sortOn snd $ toList cooc
72
73
74 -------------------------
75 -- | Ngrams Dynamics | --
76 -------------------------
77
78 sharedWithParents :: Date -> PhyloBranchId -> Int -> PhyloView -> Bool
79 sharedWithParents inf bid n pv = elem n
80 $ foldl (\mem pn -> if ((bid == (fromJust $ (pn ^. pn_bid)))
81 && (inf > (fst $ getNodePeriod pn)))
82 then nub $ mem ++ (pn ^. pn_idx)
83 else mem ) []
84 $ (pv ^. pv_nodes)
85
86
87 findDynamics :: Int -> PhyloView -> PhyloNode -> Map Int (Date,Date) -> Double
88 findDynamics n pv pn m =
89 let prd = getNodePeriod pn
90 bid = fromJust $ (pn ^. pn_bid)
91 end = last' "dynamics" (sort $ map snd $ elems m)
92 in if (((snd prd) == (snd $ m ! n)) && (snd prd /= end))
93 -- emergence
94 then 2
95 else if ((fst prd) == (fst $ m ! n))
96 -- recombination
97 then 0
98 else if (not $ sharedWithParents (fst prd) bid n pv)
99 -- decrease
100 then 1
101 else 3
102
103
104
105 processDynamics :: PhyloView -> PhyloView
106 processDynamics pv = alterPhyloNode (\pn ->
107 pn & pn_metrics %~ insert "dynamics" (map (\n -> findDynamics n pv pn ngramsDates) $ (pn ^. pn_idx) ) ) pv
108 where
109 --------------------------------------
110 ngramsDates :: Map Int (Date,Date)
111 ngramsDates = map (\ds -> let ds' = sort ds
112 in (head' "Dynamics" ds', last' "Dynamics" ds'))
113 $ fromListWith (++)
114 $ foldl (\mem pn -> mem ++ (map (\n -> (n, [fst $ getNodePeriod pn, snd $ getNodePeriod pn]))
115 $ (pn ^. pn_idx))) []
116 $ (pv ^. pv_nodes)
117 --------------------------------------
118
119
120
121 -------------------
122 -- | Proximity | --
123 -------------------
124
125
126 -- | Process the inverse sumLog
127 sumInvLog :: Double -> [Double] -> Double
128 sumInvLog s l = foldl (\mem x -> mem + (1 / log (s + x))) 0 l
129
130
131 -- | Process the sumLog
132 sumLog :: Double -> [Double] -> Double
133 sumLog s l = foldl (\mem x -> mem + log (s + x)) 0 l
134
135
136 -- | To compute a jaccard similarity between two lists
137 jaccard :: [Int] -> [Int] -> Double
138 jaccard inter' union' = ((fromIntegral . length) $ inter') / ((fromIntegral . length) $ union')
139
140
141 -- | To get the diagonal of a matrix
142 toDiago :: Map (Int, Int) Double -> [Double]
143 toDiago cooc = elems $ filterWithKey (\(x,x') _ -> x == x') cooc
144
145
146 -- | To process WeighedLogJaccard distance between to coocurency matrix
147 weightedLogJaccard :: Double -> Double -> Map (Int, Int) Double -> Map (Int, Int) Double -> [Int] -> [Int] -> Double
148 weightedLogJaccard sens nbDocs cooc cooc' ngrams ngrams'
149 | null gInter = 0
150 | gInter == gUnion = 1
151 | sens == 0 = jaccard gInter gUnion
152 | sens > 0 = (sumInvLog sens wInter) / (sumInvLog sens wUnion)
153 | otherwise = (sumLog sens wInter) / (sumLog sens wUnion)
154 where
155 --------------------------------------
156 gInter :: [Int]
157 gInter = intersect ngrams ngrams'
158 --------------------------------------
159 gUnion :: [Int]
160 gUnion = union ngrams ngrams'
161 --------------------------------------
162 wInter :: [Double]
163 wInter = toDiago $ map (/nbDocs) $ intersectionWith (+) cooc cooc'
164 --------------------------------------
165 wUnion :: [Double]
166 wUnion = toDiago $ map (/nbDocs) $ unionWith (+) cooc cooc'
167 --------------------------------------
168
169
170 -- | To process the Hamming distance between two PhyloGroup fields
171 hamming :: Map (Int, Int) Double -> Map (Int, Int) Double -> Double
172 hamming f1 f2 = fromIntegral $ max ((size inter) - (size f1)) ((size inter) - (size f2))
173 where
174 --------------------------------------
175 inter :: Map (Int, Int) Double
176 inter = intersection f1 f2
177 --------------------------------------