2 Module : Gargantext.Graph.Distances.Matrix
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 This module aims at implementig distances of terms context by context is
11 the same referential of corpus.
14 Implementation use Accelerate library which enables GPU and CPU computation:
16 * Manuel M. T. Chakravarty, Gabriele Keller, Sean Lee, Trevor L. McDonell, and Vinod Grover.
17 [Accelerating Haskell Array Codes with Multicore GPUs][CKLM+11].
18 In _DAMP '11: Declarative Aspects of Multicore Programming_, ACM, 2011.
20 * Trevor L. McDonell, Manuel M. T. Chakravarty, Gabriele Keller, and Ben Lippmeier.
21 [Optimising Purely Functional GPU Programs][MCKL13].
22 In _ICFP '13: The 18th ACM SIGPLAN International Conference on Functional Programming_, ACM, 2013.
24 * Robert Clifton-Everest, Trevor L. McDonell, Manuel M. T. Chakravarty, and Gabriele Keller.
25 [Embedding Foreign Code][CMCK14].
26 In _PADL '14: The 16th International Symposium on Practical Aspects of Declarative Languages_, Springer-Verlag, LNCS, 2014.
28 * Trevor L. McDonell, Manuel M. T. Chakravarty, Vinod Grover, and Ryan R. Newton.
29 [Type-safe Runtime Code Generation: Accelerate to LLVM][MCGN15].
30 In _Haskell '15: The 8th ACM SIGPLAN Symposium on Haskell_, ACM, 2015.
34 {-# LANGUAGE TypeFamilies #-}
35 {-# LANGUAGE TypeOperators #-}
36 {-# LANGUAGE ScopedTypeVariables #-}
38 module Gargantext.Viz.Graph.Distances.Matrice
41 import Data.Array.Accelerate
42 import Data.Array.Accelerate.Interpreter (run)
44 import qualified Gargantext.Prelude as P
47 -----------------------------------------------------------------------
51 -- Vector (Z :. 3) [0,1,2]
52 vector :: Int -> (Array (Z :. Int) Int)
53 vector n = fromList (Z :. n) [0..n]
57 -- >>> matrix 3 ([1..] :: [Double])
58 -- Matrix (Z :. 3 :. 3)
62 matrix :: Elt c => Int -> [c] -> Matrix c
63 matrix n l = fromList (Z :. n :. n) l
65 -- | Two ways to get the rank (as documentation)
67 -- >>> rank (matrix 3 ([1..] :: [Int]))
69 rank :: (Matrix a) -> Int
70 rank m = arrayRank $ arrayShape m
72 -----------------------------------------------------------------------
73 -- | Dimension of a square Matrix
74 -- How to force use with SquareMatrix ?
77 -- | Get Dimension of a square Matrix
79 -- >>> dim (matrix 3 ([1..] :: [Int]))
81 dim :: Matrix a -> Dim
84 Z :. _ :. n = arrayShape m
85 -- indexTail (arrayShape m)
87 -----------------------------------------------------------------------
89 -- | Sum of a Matrix by Column
91 -- >>> run $ matSum 3 (use $ matrix 3 [1..])
92 -- Matrix (Z :. 3 :. 3)
93 -- [ 12.0, 15.0, 18.0,
96 matSum :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
97 matSum r mat = replicate (constant (Z :. (r :: Int) :. All)) $ sum $ transpose mat
100 -- | Proba computes de probability matrix: all cells divided by thee sum of its column
101 -- if you need get the probability on the lines, just transpose it
103 -- >>> run $ matProba 3 (use $ matrix 3 [1..])
104 -- Matrix (Z :. 3 :. 3)
105 -- [ 8.333333333333333e-2, 0.13333333333333333, 0.16666666666666666,
106 -- 0.3333333333333333, 0.3333333333333333, 0.3333333333333333,
107 -- 0.5833333333333334, 0.5333333333333333, 0.5]
108 matProba :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
109 matProba r mat = zipWith (/) mat (matSum r mat)
111 -- | Diagonal of the matrix
113 -- >>> run $ diag (use $ matrix 3 ([1..] :: [Int]))
114 -- Vector (Z :. 3) [1,5,9]
115 diag :: Elt e => Acc (Matrix e) -> Acc (Vector e)
116 diag m = backpermute (indexTail (shape m)) (lift1 (\(Z :. x) -> (Z :. x :. (x :: Exp Int)))) m
118 -- | Divide by the Diagonal of the matrix
120 -- >>> run $ divByDiag 3 (use $ matrix 3 ([1..] :: [Double]))
121 -- Matrix (Z :. 3 :. 3)
122 -- [ 1.0, 0.4, 0.3333333333333333,
123 -- 4.0, 1.0, 0.6666666666666666,
125 divByDiag :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
126 divByDiag d mat = zipWith (/) mat (replicate (constant (Z :. (d :: Int) :. All)) $ diag mat)
128 -----------------------------------------------------------------------
129 -- | Filters the matrix with the minimum of maximums
131 -- >>> run $ matMiniMax $ use $ matrix 3 [1..]
132 -- Matrix (Z :. 3 :. 3)
136 matMiniMax :: Acc (Matrix Double) -> Acc (Matrix Double)
137 matMiniMax m = map (\x -> ifThenElse (x > miniMax') x 0) (transpose m)
139 miniMax' = (the $ minimum $ maximum m)
141 -- | Filters the matrix with a constant
143 -- >>> run $ matFilter 5 $ use $ matrix 3 [1..]
144 -- Matrix (Z :. 3 :. 3)
148 matFilter :: Double -> Acc (Matrix Double) -> Acc (Matrix Double)
149 matFilter t m = map (\x -> ifThenElse (x > (constant t)) x 0) (transpose m)
151 -----------------------------------------------------------------------
152 -- * Measures of proximity
153 -----------------------------------------------------------------------
154 -- ** Conditional distance
156 -- *** Conditional distance (basic)
158 -- | Conditional distance (basic version)
160 -- 2 main measures are actually implemented in order to compute the
161 -- proximity of two terms: conditional and distributional
163 -- Conditional measure is an absolute measure which reflects
164 -- interactions of 2 terms in the corpus.
165 measureConditional :: Matrix Int -> Matrix Double
166 --measureConditional m = run (matMiniMax $ matProba (dim m) $ map fromIntegral $ use m)
167 measureConditional m = run (matProba (dim m) $ map fromIntegral $ use m)
170 -- *** Conditional distance (advanced)
172 -- | Conditional distance (advanced version)
174 -- The conditional measure P(i|j) of 2 terms @i@ and @j@, also called
175 -- "confidence" , is the maximum probability between @i@ and @j@ to see
176 -- @i@ in the same context of @j@ knowing @j@.
178 -- If N(i) (resp. N(j)) is the number of occurrences of @i@ (resp. @j@)
179 -- in the corpus and _[n_{ij}\] the number of its occurrences we get:
181 -- \[P_c=max(\frac{n_i}{n_{ij}},\frac{n_j}{n_{ij}} )\]
182 conditional' :: Matrix Int -> (Matrix InclusionExclusion, Matrix SpecificityGenericity)
183 conditional' m = ( run $ ie $ map fromIntegral $ use m
184 , run $ sg $ map fromIntegral $ use m
187 ie :: Acc (Matrix Double) -> Acc (Matrix Double)
188 ie mat = map (\x -> x / (2*n-1)) $ zipWith (+) (xs mat) (ys mat)
189 sg :: Acc (Matrix Double) -> Acc (Matrix Double)
190 sg mat = map (\x -> x / (2*n-1)) $ zipWith (-) (xs mat) (ys mat)
198 xs :: Acc (Matrix Double) -> Acc (Matrix Double)
199 xs mat = zipWith (-) (matSum r $ matProba r mat) (matProba r mat)
200 ys :: Acc (Matrix Double) -> Acc (Matrix Double)
201 ys mat = zipWith (-) (matSum r $ transpose $ matProba r mat) (matProba r mat)
203 -----------------------------------------------------------------------
204 -- ** Distributional Distance
206 -- | Distributional Distance Measure
208 -- Distributional measure is a relative measure which depends on the
209 -- selected list, it represents structural equivalence.
211 -- The distributional measure P(c) of @i@ and @j@ terms is: \[
212 -- S_{MI} = \frac {\sum_{k \neq i,j ; MI_{ik} >0}^{} \min(MI_{ik},
213 -- MI_{jk})}{\sum_{k \neq i,j ; MI_{ik}}^{}} \]
215 -- Mutual information
216 -- \[S_{MI}({i},{j}) = \log(\frac{C{ij}}{E{ij}})\]
218 -- Number of cooccurrences of @i@ and @j@ in the same context of text
221 -- The expected value of the cooccurrences @i@ and @j@ (given a map list of size @n@)
222 -- \[E_{ij}^{m} = \frac {S_{i} S_{j}} {N_{m}}\]
224 -- Total cooccurrences of term @i@ given a map list of size @m@
225 -- \[S_{i} = \sum_{j, j \neq i}^{m} S_{ij}\]
227 -- Total cooccurrences of terms given a map list of size @m@
228 -- \[N_{m} = \sum_{i,i \neq i}^{m} \sum_{j, j \neq j}^{m} S_{ij}\]
230 distributional :: Matrix Int -> Matrix Double
231 distributional m = run $ matMiniMax $ ri (map fromIntegral $ use m)
234 -- filter m = zipWith (\a b -> max a b) m (transpose m)
236 ri mat = zipWith (/) mat1 mat2
238 mat1 = matSum n $ zipWith min (s_mi mat) (s_mi $ transpose mat)
241 s_mi m' = zipWith (\a b -> log (a/b)) m'
242 $ zipWith (/) (crossProduct m') (total m')
244 total m'' = replicate (constant (Z :. n :. n)) $ fold (+) 0 $ fold (+) 0 m''
249 crossProduct m''' = zipWith (*) (cross m''' ) (cross (transpose m'''))
250 cross mat = zipWith (-) (matSum n mat) (mat)
252 -----------------------------------------------------------------------
253 -----------------------------------------------------------------------
254 -- * Specificity and Genericity
256 {- | Metric Specificity and genericity: select terms
258 - let N termes and occurrences of i \[N{i}\]
260 - Cooccurrences of i and j \[N{ij}\]
261 - Probability to get i given j : \[P(i|j)=N{ij}/N{j}\]
263 - Genericity of i \[Gen(i) = \frac{\sum_{j \neq i,j} P(i|j)}{N-1}\]
264 - Specificity of j \[Spec(i) = \frac{\sum_{j \neq i,j} P(j|i)}{N-1}\]
266 - \[Inclusion (i) = Gen(i) + Spec(i)\)
267 - \[GenericityScore = Gen(i)- Spec(i)\]
269 - References: Science mapping with asymmetrical paradigmatic proximity
270 Jean-Philippe Cointet (CREA, TSV), David Chavalarias (CREA) (Submitted
271 on 15 Mar 2008), Networks and Heterogeneous Media 3, 2 (2008) 267 - 276,
272 arXiv:0803.2315 [cs.OH]
274 type InclusionExclusion = Double
275 type SpecificityGenericity = Double
277 data SquareMatrix = SymetricMatrix | NonSymetricMatrix
278 type SymetricMatrix = Matrix
279 type NonSymetricMatrix = Matrix
282 incExcSpeGen :: Matrix Int -> (Vector InclusionExclusion, Vector SpecificityGenericity)
283 incExcSpeGen m = (run' inclusionExclusion m, run' specificityGenericity m)
285 run' fun mat = run $ fun $ map fromIntegral $ use mat
287 -- | Inclusion (i) = Gen(i)+Spec(i)
288 inclusionExclusion :: Acc (Matrix Double) -> Acc (Vector Double)
289 inclusionExclusion mat = zipWith (+) (pV mat) (pV mat)
291 -- | Genericity score = Gen(i)- Spec(i)
292 specificityGenericity :: Acc (Matrix Double) -> Acc (Vector Double)
293 specificityGenericity mat = zipWith (+) (pH mat) (pH mat)
295 -- | Gen(i) : 1/(N-1)*Sum(j!=i, P(i|j)) : Genericity of i
296 pV :: Acc (Matrix Double) -> Acc (Vector Double)
297 pV mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ij mat
299 -- | Spec(i) : 1/(N-1)*Sum(j!=i, P(j|i)) : Specificity of j
300 pH :: Acc (Matrix Double) -> Acc (Vector Double)
301 pH mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ji mat
304 cardN = constant (P.fromIntegral (dim m) :: Double)
307 -- | P(i|j) = Nij /N(jj) Probability to get i given j
308 --p_ij :: (Elt e, P.Fractional (Exp e)) => Acc (SymetricMatrix e) -> Acc (Matrix e)
309 p_ij :: (Elt e, P.Fractional (Exp e)) => Acc (Matrix e) -> Acc (Matrix e)
310 p_ij m = zipWith (/) m (n_jj m)
312 n_jj :: Elt e => Acc (SymetricMatrix e) -> Acc (Matrix e)
313 n_jj myMat' = backpermute (shape m)
314 (lift1 ( \(Z :. (_ :: Exp Int) :. (j:: Exp Int))
319 -- | P(j|i) = Nij /N(ii) Probability to get i given j
321 p_ji :: (Elt e, P.Fractional (Exp e)) => Acc (Array DIM2 e) -> Acc (Array DIM2 e)
322 p_ji = transpose . p_ij
325 -- | Step to ckeck the result in visual/qualitative tests
326 incExcSpeGen_proba :: Matrix Int -> Matrix Double
327 incExcSpeGen_proba m = run' pro m
329 run' fun mat = run $ fun $ map fromIntegral $ use mat
334 -- | Hypothesis to test maybe later (or not)
335 -- TODO ask accelerate for instances to ease such writtings:
336 p_ :: (Elt e, P.Fractional (Exp e)) => Acc (Array DIM2 e) -> Acc (Array DIM2 e)
337 p_ m = zipWith (/) m (n_ m)
339 n_ :: Elt e => Acc (SymetricMatrix e) -> Acc (Matrix e)
340 n_ m = backpermute (shape m)
341 (lift1 ( \(Z :. (i :: Exp Int) :. (j:: Exp Int))
342 -> (ifThenElse (i < j) (lift (Z :. j :. j)) (lift (Z :. i :. i)) :: Exp DIM2)
347 -- * For Tests (to be removed)
348 -- | Test perfermance with this matrix
349 -- TODO : add this in a benchmark folder
350 distriTest :: Matrix Double
351 distriTest = distributional $ matrix 100 [1..]
352 -----------------------------------------------------------------------