2 Module : Gargantext.Graph.Distances.Matrix
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 This module aims at implementig distances of terms context by context is
11 the same referential of corpus.
14 Implementation use Accelerate library which enables GPU and CPU computation:
16 * Manuel M. T. Chakravarty, Gabriele Keller, Sean Lee, Trevor L. McDonell, and Vinod Grover.
17 [Accelerating Haskell Array Codes with Multicore GPUs][CKLM+11].
18 In _DAMP '11: Declarative Aspects of Multicore Programming_, ACM, 2011.
20 * Trevor L. McDonell, Manuel M. T. Chakravarty, Vinod Grover, and Ryan R. Newton.
21 [Type-safe Runtime Code Generation: Accelerate to LLVM][MCGN15].
22 In _Haskell '15: The 8th ACM SIGPLAN Symposium on Haskell_, ACM, 2015.
26 {-# LANGUAGE TypeFamilies #-}
27 {-# LANGUAGE TypeOperators #-}
28 {-# LANGUAGE ScopedTypeVariables #-}
29 {-# LANGUAGE ViewPatterns #-}
31 module Gargantext.Viz.Graph.Distances.Matrice
34 import Debug.Trace (trace)
35 import Data.Array.Accelerate
36 import Data.Array.Accelerate.Interpreter (run)
37 import qualified Gargantext.Prelude as P
40 -----------------------------------------------------------------------
44 -- Vector (Z :. 3) [0,1,2]
45 vector :: Elt c => Int -> [c] -> (Array (Z :. Int) c)
46 vector n l = fromList (Z :. n) l
50 -- >>> matrix 3 ([1..] :: [Double])
51 -- Matrix (Z :. 3 :. 3)
55 matrix :: Elt c => Int -> [c] -> Matrix c
56 matrix n l = fromList (Z :. n :. n) l
58 -- | Two ways to get the rank (as documentation)
60 -- >>> rank (matrix 3 ([1..] :: [Int]))
62 rank :: (Matrix a) -> Int
63 rank m = arrayRank $ arrayShape m
65 -----------------------------------------------------------------------
66 -- | Dimension of a square Matrix
67 -- How to force use with SquareMatrix ?
70 -- | Get Dimension of a square Matrix
72 -- >>> dim (matrix 3 ([1..] :: [Int]))
74 dim :: Matrix a -> Dim
77 Z :. _ :. n = arrayShape m
78 -- indexTail (arrayShape m)
80 -----------------------------------------------------------------------
82 runExp :: Elt e => Exp e -> e
83 runExp e = indexArray (run (unit e)) Z
84 -----------------------------------------------------------------------
86 -- | Sum of a Matrix by Column
88 -- >>> run $ matSumCol 3 (use $ matrix 3 [1..])
89 -- Matrix (Z :. 3 :. 3)
90 -- [ 12.0, 15.0, 18.0,
93 matSumCol :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
94 matSumCol r mat = replicate (constant (Z :. (r :: Int) :. All)) $ sum $ transpose mat
96 matSumCol' :: Matrix Double -> Matrix Double
97 matSumCol' m = run $ matSumCol n m'
103 -- | Proba computes de probability matrix: all cells divided by thee sum of its column
104 -- if you need get the probability on the lines, just transpose it
106 -- >>> run $ matProba 3 (use $ matrix 3 [1..])
107 -- Matrix (Z :. 3 :. 3)
108 -- [ 8.333333333333333e-2, 0.13333333333333333, 0.16666666666666666,
109 -- 0.3333333333333333, 0.3333333333333333, 0.3333333333333333,
110 -- 0.5833333333333334, 0.5333333333333333, 0.5]
111 matProba :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
112 matProba r mat = zipWith (/) mat (matSumCol r mat)
114 -- | Diagonal of the matrix
116 -- >>> run $ diag (use $ matrix 3 ([1..] :: [Int]))
117 -- Vector (Z :. 3) [1,5,9]
118 diag :: Elt e => Acc (Matrix e) -> Acc (Vector e)
119 diag m = backpermute (indexTail (shape m))
120 (lift1 (\(Z :. x) -> (Z :. x :. (x :: Exp Int))))
123 -- | Divide by the Diagonal of the matrix
125 -- >>> run $ divByDiag 3 (use $ matrix 3 ([1..] :: [Double]))
126 -- Matrix (Z :. 3 :. 3)
127 -- [ 1.0, 0.4, 0.3333333333333333,
128 -- 4.0, 1.0, 0.6666666666666666,
130 divByDiag :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
131 divByDiag d mat = zipWith (/) mat (replicate (constant (Z :. (d :: Int) :. All)) $ diag mat)
133 -----------------------------------------------------------------------
134 -- | Filters the matrix with the minimum of maximums
136 -- >>> run $ matMiniMax $ use $ matrix 3 [1..]
137 -- Matrix (Z :. 3 :. 3)
141 matMiniMax :: Acc (Matrix Double) -> Acc (Matrix Double)
142 matMiniMax m = map (\x -> ifThenElse (x > miniMax') x 0) (transpose m)
144 miniMax' = (the $ minimum $ maximum m)
146 -- | Filters the matrix with a constant
148 -- >>> run $ matFilter 5 $ use $ matrix 3 [1..]
149 -- Matrix (Z :. 3 :. 3)
153 filter' :: Double -> Acc (Matrix Double) -> Acc (Matrix Double)
154 filter' t m = map (\x -> ifThenElse (x > (constant t)) x 0) (transpose m)
156 -----------------------------------------------------------------------
157 -- * Measures of proximity
158 -----------------------------------------------------------------------
159 -- ** Conditional distance
161 -- *** Conditional distance (basic)
163 -- | Conditional distance (basic version)
165 -- 2 main measures are actually implemented in order to compute the
166 -- proximity of two terms: conditional and distributional
168 -- Conditional measure is an absolute measure which reflects
169 -- interactions of 2 terms in the corpus.
170 measureConditional :: Matrix Int -> Matrix Double
171 --measureConditional m = run (matMiniMax $ matProba (dim m) $ map fromIntegral $ use m)
172 measureConditional m = run $ matProba (dim m)
177 -- *** Conditional distance (advanced)
179 -- | Conditional distance (advanced version)
181 -- The conditional measure P(i|j) of 2 terms @i@ and @j@, also called
182 -- "confidence" , is the maximum probability between @i@ and @j@ to see
183 -- @i@ in the same context of @j@ knowing @j@.
185 -- If N(i) (resp. N(j)) is the number of occurrences of @i@ (resp. @j@)
186 -- in the corpus and _[n_{ij}\] the number of its occurrences we get:
188 -- \[P_c=max(\frac{n_i}{n_{ij}},\frac{n_j}{n_{ij}} )\]
189 conditional' :: Matrix Int -> (Matrix InclusionExclusion, Matrix SpecificityGenericity)
190 conditional' m = ( run $ ie $ map fromIntegral $ use m
191 , run $ sg $ map fromIntegral $ use m
194 ie :: Acc (Matrix Double) -> Acc (Matrix Double)
195 ie mat = map (\x -> x / (2*n-1)) $ zipWith (+) (xs mat) (ys mat)
196 sg :: Acc (Matrix Double) -> Acc (Matrix Double)
197 sg mat = map (\x -> x / (2*n-1)) $ zipWith (-) (xs mat) (ys mat)
205 xs :: Acc (Matrix Double) -> Acc (Matrix Double)
206 xs mat = zipWith (-) (matSumCol r $ matProba r mat) (matProba r mat)
207 ys :: Acc (Matrix Double) -> Acc (Matrix Double)
208 ys mat = zipWith (-) (matSumCol r $ transpose $ matProba r mat) (matProba r mat)
210 -----------------------------------------------------------------------
211 -- ** Distributional Distance
213 -- | Distributional Distance Measure
215 -- Distributional measure is a relative measure which depends on the
216 -- selected list, it represents structural equivalence of mutual information.
218 -- The distributional measure P(c) of @i@ and @j@ terms is: \[
219 -- S_{MI} = \frac {\sum_{k \neq i,j ; MI_{ik} >0}^{} \min(MI_{ik},
220 -- MI_{jk})}{\sum_{k \neq i,j ; MI_{ik}>0}^{}} \]
222 -- Mutual information
223 -- \[S_{MI}({i},{j}) = \log(\frac{C{ij}}{E{ij}})\]
225 -- Number of cooccurrences of @i@ and @j@ in the same context of text
228 -- The expected value of the cooccurrences @i@ and @j@ (given a map list of size @n@)
229 -- \[E_{ij}^{m} = \frac {S_{i} S_{j}} {N_{m}}\]
231 -- Total cooccurrences of term @i@ given a map list of size @m@
232 -- \[S_{i} = \sum_{j, j \neq i}^{m} S_{ij}\]
234 -- Total cooccurrences of terms given a map list of size @m@
235 -- \[N_{m} = \sum_{i,i \neq i}^{m} \sum_{j, j \neq j}^{m} S_{ij}\]
237 distributional :: Matrix Int -> Matrix Double
238 distributional m = run {- -- $ matMiniMax
245 {- from Int to Double -}
247 {- push matrix in Accelerate type -}
249 -- filter m = zipWith (\a b -> max a b) m (transpose m)
252 ri :: Acc (Matrix Double) -> Acc (Matrix Double)
253 ri mat = mat1 -- zipWith (/) mat1 mat2
255 mat1 = matSumCol n $ zipWith min' (myMin mat) (myMin $ transpose mat)
257 myMin :: Acc (Matrix Double) -> Acc (Matrix Double)
258 myMin = replicate (constant (Z :. n :. All)) . minimum
264 s_mi :: Acc (Matrix Double) -> Acc (Matrix Double)
265 s_mi m' = zipWith (\x y -> log (x / y)) (diagNull n m')
266 $ zipWith (/) (crossProduct n m') (total m')
270 total :: Acc (Matrix Double) -> Acc (Matrix Double)
271 total = replicate (constant (Z :. n :. n)) . sum . sum
276 -- run $ (identityMatrix (DAA.constant (10::Int)) :: DAA.Acc (DAA.Matrix Int)) Matrix (Z :. 10 :. 10)
277 identityMatrix :: Num a => Exp Int -> Acc (Matrix a)
279 let zeros = fill (index2 n n) 0
280 ones = fill (index1 n) 1
282 permute const zeros (\(unindex1 -> i) -> index2 i i) ones
285 eyeMatrix :: Num a => Dim -> Acc (Matrix a) -> Acc (Matrix a)
287 let ones = fill (index2 n n) 1
288 zeros = fill (index1 n) 0
291 permute const ones (\(unindex1 -> i) -> index2 i i) zeros
294 selfMatrix :: Num a => Dim -> Acc (Matrix a) -> Acc (Matrix a)
296 let zeros = fill (index2 n n) 0
297 ones = fill (index2 n n) 1
300 permute const ones ( lift1 ( \(Z :. (i :: Exp Int) :. (_j:: Exp Int))
301 -> -- ifThenElse (i /= j)
306 selfMatrix' :: (Elt a, P.Num (Exp a)) => Array DIM2 a -> Matrix a
307 selfMatrix' m' = run $ selfMatrix n m
312 -------------------------------------------------
313 diagNull :: Num a => Dim -> Acc (Matrix a) -> Acc (Matrix a)
314 diagNull n m = zipWith (*) m eye
319 -------------------------------------------------
320 crossProduct :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
321 crossProduct n m = trace (P.show (run m',run m'')) $ zipWith (*) m' m''
324 m'' = cross n (transpose m)
326 crossT :: Matrix Double -> Matrix Double
327 crossT = run . transpose . use
329 crossProduct' :: Matrix Double -> Matrix Double
330 crossProduct' m = run $ crossProduct n m'
335 runWith :: (Arrays c, Elt a1)
336 => (Dim -> Acc (Matrix a1) -> a2 -> Acc c)
340 runWith f m = run . f (dim m) (use m)
343 cross :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
344 cross n mat = diagNull n (matSumCol n $ diagNull n mat)
346 cross' :: Matrix Double -> Matrix Double
347 cross' mat = run $ cross n mat'
353 -----------------------------------------------------------------------
354 -----------------------------------------------------------------------
355 -- * Specificity and Genericity
357 {- | Metric Specificity and genericity: select terms
359 - let N termes and occurrences of i \[N{i}\]
361 - Cooccurrences of i and j \[N{ij}\]
362 - Probability to get i given j : \[P(i|j)=N{ij}/N{j}\]
364 - Genericity of i \[Gen(i) = \frac{\sum_{j \neq i,j} P(i|j)}{N-1}\]
365 - Specificity of j \[Spec(i) = \frac{\sum_{j \neq i,j} P(j|i)}{N-1}\]
367 - \[Inclusion (i) = Gen(i) + Spec(i)\)
368 - \[GenericityScore = Gen(i)- Spec(i)\]
370 - References: Science mapping with asymmetrical paradigmatic proximity
371 Jean-Philippe Cointet (CREA, TSV), David Chavalarias (CREA) (Submitted
372 on 15 Mar 2008), Networks and Heterogeneous Media 3, 2 (2008) 267 - 276,
373 arXiv:0803.2315 [cs.OH]
375 type InclusionExclusion = Double
376 type SpecificityGenericity = Double
378 data SquareMatrix = SymetricMatrix | NonSymetricMatrix
379 type SymetricMatrix = Matrix
380 type NonSymetricMatrix = Matrix
383 incExcSpeGen :: Matrix Int -> (Vector InclusionExclusion, Vector SpecificityGenericity)
384 incExcSpeGen m = (run' inclusionExclusion m, run' specificityGenericity m)
386 run' fun mat = run $ fun $ map fromIntegral $ use mat
388 -- | Inclusion (i) = Gen(i)+Spec(i)
389 inclusionExclusion :: Acc (Matrix Double) -> Acc (Vector Double)
390 inclusionExclusion mat = zipWith (+) (pV mat) (pV mat)
392 -- | Genericity score = Gen(i)- Spec(i)
393 specificityGenericity :: Acc (Matrix Double) -> Acc (Vector Double)
394 specificityGenericity mat = zipWith (+) (pH mat) (pH mat)
396 -- | Gen(i) : 1/(N-1)*Sum(j!=i, P(i|j)) : Genericity of i
397 pV :: Acc (Matrix Double) -> Acc (Vector Double)
398 pV mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ij mat
400 -- | Spec(i) : 1/(N-1)*Sum(j!=i, P(j|i)) : Specificity of j
401 pH :: Acc (Matrix Double) -> Acc (Vector Double)
402 pH mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ji mat
405 cardN = constant (P.fromIntegral (dim m) :: Double)
408 -- | P(i|j) = Nij /N(jj) Probability to get i given j
409 --p_ij :: (Elt e, P.Fractional (Exp e)) => Acc (SymetricMatrix e) -> Acc (Matrix e)
410 p_ij :: (Elt e, P.Fractional (Exp e)) => Acc (Matrix e) -> Acc (Matrix e)
411 p_ij m = zipWith (/) m (n_jj m)
413 n_jj :: Elt e => Acc (SymetricMatrix e) -> Acc (Matrix e)
414 n_jj myMat' = backpermute (shape m)
415 (lift1 ( \(Z :. (_ :: Exp Int) :. (j:: Exp Int))
420 -- | P(j|i) = Nij /N(ii) Probability to get i given j
422 p_ji :: (Elt e, P.Fractional (Exp e)) => Acc (Array DIM2 e) -> Acc (Array DIM2 e)
423 p_ji = transpose . p_ij
426 -- | Step to ckeck the result in visual/qualitative tests
427 incExcSpeGen_proba :: Matrix Int -> Matrix Double
428 incExcSpeGen_proba m = run' pro m
430 run' fun mat = run $ fun $ map fromIntegral $ use mat
435 -- | Hypothesis to test maybe later (or not)
436 -- TODO ask accelerate for instances to ease such writtings:
437 p_ :: (Elt e, P.Fractional (Exp e)) => Acc (Array DIM2 e) -> Acc (Array DIM2 e)
438 p_ m = zipWith (/) m (n_ m)
440 n_ :: Elt e => Acc (SymetricMatrix e) -> Acc (Matrix e)
441 n_ m = backpermute (shape m)
442 (lift1 ( \(Z :. (i :: Exp Int) :. (j:: Exp Int))
443 -> (ifThenElse (i < j) (lift (Z :. j :. j)) (lift (Z :. i :. i)) :: Exp DIM2)
448 -- * For Tests (to be removed)
449 -- | Test perfermance with this matrix
450 -- TODO : add this in a benchmark folder
451 distriTest :: Matrix Double
452 distriTest = distributional $ matrix 100 [1..]
453 -----------------------------------------------------------------------