2 Module : Gargantext.Graph.Distances.Matrix
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 This module aims at implementig distances of terms context by context is
11 the same referential of corpus.
13 Implementation use Accelerate library which enables GPU and CPU computation:
15 * Manuel M. T. Chakravarty, Gabriele Keller, Sean Lee, Trevor L. McDonell, and Vinod Grover.
16 [Accelerating Haskell Array Codes with Multicore GPUs][CKLM+11].
17 In _DAMP '11: Declarative Aspects of Multicore Programming_, ACM, 2011.
19 * Trevor L. McDonell, Manuel M. T. Chakravarty, Vinod Grover, and Ryan R. Newton.
20 [Type-safe Runtime Code Generation: Accelerate to LLVM][MCGN15].
21 In _Haskell '15: The 8th ACM SIGPLAN Symposium on Haskell_, ACM, 2015.
25 {-# LANGUAGE TypeFamilies #-}
26 {-# LANGUAGE TypeOperators #-}
27 {-# LANGUAGE ScopedTypeVariables #-}
28 {-# LANGUAGE ViewPatterns #-}
30 module Gargantext.Viz.Graph.Distances.Matrice
33 import Debug.Trace (trace)
34 import Data.Array.Accelerate
35 import Data.Array.Accelerate.Interpreter (run)
36 import qualified Gargantext.Prelude as P
39 -----------------------------------------------------------------------
43 -- Vector (Z :. 3) [0,1,2]
44 vector :: Elt c => Int -> [c] -> (Array (Z :. Int) c)
45 vector n l = fromList (Z :. n) l
49 -- >>> matrix 3 ([1..] :: [Double])
50 -- Matrix (Z :. 3 :. 3)
54 matrix :: Elt c => Int -> [c] -> Matrix c
55 matrix n l = fromList (Z :. n :. n) l
57 -- | Two ways to get the rank (as documentation)
59 -- >>> rank (matrix 3 ([1..] :: [Int]))
61 rank :: (Matrix a) -> Int
62 rank m = arrayRank $ arrayShape m
64 -----------------------------------------------------------------------
65 -- | Dimension of a square Matrix
66 -- How to force use with SquareMatrix ?
69 -- | Get Dimension of a square Matrix
71 -- >>> dim (matrix 3 ([1..] :: [Int]))
73 dim :: Matrix a -> Dim
76 Z :. _ :. n = arrayShape m
77 -- indexTail (arrayShape m)
79 -----------------------------------------------------------------------
81 runExp :: Elt e => Exp e -> e
82 runExp e = indexArray (run (unit e)) Z
83 -----------------------------------------------------------------------
85 -- | Sum of a Matrix by Column
87 -- >>> run $ matSumCol 3 (use $ matrix 3 [1..])
88 -- Matrix (Z :. 3 :. 3)
89 -- [ 12.0, 15.0, 18.0,
92 matSumCol :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
93 matSumCol r mat = replicate (constant (Z :. (r :: Int) :. All)) $ sum $ transpose mat
95 matSumCol' :: Matrix Double -> Matrix Double
96 matSumCol' m = run $ matSumCol n m'
102 -- | Proba computes de probability matrix: all cells divided by thee sum of its column
103 -- if you need get the probability on the lines, just transpose it
105 -- >>> run $ matProba 3 (use $ matrix 3 [1..])
106 -- Matrix (Z :. 3 :. 3)
107 -- [ 8.333333333333333e-2, 0.13333333333333333, 0.16666666666666666,
108 -- 0.3333333333333333, 0.3333333333333333, 0.3333333333333333,
109 -- 0.5833333333333334, 0.5333333333333333, 0.5]
110 matProba :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
111 matProba r mat = zipWith (/) mat (matSumCol r mat)
113 -- | Diagonal of the matrix
115 -- >>> run $ diag (use $ matrix 3 ([1..] :: [Int]))
116 -- Vector (Z :. 3) [1,5,9]
117 diag :: Elt e => Acc (Matrix e) -> Acc (Vector e)
118 diag m = backpermute (indexTail (shape m))
119 (lift1 (\(Z :. x) -> (Z :. x :. (x :: Exp Int))))
122 -- | Divide by the Diagonal of the matrix
124 -- >>> run $ divByDiag 3 (use $ matrix 3 ([1..] :: [Double]))
125 -- Matrix (Z :. 3 :. 3)
126 -- [ 1.0, 0.4, 0.3333333333333333,
127 -- 4.0, 1.0, 0.6666666666666666,
129 divByDiag :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
130 divByDiag d mat = zipWith (/) mat (replicate (constant (Z :. (d :: Int) :. All)) $ diag mat)
132 -----------------------------------------------------------------------
133 -- | Filters the matrix with the minimum of maximums
135 -- >>> run $ matMiniMax $ use $ matrix 3 [1..]
136 -- Matrix (Z :. 3 :. 3)
140 matMiniMax :: Acc (Matrix Double) -> Acc (Matrix Double)
141 matMiniMax m = map (\x -> ifThenElse (x > miniMax') x 0) (transpose m)
143 miniMax' = (the $ minimum $ maximum m)
145 -- | Filters the matrix with a constant
147 -- >>> run $ matFilter 5 $ use $ matrix 3 [1..]
148 -- Matrix (Z :. 3 :. 3)
152 filter' :: Double -> Acc (Matrix Double) -> Acc (Matrix Double)
153 filter' t m = map (\x -> ifThenElse (x > (constant t)) x 0) (transpose m)
155 -----------------------------------------------------------------------
156 -- * Measures of proximity
157 -----------------------------------------------------------------------
158 -- ** Conditional distance
160 -- *** Conditional distance (basic)
162 -- | Conditional distance (basic version)
164 -- 2 main measures are actually implemented in order to compute the
165 -- proximity of two terms: conditional and distributional
167 -- Conditional measure is an absolute measure which reflects
168 -- interactions of 2 terms in the corpus.
169 measureConditional :: Matrix Int -> Matrix Double
170 --measureConditional m = run (matMiniMax $ matProba (dim m) $ map fromIntegral $ use m)
171 measureConditional m = run $ matProba (dim m)
176 -- *** Conditional distance (advanced)
178 -- | Conditional distance (advanced version)
180 -- The conditional measure P(i|j) of 2 terms @i@ and @j@, also called
181 -- "confidence" , is the maximum probability between @i@ and @j@ to see
182 -- @i@ in the same context of @j@ knowing @j@.
184 -- If N(i) (resp. N(j)) is the number of occurrences of @i@ (resp. @j@)
185 -- in the corpus and _[n_{ij}\] the number of its occurrences we get:
187 -- \[P_c=max(\frac{n_i}{n_{ij}},\frac{n_j}{n_{ij}} )\]
188 conditional' :: Matrix Int -> (Matrix InclusionExclusion, Matrix SpecificityGenericity)
189 conditional' m = ( run $ ie $ map fromIntegral $ use m
190 , run $ sg $ map fromIntegral $ use m
193 ie :: Acc (Matrix Double) -> Acc (Matrix Double)
194 ie mat = map (\x -> x / (2*n-1)) $ zipWith (+) (xs mat) (ys mat)
195 sg :: Acc (Matrix Double) -> Acc (Matrix Double)
196 sg mat = map (\x -> x / (2*n-1)) $ zipWith (-) (xs mat) (ys mat)
204 xs :: Acc (Matrix Double) -> Acc (Matrix Double)
205 xs mat = zipWith (-) (matSumCol r $ matProba r mat) (matProba r mat)
206 ys :: Acc (Matrix Double) -> Acc (Matrix Double)
207 ys mat = zipWith (-) (matSumCol r $ transpose $ matProba r mat) (matProba r mat)
209 -----------------------------------------------------------------------
210 -- ** Distributional Distance
212 -- | Distributional Distance Measure
214 -- Distributional measure is a relative measure which depends on the
215 -- selected list, it represents structural equivalence of mutual information.
217 -- The distributional measure P(c) of @i@ and @j@ terms is: \[
218 -- S_{MI} = \frac {\sum_{k \neq i,j ; MI_{ik} >0}^{} \min(MI_{ik},
219 -- MI_{jk})}{\sum_{k \neq i,j ; MI_{ik}>0}^{}} \]
221 -- Mutual information
222 -- \[S_{MI}({i},{j}) = \log(\frac{C{ij}}{E{ij}})\]
224 -- Number of cooccurrences of @i@ and @j@ in the same context of text
227 -- The expected value of the cooccurrences @i@ and @j@ (given a map list of size @n@)
228 -- \[E_{ij}^{m} = \frac {S_{i} S_{j}} {N_{m}}\]
230 -- Total cooccurrences of term @i@ given a map list of size @m@
231 -- \[S_{i} = \sum_{j, j \neq i}^{m} S_{ij}\]
233 -- Total cooccurrences of terms given a map list of size @m@
234 -- \[N_{m} = \sum_{i,i \neq i}^{m} \sum_{j, j \neq j}^{m} S_{ij}\]
236 distributional :: Matrix Int -> Matrix Double
237 distributional m = run {- -- $ matMiniMax
244 {- from Int to Double -}
246 {- push matrix in Accelerate type -}
248 -- filter m = zipWith (\a b -> max a b) m (transpose m)
251 ri :: Acc (Matrix Double) -> Acc (Matrix Double)
252 ri mat = mat1 -- zipWith (/) mat1 mat2
254 mat1 = matSumCol n $ zipWith min' (myMin mat) (myMin $ transpose mat)
256 myMin :: Acc (Matrix Double) -> Acc (Matrix Double)
257 myMin = replicate (constant (Z :. n :. All)) . minimum
263 s_mi :: Acc (Matrix Double) -> Acc (Matrix Double)
264 s_mi m' = zipWith (\x y -> log (x / y)) (diagNull n m')
265 $ zipWith (/) (crossProduct n m') (total m')
269 total :: Acc (Matrix Double) -> Acc (Matrix Double)
270 total = replicate (constant (Z :. n :. n)) . sum . sum
275 -- run $ (identityMatrix (DAA.constant (10::Int)) :: DAA.Acc (DAA.Matrix Int)) Matrix (Z :. 10 :. 10)
276 identityMatrix :: Num a => Exp Int -> Acc (Matrix a)
278 let zeros = fill (index2 n n) 0
279 ones = fill (index1 n) 1
281 permute const zeros (\(unindex1 -> i) -> index2 i i) ones
284 eyeMatrix :: Num a => Dim -> Acc (Matrix a) -> Acc (Matrix a)
286 let ones = fill (index2 n n) 1
287 zeros = fill (index1 n) 0
290 permute const ones (\(unindex1 -> i) -> index2 i i) zeros
293 selfMatrix :: Num a => Dim -> Acc (Matrix a) -> Acc (Matrix a)
295 let zeros = fill (index2 n n) 0
296 ones = fill (index2 n n) 1
299 permute const ones ( lift1 ( \(Z :. (i :: Exp Int) :. (_j:: Exp Int))
300 -> -- ifThenElse (i /= j)
305 selfMatrix' :: (Elt a, P.Num (Exp a)) => Array DIM2 a -> Matrix a
306 selfMatrix' m' = run $ selfMatrix n m
311 -------------------------------------------------
312 diagNull :: Num a => Dim -> Acc (Matrix a) -> Acc (Matrix a)
313 diagNull n m = zipWith (*) m eye
318 -------------------------------------------------
319 crossProduct :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
320 crossProduct n m = trace (P.show (run m',run m'')) $ zipWith (*) m' m''
323 m'' = transpose $ cross n m
325 crossT :: Matrix Double -> Matrix Double
326 crossT = run . transpose . use
328 crossProduct' :: Matrix Double -> Matrix Double
329 crossProduct' m = run $ crossProduct n m'
334 runWith :: (Arrays c, Elt a1)
335 => (Dim -> Acc (Matrix a1) -> a2 -> Acc c)
339 runWith f m = run . f (dim m) (use m)
342 cross :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
343 cross n mat = diagNull n (matSumCol n $ diagNull n mat)
345 cross' :: Matrix Double -> Matrix Double
346 cross' mat = run $ cross n mat'
352 -----------------------------------------------------------------------
353 -----------------------------------------------------------------------
354 -- * Specificity and Genericity
356 {- | Metric Specificity and genericity: select terms
358 - let N termes and occurrences of i \[N{i}\]
360 - Cooccurrences of i and j \[N{ij}\]
361 - Probability to get i given j : \[P(i|j)=N{ij}/N{j}\]
363 - Genericity of i \[Gen(i) = \frac{\sum_{j \neq i,j} P(i|j)}{N-1}\]
364 - Specificity of j \[Spec(i) = \frac{\sum_{j \neq i,j} P(j|i)}{N-1}\]
366 - \[Inclusion (i) = Gen(i) + Spec(i)\)
367 - \[GenericityScore = Gen(i)- Spec(i)\]
369 - References: Science mapping with asymmetrical paradigmatic proximity
370 Jean-Philippe Cointet (CREA, TSV), David Chavalarias (CREA) (Submitted
371 on 15 Mar 2008), Networks and Heterogeneous Media 3, 2 (2008) 267 - 276,
372 arXiv:0803.2315 [cs.OH]
374 type InclusionExclusion = Double
375 type SpecificityGenericity = Double
377 data SquareMatrix = SymetricMatrix | NonSymetricMatrix
378 type SymetricMatrix = Matrix
379 type NonSymetricMatrix = Matrix
382 incExcSpeGen :: Matrix Int -> (Vector InclusionExclusion, Vector SpecificityGenericity)
383 incExcSpeGen m = (run' inclusionExclusion m, run' specificityGenericity m)
385 run' fun mat = run $ fun $ map fromIntegral $ use mat
387 -- | Inclusion (i) = Gen(i)+Spec(i)
388 inclusionExclusion :: Acc (Matrix Double) -> Acc (Vector Double)
389 inclusionExclusion mat = zipWith (+) (pV mat) (pV mat)
391 -- | Genericity score = Gen(i)- Spec(i)
392 specificityGenericity :: Acc (Matrix Double) -> Acc (Vector Double)
393 specificityGenericity mat = zipWith (+) (pH mat) (pH mat)
395 -- | Gen(i) : 1/(N-1)*Sum(j!=i, P(i|j)) : Genericity of i
396 pV :: Acc (Matrix Double) -> Acc (Vector Double)
397 pV mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ij mat
399 -- | Spec(i) : 1/(N-1)*Sum(j!=i, P(j|i)) : Specificity of j
400 pH :: Acc (Matrix Double) -> Acc (Vector Double)
401 pH mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ji mat
404 cardN = constant (P.fromIntegral (dim m) :: Double)
407 -- | P(i|j) = Nij /N(jj) Probability to get i given j
408 --p_ij :: (Elt e, P.Fractional (Exp e)) => Acc (SymetricMatrix e) -> Acc (Matrix e)
409 p_ij :: (Elt e, P.Fractional (Exp e)) => Acc (Matrix e) -> Acc (Matrix e)
410 p_ij m = zipWith (/) m (n_jj m)
412 n_jj :: Elt e => Acc (SymetricMatrix e) -> Acc (Matrix e)
413 n_jj myMat' = backpermute (shape m)
414 (lift1 ( \(Z :. (_ :: Exp Int) :. (j:: Exp Int))
419 -- | P(j|i) = Nij /N(ii) Probability to get i given j
421 p_ji :: (Elt e, P.Fractional (Exp e)) => Acc (Array DIM2 e) -> Acc (Array DIM2 e)
422 p_ji = transpose . p_ij
425 -- | Step to ckeck the result in visual/qualitative tests
426 incExcSpeGen_proba :: Matrix Int -> Matrix Double
427 incExcSpeGen_proba m = run' pro m
429 run' fun mat = run $ fun $ map fromIntegral $ use mat
434 -- | Hypothesis to test maybe later (or not)
435 -- TODO ask accelerate for instances to ease such writtings:
436 p_ :: (Elt e, P.Fractional (Exp e)) => Acc (Array DIM2 e) -> Acc (Array DIM2 e)
437 p_ m = zipWith (/) m (n_ m)
439 n_ :: Elt e => Acc (SymetricMatrix e) -> Acc (Matrix e)
440 n_ m = backpermute (shape m)
441 (lift1 ( \(Z :. (i :: Exp Int) :. (j:: Exp Int))
442 -> (ifThenElse (i < j) (lift (Z :. j :. j)) (lift (Z :. i :. i)) :: Exp DIM2)
447 -- * For Tests (to be removed)
448 -- | Test perfermance with this matrix
449 -- TODO : add this in a benchmark folder
450 distriTest :: Int -> Matrix Double
451 distriTest n = distributional (matrix n theMatrix)
453 theMatrix | (P.==) n 3 = [ 1, 1, 2
457 | P.otherwise = [ 1, 1
462 theResult | (P.==) n 2 = let r = 1.6094379124341003 in [ 0, r, r, 0]
463 | P.otherwise = [ 1, 1 ]
468 -----------------------------------------------------------------------