2 Module : Gargantext.Graph.Distances.Matrix
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 This module aims at implementig distances of terms context by context is
11 the same referential of corpus.
13 Implementation use Accelerate library which enables GPU and CPU computation:
15 * Manuel M. T. Chakravarty, Gabriele Keller, Sean Lee, Trevor L. McDonell, and Vinod Grover.
16 [Accelerating Haskell Array Codes with Multicore GPUs][CKLM+11].
17 In _DAMP '11: Declarative Aspects of Multicore Programming_, ACM, 2011.
19 * Trevor L. McDonell, Manuel M. T. Chakravarty, Vinod Grover, and Ryan R. Newton.
20 [Type-safe Runtime Code Generation: Accelerate to LLVM][MCGN15].
21 In _Haskell '15: The 8th ACM SIGPLAN Symposium on Haskell_, ACM, 2015.
25 {-# LANGUAGE TypeFamilies #-}
26 {-# LANGUAGE TypeOperators #-}
27 {-# LANGUAGE ScopedTypeVariables #-}
28 {-# LANGUAGE ViewPatterns #-}
30 module Gargantext.Core.Viz.Graph.Distances.Matrice
33 import qualified Data.Foldable as P (foldl1)
34 import Debug.Trace (trace)
35 import Data.Array.Accelerate
36 import Data.Array.Accelerate.Interpreter (run)
37 import qualified Gargantext.Prelude as P
40 -----------------------------------------------------------------------
44 -- Vector (Z :. 3) [0,1,2]
45 vector :: Elt c => Int -> [c] -> (Array (Z :. Int) c)
46 vector n l = fromList (Z :. n) l
50 -- >>> matrix 3 ([1..] :: [Double])
51 -- Matrix (Z :. 3 :. 3)
55 matrix :: Elt c => Int -> [c] -> Matrix c
56 matrix n l = fromList (Z :. n :. n) l
58 -- | Two ways to get the rank (as documentation)
60 -- >>> rank (matrix 3 ([1..] :: [Int]))
62 rank :: (Matrix a) -> Int
63 rank m = arrayRank $ arrayShape m
65 -----------------------------------------------------------------------
66 -- | Dimension of a square Matrix
67 -- How to force use with SquareMatrix ?
70 -- | Get Dimension of a square Matrix
72 -- >>> dim (matrix 3 ([1..] :: [Int]))
74 dim :: Matrix a -> Dim
77 Z :. _ :. n = arrayShape m
78 -- indexTail (arrayShape m)
80 -----------------------------------------------------------------------
82 runExp :: Elt e => Exp e -> e
83 runExp e = indexArray (run (unit e)) Z
84 -----------------------------------------------------------------------
86 -- | Sum of a Matrix by Column
88 -- >>> run $ matSumCol 3 (use $ matrix 3 [1..])
89 -- Matrix (Z :. 3 :. 3)
90 -- [ 12.0, 15.0, 18.0,
93 matSumCol :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
94 matSumCol r mat = replicate (constant (Z :. (r :: Int) :. All)) $ sum $ transpose mat
96 matSumCol' :: Matrix Double -> Matrix Double
97 matSumCol' m = run $ matSumCol n m'
103 -- | Proba computes de probability matrix: all cells divided by thee sum of its column
104 -- if you need get the probability on the lines, just transpose it
106 -- >>> run $ matProba 3 (use $ matrix 3 [1..])
107 -- Matrix (Z :. 3 :. 3)
108 -- [ 8.333333333333333e-2, 0.13333333333333333, 0.16666666666666666,
109 -- 0.3333333333333333, 0.3333333333333333, 0.3333333333333333,
110 -- 0.5833333333333334, 0.5333333333333333, 0.5]
111 matProba :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
112 matProba r mat = zipWith (/) mat (matSumCol r mat)
114 -- | Diagonal of the matrix
116 -- >>> run $ diag (use $ matrix 3 ([1..] :: [Int]))
117 -- Vector (Z :. 3) [1,5,9]
118 diag :: Elt e => Acc (Matrix e) -> Acc (Vector e)
119 diag m = backpermute (indexTail (shape m))
120 (lift1 (\(Z :. x) -> (Z :. x :. (x :: Exp Int))))
123 -- | Divide by the Diagonal of the matrix
125 -- >>> run $ divByDiag 3 (use $ matrix 3 ([1..] :: [Double]))
126 -- Matrix (Z :. 3 :. 3)
127 -- [ 1.0, 0.4, 0.3333333333333333,
128 -- 4.0, 1.0, 0.6666666666666666,
130 divByDiag :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
131 divByDiag d mat = zipWith (/) mat (replicate (constant (Z :. (d :: Int) :. All)) $ diag mat)
133 -----------------------------------------------------------------------
134 -- | Filters the matrix with the minimum of maximums
136 -- >>> run $ matMiniMax $ use $ matrix 3 [1..]
137 -- Matrix (Z :. 3 :. 3)
141 matMiniMax :: (Elt a, Ord a, P.Num a) => Acc (Matrix a) -> Acc (Matrix a)
142 matMiniMax m = filterWith' miniMax' (constant 0) m
144 miniMax' = the $ minimum $ maximum m
147 -- | Filters the matrix with a constant
149 -- >>> run $ matFilter 5 $ use $ matrix 3 [1..]
150 -- Matrix (Z :. 3 :. 3)
154 filter' :: Double -> Acc (Matrix Double) -> Acc (Matrix Double)
155 filter' t m = filterWith t 0 m
157 filterWith :: Double -> Double -> Acc (Matrix Double) -> Acc (Matrix Double)
158 filterWith t v m = map (\x -> ifThenElse (x > (constant t)) x (constant v)) (transpose m)
160 filterWith' :: (Elt a, Ord a) => Exp a -> Exp a -> Acc (Matrix a) -> Acc (Matrix a)
161 filterWith' t v m = map (\x -> ifThenElse (x > t) x v) m
166 -----------------------------------------------------------------------
167 -- * Metrics of proximity
168 -----------------------------------------------------------------------
169 -- ** Conditional distance
171 -- *** Conditional distance (basic)
173 -- | Conditional distance (basic version)
175 -- 2 main metrics are actually implemented in order to compute the
176 -- proximity of two terms: conditional and distributional
178 -- Conditional metric is an absolute metric which reflects
179 -- interactions of 2 terms in the corpus.
180 measureConditional :: Matrix Int -> Matrix Double
181 --measureConditional m = run (matMiniMax $ matProba (dim m) $ map fromIntegral $ use m)
182 measureConditional m = run $ matProba (dim m)
187 -- *** Conditional distance (advanced)
189 -- | Conditional distance (advanced version)
191 -- The conditional metric P(i|j) of 2 terms @i@ and @j@, also called
192 -- "confidence" , is the maximum probability between @i@ and @j@ to see
193 -- @i@ in the same context of @j@ knowing @j@.
195 -- If N(i) (resp. N(j)) is the number of occurrences of @i@ (resp. @j@)
196 -- in the corpus and _[n_{ij}\] the number of its occurrences we get:
198 -- \[P_c=max(\frac{n_i}{n_{ij}},\frac{n_j}{n_{ij}} )\]
199 conditional' :: Matrix Int -> (Matrix GenericityInclusion, Matrix SpecificityExclusion)
200 conditional' m = ( run $ ie $ map fromIntegral $ use m
201 , run $ sg $ map fromIntegral $ use m
204 ie :: Acc (Matrix Double) -> Acc (Matrix Double)
205 ie mat = map (\x -> x / (2*n-1)) $ zipWith (+) (xs mat) (ys mat)
206 sg :: Acc (Matrix Double) -> Acc (Matrix Double)
207 sg mat = map (\x -> x / (2*n-1)) $ zipWith (-) (xs mat) (ys mat)
215 xs :: Acc (Matrix Double) -> Acc (Matrix Double)
216 xs mat = zipWith (-) (matSumCol r $ matProba r mat) (matProba r mat)
217 ys :: Acc (Matrix Double) -> Acc (Matrix Double)
218 ys mat = zipWith (-) (matSumCol r $ transpose $ matProba r mat) (matProba r mat)
220 -----------------------------------------------------------------------
221 -- ** Distributional Distance
223 -- | Distributional Distance metric
225 -- Distributional metric is a relative metric which depends on the
226 -- selected list, it represents structural equivalence of mutual information.
228 -- The distributional metric P(c) of @i@ and @j@ terms is: \[
229 -- S_{MI} = \frac {\sum_{k \neq i,j ; MI_{ik} >0}^{} \min(MI_{ik},
230 -- MI_{jk})}{\sum_{k \neq i,j ; MI_{ik}>0}^{}} \]
232 -- Mutual information
233 -- \[S_{MI}({i},{j}) = \log(\frac{C{ij}}{E{ij}})\]
235 -- Number of cooccurrences of @i@ and @j@ in the same context of text
238 -- The expected value of the cooccurrences @i@ and @j@ (given a map list of size @n@)
239 -- \[E_{ij}^{m} = \frac {S_{i} S_{j}} {N_{m}}\]
241 -- Total cooccurrences of term @i@ given a map list of size @m@
242 -- \[S_{i} = \sum_{j, j \neq i}^{m} S_{ij}\]
244 -- Total cooccurrences of terms given a map list of size @m@
245 -- \[N_{m} = \sum_{i,i \neq i}^{m} \sum_{j, j \neq j}^{m} S_{ij}\]
247 distributional :: Matrix Int -> Matrix Double
248 distributional m = -- run {- $ matMiniMax -}
255 {- from Int to Double -}
257 {- push matrix in Accelerate type -}
260 _ri :: Acc (Matrix Double) -> Acc (Matrix Double)
261 _ri mat = mat1 -- zipWith (/) mat1 mat2
263 mat1 = matSumCol n $ zipWith min (_myMin mat) (_myMin $ filterWith 0 100 $ diagNull n $ transpose mat)
266 _myMin :: Acc (Matrix Double) -> Acc (Matrix Double)
267 _myMin = replicate (constant (Z :. n :. All)) . minimum
272 s_mi :: Acc (Matrix Double) -> Acc (Matrix Double)
273 s_mi m' = zipWith (\x y -> log (x / y)) (diagNull n m')
274 $ zipWith (/) (crossProduct n m') (total m')
278 total :: Acc (Matrix Double) -> Acc (Matrix Double)
279 total = replicate (constant (Z :. n :. n)) . sum . sum
284 -- run $ (identityMatrix (DAA.constant (10::Int)) :: DAA.Acc (DAA.Matrix Int)) Matrix (Z :. 10 :. 10)
285 identityMatrix :: Num a => Exp Int -> Acc (Matrix a)
287 let zeros = fill (index2 n n) 0
288 ones = fill (index1 n) 1
290 permute const zeros (\(unindex1 -> i) -> index2 i i) ones
293 eyeMatrix :: Num a => Dim -> Acc (Matrix a)
295 let ones = fill (index2 n n) 1
296 zeros = fill (index1 n) 0
299 permute const ones (\(unindex1 -> i) -> index2 i i) zeros
302 data Direction = MatCol (Exp Int) | MatRow (Exp Int) | Diag
304 nullOf :: Num a => Dim -> Direction -> Acc (Matrix a)
306 let ones = fill (index2 n n) 1
307 zeros = fill (index2 n n) 0
310 permute const ones ( lift1 ( \(Z :. (i :: Exp Int) :. (_j:: Exp Int))
312 MatCol m -> (Z :. i :. m)
313 MatRow m -> (Z :. m :. i)
314 Diag -> (Z :. i :. i)
319 nullOfWithDiag :: Num a => Dim -> Direction -> Acc (Matrix a)
320 nullOfWithDiag n dir = zipWith (*) (nullOf n dir) (nullOf n Diag)
323 rIJ' :: Matrix Int -> Matrix Double
324 rIJ' m = run $ sumRowMin (dim m) m'
326 m' = (map fromIntegral $ use m)
328 rIJ :: (Elt a, Ord a, P.Fractional (Exp a), P.Num a)
329 => Dim -> Acc (Matrix a) -> Acc (Matrix a)
330 rIJ n m = matMiniMax $ divide a b
335 divide :: (Elt a, Ord a, P.Fractional (Exp a), P.Num a)
336 => Acc (Matrix a) -> Acc (Matrix a) -> Acc (Matrix a)
337 divide = zipWith divide'
339 divide' a b = ifThenElse (b > (constant 0))
344 sumRowMin :: (Num a, Ord a) => Dim -> Acc (Matrix a) -> Acc (Matrix a)
345 sumRowMin n m = {-trace (P.show $ run m') $-} m'
347 m' = reshape (shape m) vs
349 $ P.map (\z -> sumRowMin1 n (constant z) m) [0..n-1]
351 sumRowMin1 :: (Num a, Ord a) => Dim -> Exp Int -> Acc (Matrix a) -> Acc (Vector a)
352 sumRowMin1 n x m = trace (P.show (run m,run $ transpose m)) $ m''
354 m'' = sum $ zipWith min (transpose m) m
355 _m' = zipWith (*) (zipWith (*) (nullOf n (MatCol x)) $ nullOfWithDiag n (MatRow x)) m
359 sumColMin :: (Num a, Ord a) => Dim -> Acc (Matrix a) -> Acc (Matrix a)
360 sumColMin n m = reshape (shape m) vs
363 $ P.map (\z -> sumColMin1 n (constant z) m) [0..n-1]
366 sumColMin1 :: (Num a) => Dim -> Exp Int -> Acc (Matrix a) -> Acc (Matrix a)
367 sumColMin1 n x m = zipWith (*) (nullOfWithDiag n (MatCol x)) m
371 {- | WIP fun with indexes
372 selfMatrix :: Num a => Dim -> Acc (Matrix a)
374 let zeros = fill (index2 n n) 0
375 ones = fill (index2 n n) 1
378 permute const ones ( lift1 ( \(Z :. (i :: Exp Int) :. (_j:: Exp Int))
379 -> -- ifThenElse (i /= j)
384 selfMatrix' :: (Elt a, P.Num (Exp a)) => Array DIM2 a -> Matrix a
385 selfMatrix' m' = run $ selfMatrix n
390 -------------------------------------------------
391 diagNull :: Num a => Dim -> Acc (Matrix a) -> Acc (Matrix a)
392 diagNull n m = zipWith (*) m eye
396 -------------------------------------------------
397 crossProduct :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
398 crossProduct n m = {-trace (P.show (run m',run m'')) $-} zipWith (*) m' m''
401 m'' = transpose $ cross n m
404 crossT :: Matrix Double -> Matrix Double
405 crossT = run . transpose . use
407 crossProduct' :: Matrix Double -> Matrix Double
408 crossProduct' m = run $ crossProduct n m'
413 runWith :: (Arrays c, Elt a1)
414 => (Dim -> Acc (Matrix a1) -> a2 -> Acc c)
418 runWith f m = run . f (dim m) (use m)
421 cross :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
422 cross n mat = diagNull n (matSumCol n $ diagNull n mat)
424 cross' :: Matrix Double -> Matrix Double
425 cross' mat = run $ cross n mat'
431 -----------------------------------------------------------------------
432 -----------------------------------------------------------------------
433 -- * Specificity and Genericity
435 {- | Metric Specificity and genericity: select terms
437 - let N termes and occurrences of i \[N{i}\]
439 - Cooccurrences of i and j \[N{ij}\]
440 - Probability to get i given j : \[P(i|j)=N{ij}/N{j}\]
442 - Genericity of i \[Gen(i) = \frac{\sum_{j \neq i,j} P(i|j)}{N-1}\]
443 - Specificity of j \[Spec(i) = \frac{\sum_{j \neq i,j} P(j|i)}{N-1}\]
445 - \[Inclusion (i) = Gen(i) + Spec(i)\)
446 - \[GenericityScore = Gen(i)- Spec(i)\]
448 - References: Science mapping with asymmetrical paradigmatic proximity
449 Jean-Philippe Cointet (CREA, TSV), David Chavalarias (CREA) (Submitted
450 on 15 Mar 2008), Networks and Heterogeneous Media 3, 2 (2008) 267 - 276,
451 arXiv:0803.2315 [cs.OH]
453 type GenericityInclusion = Double
454 type SpecificityExclusion = Double
456 data SquareMatrix = SymetricMatrix | NonSymetricMatrix
457 type SymetricMatrix = Matrix
458 type NonSymetricMatrix = Matrix
461 incExcSpeGen :: Matrix Int -> (Vector GenericityInclusion, Vector SpecificityExclusion)
462 incExcSpeGen m = (run' inclusionExclusion m, run' specificityGenericity m)
464 run' fun mat = run $ fun $ map fromIntegral $ use mat
466 -- | Inclusion (i) = Gen(i)+Spec(i)
467 inclusionExclusion :: Acc (Matrix Double) -> Acc (Vector Double)
468 inclusionExclusion mat = zipWith (+) (pV mat) (pV mat)
470 -- | Genericity score = Gen(i)- Spec(i)
471 specificityGenericity :: Acc (Matrix Double) -> Acc (Vector Double)
472 specificityGenericity mat = zipWith (+) (pH mat) (pH mat)
474 -- | Gen(i) : 1/(N-1)*Sum(j!=i, P(i|j)) : Genericity of i
475 pV :: Acc (Matrix Double) -> Acc (Vector Double)
476 pV mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ij mat
478 -- | Spec(i) : 1/(N-1)*Sum(j!=i, P(j|i)) : Specificity of j
479 pH :: Acc (Matrix Double) -> Acc (Vector Double)
480 pH mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ji mat
483 cardN = constant (P.fromIntegral (dim m) :: Double)
486 -- | P(i|j) = Nij /N(jj) Probability to get i given j
487 --p_ij :: (Elt e, P.Fractional (Exp e)) => Acc (SymetricMatrix e) -> Acc (Matrix e)
488 p_ij :: (Elt e, P.Fractional (Exp e)) => Acc (Matrix e) -> Acc (Matrix e)
489 p_ij m = zipWith (/) m (n_jj m)
491 n_jj :: Elt e => Acc (SymetricMatrix e) -> Acc (Matrix e)
492 n_jj myMat' = backpermute (shape m)
493 (lift1 ( \(Z :. (_ :: Exp Int) :. (j:: Exp Int))
498 -- | P(j|i) = Nij /N(ii) Probability to get i given j
500 p_ji :: (Elt e, P.Fractional (Exp e)) => Acc (Array DIM2 e) -> Acc (Array DIM2 e)
501 p_ji = transpose . p_ij
504 -- | Step to ckeck the result in visual/qualitative tests
505 incExcSpeGen_proba :: Matrix Int -> Matrix Double
506 incExcSpeGen_proba m = run' pro m
508 run' fun mat = run $ fun $ map fromIntegral $ use mat
513 -- | Hypothesis to test maybe later (or not)
514 -- TODO ask accelerate for instances to ease such writtings:
515 p_ :: (Elt e, P.Fractional (Exp e)) => Acc (Array DIM2 e) -> Acc (Array DIM2 e)
516 p_ m = zipWith (/) m (n_ m)
518 n_ :: Elt e => Acc (SymetricMatrix e) -> Acc (Matrix e)
519 n_ m = backpermute (shape m)
520 (lift1 ( \(Z :. (i :: Exp Int) :. (j:: Exp Int))
521 -> (ifThenElse (i < j) (lift (Z :. j :. j)) (lift (Z :. i :. i)) :: Exp DIM2)
526 -- * For Tests (to be removed)
527 -- | Test perfermance with this matrix
528 -- TODO : add this in a benchmark folder
529 distriTest :: Int -> Matrix Double
530 distriTest n = distributional (theMatrix n)
532 theMatrix :: Int -> Matrix Int
533 theMatrix n = matrix n (dataMatrix n)
535 dataMatrix :: Int -> [Int]
536 dataMatrix x | (P.==) x 2 = [ 1, 1
540 | (P.==) x 3 = [ 1, 1, 2
544 | (P.==) x 4 = [ 1, 1, 2, 3
549 | P.otherwise = P.undefined
552 theResult :: Int -> Matrix Double
553 theResult n | (P.==) n 2 = let r = 1.6094379124341003 in [ 0, r, r, 0]
554 | P.otherwise = [ 1, 1 ]
559 => Int -> [e] -> Acc (Array ((Z :. Int) :. Int) e)
560 colMatrix n ns = replicate (constant (Z :. (n :: Int) :. All)) v
562 v = use $ vector (P.length ns) ns
564 -----------------------------------------------------------------------