2 Module : Gargantext.Graph.Distances.Matrix
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 This module aims at implementig distances of terms context by context is
11 the same referential of corpus.
13 Implementation use Accelerate library which enables GPU and CPU computation:
15 * Manuel M. T. Chakravarty, Gabriele Keller, Sean Lee, Trevor L. McDonell, and Vinod Grover.
16 [Accelerating Haskell Array Codes with Multicore GPUs][CKLM+11].
17 In _DAMP '11: Declarative Aspects of Multicore Programming_, ACM, 2011.
19 * Trevor L. McDonell, Manuel M. T. Chakravarty, Vinod Grover, and Ryan R. Newton.
20 [Type-safe Runtime Code Generation: Accelerate to LLVM][MCGN15].
21 In _Haskell '15: The 8th ACM SIGPLAN Symposium on Haskell_, ACM, 2015.
25 {-# LANGUAGE TypeFamilies #-}
26 {-# LANGUAGE TypeOperators #-}
27 {-# LANGUAGE ScopedTypeVariables #-}
28 {-# LANGUAGE ViewPatterns #-}
30 module Gargantext.Viz.Graph.Distances.Matrice
33 import Debug.Trace (trace)
34 import Data.Array.Accelerate
35 import Data.Array.Accelerate.Interpreter (run)
36 import qualified Gargantext.Prelude as P
39 -----------------------------------------------------------------------
43 -- Vector (Z :. 3) [0,1,2]
44 vector :: Elt c => Int -> [c] -> (Array (Z :. Int) c)
45 vector n l = fromList (Z :. n) l
49 -- >>> matrix 3 ([1..] :: [Double])
50 -- Matrix (Z :. 3 :. 3)
54 matrix :: Elt c => Int -> [c] -> Matrix c
55 matrix n l = fromList (Z :. n :. n) l
57 -- | Two ways to get the rank (as documentation)
59 -- >>> rank (matrix 3 ([1..] :: [Int]))
61 rank :: (Matrix a) -> Int
62 rank m = arrayRank $ arrayShape m
64 -----------------------------------------------------------------------
65 -- | Dimension of a square Matrix
66 -- How to force use with SquareMatrix ?
69 -- | Get Dimension of a square Matrix
71 -- >>> dim (matrix 3 ([1..] :: [Int]))
73 dim :: Matrix a -> Dim
76 Z :. _ :. n = arrayShape m
77 -- indexTail (arrayShape m)
79 -----------------------------------------------------------------------
81 runExp :: Elt e => Exp e -> e
82 runExp e = indexArray (run (unit e)) Z
83 -----------------------------------------------------------------------
85 -- | Sum of a Matrix by Column
87 -- >>> run $ matSumCol 3 (use $ matrix 3 [1..])
88 -- Matrix (Z :. 3 :. 3)
89 -- [ 12.0, 15.0, 18.0,
92 matSumCol :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
93 matSumCol r mat = replicate (constant (Z :. (r :: Int) :. All)) $ sum $ transpose mat
95 matSumCol' :: Matrix Double -> Matrix Double
96 matSumCol' m = run $ matSumCol n m'
102 -- | Proba computes de probability matrix: all cells divided by thee sum of its column
103 -- if you need get the probability on the lines, just transpose it
105 -- >>> run $ matProba 3 (use $ matrix 3 [1..])
106 -- Matrix (Z :. 3 :. 3)
107 -- [ 8.333333333333333e-2, 0.13333333333333333, 0.16666666666666666,
108 -- 0.3333333333333333, 0.3333333333333333, 0.3333333333333333,
109 -- 0.5833333333333334, 0.5333333333333333, 0.5]
110 matProba :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
111 matProba r mat = zipWith (/) mat (matSumCol r mat)
113 -- | Diagonal of the matrix
115 -- >>> run $ diag (use $ matrix 3 ([1..] :: [Int]))
116 -- Vector (Z :. 3) [1,5,9]
117 diag :: Elt e => Acc (Matrix e) -> Acc (Vector e)
118 diag m = backpermute (indexTail (shape m))
119 (lift1 (\(Z :. x) -> (Z :. x :. (x :: Exp Int))))
122 -- | Divide by the Diagonal of the matrix
124 -- >>> run $ divByDiag 3 (use $ matrix 3 ([1..] :: [Double]))
125 -- Matrix (Z :. 3 :. 3)
126 -- [ 1.0, 0.4, 0.3333333333333333,
127 -- 4.0, 1.0, 0.6666666666666666,
129 divByDiag :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
130 divByDiag d mat = zipWith (/) mat (replicate (constant (Z :. (d :: Int) :. All)) $ diag mat)
132 -----------------------------------------------------------------------
133 -- | Filters the matrix with the minimum of maximums
135 -- >>> run $ matMiniMax $ use $ matrix 3 [1..]
136 -- Matrix (Z :. 3 :. 3)
140 matMiniMax :: Acc (Matrix Double) -> Acc (Matrix Double)
141 matMiniMax m = map (\x -> ifThenElse (x > miniMax') x 0) (transpose m)
143 miniMax' = (the $ minimum $ maximum m)
147 -- | Filters the matrix with a constant
149 -- >>> run $ matFilter 5 $ use $ matrix 3 [1..]
150 -- Matrix (Z :. 3 :. 3)
154 filter' :: Double -> Acc (Matrix Double) -> Acc (Matrix Double)
155 filter' t m = filterWith t 0 m
157 filterWith :: Double -> Double -> Acc (Matrix Double) -> Acc (Matrix Double)
158 filterWith t v m = map (\x -> ifThenElse (x > (constant t)) x (constant v)) (transpose m)
162 -----------------------------------------------------------------------
163 -- * Measures of proximity
164 -----------------------------------------------------------------------
165 -- ** Conditional distance
167 -- *** Conditional distance (basic)
169 -- | Conditional distance (basic version)
171 -- 2 main measures are actually implemented in order to compute the
172 -- proximity of two terms: conditional and distributional
174 -- Conditional measure is an absolute measure which reflects
175 -- interactions of 2 terms in the corpus.
176 measureConditional :: Matrix Int -> Matrix Double
177 --measureConditional m = run (matMiniMax $ matProba (dim m) $ map fromIntegral $ use m)
178 measureConditional m = run $ matProba (dim m)
183 -- *** Conditional distance (advanced)
185 -- | Conditional distance (advanced version)
187 -- The conditional measure P(i|j) of 2 terms @i@ and @j@, also called
188 -- "confidence" , is the maximum probability between @i@ and @j@ to see
189 -- @i@ in the same context of @j@ knowing @j@.
191 -- If N(i) (resp. N(j)) is the number of occurrences of @i@ (resp. @j@)
192 -- in the corpus and _[n_{ij}\] the number of its occurrences we get:
194 -- \[P_c=max(\frac{n_i}{n_{ij}},\frac{n_j}{n_{ij}} )\]
195 conditional' :: Matrix Int -> (Matrix InclusionExclusion, Matrix SpecificityGenericity)
196 conditional' m = ( run $ ie $ map fromIntegral $ use m
197 , run $ sg $ map fromIntegral $ use m
200 ie :: Acc (Matrix Double) -> Acc (Matrix Double)
201 ie mat = map (\x -> x / (2*n-1)) $ zipWith (+) (xs mat) (ys mat)
202 sg :: Acc (Matrix Double) -> Acc (Matrix Double)
203 sg mat = map (\x -> x / (2*n-1)) $ zipWith (-) (xs mat) (ys mat)
211 xs :: Acc (Matrix Double) -> Acc (Matrix Double)
212 xs mat = zipWith (-) (matSumCol r $ matProba r mat) (matProba r mat)
213 ys :: Acc (Matrix Double) -> Acc (Matrix Double)
214 ys mat = zipWith (-) (matSumCol r $ transpose $ matProba r mat) (matProba r mat)
216 -----------------------------------------------------------------------
217 -- ** Distributional Distance
219 -- | Distributional Distance Measure
221 -- Distributional measure is a relative measure which depends on the
222 -- selected list, it represents structural equivalence of mutual information.
224 -- The distributional measure P(c) of @i@ and @j@ terms is: \[
225 -- S_{MI} = \frac {\sum_{k \neq i,j ; MI_{ik} >0}^{} \min(MI_{ik},
226 -- MI_{jk})}{\sum_{k \neq i,j ; MI_{ik}>0}^{}} \]
228 -- Mutual information
229 -- \[S_{MI}({i},{j}) = \log(\frac{C{ij}}{E{ij}})\]
231 -- Number of cooccurrences of @i@ and @j@ in the same context of text
234 -- The expected value of the cooccurrences @i@ and @j@ (given a map list of size @n@)
235 -- \[E_{ij}^{m} = \frac {S_{i} S_{j}} {N_{m}}\]
237 -- Total cooccurrences of term @i@ given a map list of size @m@
238 -- \[S_{i} = \sum_{j, j \neq i}^{m} S_{ij}\]
240 -- Total cooccurrences of terms given a map list of size @m@
241 -- \[N_{m} = \sum_{i,i \neq i}^{m} \sum_{j, j \neq j}^{m} S_{ij}\]
243 distributional :: Matrix Int -> Matrix Double
244 distributional m = run -- $ matMiniMax
251 {- from Int to Double -}
253 {- push matrix in Accelerate type -}
256 ri :: Acc (Matrix Double) -> Acc (Matrix Double)
257 ri mat = mat1 -- zipWith (/) mat1 mat2
259 mat1 = matSumCol n $ zipWith min (myMin mat) (myMin $ filterWith 0 100 $ diagNull n $ transpose mat)
262 myMin :: Acc (Matrix Double) -> Acc (Matrix Double)
263 myMin = replicate (constant (Z :. n :. All)) . minimum
268 s_mi :: Acc (Matrix Double) -> Acc (Matrix Double)
269 s_mi m' = zipWith (\x y -> log (x / y)) (diagNull n m')
270 $ zipWith (/) (crossProduct n m') (total m')
274 total :: Acc (Matrix Double) -> Acc (Matrix Double)
275 total = replicate (constant (Z :. n :. n)) . sum . sum
280 -- run $ (identityMatrix (DAA.constant (10::Int)) :: DAA.Acc (DAA.Matrix Int)) Matrix (Z :. 10 :. 10)
281 identityMatrix :: Num a => Exp Int -> Acc (Matrix a)
283 let zeros = fill (index2 n n) 0
284 ones = fill (index1 n) 1
286 permute const zeros (\(unindex1 -> i) -> index2 i i) ones
289 eyeMatrix :: Num a => Dim -> Acc (Matrix a)
291 let ones = fill (index2 n n) 1
292 zeros = fill (index1 n) 0
295 permute const ones (\(unindex1 -> i) -> index2 i i) zeros
298 data Direction = MatCol (Exp Int) | MatRow (Exp Int) | Diag
300 nullOf :: Num a => Dim -> Direction -> Acc (Matrix a)
302 let ones = fill (index2 n n) 1
303 zeros = fill (index2 n n) 0
306 permute const ones -- (\(unindex2 -> i) -> let Exp (x,y) = i in index2 x y)
308 ( lift1 ( \(Z :. (i :: Exp Int) :. (_j:: Exp Int))
310 MatCol m -> (Z :. i :. m)
311 MatRow m -> (Z :. m :. i)
312 Diag -> (Z :. i :. i)
317 nullOfWithDiag :: Num a => Dim -> Direction -> Acc (Matrix a)
318 nullOfWithDiag n dir = zipWith (*) (nullOf n dir) (nullOf n Diag)
322 {- | WIP fun with indexes
323 selfMatrix :: Num a => Dim -> Acc (Matrix a)
325 let zeros = fill (index2 n n) 0
326 ones = fill (index2 n n) 1
329 permute const ones ( lift1 ( \(Z :. (i :: Exp Int) :. (_j:: Exp Int))
330 -> -- ifThenElse (i /= j)
335 selfMatrix' :: (Elt a, P.Num (Exp a)) => Array DIM2 a -> Matrix a
336 selfMatrix' m' = run $ selfMatrix n
341 -------------------------------------------------
342 diagNull :: Num a => Dim -> Acc (Matrix a) -> Acc (Matrix a)
343 diagNull n m = zipWith (*) m eye
347 -------------------------------------------------
348 crossProduct :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
349 crossProduct n m = {-trace (P.show (run m',run m'')) $-} zipWith (*) m' m''
352 m'' = transpose $ cross n m
355 crossT :: Matrix Double -> Matrix Double
356 crossT = run . transpose . use
358 crossProduct' :: Matrix Double -> Matrix Double
359 crossProduct' m = run $ crossProduct n m'
364 runWith :: (Arrays c, Elt a1)
365 => (Dim -> Acc (Matrix a1) -> a2 -> Acc c)
369 runWith f m = run . f (dim m) (use m)
372 cross :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
373 cross n mat = diagNull n (matSumCol n $ diagNull n mat)
375 cross' :: Matrix Double -> Matrix Double
376 cross' mat = run $ cross n mat'
382 -----------------------------------------------------------------------
383 -----------------------------------------------------------------------
384 -- * Specificity and Genericity
386 {- | Metric Specificity and genericity: select terms
388 - let N termes and occurrences of i \[N{i}\]
390 - Cooccurrences of i and j \[N{ij}\]
391 - Probability to get i given j : \[P(i|j)=N{ij}/N{j}\]
393 - Genericity of i \[Gen(i) = \frac{\sum_{j \neq i,j} P(i|j)}{N-1}\]
394 - Specificity of j \[Spec(i) = \frac{\sum_{j \neq i,j} P(j|i)}{N-1}\]
396 - \[Inclusion (i) = Gen(i) + Spec(i)\)
397 - \[GenericityScore = Gen(i)- Spec(i)\]
399 - References: Science mapping with asymmetrical paradigmatic proximity
400 Jean-Philippe Cointet (CREA, TSV), David Chavalarias (CREA) (Submitted
401 on 15 Mar 2008), Networks and Heterogeneous Media 3, 2 (2008) 267 - 276,
402 arXiv:0803.2315 [cs.OH]
404 type InclusionExclusion = Double
405 type SpecificityGenericity = Double
407 data SquareMatrix = SymetricMatrix | NonSymetricMatrix
408 type SymetricMatrix = Matrix
409 type NonSymetricMatrix = Matrix
412 incExcSpeGen :: Matrix Int -> (Vector InclusionExclusion, Vector SpecificityGenericity)
413 incExcSpeGen m = (run' inclusionExclusion m, run' specificityGenericity m)
415 run' fun mat = run $ fun $ map fromIntegral $ use mat
417 -- | Inclusion (i) = Gen(i)+Spec(i)
418 inclusionExclusion :: Acc (Matrix Double) -> Acc (Vector Double)
419 inclusionExclusion mat = zipWith (+) (pV mat) (pV mat)
421 -- | Genericity score = Gen(i)- Spec(i)
422 specificityGenericity :: Acc (Matrix Double) -> Acc (Vector Double)
423 specificityGenericity mat = zipWith (+) (pH mat) (pH mat)
425 -- | Gen(i) : 1/(N-1)*Sum(j!=i, P(i|j)) : Genericity of i
426 pV :: Acc (Matrix Double) -> Acc (Vector Double)
427 pV mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ij mat
429 -- | Spec(i) : 1/(N-1)*Sum(j!=i, P(j|i)) : Specificity of j
430 pH :: Acc (Matrix Double) -> Acc (Vector Double)
431 pH mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ji mat
434 cardN = constant (P.fromIntegral (dim m) :: Double)
437 -- | P(i|j) = Nij /N(jj) Probability to get i given j
438 --p_ij :: (Elt e, P.Fractional (Exp e)) => Acc (SymetricMatrix e) -> Acc (Matrix e)
439 p_ij :: (Elt e, P.Fractional (Exp e)) => Acc (Matrix e) -> Acc (Matrix e)
440 p_ij m = zipWith (/) m (n_jj m)
442 n_jj :: Elt e => Acc (SymetricMatrix e) -> Acc (Matrix e)
443 n_jj myMat' = backpermute (shape m)
444 (lift1 ( \(Z :. (_ :: Exp Int) :. (j:: Exp Int))
449 -- | P(j|i) = Nij /N(ii) Probability to get i given j
451 p_ji :: (Elt e, P.Fractional (Exp e)) => Acc (Array DIM2 e) -> Acc (Array DIM2 e)
452 p_ji = transpose . p_ij
455 -- | Step to ckeck the result in visual/qualitative tests
456 incExcSpeGen_proba :: Matrix Int -> Matrix Double
457 incExcSpeGen_proba m = run' pro m
459 run' fun mat = run $ fun $ map fromIntegral $ use mat
464 -- | Hypothesis to test maybe later (or not)
465 -- TODO ask accelerate for instances to ease such writtings:
466 p_ :: (Elt e, P.Fractional (Exp e)) => Acc (Array DIM2 e) -> Acc (Array DIM2 e)
467 p_ m = zipWith (/) m (n_ m)
469 n_ :: Elt e => Acc (SymetricMatrix e) -> Acc (Matrix e)
470 n_ m = backpermute (shape m)
471 (lift1 ( \(Z :. (i :: Exp Int) :. (j:: Exp Int))
472 -> (ifThenElse (i < j) (lift (Z :. j :. j)) (lift (Z :. i :. i)) :: Exp DIM2)
477 -- * For Tests (to be removed)
478 -- | Test perfermance with this matrix
479 -- TODO : add this in a benchmark folder
480 distriTest :: Int -> Matrix Double
481 distriTest n = distributional (matrix n theMatrix)
483 theMatrix | (P.==) n 2 = [ 1, 1
487 | (P.==) n 3 = [ 1, 1, 2
491 | (P.==) n 4 = [ 1, 1, 2, 3
496 | P.otherwise = P.undefined
498 theResult | (P.==) n 2 = let r = 1.6094379124341003 in [ 0, r, r, 0]
499 | P.otherwise = [ 1, 1 ]
504 -----------------------------------------------------------------------