2 Module : Gargantext.Graph.Distances.Matrix
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 This module aims at implementig distances of terms context by context is
11 the same referential of corpus.
14 Implementation use Accelerate library which enables GPU and CPU computation:
16 * Manuel M. T. Chakravarty, Gabriele Keller, Sean Lee, Trevor L. McDonell, and Vinod Grover.
17 [Accelerating Haskell Array Codes with Multicore GPUs][CKLM+11].
18 In _DAMP '11: Declarative Aspects of Multicore Programming_, ACM, 2011.
20 * Trevor L. McDonell, Manuel M. T. Chakravarty, Gabriele Keller, and Ben Lippmeier.
21 [Optimising Purely Functional GPU Programs][MCKL13].
22 In _ICFP '13: The 18th ACM SIGPLAN International Conference on Functional Programming_, ACM, 2013.
24 * Robert Clifton-Everest, Trevor L. McDonell, Manuel M. T. Chakravarty, and Gabriele Keller.
25 [Embedding Foreign Code][CMCK14].
26 In _PADL '14: The 16th International Symposium on Practical Aspects of Declarative Languages_, Springer-Verlag, LNCS, 2014.
28 * Trevor L. McDonell, Manuel M. T. Chakravarty, Vinod Grover, and Ryan R. Newton.
29 [Type-safe Runtime Code Generation: Accelerate to LLVM][MCGN15].
30 In _Haskell '15: The 8th ACM SIGPLAN Symposium on Haskell_, ACM, 2015.
34 {-# LANGUAGE TypeFamilies #-}
35 {-# LANGUAGE TypeOperators #-}
36 {-# LANGUAGE ScopedTypeVariables #-}
37 {-# LANGUAGE ViewPatterns #-}
39 module Gargantext.Viz.Graph.Distances.Matrice
42 import Debug.Trace (trace)
43 import Data.Array.Accelerate
44 import Data.Array.Accelerate.Interpreter (run)
45 import qualified Gargantext.Prelude as P
48 -----------------------------------------------------------------------
52 -- Vector (Z :. 3) [0,1,2]
53 vector :: Int -> (Array (Z :. Int) Int)
54 vector n = fromList (Z :. n) [0..n]
58 -- >>> matrix 3 ([1..] :: [Double])
59 -- Matrix (Z :. 3 :. 3)
63 matrix :: Elt c => Int -> [c] -> Matrix c
64 matrix n l = fromList (Z :. n :. n) l
66 -- | Two ways to get the rank (as documentation)
68 -- >>> rank (matrix 3 ([1..] :: [Int]))
70 rank :: (Matrix a) -> Int
71 rank m = arrayRank $ arrayShape m
73 -----------------------------------------------------------------------
74 -- | Dimension of a square Matrix
75 -- How to force use with SquareMatrix ?
78 -- | Get Dimension of a square Matrix
80 -- >>> dim (matrix 3 ([1..] :: [Int]))
82 dim :: Matrix a -> Dim
85 Z :. _ :. n = arrayShape m
86 -- indexTail (arrayShape m)
88 -----------------------------------------------------------------------
90 runExp :: Elt e => Exp e -> e
91 runExp e = indexArray (run (unit e)) Z
92 -----------------------------------------------------------------------
94 -- | Sum of a Matrix by Column
96 -- >>> run $ matSumCol 3 (use $ matrix 3 [1..])
97 -- Matrix (Z :. 3 :. 3)
98 -- [ 12.0, 15.0, 18.0,
101 matSumCol :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
102 matSumCol r mat = replicate (constant (Z :. (r :: Int) :. All)) $ sum $ transpose mat
104 matSumCol' :: Matrix Double -> Matrix Double
105 matSumCol' m = run $ matSumCol n m'
111 -- | Proba computes de probability matrix: all cells divided by thee sum of its column
112 -- if you need get the probability on the lines, just transpose it
114 -- >>> run $ matProba 3 (use $ matrix 3 [1..])
115 -- Matrix (Z :. 3 :. 3)
116 -- [ 8.333333333333333e-2, 0.13333333333333333, 0.16666666666666666,
117 -- 0.3333333333333333, 0.3333333333333333, 0.3333333333333333,
118 -- 0.5833333333333334, 0.5333333333333333, 0.5]
119 matProba :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
120 matProba r mat = zipWith (/) mat (matSumCol r mat)
122 -- | Diagonal of the matrix
124 -- >>> run $ diag (use $ matrix 3 ([1..] :: [Int]))
125 -- Vector (Z :. 3) [1,5,9]
126 diag :: Elt e => Acc (Matrix e) -> Acc (Vector e)
127 diag m = backpermute (indexTail (shape m))
128 (lift1 (\(Z :. x) -> (Z :. x :. (x :: Exp Int))))
131 -- | Divide by the Diagonal of the matrix
133 -- >>> run $ divByDiag 3 (use $ matrix 3 ([1..] :: [Double]))
134 -- Matrix (Z :. 3 :. 3)
135 -- [ 1.0, 0.4, 0.3333333333333333,
136 -- 4.0, 1.0, 0.6666666666666666,
138 divByDiag :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
139 divByDiag d mat = zipWith (/) mat (replicate (constant (Z :. (d :: Int) :. All)) $ diag mat)
141 -----------------------------------------------------------------------
142 -- | Filters the matrix with the minimum of maximums
144 -- >>> run $ matMiniMax $ use $ matrix 3 [1..]
145 -- Matrix (Z :. 3 :. 3)
149 matMiniMax :: Acc (Matrix Double) -> Acc (Matrix Double)
150 matMiniMax m = map (\x -> ifThenElse (x > miniMax') x 0) (transpose m)
152 miniMax' = (the $ minimum $ maximum m)
154 -- | Filters the matrix with a constant
156 -- >>> run $ matFilter 5 $ use $ matrix 3 [1..]
157 -- Matrix (Z :. 3 :. 3)
161 filter' :: Double -> Acc (Matrix Double) -> Acc (Matrix Double)
162 filter' t m = map (\x -> ifThenElse (x > (constant t)) x 0) (transpose m)
164 -----------------------------------------------------------------------
165 -- * Measures of proximity
166 -----------------------------------------------------------------------
167 -- ** Conditional distance
169 -- *** Conditional distance (basic)
171 -- | Conditional distance (basic version)
173 -- 2 main measures are actually implemented in order to compute the
174 -- proximity of two terms: conditional and distributional
176 -- Conditional measure is an absolute measure which reflects
177 -- interactions of 2 terms in the corpus.
178 measureConditional :: Matrix Int -> Matrix Double
179 --measureConditional m = run (matMiniMax $ matProba (dim m) $ map fromIntegral $ use m)
180 measureConditional m = run $ matProba (dim m)
185 -- *** Conditional distance (advanced)
187 -- | Conditional distance (advanced version)
189 -- The conditional measure P(i|j) of 2 terms @i@ and @j@, also called
190 -- "confidence" , is the maximum probability between @i@ and @j@ to see
191 -- @i@ in the same context of @j@ knowing @j@.
193 -- If N(i) (resp. N(j)) is the number of occurrences of @i@ (resp. @j@)
194 -- in the corpus and _[n_{ij}\] the number of its occurrences we get:
196 -- \[P_c=max(\frac{n_i}{n_{ij}},\frac{n_j}{n_{ij}} )\]
197 conditional' :: Matrix Int -> (Matrix InclusionExclusion, Matrix SpecificityGenericity)
198 conditional' m = ( run $ ie $ map fromIntegral $ use m
199 , run $ sg $ map fromIntegral $ use m
202 ie :: Acc (Matrix Double) -> Acc (Matrix Double)
203 ie mat = map (\x -> x / (2*n-1)) $ zipWith (+) (xs mat) (ys mat)
204 sg :: Acc (Matrix Double) -> Acc (Matrix Double)
205 sg mat = map (\x -> x / (2*n-1)) $ zipWith (-) (xs mat) (ys mat)
213 xs :: Acc (Matrix Double) -> Acc (Matrix Double)
214 xs mat = zipWith (-) (matSumCol r $ matProba r mat) (matProba r mat)
215 ys :: Acc (Matrix Double) -> Acc (Matrix Double)
216 ys mat = zipWith (-) (matSumCol r $ transpose $ matProba r mat) (matProba r mat)
218 -----------------------------------------------------------------------
219 -- ** Distributional Distance
221 -- | Distributional Distance Measure
223 -- Distributional measure is a relative measure which depends on the
224 -- selected list, it represents structural equivalence of mutual information.
226 -- The distributional measure P(c) of @i@ and @j@ terms is: \[
227 -- S_{MI} = \frac {\sum_{k \neq i,j ; MI_{ik} >0}^{} \min(MI_{ik},
228 -- MI_{jk})}{\sum_{k \neq i,j ; MI_{ik}>0}^{}} \]
230 -- Mutual information
231 -- \[S_{MI}({i},{j}) = \log(\frac{C{ij}}{E{ij}})\]
233 -- Number of cooccurrences of @i@ and @j@ in the same context of text
236 -- The expected value of the cooccurrences @i@ and @j@ (given a map list of size @n@)
237 -- \[E_{ij}^{m} = \frac {S_{i} S_{j}} {N_{m}}\]
239 -- Total cooccurrences of term @i@ given a map list of size @m@
240 -- \[S_{i} = \sum_{j, j \neq i}^{m} S_{ij}\]
242 -- Total cooccurrences of terms given a map list of size @m@
243 -- \[N_{m} = \sum_{i,i \neq i}^{m} \sum_{j, j \neq j}^{m} S_{ij}\]
245 distributional :: Matrix Int -> Matrix Double
246 distributional m = run -- $ matMiniMax
252 $ map fromIntegral -- ^ from Int to Double
253 $ use m -- ^ push matrix in Accelerate type
255 -- filter m = zipWith (\a b -> max a b) m (transpose m)
257 ri :: Acc (Matrix Double) -> Acc (Matrix Double)
258 ri mat = mat1 -- zipWith (/) mat1 mat2
260 mat1 = matSumCol n $ zipWith min' (myMin mat) (myMin $ transpose mat)
263 s_mi :: Acc (Matrix Double) -> Acc (Matrix Double)
264 s_mi m' = zipWith (\a b -> log (a/b)) m'
265 $ zipWith (/) (crossProduct n m') (total m')
267 total :: Acc (Matrix Double) -> Acc (Matrix Double)
268 total = replicate (constant (Z :. n :. n)) . sum . sum
271 | runExp (x > y && x /= 0) = x
274 myMin :: Acc (Matrix Double) -> Acc (Matrix Double)
275 myMin = replicate (constant (Z :. n :. All)) . minimum
280 -- run $ (identityMatrix (DAA.constant (10::Int)) :: DAA.Acc (DAA.Matrix Int)) Matrix (Z :. 10 :. 10)
281 identityMatrix :: Num a => Exp Int -> Acc (Matrix a)
283 let zeros = fill (index2 n n) 0
284 ones = fill (index1 n) 1
286 permute const zeros (\(unindex1 -> i) -> index2 i i) ones
289 eyeMatrix :: Num a => (Matrix a) -> Acc (Matrix a)
291 let zeros = fill (index2 n n) 1
292 ones = fill (index1 n) 0
295 permute const zeros (\(unindex1 -> i) -> index2 i i) ones
297 diag2null :: Num a => (Matrix a) -> Acc (Matrix a)
298 diag2null m' = zipWith (*) m eye
305 crossProduct :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
306 crossProduct n m = trace (P.show (run m',run m'')) $ zipWith (*) m' m''
309 m'' = cross n (transpose m)
311 crossT :: Matrix Double -> Matrix Double
312 crossT = run . transpose . use
314 crossProduct' :: Matrix Double -> Matrix Double
315 crossProduct' m = run $ crossProduct n m'
320 runWith :: (Arrays c, Elt a1)
321 => (Dim -> Acc (Matrix a1) -> a2 -> Acc c)
325 runWith f m = run . f (dim m) (use m)
328 cross :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
329 cross n mat = zipWith (-) (matSumCol n mat) (mat)
331 cross' :: Matrix Double -> Matrix Double
332 cross' mat = run $ cross n mat'
338 -----------------------------------------------------------------------
339 -----------------------------------------------------------------------
340 -- * Specificity and Genericity
342 {- | Metric Specificity and genericity: select terms
344 - let N termes and occurrences of i \[N{i}\]
346 - Cooccurrences of i and j \[N{ij}\]
347 - Probability to get i given j : \[P(i|j)=N{ij}/N{j}\]
349 - Genericity of i \[Gen(i) = \frac{\sum_{j \neq i,j} P(i|j)}{N-1}\]
350 - Specificity of j \[Spec(i) = \frac{\sum_{j \neq i,j} P(j|i)}{N-1}\]
352 - \[Inclusion (i) = Gen(i) + Spec(i)\)
353 - \[GenericityScore = Gen(i)- Spec(i)\]
355 - References: Science mapping with asymmetrical paradigmatic proximity
356 Jean-Philippe Cointet (CREA, TSV), David Chavalarias (CREA) (Submitted
357 on 15 Mar 2008), Networks and Heterogeneous Media 3, 2 (2008) 267 - 276,
358 arXiv:0803.2315 [cs.OH]
360 type InclusionExclusion = Double
361 type SpecificityGenericity = Double
363 data SquareMatrix = SymetricMatrix | NonSymetricMatrix
364 type SymetricMatrix = Matrix
365 type NonSymetricMatrix = Matrix
368 incExcSpeGen :: Matrix Int -> (Vector InclusionExclusion, Vector SpecificityGenericity)
369 incExcSpeGen m = (run' inclusionExclusion m, run' specificityGenericity m)
371 run' fun mat = run $ fun $ map fromIntegral $ use mat
373 -- | Inclusion (i) = Gen(i)+Spec(i)
374 inclusionExclusion :: Acc (Matrix Double) -> Acc (Vector Double)
375 inclusionExclusion mat = zipWith (+) (pV mat) (pV mat)
377 -- | Genericity score = Gen(i)- Spec(i)
378 specificityGenericity :: Acc (Matrix Double) -> Acc (Vector Double)
379 specificityGenericity mat = zipWith (+) (pH mat) (pH mat)
381 -- | Gen(i) : 1/(N-1)*Sum(j!=i, P(i|j)) : Genericity of i
382 pV :: Acc (Matrix Double) -> Acc (Vector Double)
383 pV mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ij mat
385 -- | Spec(i) : 1/(N-1)*Sum(j!=i, P(j|i)) : Specificity of j
386 pH :: Acc (Matrix Double) -> Acc (Vector Double)
387 pH mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ji mat
390 cardN = constant (P.fromIntegral (dim m) :: Double)
393 -- | P(i|j) = Nij /N(jj) Probability to get i given j
394 --p_ij :: (Elt e, P.Fractional (Exp e)) => Acc (SymetricMatrix e) -> Acc (Matrix e)
395 p_ij :: (Elt e, P.Fractional (Exp e)) => Acc (Matrix e) -> Acc (Matrix e)
396 p_ij m = zipWith (/) m (n_jj m)
398 n_jj :: Elt e => Acc (SymetricMatrix e) -> Acc (Matrix e)
399 n_jj myMat' = backpermute (shape m)
400 (lift1 ( \(Z :. (_ :: Exp Int) :. (j:: Exp Int))
405 -- | P(j|i) = Nij /N(ii) Probability to get i given j
407 p_ji :: (Elt e, P.Fractional (Exp e)) => Acc (Array DIM2 e) -> Acc (Array DIM2 e)
408 p_ji = transpose . p_ij
411 -- | Step to ckeck the result in visual/qualitative tests
412 incExcSpeGen_proba :: Matrix Int -> Matrix Double
413 incExcSpeGen_proba m = run' pro m
415 run' fun mat = run $ fun $ map fromIntegral $ use mat
420 -- | Hypothesis to test maybe later (or not)
421 -- TODO ask accelerate for instances to ease such writtings:
422 p_ :: (Elt e, P.Fractional (Exp e)) => Acc (Array DIM2 e) -> Acc (Array DIM2 e)
423 p_ m = zipWith (/) m (n_ m)
425 n_ :: Elt e => Acc (SymetricMatrix e) -> Acc (Matrix e)
426 n_ m = backpermute (shape m)
427 (lift1 ( \(Z :. (i :: Exp Int) :. (j:: Exp Int))
428 -> (ifThenElse (i < j) (lift (Z :. j :. j)) (lift (Z :. i :. i)) :: Exp DIM2)
433 -- * For Tests (to be removed)
434 -- | Test perfermance with this matrix
435 -- TODO : add this in a benchmark folder
436 distriTest :: Matrix Double
437 distriTest = distributional $ matrix 100 [1..]
438 -----------------------------------------------------------------------