]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/Graph/Distances/Matrice.hs
Merge branch 'dev' into dev-doc-table-optimization
[gargantext.git] / src / Gargantext / Viz / Graph / Distances / Matrice.hs
1 {-|
2 Module : Gargantext.Graph.Distances.Matrix
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 This module aims at implementig distances of terms context by context is
11 the same referential of corpus.
12
13
14 Implementation use Accelerate library which enables GPU and CPU computation:
15
16 * Manuel M. T. Chakravarty, Gabriele Keller, Sean Lee, Trevor L. McDonell, and Vinod Grover.
17 [Accelerating Haskell Array Codes with Multicore GPUs][CKLM+11].
18 In _DAMP '11: Declarative Aspects of Multicore Programming_, ACM, 2011.
19
20 * Trevor L. McDonell, Manuel M. T. Chakravarty, Vinod Grover, and Ryan R. Newton.
21 [Type-safe Runtime Code Generation: Accelerate to LLVM][MCGN15].
22 In _Haskell '15: The 8th ACM SIGPLAN Symposium on Haskell_, ACM, 2015.
23
24 -}
25
26 {-# LANGUAGE TypeFamilies #-}
27 {-# LANGUAGE TypeOperators #-}
28 {-# LANGUAGE ScopedTypeVariables #-}
29 {-# LANGUAGE ViewPatterns #-}
30
31 module Gargantext.Viz.Graph.Distances.Matrice
32 where
33
34 import Debug.Trace (trace)
35 import Data.Array.Accelerate
36 import Data.Array.Accelerate.Interpreter (run)
37 import qualified Gargantext.Prelude as P
38
39
40 -----------------------------------------------------------------------
41 -- | Define a vector
42 --
43 -- >>> vector 3
44 -- Vector (Z :. 3) [0,1,2]
45 vector :: Elt c => Int -> [c] -> (Array (Z :. Int) c)
46 vector n l = fromList (Z :. n) l
47
48 -- | Define a matrix
49 --
50 -- >>> matrix 3 ([1..] :: [Double])
51 -- Matrix (Z :. 3 :. 3)
52 -- [ 1.0, 2.0, 3.0,
53 -- 4.0, 5.0, 6.0,
54 -- 7.0, 8.0, 9.0]
55 matrix :: Elt c => Int -> [c] -> Matrix c
56 matrix n l = fromList (Z :. n :. n) l
57
58 -- | Two ways to get the rank (as documentation)
59 --
60 -- >>> rank (matrix 3 ([1..] :: [Int]))
61 -- 2
62 rank :: (Matrix a) -> Int
63 rank m = arrayRank $ arrayShape m
64
65 -----------------------------------------------------------------------
66 -- | Dimension of a square Matrix
67 -- How to force use with SquareMatrix ?
68 type Dim = Int
69
70 -- | Get Dimension of a square Matrix
71 --
72 -- >>> dim (matrix 3 ([1..] :: [Int]))
73 -- 3
74 dim :: Matrix a -> Dim
75 dim m = n
76 where
77 Z :. _ :. n = arrayShape m
78 -- indexTail (arrayShape m)
79
80 -----------------------------------------------------------------------
81 -- TODO move to Utils
82 runExp :: Elt e => Exp e -> e
83 runExp e = indexArray (run (unit e)) Z
84 -----------------------------------------------------------------------
85
86 -- | Sum of a Matrix by Column
87 --
88 -- >>> run $ matSumCol 3 (use $ matrix 3 [1..])
89 -- Matrix (Z :. 3 :. 3)
90 -- [ 12.0, 15.0, 18.0,
91 -- 12.0, 15.0, 18.0,
92 -- 12.0, 15.0, 18.0]
93 matSumCol :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
94 matSumCol r mat = replicate (constant (Z :. (r :: Int) :. All)) $ sum $ transpose mat
95
96 matSumCol' :: Matrix Double -> Matrix Double
97 matSumCol' m = run $ matSumCol n m'
98 where
99 n = dim m
100 m' = use m
101
102
103 -- | Proba computes de probability matrix: all cells divided by thee sum of its column
104 -- if you need get the probability on the lines, just transpose it
105 --
106 -- >>> run $ matProba 3 (use $ matrix 3 [1..])
107 -- Matrix (Z :. 3 :. 3)
108 -- [ 8.333333333333333e-2, 0.13333333333333333, 0.16666666666666666,
109 -- 0.3333333333333333, 0.3333333333333333, 0.3333333333333333,
110 -- 0.5833333333333334, 0.5333333333333333, 0.5]
111 matProba :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
112 matProba r mat = zipWith (/) mat (matSumCol r mat)
113
114 -- | Diagonal of the matrix
115 --
116 -- >>> run $ diag (use $ matrix 3 ([1..] :: [Int]))
117 -- Vector (Z :. 3) [1,5,9]
118 diag :: Elt e => Acc (Matrix e) -> Acc (Vector e)
119 diag m = backpermute (indexTail (shape m))
120 (lift1 (\(Z :. x) -> (Z :. x :. (x :: Exp Int))))
121 m
122
123 -- | Divide by the Diagonal of the matrix
124 --
125 -- >>> run $ divByDiag 3 (use $ matrix 3 ([1..] :: [Double]))
126 -- Matrix (Z :. 3 :. 3)
127 -- [ 1.0, 0.4, 0.3333333333333333,
128 -- 4.0, 1.0, 0.6666666666666666,
129 -- 7.0, 1.6, 1.0]
130 divByDiag :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
131 divByDiag d mat = zipWith (/) mat (replicate (constant (Z :. (d :: Int) :. All)) $ diag mat)
132
133 -----------------------------------------------------------------------
134 -- | Filters the matrix with the minimum of maximums
135 --
136 -- >>> run $ matMiniMax $ use $ matrix 3 [1..]
137 -- Matrix (Z :. 3 :. 3)
138 -- [ 0.0, 4.0, 7.0,
139 -- 0.0, 5.0, 8.0,
140 -- 0.0, 6.0, 9.0]
141 matMiniMax :: Acc (Matrix Double) -> Acc (Matrix Double)
142 matMiniMax m = map (\x -> ifThenElse (x > miniMax') x 0) (transpose m)
143 where
144 miniMax' = (the $ minimum $ maximum m)
145
146 -- | Filters the matrix with a constant
147 --
148 -- >>> run $ matFilter 5 $ use $ matrix 3 [1..]
149 -- Matrix (Z :. 3 :. 3)
150 -- [ 0.0, 0.0, 7.0,
151 -- 0.0, 0.0, 8.0,
152 -- 0.0, 6.0, 9.0]
153 filter' :: Double -> Acc (Matrix Double) -> Acc (Matrix Double)
154 filter' t m = map (\x -> ifThenElse (x > (constant t)) x 0) (transpose m)
155
156 -----------------------------------------------------------------------
157 -- * Measures of proximity
158 -----------------------------------------------------------------------
159 -- ** Conditional distance
160
161 -- *** Conditional distance (basic)
162
163 -- | Conditional distance (basic version)
164 --
165 -- 2 main measures are actually implemented in order to compute the
166 -- proximity of two terms: conditional and distributional
167 --
168 -- Conditional measure is an absolute measure which reflects
169 -- interactions of 2 terms in the corpus.
170 measureConditional :: Matrix Int -> Matrix Double
171 --measureConditional m = run (matMiniMax $ matProba (dim m) $ map fromIntegral $ use m)
172 measureConditional m = run $ matProba (dim m)
173 $ map fromIntegral
174 $ use m
175
176
177 -- *** Conditional distance (advanced)
178
179 -- | Conditional distance (advanced version)
180 --
181 -- The conditional measure P(i|j) of 2 terms @i@ and @j@, also called
182 -- "confidence" , is the maximum probability between @i@ and @j@ to see
183 -- @i@ in the same context of @j@ knowing @j@.
184 --
185 -- If N(i) (resp. N(j)) is the number of occurrences of @i@ (resp. @j@)
186 -- in the corpus and _[n_{ij}\] the number of its occurrences we get:
187 --
188 -- \[P_c=max(\frac{n_i}{n_{ij}},\frac{n_j}{n_{ij}} )\]
189 conditional' :: Matrix Int -> (Matrix InclusionExclusion, Matrix SpecificityGenericity)
190 conditional' m = ( run $ ie $ map fromIntegral $ use m
191 , run $ sg $ map fromIntegral $ use m
192 )
193 where
194 ie :: Acc (Matrix Double) -> Acc (Matrix Double)
195 ie mat = map (\x -> x / (2*n-1)) $ zipWith (+) (xs mat) (ys mat)
196 sg :: Acc (Matrix Double) -> Acc (Matrix Double)
197 sg mat = map (\x -> x / (2*n-1)) $ zipWith (-) (xs mat) (ys mat)
198
199 n :: Exp Double
200 n = P.fromIntegral r
201
202 r :: Dim
203 r = dim m
204
205 xs :: Acc (Matrix Double) -> Acc (Matrix Double)
206 xs mat = zipWith (-) (matSumCol r $ matProba r mat) (matProba r mat)
207 ys :: Acc (Matrix Double) -> Acc (Matrix Double)
208 ys mat = zipWith (-) (matSumCol r $ transpose $ matProba r mat) (matProba r mat)
209
210 -----------------------------------------------------------------------
211 -- ** Distributional Distance
212
213 -- | Distributional Distance Measure
214 --
215 -- Distributional measure is a relative measure which depends on the
216 -- selected list, it represents structural equivalence of mutual information.
217 --
218 -- The distributional measure P(c) of @i@ and @j@ terms is: \[
219 -- S_{MI} = \frac {\sum_{k \neq i,j ; MI_{ik} >0}^{} \min(MI_{ik},
220 -- MI_{jk})}{\sum_{k \neq i,j ; MI_{ik}>0}^{}} \]
221 --
222 -- Mutual information
223 -- \[S_{MI}({i},{j}) = \log(\frac{C{ij}}{E{ij}})\]
224 --
225 -- Number of cooccurrences of @i@ and @j@ in the same context of text
226 -- \[C{ij}\]
227 --
228 -- The expected value of the cooccurrences @i@ and @j@ (given a map list of size @n@)
229 -- \[E_{ij}^{m} = \frac {S_{i} S_{j}} {N_{m}}\]
230 --
231 -- Total cooccurrences of term @i@ given a map list of size @m@
232 -- \[S_{i} = \sum_{j, j \neq i}^{m} S_{ij}\]
233 --
234 -- Total cooccurrences of terms given a map list of size @m@
235 -- \[N_{m} = \sum_{i,i \neq i}^{m} \sum_{j, j \neq j}^{m} S_{ij}\]
236 --
237 distributional :: Matrix Int -> Matrix Double
238 distributional m = run {- -- $ matMiniMax
239 -- $ ri
240 -- $ myMin
241 -}
242 $ filter' 0
243 $ s_mi
244 $ map fromIntegral
245 {- from Int to Double -}
246 $ use m
247 {- push matrix in Accelerate type -}
248 where
249 -- filter m = zipWith (\a b -> max a b) m (transpose m)
250
251 {-
252 ri :: Acc (Matrix Double) -> Acc (Matrix Double)
253 ri mat = mat1 -- zipWith (/) mat1 mat2
254 where
255 mat1 = matSumCol n $ zipWith min' (myMin mat) (myMin $ transpose mat)
256 mat2 = total mat
257 myMin :: Acc (Matrix Double) -> Acc (Matrix Double)
258 myMin = replicate (constant (Z :. n :. All)) . minimum
259
260 -}
261
262 -- TODO fix NaN
263 -- Quali TEST: OK
264 s_mi :: Acc (Matrix Double) -> Acc (Matrix Double)
265 s_mi m' = zipWith (\x y -> log (x / y)) (diagNull n m')
266 $ zipWith (/) (crossProduct n m') (total m')
267 -- crossProduct n m'
268
269
270 total :: Acc (Matrix Double) -> Acc (Matrix Double)
271 total = replicate (constant (Z :. n :. n)) . sum . sum
272
273 n :: Dim
274 n = dim m
275
276 -- run $ (identityMatrix (DAA.constant (10::Int)) :: DAA.Acc (DAA.Matrix Int)) Matrix (Z :. 10 :. 10)
277 identityMatrix :: Num a => Exp Int -> Acc (Matrix a)
278 identityMatrix n =
279 let zeros = fill (index2 n n) 0
280 ones = fill (index1 n) 1
281 in
282 permute const zeros (\(unindex1 -> i) -> index2 i i) ones
283
284
285 eyeMatrix :: Num a => Dim -> Acc (Matrix a) -> Acc (Matrix a)
286 eyeMatrix n' _m =
287 let ones = fill (index2 n n) 1
288 zeros = fill (index1 n) 0
289 n = constant n'
290 in
291 permute const ones (\(unindex1 -> i) -> index2 i i) zeros
292
293
294 selfMatrix :: Num a => Dim -> Acc (Matrix a) -> Acc (Matrix a)
295 selfMatrix n' _m =
296 let zeros = fill (index2 n n) 0
297 ones = fill (index2 n n) 1
298 n = constant n'
299 in
300 permute const ones ( lift1 ( \(Z :. (i :: Exp Int) :. (_j:: Exp Int))
301 -> -- ifThenElse (i /= j)
302 -- (Z :. i :. j)
303 (Z :. i :. i)
304 )) zeros
305
306 selfMatrix' :: (Elt a, P.Num (Exp a)) => Array DIM2 a -> Matrix a
307 selfMatrix' m' = run $ selfMatrix n m
308 where
309 n = dim m'
310 m = use m'
311
312 -------------------------------------------------
313 diagNull :: Num a => Dim -> Acc (Matrix a) -> Acc (Matrix a)
314 diagNull n m = zipWith (*) m eye
315 where
316 eye = eyeMatrix n m
317
318
319 -------------------------------------------------
320 crossProduct :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
321 crossProduct n m = trace (P.show (run m',run m'')) $ zipWith (*) m' m''
322 where
323 m' = cross n m
324 m'' = cross n (transpose m)
325
326 crossT :: Matrix Double -> Matrix Double
327 crossT = run . transpose . use
328
329 crossProduct' :: Matrix Double -> Matrix Double
330 crossProduct' m = run $ crossProduct n m'
331 where
332 n = dim m
333 m' = use m
334
335 runWith :: (Arrays c, Elt a1)
336 => (Dim -> Acc (Matrix a1) -> a2 -> Acc c)
337 -> Matrix a1
338 -> a2
339 -> c
340 runWith f m = run . f (dim m) (use m)
341
342 -- | cross
343 cross :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
344 cross n mat = diagNull n (matSumCol n $ diagNull n mat)
345
346 cross' :: Matrix Double -> Matrix Double
347 cross' mat = run $ cross n mat'
348 where
349 mat' = use mat
350 n = dim mat
351
352
353 -----------------------------------------------------------------------
354 -----------------------------------------------------------------------
355 -- * Specificity and Genericity
356
357 {- | Metric Specificity and genericity: select terms
358
359 - let N termes and occurrences of i \[N{i}\]
360
361 - Cooccurrences of i and j \[N{ij}\]
362 - Probability to get i given j : \[P(i|j)=N{ij}/N{j}\]
363
364 - Genericity of i \[Gen(i) = \frac{\sum_{j \neq i,j} P(i|j)}{N-1}\]
365 - Specificity of j \[Spec(i) = \frac{\sum_{j \neq i,j} P(j|i)}{N-1}\]
366
367 - \[Inclusion (i) = Gen(i) + Spec(i)\)
368 - \[GenericityScore = Gen(i)- Spec(i)\]
369
370 - References: Science mapping with asymmetrical paradigmatic proximity
371 Jean-Philippe Cointet (CREA, TSV), David Chavalarias (CREA) (Submitted
372 on 15 Mar 2008), Networks and Heterogeneous Media 3, 2 (2008) 267 - 276,
373 arXiv:0803.2315 [cs.OH]
374 -}
375 type InclusionExclusion = Double
376 type SpecificityGenericity = Double
377
378 data SquareMatrix = SymetricMatrix | NonSymetricMatrix
379 type SymetricMatrix = Matrix
380 type NonSymetricMatrix = Matrix
381
382
383 incExcSpeGen :: Matrix Int -> (Vector InclusionExclusion, Vector SpecificityGenericity)
384 incExcSpeGen m = (run' inclusionExclusion m, run' specificityGenericity m)
385 where
386 run' fun mat = run $ fun $ map fromIntegral $ use mat
387
388 -- | Inclusion (i) = Gen(i)+Spec(i)
389 inclusionExclusion :: Acc (Matrix Double) -> Acc (Vector Double)
390 inclusionExclusion mat = zipWith (+) (pV mat) (pV mat)
391
392 -- | Genericity score = Gen(i)- Spec(i)
393 specificityGenericity :: Acc (Matrix Double) -> Acc (Vector Double)
394 specificityGenericity mat = zipWith (+) (pH mat) (pH mat)
395
396 -- | Gen(i) : 1/(N-1)*Sum(j!=i, P(i|j)) : Genericity of i
397 pV :: Acc (Matrix Double) -> Acc (Vector Double)
398 pV mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ij mat
399
400 -- | Spec(i) : 1/(N-1)*Sum(j!=i, P(j|i)) : Specificity of j
401 pH :: Acc (Matrix Double) -> Acc (Vector Double)
402 pH mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ji mat
403
404 cardN :: Exp Double
405 cardN = constant (P.fromIntegral (dim m) :: Double)
406
407
408 -- | P(i|j) = Nij /N(jj) Probability to get i given j
409 --p_ij :: (Elt e, P.Fractional (Exp e)) => Acc (SymetricMatrix e) -> Acc (Matrix e)
410 p_ij :: (Elt e, P.Fractional (Exp e)) => Acc (Matrix e) -> Acc (Matrix e)
411 p_ij m = zipWith (/) m (n_jj m)
412 where
413 n_jj :: Elt e => Acc (SymetricMatrix e) -> Acc (Matrix e)
414 n_jj myMat' = backpermute (shape m)
415 (lift1 ( \(Z :. (_ :: Exp Int) :. (j:: Exp Int))
416 -> (Z :. j :. j)
417 )
418 ) myMat'
419
420 -- | P(j|i) = Nij /N(ii) Probability to get i given j
421 -- to test
422 p_ji :: (Elt e, P.Fractional (Exp e)) => Acc (Array DIM2 e) -> Acc (Array DIM2 e)
423 p_ji = transpose . p_ij
424
425
426 -- | Step to ckeck the result in visual/qualitative tests
427 incExcSpeGen_proba :: Matrix Int -> Matrix Double
428 incExcSpeGen_proba m = run' pro m
429 where
430 run' fun mat = run $ fun $ map fromIntegral $ use mat
431
432 pro mat = p_ji mat
433
434 {-
435 -- | Hypothesis to test maybe later (or not)
436 -- TODO ask accelerate for instances to ease such writtings:
437 p_ :: (Elt e, P.Fractional (Exp e)) => Acc (Array DIM2 e) -> Acc (Array DIM2 e)
438 p_ m = zipWith (/) m (n_ m)
439 where
440 n_ :: Elt e => Acc (SymetricMatrix e) -> Acc (Matrix e)
441 n_ m = backpermute (shape m)
442 (lift1 ( \(Z :. (i :: Exp Int) :. (j:: Exp Int))
443 -> (ifThenElse (i < j) (lift (Z :. j :. j)) (lift (Z :. i :. i)) :: Exp DIM2)
444 )
445 ) m
446 -}
447
448 -- * For Tests (to be removed)
449 -- | Test perfermance with this matrix
450 -- TODO : add this in a benchmark folder
451 distriTest :: Matrix Double
452 distriTest = distributional $ matrix 100 [1..]
453 -----------------------------------------------------------------------
454