]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/Graph/Distances/Matrice.hs
[FIX+DOC] haddock + warnings
[gargantext.git] / src / Gargantext / Viz / Graph / Distances / Matrice.hs
1 {-|
2 Module : Gargantext.Graph.Distances.Matrix
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 This module aims at implementig distances of terms context by context is
11 the same referential of corpus.
12
13
14 Implementation use Accelerate library which enables GPU and CPU computation:
15
16 * Manuel M. T. Chakravarty, Gabriele Keller, Sean Lee, Trevor L. McDonell, and Vinod Grover.
17 [Accelerating Haskell Array Codes with Multicore GPUs][CKLM+11].
18 In _DAMP '11: Declarative Aspects of Multicore Programming_, ACM, 2011.
19
20 * Trevor L. McDonell, Manuel M. T. Chakravarty, Vinod Grover, and Ryan R. Newton.
21 [Type-safe Runtime Code Generation: Accelerate to LLVM][MCGN15].
22 In _Haskell '15: The 8th ACM SIGPLAN Symposium on Haskell_, ACM, 2015.
23
24 -}
25
26 {-# LANGUAGE TypeFamilies #-}
27 {-# LANGUAGE TypeOperators #-}
28 {-# LANGUAGE ScopedTypeVariables #-}
29 {-# LANGUAGE ViewPatterns #-}
30
31 module Gargantext.Viz.Graph.Distances.Matrice
32 where
33
34 import Debug.Trace (trace)
35 import Data.Array.Accelerate
36 import Data.Array.Accelerate.Interpreter (run)
37 import qualified Gargantext.Prelude as P
38
39
40 -----------------------------------------------------------------------
41 -- | Define a vector
42 --
43 -- >>> vector 3
44 -- Vector (Z :. 3) [0,1,2]
45 vector :: Elt c => Int -> [c] -> (Array (Z :. Int) c)
46 vector n l = fromList (Z :. n) l
47
48 -- | Define a matrix
49 --
50 -- >>> matrix 3 ([1..] :: [Double])
51 -- Matrix (Z :. 3 :. 3)
52 -- [ 1.0, 2.0, 3.0,
53 -- 4.0, 5.0, 6.0,
54 -- 7.0, 8.0, 9.0]
55 matrix :: Elt c => Int -> [c] -> Matrix c
56 matrix n l = fromList (Z :. n :. n) l
57
58 -- | Two ways to get the rank (as documentation)
59 --
60 -- >>> rank (matrix 3 ([1..] :: [Int]))
61 -- 2
62 rank :: (Matrix a) -> Int
63 rank m = arrayRank $ arrayShape m
64
65 -----------------------------------------------------------------------
66 -- | Dimension of a square Matrix
67 -- How to force use with SquareMatrix ?
68 type Dim = Int
69
70 -- | Get Dimension of a square Matrix
71 --
72 -- >>> dim (matrix 3 ([1..] :: [Int]))
73 -- 3
74 dim :: Matrix a -> Dim
75 dim m = n
76 where
77 Z :. _ :. n = arrayShape m
78 -- indexTail (arrayShape m)
79
80 -----------------------------------------------------------------------
81 -- TODO move to Utils
82 runExp :: Elt e => Exp e -> e
83 runExp e = indexArray (run (unit e)) Z
84 -----------------------------------------------------------------------
85
86 -- | Sum of a Matrix by Column
87 --
88 -- >>> run $ matSumCol 3 (use $ matrix 3 [1..])
89 -- Matrix (Z :. 3 :. 3)
90 -- [ 12.0, 15.0, 18.0,
91 -- 12.0, 15.0, 18.0,
92 -- 12.0, 15.0, 18.0]
93 matSumCol :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
94 matSumCol r mat = replicate (constant (Z :. (r :: Int) :. All)) $ sum $ transpose mat
95
96 matSumCol' :: Matrix Double -> Matrix Double
97 matSumCol' m = run $ matSumCol n m'
98 where
99 n = dim m
100 m' = use m
101
102
103 -- | Proba computes de probability matrix: all cells divided by thee sum of its column
104 -- if you need get the probability on the lines, just transpose it
105 --
106 -- >>> run $ matProba 3 (use $ matrix 3 [1..])
107 -- Matrix (Z :. 3 :. 3)
108 -- [ 8.333333333333333e-2, 0.13333333333333333, 0.16666666666666666,
109 -- 0.3333333333333333, 0.3333333333333333, 0.3333333333333333,
110 -- 0.5833333333333334, 0.5333333333333333, 0.5]
111 matProba :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
112 matProba r mat = zipWith (/) mat (matSumCol r mat)
113
114 -- | Diagonal of the matrix
115 --
116 -- >>> run $ diag (use $ matrix 3 ([1..] :: [Int]))
117 -- Vector (Z :. 3) [1,5,9]
118 diag :: Elt e => Acc (Matrix e) -> Acc (Vector e)
119 diag m = backpermute (indexTail (shape m))
120 (lift1 (\(Z :. x) -> (Z :. x :. (x :: Exp Int))))
121 m
122
123 -- | Divide by the Diagonal of the matrix
124 --
125 -- >>> run $ divByDiag 3 (use $ matrix 3 ([1..] :: [Double]))
126 -- Matrix (Z :. 3 :. 3)
127 -- [ 1.0, 0.4, 0.3333333333333333,
128 -- 4.0, 1.0, 0.6666666666666666,
129 -- 7.0, 1.6, 1.0]
130 divByDiag :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
131 divByDiag d mat = zipWith (/) mat (replicate (constant (Z :. (d :: Int) :. All)) $ diag mat)
132
133 -----------------------------------------------------------------------
134 -- | Filters the matrix with the minimum of maximums
135 --
136 -- >>> run $ matMiniMax $ use $ matrix 3 [1..]
137 -- Matrix (Z :. 3 :. 3)
138 -- [ 0.0, 4.0, 7.0,
139 -- 0.0, 5.0, 8.0,
140 -- 0.0, 6.0, 9.0]
141 matMiniMax :: Acc (Matrix Double) -> Acc (Matrix Double)
142 matMiniMax m = map (\x -> ifThenElse (x > miniMax') x 0) (transpose m)
143 where
144 miniMax' = (the $ minimum $ maximum m)
145
146 -- | Filters the matrix with a constant
147 --
148 -- >>> run $ matFilter 5 $ use $ matrix 3 [1..]
149 -- Matrix (Z :. 3 :. 3)
150 -- [ 0.0, 0.0, 7.0,
151 -- 0.0, 0.0, 8.0,
152 -- 0.0, 6.0, 9.0]
153 filter' :: Double -> Acc (Matrix Double) -> Acc (Matrix Double)
154 filter' t m = map (\x -> ifThenElse (x > (constant t)) x 0) (transpose m)
155
156 -----------------------------------------------------------------------
157 -- * Measures of proximity
158 -----------------------------------------------------------------------
159 -- ** Conditional distance
160
161 -- *** Conditional distance (basic)
162
163 -- | Conditional distance (basic version)
164 --
165 -- 2 main measures are actually implemented in order to compute the
166 -- proximity of two terms: conditional and distributional
167 --
168 -- Conditional measure is an absolute measure which reflects
169 -- interactions of 2 terms in the corpus.
170 measureConditional :: Matrix Int -> Matrix Double
171 --measureConditional m = run (matMiniMax $ matProba (dim m) $ map fromIntegral $ use m)
172 measureConditional m = run $ matProba (dim m)
173 $ map fromIntegral
174 $ use m
175
176
177 -- *** Conditional distance (advanced)
178
179 -- | Conditional distance (advanced version)
180 --
181 -- The conditional measure P(i|j) of 2 terms @i@ and @j@, also called
182 -- "confidence" , is the maximum probability between @i@ and @j@ to see
183 -- @i@ in the same context of @j@ knowing @j@.
184 --
185 -- If N(i) (resp. N(j)) is the number of occurrences of @i@ (resp. @j@)
186 -- in the corpus and _[n_{ij}\] the number of its occurrences we get:
187 --
188 -- \[P_c=max(\frac{n_i}{n_{ij}},\frac{n_j}{n_{ij}} )\]
189 conditional' :: Matrix Int -> (Matrix InclusionExclusion, Matrix SpecificityGenericity)
190 conditional' m = ( run $ ie $ map fromIntegral $ use m
191 , run $ sg $ map fromIntegral $ use m
192 )
193 where
194 ie :: Acc (Matrix Double) -> Acc (Matrix Double)
195 ie mat = map (\x -> x / (2*n-1)) $ zipWith (+) (xs mat) (ys mat)
196 sg :: Acc (Matrix Double) -> Acc (Matrix Double)
197 sg mat = map (\x -> x / (2*n-1)) $ zipWith (-) (xs mat) (ys mat)
198
199 n :: Exp Double
200 n = P.fromIntegral r
201
202 r :: Dim
203 r = dim m
204
205 xs :: Acc (Matrix Double) -> Acc (Matrix Double)
206 xs mat = zipWith (-) (matSumCol r $ matProba r mat) (matProba r mat)
207 ys :: Acc (Matrix Double) -> Acc (Matrix Double)
208 ys mat = zipWith (-) (matSumCol r $ transpose $ matProba r mat) (matProba r mat)
209
210 -----------------------------------------------------------------------
211 -- ** Distributional Distance
212
213 -- | Distributional Distance Measure
214 --
215 -- Distributional measure is a relative measure which depends on the
216 -- selected list, it represents structural equivalence of mutual information.
217 --
218 -- The distributional measure P(c) of @i@ and @j@ terms is: \[
219 -- S_{MI} = \frac {\sum_{k \neq i,j ; MI_{ik} >0}^{} \min(MI_{ik},
220 -- MI_{jk})}{\sum_{k \neq i,j ; MI_{ik}>0}^{}} \]
221 --
222 -- Mutual information
223 -- \[S_{MI}({i},{j}) = \log(\frac{C{ij}}{E{ij}})\]
224 --
225 -- Number of cooccurrences of @i@ and @j@ in the same context of text
226 -- \[C{ij}\]
227 --
228 -- The expected value of the cooccurrences @i@ and @j@ (given a map list of size @n@)
229 -- \[E_{ij}^{m} = \frac {S_{i} S_{j}} {N_{m}}\]
230 --
231 -- Total cooccurrences of term @i@ given a map list of size @m@
232 -- \[S_{i} = \sum_{j, j \neq i}^{m} S_{ij}\]
233 --
234 -- Total cooccurrences of terms given a map list of size @m@
235 -- \[N_{m} = \sum_{i,i \neq i}^{m} \sum_{j, j \neq j}^{m} S_{ij}\]
236 --
237 distributional :: Matrix Int -> Matrix Double
238 distributional m = run -- $ matMiniMax
239 -- $ ri
240 -- $ myMin
241 $ filter' 0
242 $ s_mi
243 $ map fromIntegral -- ^ from Int to Double
244 $ use m -- ^ push matrix in Accelerate type
245 where
246 -- filter m = zipWith (\a b -> max a b) m (transpose m)
247
248 {-
249 ri :: Acc (Matrix Double) -> Acc (Matrix Double)
250 ri mat = mat1 -- zipWith (/) mat1 mat2
251 where
252 mat1 = matSumCol n $ zipWith min' (myMin mat) (myMin $ transpose mat)
253 mat2 = total mat
254 myMin :: Acc (Matrix Double) -> Acc (Matrix Double)
255 myMin = replicate (constant (Z :. n :. All)) . minimum
256
257 -}
258
259 -- TODO fix NaN
260 -- Quali TEST: OK
261 s_mi :: Acc (Matrix Double) -> Acc (Matrix Double)
262 s_mi m' = zipWith (\x y -> log (x / y)) (diagNull n m')
263 $ zipWith (/) (crossProduct n m') (total m')
264 -- crossProduct n m'
265
266
267 total :: Acc (Matrix Double) -> Acc (Matrix Double)
268 total = replicate (constant (Z :. n :. n)) . sum . sum
269
270 n :: Dim
271 n = dim m
272
273 -- run $ (identityMatrix (DAA.constant (10::Int)) :: DAA.Acc (DAA.Matrix Int)) Matrix (Z :. 10 :. 10)
274 identityMatrix :: Num a => Exp Int -> Acc (Matrix a)
275 identityMatrix n =
276 let zeros = fill (index2 n n) 0
277 ones = fill (index1 n) 1
278 in
279 permute const zeros (\(unindex1 -> i) -> index2 i i) ones
280
281
282 eyeMatrix :: Num a => Dim -> Acc (Matrix a) -> Acc (Matrix a)
283 eyeMatrix n' _m =
284 let ones = fill (index2 n n) 1
285 zeros = fill (index1 n) 0
286 n = constant n'
287 in
288 permute const ones (\(unindex1 -> i) -> index2 i i) zeros
289
290
291 selfMatrix :: Num a => Dim -> Acc (Matrix a) -> Acc (Matrix a)
292 selfMatrix n' _m =
293 let zeros = fill (index2 n n) 0
294 ones = fill (index2 n n) 1
295 n = constant n'
296 in
297 permute const ones ( lift1 ( \(Z :. (i :: Exp Int) :. (_j:: Exp Int))
298 -> -- ifThenElse (i /= j)
299 -- (Z :. i :. j)
300 (Z :. i :. i)
301 )) zeros
302
303 selfMatrix' :: (Elt a, P.Num (Exp a)) => Array DIM2 a -> Matrix a
304 selfMatrix' m' = run $ selfMatrix n m
305 where
306 n = dim m'
307 m = use m'
308
309 -------------------------------------------------
310 diagNull :: Num a => Dim -> Acc (Matrix a) -> Acc (Matrix a)
311 diagNull n m = zipWith (*) m eye
312 where
313 eye = eyeMatrix n m
314
315
316 -------------------------------------------------
317 crossProduct :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
318 crossProduct n m = trace (P.show (run m',run m'')) $ zipWith (*) m' m''
319 where
320 m' = cross n m
321 m'' = cross n (transpose m)
322
323 crossT :: Matrix Double -> Matrix Double
324 crossT = run . transpose . use
325
326 crossProduct' :: Matrix Double -> Matrix Double
327 crossProduct' m = run $ crossProduct n m'
328 where
329 n = dim m
330 m' = use m
331
332 runWith :: (Arrays c, Elt a1)
333 => (Dim -> Acc (Matrix a1) -> a2 -> Acc c)
334 -> Matrix a1
335 -> a2
336 -> c
337 runWith f m = run . f (dim m) (use m)
338
339 -- | cross
340 cross :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
341 cross n mat = diagNull n (matSumCol n $ diagNull n mat)
342
343 cross' :: Matrix Double -> Matrix Double
344 cross' mat = run $ cross n mat'
345 where
346 mat' = use mat
347 n = dim mat
348
349
350 -----------------------------------------------------------------------
351 -----------------------------------------------------------------------
352 -- * Specificity and Genericity
353
354 {- | Metric Specificity and genericity: select terms
355
356 - let N termes and occurrences of i \[N{i}\]
357
358 - Cooccurrences of i and j \[N{ij}\]
359 - Probability to get i given j : \[P(i|j)=N{ij}/N{j}\]
360
361 - Genericity of i \[Gen(i) = \frac{\sum_{j \neq i,j} P(i|j)}{N-1}\]
362 - Specificity of j \[Spec(i) = \frac{\sum_{j \neq i,j} P(j|i)}{N-1}\]
363
364 - \[Inclusion (i) = Gen(i) + Spec(i)\)
365 - \[GenericityScore = Gen(i)- Spec(i)\]
366
367 - References: Science mapping with asymmetrical paradigmatic proximity
368 Jean-Philippe Cointet (CREA, TSV), David Chavalarias (CREA) (Submitted
369 on 15 Mar 2008), Networks and Heterogeneous Media 3, 2 (2008) 267 - 276,
370 arXiv:0803.2315 [cs.OH]
371 -}
372 type InclusionExclusion = Double
373 type SpecificityGenericity = Double
374
375 data SquareMatrix = SymetricMatrix | NonSymetricMatrix
376 type SymetricMatrix = Matrix
377 type NonSymetricMatrix = Matrix
378
379
380 incExcSpeGen :: Matrix Int -> (Vector InclusionExclusion, Vector SpecificityGenericity)
381 incExcSpeGen m = (run' inclusionExclusion m, run' specificityGenericity m)
382 where
383 run' fun mat = run $ fun $ map fromIntegral $ use mat
384
385 -- | Inclusion (i) = Gen(i)+Spec(i)
386 inclusionExclusion :: Acc (Matrix Double) -> Acc (Vector Double)
387 inclusionExclusion mat = zipWith (+) (pV mat) (pV mat)
388
389 -- | Genericity score = Gen(i)- Spec(i)
390 specificityGenericity :: Acc (Matrix Double) -> Acc (Vector Double)
391 specificityGenericity mat = zipWith (+) (pH mat) (pH mat)
392
393 -- | Gen(i) : 1/(N-1)*Sum(j!=i, P(i|j)) : Genericity of i
394 pV :: Acc (Matrix Double) -> Acc (Vector Double)
395 pV mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ij mat
396
397 -- | Spec(i) : 1/(N-1)*Sum(j!=i, P(j|i)) : Specificity of j
398 pH :: Acc (Matrix Double) -> Acc (Vector Double)
399 pH mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ji mat
400
401 cardN :: Exp Double
402 cardN = constant (P.fromIntegral (dim m) :: Double)
403
404
405 -- | P(i|j) = Nij /N(jj) Probability to get i given j
406 --p_ij :: (Elt e, P.Fractional (Exp e)) => Acc (SymetricMatrix e) -> Acc (Matrix e)
407 p_ij :: (Elt e, P.Fractional (Exp e)) => Acc (Matrix e) -> Acc (Matrix e)
408 p_ij m = zipWith (/) m (n_jj m)
409 where
410 n_jj :: Elt e => Acc (SymetricMatrix e) -> Acc (Matrix e)
411 n_jj myMat' = backpermute (shape m)
412 (lift1 ( \(Z :. (_ :: Exp Int) :. (j:: Exp Int))
413 -> (Z :. j :. j)
414 )
415 ) myMat'
416
417 -- | P(j|i) = Nij /N(ii) Probability to get i given j
418 -- to test
419 p_ji :: (Elt e, P.Fractional (Exp e)) => Acc (Array DIM2 e) -> Acc (Array DIM2 e)
420 p_ji = transpose . p_ij
421
422
423 -- | Step to ckeck the result in visual/qualitative tests
424 incExcSpeGen_proba :: Matrix Int -> Matrix Double
425 incExcSpeGen_proba m = run' pro m
426 where
427 run' fun mat = run $ fun $ map fromIntegral $ use mat
428
429 pro mat = p_ji mat
430
431 {-
432 -- | Hypothesis to test maybe later (or not)
433 -- TODO ask accelerate for instances to ease such writtings:
434 p_ :: (Elt e, P.Fractional (Exp e)) => Acc (Array DIM2 e) -> Acc (Array DIM2 e)
435 p_ m = zipWith (/) m (n_ m)
436 where
437 n_ :: Elt e => Acc (SymetricMatrix e) -> Acc (Matrix e)
438 n_ m = backpermute (shape m)
439 (lift1 ( \(Z :. (i :: Exp Int) :. (j:: Exp Int))
440 -> (ifThenElse (i < j) (lift (Z :. j :. j)) (lift (Z :. i :. i)) :: Exp DIM2)
441 )
442 ) m
443 -}
444
445 -- * For Tests (to be removed)
446 -- | Test perfermance with this matrix
447 -- TODO : add this in a benchmark folder
448 distriTest :: Matrix Double
449 distriTest = distributional $ matrix 100 [1..]
450 -----------------------------------------------------------------------
451