]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/Graph/Distances/Matrice.hs
[TEST] fix tests (WIP)
[gargantext.git] / src / Gargantext / Viz / Graph / Distances / Matrice.hs
1 {-|
2 Module : Gargantext.Graph.Distances.Matrix
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 This module aims at implementig distances of terms context by context is
11 the same referential of corpus.
12
13
14 Implementation use Accelerate library which enables GPU and CPU computation:
15
16 * Manuel M. T. Chakravarty, Gabriele Keller, Sean Lee, Trevor L. McDonell, and Vinod Grover.
17 [Accelerating Haskell Array Codes with Multicore GPUs][CKLM+11].
18 In _DAMP '11: Declarative Aspects of Multicore Programming_, ACM, 2011.
19
20 * Trevor L. McDonell, Manuel M. T. Chakravarty, Gabriele Keller, and Ben Lippmeier.
21 [Optimising Purely Functional GPU Programs][MCKL13].
22 In _ICFP '13: The 18th ACM SIGPLAN International Conference on Functional Programming_, ACM, 2013.
23
24 * Robert Clifton-Everest, Trevor L. McDonell, Manuel M. T. Chakravarty, and Gabriele Keller.
25 [Embedding Foreign Code][CMCK14].
26 In _PADL '14: The 16th International Symposium on Practical Aspects of Declarative Languages_, Springer-Verlag, LNCS, 2014.
27
28 * Trevor L. McDonell, Manuel M. T. Chakravarty, Vinod Grover, and Ryan R. Newton.
29 [Type-safe Runtime Code Generation: Accelerate to LLVM][MCGN15].
30 In _Haskell '15: The 8th ACM SIGPLAN Symposium on Haskell_, ACM, 2015.
31
32 -}
33
34 {-# LANGUAGE TypeFamilies #-}
35 {-# LANGUAGE TypeOperators #-}
36 {-# LANGUAGE ScopedTypeVariables #-}
37
38 module Gargantext.Viz.Graph.Distances.Matrice
39 where
40
41 import Data.Array.Accelerate
42 import Data.Array.Accelerate.Interpreter (run)
43
44 import qualified Gargantext.Prelude as P
45
46
47 -----------------------------------------------------------------------
48 -- | Define a vector
49 --
50 -- >>> vector 3
51 -- Vector (Z :. 3) [0,1,2]
52 vector :: Int -> (Array (Z :. Int) Int)
53 vector n = fromList (Z :. n) [0..n]
54
55 -- | Define a matrix
56 --
57 -- >>> matrix 3 ([1..] :: [Double])
58 -- Matrix (Z :. 3 :. 3)
59 -- [ 1.0, 2.0, 3.0,
60 -- 4.0, 5.0, 6.0,
61 -- 7.0, 8.0, 9.0]
62 matrix :: Elt c => Int -> [c] -> Matrix c
63 matrix n l = fromList (Z :. n :. n) l
64
65 -- | Two ways to get the rank (as documentation)
66 --
67 -- >>> rank (matrix 3 ([1..] :: [Int]))
68 -- 2
69 rank :: (Matrix a) -> Int
70 rank m = arrayRank $ arrayShape m
71
72 -----------------------------------------------------------------------
73 -- | Dimension of a square Matrix
74 -- How to force use with SquareMatrix ?
75 type Dim = Int
76
77 -- | Get Dimension of a square Matrix
78 --
79 -- >>> dim (matrix 3 ([1..] :: [Int]))
80 -- 3
81 dim :: Matrix a -> Dim
82 dim m = n
83 where
84 Z :. _ :. n = arrayShape m
85 -- indexTail (arrayShape m)
86
87 -----------------------------------------------------------------------
88
89 -- | Sum of a Matrix by Column
90 --
91 -- >>> run $ matSumCol 3 (use $ matrix 3 [1..])
92 -- Matrix (Z :. 3 :. 3)
93 -- [ 12.0, 15.0, 18.0,
94 -- 12.0, 15.0, 18.0,
95 -- 12.0, 15.0, 18.0]
96 matSumCol :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
97 matSumCol r mat = replicate (constant (Z :. (r :: Int) :. All)) $ sum $ transpose mat
98
99 matSumCol' :: Matrix Double -> Matrix Double
100 matSumCol' m = run $ matSumCol n m'
101 where
102 n = dim m
103 m' = use m
104
105
106 -- | Proba computes de probability matrix: all cells divided by thee sum of its column
107 -- if you need get the probability on the lines, just transpose it
108 --
109 -- >>> run $ matProba 3 (use $ matrix 3 [1..])
110 -- Matrix (Z :. 3 :. 3)
111 -- [ 8.333333333333333e-2, 0.13333333333333333, 0.16666666666666666,
112 -- 0.3333333333333333, 0.3333333333333333, 0.3333333333333333,
113 -- 0.5833333333333334, 0.5333333333333333, 0.5]
114 matProba :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
115 matProba r mat = zipWith (/) mat (matSumCol r mat)
116
117 -- | Diagonal of the matrix
118 --
119 -- >>> run $ diag (use $ matrix 3 ([1..] :: [Int]))
120 -- Vector (Z :. 3) [1,5,9]
121 diag :: Elt e => Acc (Matrix e) -> Acc (Vector e)
122 diag m = backpermute (indexTail (shape m)) (lift1 (\(Z :. x) -> (Z :. x :. (x :: Exp Int)))) m
123
124 -- | Divide by the Diagonal of the matrix
125 --
126 -- >>> run $ divByDiag 3 (use $ matrix 3 ([1..] :: [Double]))
127 -- Matrix (Z :. 3 :. 3)
128 -- [ 1.0, 0.4, 0.3333333333333333,
129 -- 4.0, 1.0, 0.6666666666666666,
130 -- 7.0, 1.6, 1.0]
131 divByDiag :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
132 divByDiag d mat = zipWith (/) mat (replicate (constant (Z :. (d :: Int) :. All)) $ diag mat)
133
134 -----------------------------------------------------------------------
135 -- | Filters the matrix with the minimum of maximums
136 --
137 -- >>> run $ matMiniMax $ use $ matrix 3 [1..]
138 -- Matrix (Z :. 3 :. 3)
139 -- [ 0.0, 4.0, 7.0,
140 -- 0.0, 5.0, 8.0,
141 -- 0.0, 6.0, 9.0]
142 matMiniMax :: Acc (Matrix Double) -> Acc (Matrix Double)
143 matMiniMax m = map (\x -> ifThenElse (x > miniMax') x 0) (transpose m)
144 where
145 miniMax' = (the $ minimum $ maximum m)
146
147 -- | Filters the matrix with a constant
148 --
149 -- >>> run $ matFilter 5 $ use $ matrix 3 [1..]
150 -- Matrix (Z :. 3 :. 3)
151 -- [ 0.0, 0.0, 7.0,
152 -- 0.0, 0.0, 8.0,
153 -- 0.0, 6.0, 9.0]
154 matFilter :: Double -> Acc (Matrix Double) -> Acc (Matrix Double)
155 matFilter t m = map (\x -> ifThenElse (x > (constant t)) x 0) (transpose m)
156
157 -----------------------------------------------------------------------
158 -- * Measures of proximity
159 -----------------------------------------------------------------------
160 -- ** Conditional distance
161
162 -- *** Conditional distance (basic)
163
164 -- | Conditional distance (basic version)
165 --
166 -- 2 main measures are actually implemented in order to compute the
167 -- proximity of two terms: conditional and distributional
168 --
169 -- Conditional measure is an absolute measure which reflects
170 -- interactions of 2 terms in the corpus.
171 measureConditional :: Matrix Int -> Matrix Double
172 --measureConditional m = run (matMiniMax $ matProba (dim m) $ map fromIntegral $ use m)
173 measureConditional m = run $ matProba (dim m)
174 $ map fromIntegral
175 $ use m
176
177
178 -- *** Conditional distance (advanced)
179
180 -- | Conditional distance (advanced version)
181 --
182 -- The conditional measure P(i|j) of 2 terms @i@ and @j@, also called
183 -- "confidence" , is the maximum probability between @i@ and @j@ to see
184 -- @i@ in the same context of @j@ knowing @j@.
185 --
186 -- If N(i) (resp. N(j)) is the number of occurrences of @i@ (resp. @j@)
187 -- in the corpus and _[n_{ij}\] the number of its occurrences we get:
188 --
189 -- \[P_c=max(\frac{n_i}{n_{ij}},\frac{n_j}{n_{ij}} )\]
190 conditional' :: Matrix Int -> (Matrix InclusionExclusion, Matrix SpecificityGenericity)
191 conditional' m = ( run $ ie $ map fromIntegral $ use m
192 , run $ sg $ map fromIntegral $ use m
193 )
194 where
195 ie :: Acc (Matrix Double) -> Acc (Matrix Double)
196 ie mat = map (\x -> x / (2*n-1)) $ zipWith (+) (xs mat) (ys mat)
197 sg :: Acc (Matrix Double) -> Acc (Matrix Double)
198 sg mat = map (\x -> x / (2*n-1)) $ zipWith (-) (xs mat) (ys mat)
199
200 n :: Exp Double
201 n = P.fromIntegral r
202
203 r :: Dim
204 r = dim m
205
206 xs :: Acc (Matrix Double) -> Acc (Matrix Double)
207 xs mat = zipWith (-) (matSumCol r $ matProba r mat) (matProba r mat)
208 ys :: Acc (Matrix Double) -> Acc (Matrix Double)
209 ys mat = zipWith (-) (matSumCol r $ transpose $ matProba r mat) (matProba r mat)
210
211 -----------------------------------------------------------------------
212 -- ** Distributional Distance
213
214 -- | Distributional Distance Measure
215 --
216 -- Distributional measure is a relative measure which depends on the
217 -- selected list, it represents structural equivalence of mutual information.
218 --
219 -- The distributional measure P(c) of @i@ and @j@ terms is: \[
220 -- S_{MI} = \frac {\sum_{k \neq i,j ; MI_{ik} >0}^{} \min(MI_{ik},
221 -- MI_{jk})}{\sum_{k \neq i,j ; MI_{ik}>0}^{}} \]
222 --
223 -- Mutual information
224 -- \[S_{MI}({i},{j}) = \log(\frac{C{ij}}{E{ij}})\]
225 --
226 -- Number of cooccurrences of @i@ and @j@ in the same context of text
227 -- \[C{ij}\]
228 --
229 -- The expected value of the cooccurrences @i@ and @j@ (given a map list of size @n@)
230 -- \[E_{ij}^{m} = \frac {S_{i} S_{j}} {N_{m}}\]
231 --
232 -- Total cooccurrences of term @i@ given a map list of size @m@
233 -- \[S_{i} = \sum_{j, j \neq i}^{m} S_{ij}\]
234 --
235 -- Total cooccurrences of terms given a map list of size @m@
236 -- \[N_{m} = \sum_{i,i \neq i}^{m} \sum_{j, j \neq j}^{m} S_{ij}\]
237 --
238 distributional :: Matrix Int -> Matrix Double
239 distributional m = run $ matMiniMax
240 $ ri
241 $ map fromIntegral -- ^ from Int to Double
242 $ use m -- ^ push matrix in Accelerate type
243 where
244 -- filter m = zipWith (\a b -> max a b) m (transpose m)
245
246 ri :: Acc (Matrix Double) -> Acc (Matrix Double)
247 ri mat = zipWith (/) mat1 mat2
248 where
249 mat1 = matSumCol n $ zipWith min (s_mi mat) (s_mi $ transpose mat)
250 mat2 = matSumCol n mat
251
252 s_mi :: Acc (Matrix Double) -> Acc (Matrix Double)
253 s_mi m' = zipWith (\a b -> log (a/b)) m'
254 $ zipWith (/) (crossProduct m') (total m')
255
256 total :: Acc (Matrix Double) -> Acc (Matrix Double)
257 total = replicate (constant (Z :. n :. n)) . sum . sum
258
259 n :: Dim
260 n = dim m
261
262 crossProduct :: Acc (Matrix Double) -> Acc (Matrix Double)
263 crossProduct m''' = zipWith (*) (cross m''' ) (cross (transpose m'''))
264 cross :: Acc (Matrix Double) -> Acc (Matrix Double)
265 cross mat = zipWith (-) (matSumCol n mat) (mat)
266
267 -- | cross
268 {-
269 cross :: Matrix Double -> Matrix Double
270 cross mat = run $ zipWith (-) (matSumCol n mat') (mat')
271 where
272 mat' = use mat
273 n = dim mat
274 -}
275
276
277 -----------------------------------------------------------------------
278 -----------------------------------------------------------------------
279 -- * Specificity and Genericity
280
281 {- | Metric Specificity and genericity: select terms
282
283 - let N termes and occurrences of i \[N{i}\]
284
285 - Cooccurrences of i and j \[N{ij}\]
286 - Probability to get i given j : \[P(i|j)=N{ij}/N{j}\]
287
288 - Genericity of i \[Gen(i) = \frac{\sum_{j \neq i,j} P(i|j)}{N-1}\]
289 - Specificity of j \[Spec(i) = \frac{\sum_{j \neq i,j} P(j|i)}{N-1}\]
290
291 - \[Inclusion (i) = Gen(i) + Spec(i)\)
292 - \[GenericityScore = Gen(i)- Spec(i)\]
293
294 - References: Science mapping with asymmetrical paradigmatic proximity
295 Jean-Philippe Cointet (CREA, TSV), David Chavalarias (CREA) (Submitted
296 on 15 Mar 2008), Networks and Heterogeneous Media 3, 2 (2008) 267 - 276,
297 arXiv:0803.2315 [cs.OH]
298 -}
299 type InclusionExclusion = Double
300 type SpecificityGenericity = Double
301
302 data SquareMatrix = SymetricMatrix | NonSymetricMatrix
303 type SymetricMatrix = Matrix
304 type NonSymetricMatrix = Matrix
305
306
307 incExcSpeGen :: Matrix Int -> (Vector InclusionExclusion, Vector SpecificityGenericity)
308 incExcSpeGen m = (run' inclusionExclusion m, run' specificityGenericity m)
309 where
310 run' fun mat = run $ fun $ map fromIntegral $ use mat
311
312 -- | Inclusion (i) = Gen(i)+Spec(i)
313 inclusionExclusion :: Acc (Matrix Double) -> Acc (Vector Double)
314 inclusionExclusion mat = zipWith (+) (pV mat) (pV mat)
315
316 -- | Genericity score = Gen(i)- Spec(i)
317 specificityGenericity :: Acc (Matrix Double) -> Acc (Vector Double)
318 specificityGenericity mat = zipWith (+) (pH mat) (pH mat)
319
320 -- | Gen(i) : 1/(N-1)*Sum(j!=i, P(i|j)) : Genericity of i
321 pV :: Acc (Matrix Double) -> Acc (Vector Double)
322 pV mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ij mat
323
324 -- | Spec(i) : 1/(N-1)*Sum(j!=i, P(j|i)) : Specificity of j
325 pH :: Acc (Matrix Double) -> Acc (Vector Double)
326 pH mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ji mat
327
328 cardN :: Exp Double
329 cardN = constant (P.fromIntegral (dim m) :: Double)
330
331
332 -- | P(i|j) = Nij /N(jj) Probability to get i given j
333 --p_ij :: (Elt e, P.Fractional (Exp e)) => Acc (SymetricMatrix e) -> Acc (Matrix e)
334 p_ij :: (Elt e, P.Fractional (Exp e)) => Acc (Matrix e) -> Acc (Matrix e)
335 p_ij m = zipWith (/) m (n_jj m)
336 where
337 n_jj :: Elt e => Acc (SymetricMatrix e) -> Acc (Matrix e)
338 n_jj myMat' = backpermute (shape m)
339 (lift1 ( \(Z :. (_ :: Exp Int) :. (j:: Exp Int))
340 -> (Z :. j :. j)
341 )
342 ) myMat'
343
344 -- | P(j|i) = Nij /N(ii) Probability to get i given j
345 -- to test
346 p_ji :: (Elt e, P.Fractional (Exp e)) => Acc (Array DIM2 e) -> Acc (Array DIM2 e)
347 p_ji = transpose . p_ij
348
349
350 -- | Step to ckeck the result in visual/qualitative tests
351 incExcSpeGen_proba :: Matrix Int -> Matrix Double
352 incExcSpeGen_proba m = run' pro m
353 where
354 run' fun mat = run $ fun $ map fromIntegral $ use mat
355
356 pro mat = p_ji mat
357
358 {-
359 -- | Hypothesis to test maybe later (or not)
360 -- TODO ask accelerate for instances to ease such writtings:
361 p_ :: (Elt e, P.Fractional (Exp e)) => Acc (Array DIM2 e) -> Acc (Array DIM2 e)
362 p_ m = zipWith (/) m (n_ m)
363 where
364 n_ :: Elt e => Acc (SymetricMatrix e) -> Acc (Matrix e)
365 n_ m = backpermute (shape m)
366 (lift1 ( \(Z :. (i :: Exp Int) :. (j:: Exp Int))
367 -> (ifThenElse (i < j) (lift (Z :. j :. j)) (lift (Z :. i :. i)) :: Exp DIM2)
368 )
369 ) m
370 -}
371
372 -- * For Tests (to be removed)
373 -- | Test perfermance with this matrix
374 -- TODO : add this in a benchmark folder
375 distriTest :: Matrix Double
376 distriTest = distributional $ matrix 100 [1..]
377 -----------------------------------------------------------------------
378