]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/Graph/Distances/Matrice.hs
[FIX] some warnings/errors at compilation time. OK.
[gargantext.git] / src / Gargantext / Viz / Graph / Distances / Matrice.hs
1 {-|
2 Module : Gargantext.Graph.Distances.Matrix
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 This module aims at implementig distances of terms context by context is
11 the same referential of corpus.
12
13
14 Implementation use Accelerate library which enables GPU and CPU computation:
15
16 * Manuel M. T. Chakravarty, Gabriele Keller, Sean Lee, Trevor L. McDonell, and Vinod Grover.
17 [Accelerating Haskell Array Codes with Multicore GPUs][CKLM+11].
18 In _DAMP '11: Declarative Aspects of Multicore Programming_, ACM, 2011.
19
20 * Trevor L. McDonell, Manuel M. T. Chakravarty, Gabriele Keller, and Ben Lippmeier.
21 [Optimising Purely Functional GPU Programs][MCKL13].
22 In _ICFP '13: The 18th ACM SIGPLAN International Conference on Functional Programming_, ACM, 2013.
23
24 * Robert Clifton-Everest, Trevor L. McDonell, Manuel M. T. Chakravarty, and Gabriele Keller.
25 [Embedding Foreign Code][CMCK14].
26 In _PADL '14: The 16th International Symposium on Practical Aspects of Declarative Languages_, Springer-Verlag, LNCS, 2014.
27
28 * Trevor L. McDonell, Manuel M. T. Chakravarty, Vinod Grover, and Ryan R. Newton.
29 [Type-safe Runtime Code Generation: Accelerate to LLVM][MCGN15].
30 In _Haskell '15: The 8th ACM SIGPLAN Symposium on Haskell_, ACM, 2015.
31
32 -}
33
34 {-# LANGUAGE NoImplicitPrelude #-}
35 {-# LANGUAGE FlexibleContexts #-}
36 {-# LANGUAGE TypeFamilies #-}
37 {-# LANGUAGE TypeOperators #-}
38 {-# LANGUAGE ScopedTypeVariables #-}
39
40 module Gargantext.Viz.Graph.Distances.Matrice
41 where
42
43 import Data.Array.Accelerate
44 import Data.Array.Accelerate.Interpreter (run)
45
46 import qualified Gargantext.Prelude as P
47
48
49 -----------------------------------------------------------------------
50 -- | Define a vector
51 --
52 -- >>> vector 3
53 -- Vector (Z :. 3) [0,1,2]
54 vector :: Int -> (Array (Z :. Int) Int)
55 vector n = fromList (Z :. n) [0..n]
56
57 -- | Define a matrix
58 --
59 -- >>> matrix 3 ([1..] :: [Double])
60 -- Matrix (Z :. 3 :. 3)
61 -- [ 1.0, 2.0, 3.0,
62 -- 4.0, 5.0, 6.0,
63 -- 7.0, 8.0, 9.0]
64 matrix :: Elt c => Int -> [c] -> Matrix c
65 matrix n l = fromList (Z :. n :. n) l
66
67 -- | Two ways to get the rank (as documentation)
68 --
69 -- >>> rank (matrix 3 ([1..] :: [Int]))
70 -- 2
71 rank :: (Matrix a) -> Int
72 rank m = arrayRank $ arrayShape m
73
74 -----------------------------------------------------------------------
75 -- | Dimension of a square Matrix
76 -- How to force use with SquareMatrix ?
77 type Dim = Int
78
79 -- | Get Dimension of a square Matrix
80 --
81 -- >>> dim (matrix 3 ([1..] :: [Int]))
82 -- 3
83 dim :: Matrix a -> Dim
84 dim m = n
85 where
86 Z :. _ :. n = arrayShape m
87 -- indexTail (arrayShape m)
88
89 -----------------------------------------------------------------------
90
91 -- | Sum of a Matrix by Column
92 --
93 -- >>> run $ matSum 3 (use $ matrix 3 [1..])
94 -- Matrix (Z :. 3 :. 3)
95 -- [ 12.0, 15.0, 18.0,
96 -- 12.0, 15.0, 18.0,
97 -- 12.0, 15.0, 18.0]
98 matSum :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
99 matSum r mat = replicate (constant (Z :. (r :: Int) :. All)) $ sum $ transpose mat
100
101
102 -- | Proba computes de probability matrix: all cells divided by thee sum of its column
103 -- if you need get the probability on the lines, just transpose it
104 --
105 -- >>> run $ matProba 3 (use $ matrix 3 [1..])
106 -- Matrix (Z :. 3 :. 3)
107 -- [ 8.333333333333333e-2, 0.13333333333333333, 0.16666666666666666,
108 -- 0.3333333333333333, 0.3333333333333333, 0.3333333333333333,
109 -- 0.5833333333333334, 0.5333333333333333, 0.5]
110 matProba :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
111 matProba r mat = zipWith (/) mat (matSum r mat)
112
113 -- | Diagonal of the matrix
114 --
115 -- >>> run $ diag (use $ matrix 3 ([1..] :: [Int]))
116 -- Vector (Z :. 3) [1,5,9]
117 diag :: Elt e => Acc (Matrix e) -> Acc (Vector e)
118 diag m = backpermute (indexTail (shape m)) (lift1 (\(Z :. x) -> (Z :. x :. (x :: Exp Int)))) m
119
120
121 -- | Divide by the Diagonal of the matrix
122 --
123 -- >>> run $ divByDiag 3 (use $ matrix 3 ([1..] :: [Double]))
124 -- Matrix (Z :. 3 :. 3)
125 -- [ 1.0, 0.4, 0.3333333333333333,
126 -- 4.0, 1.0, 0.6666666666666666,
127 -- 7.0, 1.6, 1.0]
128 divByDiag :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
129 divByDiag d mat = zipWith (/) mat (replicate (constant (Z :. (d :: Int) :. All)) $ diag mat)
130
131 -----------------------------------------------------------------------
132 -- | Filters the matrix with the minimum of maximums
133 --
134 -- >>> run $ matMiniMax $ use $ matrix 3 [1..]
135 -- Matrix (Z :. 3 :. 3)
136 -- [ 0.0, 4.0, 7.0,
137 -- 0.0, 5.0, 8.0,
138 -- 0.0, 6.0, 9.0]
139 matMiniMax :: Acc (Matrix Double) -> Acc (Matrix Double)
140 matMiniMax m = map (\x -> ifThenElse (x > miniMax') x 0) (transpose m)
141 where
142 miniMax' = (the $ minimum $ maximum m)
143
144 -- | Filters the matrix with a constant
145 --
146 -- >>> run $ matFilter 5 $ use $ matrix 3 [1..]
147 -- Matrix (Z :. 3 :. 3)
148 -- [ 0.0, 0.0, 7.0,
149 -- 0.0, 0.0, 8.0,
150 -- 0.0, 6.0, 9.0]
151 matFilter :: Double -> Acc (Matrix Double) -> Acc (Matrix Double)
152 matFilter t m = map (\x -> ifThenElse (x > (constant t)) x 0) (transpose m)
153
154 -----------------------------------------------------------------------
155 -- * Measures of proximity
156 -----------------------------------------------------------------------
157 -- ** Conditional distance
158
159 -- *** Conditional distance (basic)
160
161 -- | Conditional distance (basic version)
162 --
163 -- 2 main measures are actually implemented in order to compute the
164 -- proximity of two terms: conditional and distributional
165 --
166 -- Conditional measure is an absolute measure which reflects
167 -- interactions of 2 terms in the corpus.
168 measureConditional :: Matrix Int -> Matrix Double
169 --measureConditional m = run (matMiniMax $ matProba (dim m) $ map fromIntegral $ use m)
170 measureConditional m = run (matProba (dim m) $ map fromIntegral $ use m)
171
172
173 -- *** Conditional distance (advanced)
174
175 -- | Conditional distance (advanced version)
176 --
177 -- The conditional measure P(i|j) of 2 terms @i@ and @j@, also called
178 -- "confidence" , is the maximum probability between @i@ and @j@ to see
179 -- @i@ in the same context of @j@ knowing @j@.
180 --
181 -- If N(i) (resp. N(j)) is the number of occurrences of @i@ (resp. @j@)
182 -- in the corpus and _[n_{ij}\] the number of its occurrences we get:
183 --
184 -- \[P_c=max(\frac{n_i}{n_{ij}},\frac{n_j}{n_{ij}} )\]
185 conditional' :: Matrix Int -> (Matrix InclusionExclusion, Matrix SpecificityGenericity)
186 conditional' m = (run $ ie $ map fromIntegral $ use m, run $ sg $ map fromIntegral $ use m)
187 where
188 ie :: Acc (Matrix Double) -> Acc (Matrix Double)
189 ie mat = map (\x -> x / (2*n-1)) $ zipWith (+) (xs mat) (ys mat)
190 sg :: Acc (Matrix Double) -> Acc (Matrix Double)
191 sg mat = map (\x -> x / (2*n-1)) $ zipWith (-) (xs mat) (ys mat)
192
193 n :: Exp Double
194 n = P.fromIntegral r
195
196 r :: Dim
197 r = dim m
198
199 xs :: Acc (Matrix Double) -> Acc (Matrix Double)
200 xs mat = zipWith (-) (matSum r $ matProba r mat) (matProba r mat)
201 ys :: Acc (Matrix Double) -> Acc (Matrix Double)
202 ys mat = zipWith (-) (matSum r $ transpose $ matProba r mat) (matProba r mat)
203
204 -----------------------------------------------------------------------
205 -- ** Distributional Distance
206
207 -- | Distributional Distance Measure
208 --
209 -- Distributional measure is a relative measure which depends on the
210 -- selected list, it represents structural equivalence.
211 --
212 -- The distributional measure \[P_c\] of @i@ and @j@ terms is: \[
213 -- S_{MI} = \frac {\sum_{k \neq i,j ; MI_{ik} >0}^{} \min(MI_{ik},
214 -- MI_{jk})}{\sum_{k \neq i,j ; MI_{ik}}^{}} \]
215 --
216 -- Mutual information
217 -- \[S{MI}({i},{j}) = \log(\frac{C{ij}}{E{ij}})\]
218 --
219 -- Number of cooccurrences of @i@ and @j@ in the same context of text
220 -- \[C{ij}\]
221 --
222 -- The expected value of the cooccurrences
223 -- \[E_{ij} = \frac {S_{i} S_{j}} {N}\]
224 --
225 -- Total cooccurrences of @i@ term
226 -- \[N_{i} = \sum_{i}^{} S_{i}\]
227 distributional :: Matrix Int -> Matrix Double
228 distributional m = run $ matMiniMax $ ri (map fromIntegral $ use m)
229 where
230
231 -- filter m = zipWith (\a b -> max a b) m (transpose m)
232
233 ri mat = zipWith (/) mat1 mat2
234 where
235 mat1 = matSum n $ zipWith min (mi mat) (mi $ transpose mat)
236 mat2 = matSum n mat
237
238 mi m' = zipWith (\a b -> max (log $ a/b) 0) m'
239 $ zipWith (/) (crossProduct m') (total m')
240
241 total m'' = replicate (constant (Z :. n :. n)) $ fold (+) 0 $ fold (+) 0 m''
242 n = dim m
243
244 crossProduct m''' = zipWith (*) (cross m''' ) (cross (transpose m'''))
245 cross mat = zipWith (-) (matSum n mat) (mat)
246
247 -----------------------------------------------------------------------
248 -----------------------------------------------------------------------
249 -- * Specificity and Genericity
250
251 {- | Metric Specificity and genericity: select terms
252
253 - let N termes and occurrences of i \[N{i}\]
254
255 - Cooccurrences of i and j \[N{ij}\]
256 - Probability to get i given j : \[P(i|j)=N{ij}/N{j}\]
257
258 - Genericity of i \[Gen(i) = \frac{\sum_{j \neq i,j} P(i|j)}{N-1}\]
259 - Specificity of j \[Spec(i) = \frac{\sum_{j \neq i,j} P(j|i)}{N-1}\]
260
261 - \[Inclusion (i) = Gen(i) + Spec(i)\)
262 - \[GenericityScore = Gen(i)- Spec(i)\]
263
264 - References: Science mapping with asymmetrical paradigmatic proximity
265 Jean-Philippe Cointet (CREA, TSV), David Chavalarias (CREA) (Submitted
266 on 15 Mar 2008), Networks and Heterogeneous Media 3, 2 (2008) 267 - 276,
267 arXiv:0803.2315 [cs.OH]
268 -}
269 type InclusionExclusion = Double
270 type SpecificityGenericity = Double
271
272 data SquareMatrix = SymetricMatrix | NonSymetricMatrix
273 type SymetricMatrix = Matrix
274 type NonSymetricMatrix = Matrix
275
276
277 incExcSpeGen :: Matrix Int -> (Vector InclusionExclusion, Vector SpecificityGenericity)
278 incExcSpeGen m = (run' inclusionExclusion m, run' specificityGenericity m)
279 where
280 run' fun mat = run $ fun $ map fromIntegral $ use mat
281
282 -- | Inclusion (i) = Gen(i)+Spec(i)
283 inclusionExclusion :: Acc (Matrix Double) -> Acc (Vector Double)
284 inclusionExclusion mat = zipWith (+) (pV mat) (pH mat)
285
286 -- | Genericity score = Gen(i)- Spec(i)
287 specificityGenericity :: Acc (Matrix Double) -> Acc (Vector Double)
288 specificityGenericity mat = zipWith (-) (pV mat) (pH mat)
289
290 -- | Gen(i) : 1/(N-1)*Sum(j!=i, P(i|j)) : Genericity of i
291 pV :: Acc (Matrix Double) -> Acc (Vector Double)
292 pV mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ij mat
293
294 -- | Spec(i) : 1/(N-1)*Sum(j!=i, P(j|i)) : Specificity of j
295 pH :: Acc (Matrix Double) -> Acc (Vector Double)
296 pH mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ji mat
297
298 cardN :: Exp Double
299 cardN = constant (P.fromIntegral (dim m) :: Double)
300
301
302 -- | P(i|j) = Nij /N(jj) Probability to get i given j
303 --p_ij :: (Elt e, P.Fractional (Exp e)) => Acc (SymetricMatrix e) -> Acc (Matrix e)
304 p_ij :: (Elt e, P.Fractional (Exp e)) => Acc (Matrix e) -> Acc (Matrix e)
305 p_ij m = zipWith (/) m (n_jj m)
306 where
307 n_jj :: Elt e => Acc (SymetricMatrix e) -> Acc (Matrix e)
308 n_jj myMat' = backpermute (shape m)
309 (lift1 ( \(Z :. (_ :: Exp Int) :. (j:: Exp Int))
310 -> (Z :. j :. j)
311 )
312 ) myMat'
313
314 -- | P(j|i) = Nij /N(ii) Probability to get i given j
315 -- to test
316 p_ji :: (Elt e, P.Fractional (Exp e)) => Acc (Array DIM2 e) -> Acc (Array DIM2 e)
317 p_ji = transpose . p_ij
318
319
320 -- | Step to ckeck the result in visual/qualitative tests
321 incExcSpeGen_proba :: Matrix Int -> Matrix Double
322 incExcSpeGen_proba m = run' pro m
323 where
324 run' fun mat = run $ fun $ map fromIntegral $ use mat
325
326 pro mat = p_ji mat
327
328 {-
329 -- | Hypothesis to test maybe later (or not)
330 -- TODO ask accelerate for instances to ease such writtings:
331 p_ :: (Elt e, P.Fractional (Exp e)) => Acc (Array DIM2 e) -> Acc (Array DIM2 e)
332 p_ m = zipWith (/) m (n_ m)
333 where
334 n_ :: Elt e => Acc (SymetricMatrix e) -> Acc (Matrix e)
335 n_ m = backpermute (shape m)
336 (lift1 ( \(Z :. (i :: Exp Int) :. (j:: Exp Int))
337 -> (ifThenElse (i < j) (lift (Z :. j :. j)) (lift (Z :. i :. i)) :: Exp DIM2)
338 )
339 ) m
340 -}
341
342 -- * For Tests (to be removed)
343 -- | Test perfermance with this matrix
344 -- TODO : add this in a benchmark folder
345 distriTest :: Matrix Double
346 distriTest = distributional $ matrix 100 [1..]
347 -----------------------------------------------------------------------
348