]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/Graph/Distances/Matrice.hs
Merge branch 'dev-list-charts' into dev-doc-table-optimization
[gargantext.git] / src / Gargantext / Viz / Graph / Distances / Matrice.hs
1 {-|
2 Module : Gargantext.Graph.Distances.Matrix
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 This module aims at implementig distances of terms context by context is
11 the same referential of corpus.
12
13
14 Implementation use Accelerate library which enables GPU and CPU computation:
15
16 * Manuel M. T. Chakravarty, Gabriele Keller, Sean Lee, Trevor L. McDonell, and Vinod Grover.
17 [Accelerating Haskell Array Codes with Multicore GPUs][CKLM+11].
18 In _DAMP '11: Declarative Aspects of Multicore Programming_, ACM, 2011.
19
20 * Trevor L. McDonell, Manuel M. T. Chakravarty, Gabriele Keller, and Ben Lippmeier.
21 [Optimising Purely Functional GPU Programs][MCKL13].
22 In _ICFP '13: The 18th ACM SIGPLAN International Conference on Functional Programming_, ACM, 2013.
23
24 * Robert Clifton-Everest, Trevor L. McDonell, Manuel M. T. Chakravarty, and Gabriele Keller.
25 [Embedding Foreign Code][CMCK14].
26 In _PADL '14: The 16th International Symposium on Practical Aspects of Declarative Languages_, Springer-Verlag, LNCS, 2014.
27
28 * Trevor L. McDonell, Manuel M. T. Chakravarty, Vinod Grover, and Ryan R. Newton.
29 [Type-safe Runtime Code Generation: Accelerate to LLVM][MCGN15].
30 In _Haskell '15: The 8th ACM SIGPLAN Symposium on Haskell_, ACM, 2015.
31
32 -}
33
34 {-# LANGUAGE TypeFamilies #-}
35 {-# LANGUAGE TypeOperators #-}
36 {-# LANGUAGE ScopedTypeVariables #-}
37
38 module Gargantext.Viz.Graph.Distances.Matrice
39 where
40
41 import Data.Array.Accelerate
42 import Data.Array.Accelerate.Interpreter (run)
43
44 import qualified Gargantext.Prelude as P
45
46
47 -----------------------------------------------------------------------
48 -- | Define a vector
49 --
50 -- >>> vector 3
51 -- Vector (Z :. 3) [0,1,2]
52 vector :: Int -> (Array (Z :. Int) Int)
53 vector n = fromList (Z :. n) [0..n]
54
55 -- | Define a matrix
56 --
57 -- >>> matrix 3 ([1..] :: [Double])
58 -- Matrix (Z :. 3 :. 3)
59 -- [ 1.0, 2.0, 3.0,
60 -- 4.0, 5.0, 6.0,
61 -- 7.0, 8.0, 9.0]
62 matrix :: Elt c => Int -> [c] -> Matrix c
63 matrix n l = fromList (Z :. n :. n) l
64
65 -- | Two ways to get the rank (as documentation)
66 --
67 -- >>> rank (matrix 3 ([1..] :: [Int]))
68 -- 2
69 rank :: (Matrix a) -> Int
70 rank m = arrayRank $ arrayShape m
71
72 -----------------------------------------------------------------------
73 -- | Dimension of a square Matrix
74 -- How to force use with SquareMatrix ?
75 type Dim = Int
76
77 -- | Get Dimension of a square Matrix
78 --
79 -- >>> dim (matrix 3 ([1..] :: [Int]))
80 -- 3
81 dim :: Matrix a -> Dim
82 dim m = n
83 where
84 Z :. _ :. n = arrayShape m
85 -- indexTail (arrayShape m)
86
87 -----------------------------------------------------------------------
88
89 -- | Sum of a Matrix by Column
90 --
91 -- >>> run $ matSum 3 (use $ matrix 3 [1..])
92 -- Matrix (Z :. 3 :. 3)
93 -- [ 12.0, 15.0, 18.0,
94 -- 12.0, 15.0, 18.0,
95 -- 12.0, 15.0, 18.0]
96 matSum :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
97 matSum r mat = replicate (constant (Z :. (r :: Int) :. All)) $ sum $ transpose mat
98
99
100 -- | Proba computes de probability matrix: all cells divided by thee sum of its column
101 -- if you need get the probability on the lines, just transpose it
102 --
103 -- >>> run $ matProba 3 (use $ matrix 3 [1..])
104 -- Matrix (Z :. 3 :. 3)
105 -- [ 8.333333333333333e-2, 0.13333333333333333, 0.16666666666666666,
106 -- 0.3333333333333333, 0.3333333333333333, 0.3333333333333333,
107 -- 0.5833333333333334, 0.5333333333333333, 0.5]
108 matProba :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
109 matProba r mat = zipWith (/) mat (matSum r mat)
110
111 -- | Diagonal of the matrix
112 --
113 -- >>> run $ diag (use $ matrix 3 ([1..] :: [Int]))
114 -- Vector (Z :. 3) [1,5,9]
115 diag :: Elt e => Acc (Matrix e) -> Acc (Vector e)
116 diag m = backpermute (indexTail (shape m)) (lift1 (\(Z :. x) -> (Z :. x :. (x :: Exp Int)))) m
117
118 -- | Divide by the Diagonal of the matrix
119 --
120 -- >>> run $ divByDiag 3 (use $ matrix 3 ([1..] :: [Double]))
121 -- Matrix (Z :. 3 :. 3)
122 -- [ 1.0, 0.4, 0.3333333333333333,
123 -- 4.0, 1.0, 0.6666666666666666,
124 -- 7.0, 1.6, 1.0]
125 divByDiag :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
126 divByDiag d mat = zipWith (/) mat (replicate (constant (Z :. (d :: Int) :. All)) $ diag mat)
127
128 -----------------------------------------------------------------------
129 -- | Filters the matrix with the minimum of maximums
130 --
131 -- >>> run $ matMiniMax $ use $ matrix 3 [1..]
132 -- Matrix (Z :. 3 :. 3)
133 -- [ 0.0, 4.0, 7.0,
134 -- 0.0, 5.0, 8.0,
135 -- 0.0, 6.0, 9.0]
136 matMiniMax :: Acc (Matrix Double) -> Acc (Matrix Double)
137 matMiniMax m = map (\x -> ifThenElse (x > miniMax') x 0) (transpose m)
138 where
139 miniMax' = (the $ minimum $ maximum m)
140
141 -- | Filters the matrix with a constant
142 --
143 -- >>> run $ matFilter 5 $ use $ matrix 3 [1..]
144 -- Matrix (Z :. 3 :. 3)
145 -- [ 0.0, 0.0, 7.0,
146 -- 0.0, 0.0, 8.0,
147 -- 0.0, 6.0, 9.0]
148 matFilter :: Double -> Acc (Matrix Double) -> Acc (Matrix Double)
149 matFilter t m = map (\x -> ifThenElse (x > (constant t)) x 0) (transpose m)
150
151 -----------------------------------------------------------------------
152 -- * Measures of proximity
153 -----------------------------------------------------------------------
154 -- ** Conditional distance
155
156 -- *** Conditional distance (basic)
157
158 -- | Conditional distance (basic version)
159 --
160 -- 2 main measures are actually implemented in order to compute the
161 -- proximity of two terms: conditional and distributional
162 --
163 -- Conditional measure is an absolute measure which reflects
164 -- interactions of 2 terms in the corpus.
165 measureConditional :: Matrix Int -> Matrix Double
166 --measureConditional m = run (matMiniMax $ matProba (dim m) $ map fromIntegral $ use m)
167 measureConditional m = run (matProba (dim m) $ map fromIntegral $ use m)
168
169
170 -- *** Conditional distance (advanced)
171
172 -- | Conditional distance (advanced version)
173 --
174 -- The conditional measure P(i|j) of 2 terms @i@ and @j@, also called
175 -- "confidence" , is the maximum probability between @i@ and @j@ to see
176 -- @i@ in the same context of @j@ knowing @j@.
177 --
178 -- If N(i) (resp. N(j)) is the number of occurrences of @i@ (resp. @j@)
179 -- in the corpus and _[n_{ij}\] the number of its occurrences we get:
180 --
181 -- \[P_c=max(\frac{n_i}{n_{ij}},\frac{n_j}{n_{ij}} )\]
182 conditional' :: Matrix Int -> (Matrix InclusionExclusion, Matrix SpecificityGenericity)
183 conditional' m = ( run $ ie $ map fromIntegral $ use m
184 , run $ sg $ map fromIntegral $ use m
185 )
186 where
187 ie :: Acc (Matrix Double) -> Acc (Matrix Double)
188 ie mat = map (\x -> x / (2*n-1)) $ zipWith (+) (xs mat) (ys mat)
189 sg :: Acc (Matrix Double) -> Acc (Matrix Double)
190 sg mat = map (\x -> x / (2*n-1)) $ zipWith (-) (xs mat) (ys mat)
191
192 n :: Exp Double
193 n = P.fromIntegral r
194
195 r :: Dim
196 r = dim m
197
198 xs :: Acc (Matrix Double) -> Acc (Matrix Double)
199 xs mat = zipWith (-) (matSum r $ matProba r mat) (matProba r mat)
200 ys :: Acc (Matrix Double) -> Acc (Matrix Double)
201 ys mat = zipWith (-) (matSum r $ transpose $ matProba r mat) (matProba r mat)
202
203 -----------------------------------------------------------------------
204 -- ** Distributional Distance
205
206 -- | Distributional Distance Measure
207 --
208 -- Distributional measure is a relative measure which depends on the
209 -- selected list, it represents structural equivalence.
210 --
211 -- The distributional measure P(c) of @i@ and @j@ terms is: \[
212 -- S_{MI} = \frac {\sum_{k \neq i,j ; MI_{ik} >0}^{} \min(MI_{ik},
213 -- MI_{jk})}{\sum_{k \neq i,j ; MI_{ik}}^{}} \]
214 --
215 -- Mutual information
216 -- \[S_{MI}({i},{j}) = \log(\frac{C{ij}}{E{ij}})\]
217 --
218 -- Number of cooccurrences of @i@ and @j@ in the same context of text
219 -- \[C{ij}\]
220 --
221 -- The expected value of the cooccurrences @i@ and @j@ (given a map list of size @n@)
222 -- \[E_{ij}^{m} = \frac {S_{i} S_{j}} {N_{m}}\]
223 --
224 -- Total cooccurrences of term @i@ given a map list of size @m@
225 -- \[S_{i} = \sum_{j, j \neq i}^{m} S_{ij}\]
226 --
227 -- Total cooccurrences of terms given a map list of size @m@
228 -- \[N_{m} = \sum_{i,i \neq i}^{m} \sum_{j, j \neq j}^{m} S_{ij}\]
229 --
230 distributional :: Matrix Int -> Matrix Double
231 distributional m = run $ matMiniMax $ ri (map fromIntegral $ use m)
232 where
233
234 -- filter m = zipWith (\a b -> max a b) m (transpose m)
235
236 ri mat = zipWith (/) mat1 mat2
237 where
238 mat1 = matSum n $ zipWith min (s_mi mat) (s_mi $ transpose mat)
239 mat2 = matSum n mat
240
241 s_mi m' = zipWith (\a b -> log (a/b)) m'
242 $ zipWith (/) (crossProduct m') (total m')
243
244 total m'' = replicate (constant (Z :. n :. n)) $ fold (+) 0 $ fold (+) 0 m''
245
246 n :: Dim
247 n = dim m
248
249 crossProduct m''' = zipWith (*) (cross m''' ) (cross (transpose m'''))
250 cross mat = zipWith (-) (matSum n mat) (mat)
251
252 -----------------------------------------------------------------------
253 -----------------------------------------------------------------------
254 -- * Specificity and Genericity
255
256 {- | Metric Specificity and genericity: select terms
257
258 - let N termes and occurrences of i \[N{i}\]
259
260 - Cooccurrences of i and j \[N{ij}\]
261 - Probability to get i given j : \[P(i|j)=N{ij}/N{j}\]
262
263 - Genericity of i \[Gen(i) = \frac{\sum_{j \neq i,j} P(i|j)}{N-1}\]
264 - Specificity of j \[Spec(i) = \frac{\sum_{j \neq i,j} P(j|i)}{N-1}\]
265
266 - \[Inclusion (i) = Gen(i) + Spec(i)\)
267 - \[GenericityScore = Gen(i)- Spec(i)\]
268
269 - References: Science mapping with asymmetrical paradigmatic proximity
270 Jean-Philippe Cointet (CREA, TSV), David Chavalarias (CREA) (Submitted
271 on 15 Mar 2008), Networks and Heterogeneous Media 3, 2 (2008) 267 - 276,
272 arXiv:0803.2315 [cs.OH]
273 -}
274 type InclusionExclusion = Double
275 type SpecificityGenericity = Double
276
277 data SquareMatrix = SymetricMatrix | NonSymetricMatrix
278 type SymetricMatrix = Matrix
279 type NonSymetricMatrix = Matrix
280
281
282 incExcSpeGen :: Matrix Int -> (Vector InclusionExclusion, Vector SpecificityGenericity)
283 incExcSpeGen m = (run' inclusionExclusion m, run' specificityGenericity m)
284 where
285 run' fun mat = run $ fun $ map fromIntegral $ use mat
286
287 -- | Inclusion (i) = Gen(i)+Spec(i)
288 inclusionExclusion :: Acc (Matrix Double) -> Acc (Vector Double)
289 inclusionExclusion mat = zipWith (+) (pV mat) (pV mat)
290
291 -- | Genericity score = Gen(i)- Spec(i)
292 specificityGenericity :: Acc (Matrix Double) -> Acc (Vector Double)
293 specificityGenericity mat = zipWith (+) (pH mat) (pH mat)
294
295 -- | Gen(i) : 1/(N-1)*Sum(j!=i, P(i|j)) : Genericity of i
296 pV :: Acc (Matrix Double) -> Acc (Vector Double)
297 pV mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ij mat
298
299 -- | Spec(i) : 1/(N-1)*Sum(j!=i, P(j|i)) : Specificity of j
300 pH :: Acc (Matrix Double) -> Acc (Vector Double)
301 pH mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ji mat
302
303 cardN :: Exp Double
304 cardN = constant (P.fromIntegral (dim m) :: Double)
305
306
307 -- | P(i|j) = Nij /N(jj) Probability to get i given j
308 --p_ij :: (Elt e, P.Fractional (Exp e)) => Acc (SymetricMatrix e) -> Acc (Matrix e)
309 p_ij :: (Elt e, P.Fractional (Exp e)) => Acc (Matrix e) -> Acc (Matrix e)
310 p_ij m = zipWith (/) m (n_jj m)
311 where
312 n_jj :: Elt e => Acc (SymetricMatrix e) -> Acc (Matrix e)
313 n_jj myMat' = backpermute (shape m)
314 (lift1 ( \(Z :. (_ :: Exp Int) :. (j:: Exp Int))
315 -> (Z :. j :. j)
316 )
317 ) myMat'
318
319 -- | P(j|i) = Nij /N(ii) Probability to get i given j
320 -- to test
321 p_ji :: (Elt e, P.Fractional (Exp e)) => Acc (Array DIM2 e) -> Acc (Array DIM2 e)
322 p_ji = transpose . p_ij
323
324
325 -- | Step to ckeck the result in visual/qualitative tests
326 incExcSpeGen_proba :: Matrix Int -> Matrix Double
327 incExcSpeGen_proba m = run' pro m
328 where
329 run' fun mat = run $ fun $ map fromIntegral $ use mat
330
331 pro mat = p_ji mat
332
333 {-
334 -- | Hypothesis to test maybe later (or not)
335 -- TODO ask accelerate for instances to ease such writtings:
336 p_ :: (Elt e, P.Fractional (Exp e)) => Acc (Array DIM2 e) -> Acc (Array DIM2 e)
337 p_ m = zipWith (/) m (n_ m)
338 where
339 n_ :: Elt e => Acc (SymetricMatrix e) -> Acc (Matrix e)
340 n_ m = backpermute (shape m)
341 (lift1 ( \(Z :. (i :: Exp Int) :. (j:: Exp Int))
342 -> (ifThenElse (i < j) (lift (Z :. j :. j)) (lift (Z :. i :. i)) :: Exp DIM2)
343 )
344 ) m
345 -}
346
347 -- * For Tests (to be removed)
348 -- | Test perfermance with this matrix
349 -- TODO : add this in a benchmark folder
350 distriTest :: Matrix Double
351 distriTest = distributional $ matrix 100 [1..]
352 -----------------------------------------------------------------------
353