]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/Graph/Distances/Matrice.hs
[docker] update image, add README info
[gargantext.git] / src / Gargantext / Viz / Graph / Distances / Matrice.hs
1 {-|
2 Module : Gargantext.Graph.Distances.Matrix
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 This module aims at implementig distances of terms context by context is
11 the same referential of corpus.
12
13
14 Implementation use Accelerate library which enables GPU and CPU computation:
15
16 * Manuel M. T. Chakravarty, Gabriele Keller, Sean Lee, Trevor L. McDonell, and Vinod Grover.
17 [Accelerating Haskell Array Codes with Multicore GPUs][CKLM+11].
18 In _DAMP '11: Declarative Aspects of Multicore Programming_, ACM, 2011.
19
20 * Trevor L. McDonell, Manuel M. T. Chakravarty, Gabriele Keller, and Ben Lippmeier.
21 [Optimising Purely Functional GPU Programs][MCKL13].
22 In _ICFP '13: The 18th ACM SIGPLAN International Conference on Functional Programming_, ACM, 2013.
23
24 * Robert Clifton-Everest, Trevor L. McDonell, Manuel M. T. Chakravarty, and Gabriele Keller.
25 [Embedding Foreign Code][CMCK14].
26 In _PADL '14: The 16th International Symposium on Practical Aspects of Declarative Languages_, Springer-Verlag, LNCS, 2014.
27
28 * Trevor L. McDonell, Manuel M. T. Chakravarty, Vinod Grover, and Ryan R. Newton.
29 [Type-safe Runtime Code Generation: Accelerate to LLVM][MCGN15].
30 In _Haskell '15: The 8th ACM SIGPLAN Symposium on Haskell_, ACM, 2015.
31
32 -}
33
34 {-# LANGUAGE NoImplicitPrelude #-}
35 {-# LANGUAGE FlexibleContexts #-}
36 {-# LANGUAGE TypeFamilies #-}
37 {-# LANGUAGE TypeOperators #-}
38 {-# LANGUAGE ScopedTypeVariables #-}
39
40 module Gargantext.Viz.Graph.Distances.Matrice
41 where
42
43 import Data.Array.Accelerate
44 import Data.Array.Accelerate.Interpreter (run)
45
46 import qualified Gargantext.Prelude as P
47
48
49 -----------------------------------------------------------------------
50 -- | Define a vector
51 --
52 -- >>> vector 3
53 -- Vector (Z :. 3) [0,1,2]
54 vector :: Int -> (Array (Z :. Int) Int)
55 vector n = fromList (Z :. n) [0..n]
56
57 -- | Define a matrix
58 --
59 -- >>> matrix 3 ([1..] :: [Double])
60 -- Matrix (Z :. 3 :. 3)
61 -- [ 1.0, 2.0, 3.0,
62 -- 4.0, 5.0, 6.0,
63 -- 7.0, 8.0, 9.0]
64 matrix :: Elt c => Int -> [c] -> Matrix c
65 matrix n l = fromList (Z :. n :. n) l
66
67 -- | Two ways to get the rank (as documentation)
68 --
69 -- >>> rank (matrix 3 ([1..] :: [Int]))
70 -- 2
71 rank :: (Matrix a) -> Int
72 rank m = arrayRank $ arrayShape m
73
74 -----------------------------------------------------------------------
75 -- | Dimension of a square Matrix
76 -- How to force use with SquareMatrix ?
77 type Dim = Int
78
79 -- | Get Dimension of a square Matrix
80 --
81 -- >>> dim (matrix 3 ([1..] :: [Int]))
82 -- 3
83 dim :: Matrix a -> Dim
84 dim m = n
85 where
86 Z :. _ :. n = arrayShape m
87 -- indexTail (arrayShape m)
88
89 -----------------------------------------------------------------------
90
91 -- | Sum of a Matrix by Column
92 --
93 -- >>> run $ matSum 3 (use $ matrix 3 [1..])
94 -- Matrix (Z :. 3 :. 3)
95 -- [ 12.0, 15.0, 18.0,
96 -- 12.0, 15.0, 18.0,
97 -- 12.0, 15.0, 18.0]
98 matSum :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
99 matSum r mat = replicate (constant (Z :. (r :: Int) :. All)) $ sum $ transpose mat
100
101
102 -- | Proba computes de probability matrix: all cells divided by thee sum of its column
103 -- if you need get the probability on the lines, just transpose it
104 --
105 -- >>> run $ matProba 3 (use $ matrix 3 [1..])
106 -- Matrix (Z :. 3 :. 3)
107 -- [ 8.333333333333333e-2, 0.13333333333333333, 0.16666666666666666,
108 -- 0.3333333333333333, 0.3333333333333333, 0.3333333333333333,
109 -- 0.5833333333333334, 0.5333333333333333, 0.5]
110 matProba :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
111 matProba r mat = zipWith (/) mat (matSum r mat)
112
113 -- | Diagonal of the matrix
114 --
115 -- >>> run $ diag (use $ matrix 3 ([1..] :: [Int]))
116 -- Vector (Z :. 3) [1,5,9]
117 diag :: Elt e => Acc (Matrix e) -> Acc (Vector e)
118 diag m = backpermute (indexTail (shape m)) (lift1 (\(Z :. x) -> (Z :. x :. (x :: Exp Int)))) m
119
120 -- | Divide by the Diagonal of the matrix
121 --
122 -- >>> run $ divByDiag 3 (use $ matrix 3 ([1..] :: [Double]))
123 -- Matrix (Z :. 3 :. 3)
124 -- [ 1.0, 0.4, 0.3333333333333333,
125 -- 4.0, 1.0, 0.6666666666666666,
126 -- 7.0, 1.6, 1.0]
127 divByDiag :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
128 divByDiag d mat = zipWith (/) mat (replicate (constant (Z :. (d :: Int) :. All)) $ diag mat)
129
130 -----------------------------------------------------------------------
131 -- | Filters the matrix with the minimum of maximums
132 --
133 -- >>> run $ matMiniMax $ use $ matrix 3 [1..]
134 -- Matrix (Z :. 3 :. 3)
135 -- [ 0.0, 4.0, 7.0,
136 -- 0.0, 5.0, 8.0,
137 -- 0.0, 6.0, 9.0]
138 matMiniMax :: Acc (Matrix Double) -> Acc (Matrix Double)
139 matMiniMax m = map (\x -> ifThenElse (x > miniMax') x 0) (transpose m)
140 where
141 miniMax' = (the $ minimum $ maximum m)
142
143 -- | Filters the matrix with a constant
144 --
145 -- >>> run $ matFilter 5 $ use $ matrix 3 [1..]
146 -- Matrix (Z :. 3 :. 3)
147 -- [ 0.0, 0.0, 7.0,
148 -- 0.0, 0.0, 8.0,
149 -- 0.0, 6.0, 9.0]
150 matFilter :: Double -> Acc (Matrix Double) -> Acc (Matrix Double)
151 matFilter t m = map (\x -> ifThenElse (x > (constant t)) x 0) (transpose m)
152
153 -----------------------------------------------------------------------
154 -- * Measures of proximity
155 -----------------------------------------------------------------------
156 -- ** Conditional distance
157
158 -- *** Conditional distance (basic)
159
160 -- | Conditional distance (basic version)
161 --
162 -- 2 main measures are actually implemented in order to compute the
163 -- proximity of two terms: conditional and distributional
164 --
165 -- Conditional measure is an absolute measure which reflects
166 -- interactions of 2 terms in the corpus.
167 measureConditional :: Matrix Int -> Matrix Double
168 --measureConditional m = run (matMiniMax $ matProba (dim m) $ map fromIntegral $ use m)
169 measureConditional m = run (matProba (dim m) $ map fromIntegral $ use m)
170
171
172 -- *** Conditional distance (advanced)
173
174 -- | Conditional distance (advanced version)
175 --
176 -- The conditional measure P(i|j) of 2 terms @i@ and @j@, also called
177 -- "confidence" , is the maximum probability between @i@ and @j@ to see
178 -- @i@ in the same context of @j@ knowing @j@.
179 --
180 -- If N(i) (resp. N(j)) is the number of occurrences of @i@ (resp. @j@)
181 -- in the corpus and _[n_{ij}\] the number of its occurrences we get:
182 --
183 -- \[P_c=max(\frac{n_i}{n_{ij}},\frac{n_j}{n_{ij}} )\]
184 conditional' :: Matrix Int -> (Matrix InclusionExclusion, Matrix SpecificityGenericity)
185 conditional' m = (run $ ie $ map fromIntegral $ use m, run $ sg $ map fromIntegral $ use m)
186 where
187 ie :: Acc (Matrix Double) -> Acc (Matrix Double)
188 ie mat = map (\x -> x / (2*n-1)) $ zipWith (+) (xs mat) (ys mat)
189 sg :: Acc (Matrix Double) -> Acc (Matrix Double)
190 sg mat = map (\x -> x / (2*n-1)) $ zipWith (-) (xs mat) (ys mat)
191
192 n :: Exp Double
193 n = P.fromIntegral r
194
195 r :: Dim
196 r = dim m
197
198 xs :: Acc (Matrix Double) -> Acc (Matrix Double)
199 xs mat = zipWith (-) (matSum r $ matProba r mat) (matProba r mat)
200 ys :: Acc (Matrix Double) -> Acc (Matrix Double)
201 ys mat = zipWith (-) (matSum r $ transpose $ matProba r mat) (matProba r mat)
202
203 -----------------------------------------------------------------------
204 -- ** Distributional Distance
205
206 -- | Distributional Distance Measure
207 --
208 -- Distributional measure is a relative measure which depends on the
209 -- selected list, it represents structural equivalence.
210 --
211 -- The distributional measure P(c) of @i@ and @j@ terms is: \[
212 -- S_{MI} = \frac {\sum_{k \neq i,j ; MI_{ik} >0}^{} \min(MI_{ik},
213 -- MI_{jk})}{\sum_{k \neq i,j ; MI_{ik}}^{}} \]
214 --
215 -- Mutual information
216 -- \[S_{MI}({i},{j}) = \log(\frac{C{ij}}{E{ij}})\]
217 --
218 -- Number of cooccurrences of @i@ and @j@ in the same context of text
219 -- \[C{ij}\]
220 --
221 -- The expected value of the cooccurrences @i@ and @j@ (given a map list of size @n@)
222 -- \[E_{ij}^{m} = \frac {S_{i} S_{j}} {N_{m}}\]
223 --
224 -- Total cooccurrences of term @i@ given a map list of size @m@
225 -- \[S_{i} = \sum_{j, j \neq i}^{m} S_{ij}\]
226 --
227 -- Total cooccurrences of terms given a map list of size @m@
228 -- \[N_{m} = \sum_{i,i \neq i}^{m} \sum_{j, j \neq j}^{m} S_{ij}\]
229 --
230 distributional :: Matrix Int -> Matrix Double
231 distributional m = run $ matMiniMax $ ri (map fromIntegral $ use m)
232 where
233
234 -- filter m = zipWith (\a b -> max a b) m (transpose m)
235
236 ri mat = zipWith (/) mat1 mat2
237 where
238 mat1 = matSum n $ zipWith min (s_mi mat) (s_mi $ transpose mat)
239 mat2 = matSum n mat
240
241 s_mi m' = zipWith (\a b -> log (a/b)) m'
242 $ zipWith (/) (crossProduct m') (total m')
243
244 total m'' = replicate (constant (Z :. n :. n)) $ fold (+) 0 $ fold (+) 0 m''
245 n = dim m
246
247 crossProduct m''' = zipWith (*) (cross m''' ) (cross (transpose m'''))
248 cross mat = zipWith (-) (matSum n mat) (mat)
249
250 -----------------------------------------------------------------------
251 -----------------------------------------------------------------------
252 -- * Specificity and Genericity
253
254 {- | Metric Specificity and genericity: select terms
255
256 - let N termes and occurrences of i \[N{i}\]
257
258 - Cooccurrences of i and j \[N{ij}\]
259 - Probability to get i given j : \[P(i|j)=N{ij}/N{j}\]
260
261 - Genericity of i \[Gen(i) = \frac{\sum_{j \neq i,j} P(i|j)}{N-1}\]
262 - Specificity of j \[Spec(i) = \frac{\sum_{j \neq i,j} P(j|i)}{N-1}\]
263
264 - \[Inclusion (i) = Gen(i) + Spec(i)\)
265 - \[GenericityScore = Gen(i)- Spec(i)\]
266
267 - References: Science mapping with asymmetrical paradigmatic proximity
268 Jean-Philippe Cointet (CREA, TSV), David Chavalarias (CREA) (Submitted
269 on 15 Mar 2008), Networks and Heterogeneous Media 3, 2 (2008) 267 - 276,
270 arXiv:0803.2315 [cs.OH]
271 -}
272 type InclusionExclusion = Double
273 type SpecificityGenericity = Double
274
275 data SquareMatrix = SymetricMatrix | NonSymetricMatrix
276 type SymetricMatrix = Matrix
277 type NonSymetricMatrix = Matrix
278
279
280 incExcSpeGen :: Matrix Int -> (Vector InclusionExclusion, Vector SpecificityGenericity)
281 incExcSpeGen m = (run' inclusionExclusion m, run' specificityGenericity m)
282 where
283 run' fun mat = run $ fun $ map fromIntegral $ use mat
284
285 -- | Inclusion (i) = Gen(i)+Spec(i)
286 inclusionExclusion :: Acc (Matrix Double) -> Acc (Vector Double)
287 inclusionExclusion mat = zipWith (+) (pV mat) (pV mat)
288
289 -- | Genericity score = Gen(i)- Spec(i)
290 specificityGenericity :: Acc (Matrix Double) -> Acc (Vector Double)
291 specificityGenericity mat = zipWith (+) (pH mat) (pH mat)
292
293 -- | Gen(i) : 1/(N-1)*Sum(j!=i, P(i|j)) : Genericity of i
294 pV :: Acc (Matrix Double) -> Acc (Vector Double)
295 pV mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ij mat
296
297 -- | Spec(i) : 1/(N-1)*Sum(j!=i, P(j|i)) : Specificity of j
298 pH :: Acc (Matrix Double) -> Acc (Vector Double)
299 pH mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ji mat
300
301 cardN :: Exp Double
302 cardN = constant (P.fromIntegral (dim m) :: Double)
303
304
305 -- | P(i|j) = Nij /N(jj) Probability to get i given j
306 --p_ij :: (Elt e, P.Fractional (Exp e)) => Acc (SymetricMatrix e) -> Acc (Matrix e)
307 p_ij :: (Elt e, P.Fractional (Exp e)) => Acc (Matrix e) -> Acc (Matrix e)
308 p_ij m = zipWith (/) m (n_jj m)
309 where
310 n_jj :: Elt e => Acc (SymetricMatrix e) -> Acc (Matrix e)
311 n_jj myMat' = backpermute (shape m)
312 (lift1 ( \(Z :. (_ :: Exp Int) :. (j:: Exp Int))
313 -> (Z :. j :. j)
314 )
315 ) myMat'
316
317 -- | P(j|i) = Nij /N(ii) Probability to get i given j
318 -- to test
319 p_ji :: (Elt e, P.Fractional (Exp e)) => Acc (Array DIM2 e) -> Acc (Array DIM2 e)
320 p_ji = transpose . p_ij
321
322
323 -- | Step to ckeck the result in visual/qualitative tests
324 incExcSpeGen_proba :: Matrix Int -> Matrix Double
325 incExcSpeGen_proba m = run' pro m
326 where
327 run' fun mat = run $ fun $ map fromIntegral $ use mat
328
329 pro mat = p_ji mat
330
331 {-
332 -- | Hypothesis to test maybe later (or not)
333 -- TODO ask accelerate for instances to ease such writtings:
334 p_ :: (Elt e, P.Fractional (Exp e)) => Acc (Array DIM2 e) -> Acc (Array DIM2 e)
335 p_ m = zipWith (/) m (n_ m)
336 where
337 n_ :: Elt e => Acc (SymetricMatrix e) -> Acc (Matrix e)
338 n_ m = backpermute (shape m)
339 (lift1 ( \(Z :. (i :: Exp Int) :. (j:: Exp Int))
340 -> (ifThenElse (i < j) (lift (Z :. j :. j)) (lift (Z :. i :. i)) :: Exp DIM2)
341 )
342 ) m
343 -}
344
345 -- * For Tests (to be removed)
346 -- | Test perfermance with this matrix
347 -- TODO : add this in a benchmark folder
348 distriTest :: Matrix Double
349 distriTest = distributional $ matrix 100 [1..]
350 -----------------------------------------------------------------------
351