]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/Graph/Distances/Matrice.hs
[refactoring] add some default extensions to package.yaml
[gargantext.git] / src / Gargantext / Viz / Graph / Distances / Matrice.hs
1 {-|
2 Module : Gargantext.Graph.Distances.Matrix
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 This module aims at implementig distances of terms context by context is
11 the same referential of corpus.
12
13
14 Implementation use Accelerate library which enables GPU and CPU computation:
15
16 * Manuel M. T. Chakravarty, Gabriele Keller, Sean Lee, Trevor L. McDonell, and Vinod Grover.
17 [Accelerating Haskell Array Codes with Multicore GPUs][CKLM+11].
18 In _DAMP '11: Declarative Aspects of Multicore Programming_, ACM, 2011.
19
20 * Trevor L. McDonell, Manuel M. T. Chakravarty, Gabriele Keller, and Ben Lippmeier.
21 [Optimising Purely Functional GPU Programs][MCKL13].
22 In _ICFP '13: The 18th ACM SIGPLAN International Conference on Functional Programming_, ACM, 2013.
23
24 * Robert Clifton-Everest, Trevor L. McDonell, Manuel M. T. Chakravarty, and Gabriele Keller.
25 [Embedding Foreign Code][CMCK14].
26 In _PADL '14: The 16th International Symposium on Practical Aspects of Declarative Languages_, Springer-Verlag, LNCS, 2014.
27
28 * Trevor L. McDonell, Manuel M. T. Chakravarty, Vinod Grover, and Ryan R. Newton.
29 [Type-safe Runtime Code Generation: Accelerate to LLVM][MCGN15].
30 In _Haskell '15: The 8th ACM SIGPLAN Symposium on Haskell_, ACM, 2015.
31
32 -}
33
34 {-# LANGUAGE TypeFamilies #-}
35 {-# LANGUAGE TypeOperators #-}
36 {-# LANGUAGE ScopedTypeVariables #-}
37
38 module Gargantext.Viz.Graph.Distances.Matrice
39 where
40
41 import Data.Array.Accelerate
42 import Data.Array.Accelerate.Interpreter (run)
43
44 import qualified Gargantext.Prelude as P
45
46
47 -----------------------------------------------------------------------
48 -- | Define a vector
49 --
50 -- >>> vector 3
51 -- Vector (Z :. 3) [0,1,2]
52 vector :: Int -> (Array (Z :. Int) Int)
53 vector n = fromList (Z :. n) [0..n]
54
55 -- | Define a matrix
56 --
57 -- >>> matrix 3 ([1..] :: [Double])
58 -- Matrix (Z :. 3 :. 3)
59 -- [ 1.0, 2.0, 3.0,
60 -- 4.0, 5.0, 6.0,
61 -- 7.0, 8.0, 9.0]
62 matrix :: Elt c => Int -> [c] -> Matrix c
63 matrix n l = fromList (Z :. n :. n) l
64
65 -- | Two ways to get the rank (as documentation)
66 --
67 -- >>> rank (matrix 3 ([1..] :: [Int]))
68 -- 2
69 rank :: (Matrix a) -> Int
70 rank m = arrayRank $ arrayShape m
71
72 -----------------------------------------------------------------------
73 -- | Dimension of a square Matrix
74 -- How to force use with SquareMatrix ?
75 type Dim = Int
76
77 -- | Get Dimension of a square Matrix
78 --
79 -- >>> dim (matrix 3 ([1..] :: [Int]))
80 -- 3
81 dim :: Matrix a -> Dim
82 dim m = n
83 where
84 Z :. _ :. n = arrayShape m
85 -- indexTail (arrayShape m)
86
87 -----------------------------------------------------------------------
88
89 -- | Sum of a Matrix by Column
90 --
91 -- >>> run $ matSum 3 (use $ matrix 3 [1..])
92 -- Matrix (Z :. 3 :. 3)
93 -- [ 12.0, 15.0, 18.0,
94 -- 12.0, 15.0, 18.0,
95 -- 12.0, 15.0, 18.0]
96 matSum :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
97 matSum r mat = replicate (constant (Z :. (r :: Int) :. All)) $ sum $ transpose mat
98
99
100 -- | Proba computes de probability matrix: all cells divided by thee sum of its column
101 -- if you need get the probability on the lines, just transpose it
102 --
103 -- >>> run $ matProba 3 (use $ matrix 3 [1..])
104 -- Matrix (Z :. 3 :. 3)
105 -- [ 8.333333333333333e-2, 0.13333333333333333, 0.16666666666666666,
106 -- 0.3333333333333333, 0.3333333333333333, 0.3333333333333333,
107 -- 0.5833333333333334, 0.5333333333333333, 0.5]
108 matProba :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
109 matProba r mat = zipWith (/) mat (matSum r mat)
110
111 -- | Diagonal of the matrix
112 --
113 -- >>> run $ diag (use $ matrix 3 ([1..] :: [Int]))
114 -- Vector (Z :. 3) [1,5,9]
115 diag :: Elt e => Acc (Matrix e) -> Acc (Vector e)
116 diag m = backpermute (indexTail (shape m)) (lift1 (\(Z :. x) -> (Z :. x :. (x :: Exp Int)))) m
117
118 -- | Divide by the Diagonal of the matrix
119 --
120 -- >>> run $ divByDiag 3 (use $ matrix 3 ([1..] :: [Double]))
121 -- Matrix (Z :. 3 :. 3)
122 -- [ 1.0, 0.4, 0.3333333333333333,
123 -- 4.0, 1.0, 0.6666666666666666,
124 -- 7.0, 1.6, 1.0]
125 divByDiag :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
126 divByDiag d mat = zipWith (/) mat (replicate (constant (Z :. (d :: Int) :. All)) $ diag mat)
127
128 -----------------------------------------------------------------------
129 -- | Filters the matrix with the minimum of maximums
130 --
131 -- >>> run $ matMiniMax $ use $ matrix 3 [1..]
132 -- Matrix (Z :. 3 :. 3)
133 -- [ 0.0, 4.0, 7.0,
134 -- 0.0, 5.0, 8.0,
135 -- 0.0, 6.0, 9.0]
136 matMiniMax :: Acc (Matrix Double) -> Acc (Matrix Double)
137 matMiniMax m = map (\x -> ifThenElse (x > miniMax') x 0) (transpose m)
138 where
139 miniMax' = (the $ minimum $ maximum m)
140
141 -- | Filters the matrix with a constant
142 --
143 -- >>> run $ matFilter 5 $ use $ matrix 3 [1..]
144 -- Matrix (Z :. 3 :. 3)
145 -- [ 0.0, 0.0, 7.0,
146 -- 0.0, 0.0, 8.0,
147 -- 0.0, 6.0, 9.0]
148 matFilter :: Double -> Acc (Matrix Double) -> Acc (Matrix Double)
149 matFilter t m = map (\x -> ifThenElse (x > (constant t)) x 0) (transpose m)
150
151 -----------------------------------------------------------------------
152 -- * Measures of proximity
153 -----------------------------------------------------------------------
154 -- ** Conditional distance
155
156 -- *** Conditional distance (basic)
157
158 -- | Conditional distance (basic version)
159 --
160 -- 2 main measures are actually implemented in order to compute the
161 -- proximity of two terms: conditional and distributional
162 --
163 -- Conditional measure is an absolute measure which reflects
164 -- interactions of 2 terms in the corpus.
165 measureConditional :: Matrix Int -> Matrix Double
166 --measureConditional m = run (matMiniMax $ matProba (dim m) $ map fromIntegral $ use m)
167 measureConditional m = run (matProba (dim m) $ map fromIntegral $ use m)
168
169
170 -- *** Conditional distance (advanced)
171
172 -- | Conditional distance (advanced version)
173 --
174 -- The conditional measure P(i|j) of 2 terms @i@ and @j@, also called
175 -- "confidence" , is the maximum probability between @i@ and @j@ to see
176 -- @i@ in the same context of @j@ knowing @j@.
177 --
178 -- If N(i) (resp. N(j)) is the number of occurrences of @i@ (resp. @j@)
179 -- in the corpus and _[n_{ij}\] the number of its occurrences we get:
180 --
181 -- \[P_c=max(\frac{n_i}{n_{ij}},\frac{n_j}{n_{ij}} )\]
182 conditional' :: Matrix Int -> (Matrix InclusionExclusion, Matrix SpecificityGenericity)
183 conditional' m = (run $ ie $ map fromIntegral $ use m, run $ sg $ map fromIntegral $ use m)
184 where
185 ie :: Acc (Matrix Double) -> Acc (Matrix Double)
186 ie mat = map (\x -> x / (2*n-1)) $ zipWith (+) (xs mat) (ys mat)
187 sg :: Acc (Matrix Double) -> Acc (Matrix Double)
188 sg mat = map (\x -> x / (2*n-1)) $ zipWith (-) (xs mat) (ys mat)
189
190 n :: Exp Double
191 n = P.fromIntegral r
192
193 r :: Dim
194 r = dim m
195
196 xs :: Acc (Matrix Double) -> Acc (Matrix Double)
197 xs mat = zipWith (-) (matSum r $ matProba r mat) (matProba r mat)
198 ys :: Acc (Matrix Double) -> Acc (Matrix Double)
199 ys mat = zipWith (-) (matSum r $ transpose $ matProba r mat) (matProba r mat)
200
201 -----------------------------------------------------------------------
202 -- ** Distributional Distance
203
204 -- | Distributional Distance Measure
205 --
206 -- Distributional measure is a relative measure which depends on the
207 -- selected list, it represents structural equivalence.
208 --
209 -- The distributional measure P(c) of @i@ and @j@ terms is: \[
210 -- S_{MI} = \frac {\sum_{k \neq i,j ; MI_{ik} >0}^{} \min(MI_{ik},
211 -- MI_{jk})}{\sum_{k \neq i,j ; MI_{ik}}^{}} \]
212 --
213 -- Mutual information
214 -- \[S_{MI}({i},{j}) = \log(\frac{C{ij}}{E{ij}})\]
215 --
216 -- Number of cooccurrences of @i@ and @j@ in the same context of text
217 -- \[C{ij}\]
218 --
219 -- The expected value of the cooccurrences @i@ and @j@ (given a map list of size @n@)
220 -- \[E_{ij}^{m} = \frac {S_{i} S_{j}} {N_{m}}\]
221 --
222 -- Total cooccurrences of term @i@ given a map list of size @m@
223 -- \[S_{i} = \sum_{j, j \neq i}^{m} S_{ij}\]
224 --
225 -- Total cooccurrences of terms given a map list of size @m@
226 -- \[N_{m} = \sum_{i,i \neq i}^{m} \sum_{j, j \neq j}^{m} S_{ij}\]
227 --
228 distributional :: Matrix Int -> Matrix Double
229 distributional m = run $ matMiniMax $ ri (map fromIntegral $ use m)
230 where
231
232 -- filter m = zipWith (\a b -> max a b) m (transpose m)
233
234 ri mat = zipWith (/) mat1 mat2
235 where
236 mat1 = matSum n $ zipWith min (s_mi mat) (s_mi $ transpose mat)
237 mat2 = matSum n mat
238
239 s_mi m' = zipWith (\a b -> log (a/b)) m'
240 $ zipWith (/) (crossProduct m') (total m')
241
242 total m'' = replicate (constant (Z :. n :. n)) $ fold (+) 0 $ fold (+) 0 m''
243 n = dim m
244
245 crossProduct m''' = zipWith (*) (cross m''' ) (cross (transpose m'''))
246 cross mat = zipWith (-) (matSum n mat) (mat)
247
248 -----------------------------------------------------------------------
249 -----------------------------------------------------------------------
250 -- * Specificity and Genericity
251
252 {- | Metric Specificity and genericity: select terms
253
254 - let N termes and occurrences of i \[N{i}\]
255
256 - Cooccurrences of i and j \[N{ij}\]
257 - Probability to get i given j : \[P(i|j)=N{ij}/N{j}\]
258
259 - Genericity of i \[Gen(i) = \frac{\sum_{j \neq i,j} P(i|j)}{N-1}\]
260 - Specificity of j \[Spec(i) = \frac{\sum_{j \neq i,j} P(j|i)}{N-1}\]
261
262 - \[Inclusion (i) = Gen(i) + Spec(i)\)
263 - \[GenericityScore = Gen(i)- Spec(i)\]
264
265 - References: Science mapping with asymmetrical paradigmatic proximity
266 Jean-Philippe Cointet (CREA, TSV), David Chavalarias (CREA) (Submitted
267 on 15 Mar 2008), Networks and Heterogeneous Media 3, 2 (2008) 267 - 276,
268 arXiv:0803.2315 [cs.OH]
269 -}
270 type InclusionExclusion = Double
271 type SpecificityGenericity = Double
272
273 data SquareMatrix = SymetricMatrix | NonSymetricMatrix
274 type SymetricMatrix = Matrix
275 type NonSymetricMatrix = Matrix
276
277
278 incExcSpeGen :: Matrix Int -> (Vector InclusionExclusion, Vector SpecificityGenericity)
279 incExcSpeGen m = (run' inclusionExclusion m, run' specificityGenericity m)
280 where
281 run' fun mat = run $ fun $ map fromIntegral $ use mat
282
283 -- | Inclusion (i) = Gen(i)+Spec(i)
284 inclusionExclusion :: Acc (Matrix Double) -> Acc (Vector Double)
285 inclusionExclusion mat = zipWith (+) (pV mat) (pV mat)
286
287 -- | Genericity score = Gen(i)- Spec(i)
288 specificityGenericity :: Acc (Matrix Double) -> Acc (Vector Double)
289 specificityGenericity mat = zipWith (+) (pH mat) (pH mat)
290
291 -- | Gen(i) : 1/(N-1)*Sum(j!=i, P(i|j)) : Genericity of i
292 pV :: Acc (Matrix Double) -> Acc (Vector Double)
293 pV mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ij mat
294
295 -- | Spec(i) : 1/(N-1)*Sum(j!=i, P(j|i)) : Specificity of j
296 pH :: Acc (Matrix Double) -> Acc (Vector Double)
297 pH mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ji mat
298
299 cardN :: Exp Double
300 cardN = constant (P.fromIntegral (dim m) :: Double)
301
302
303 -- | P(i|j) = Nij /N(jj) Probability to get i given j
304 --p_ij :: (Elt e, P.Fractional (Exp e)) => Acc (SymetricMatrix e) -> Acc (Matrix e)
305 p_ij :: (Elt e, P.Fractional (Exp e)) => Acc (Matrix e) -> Acc (Matrix e)
306 p_ij m = zipWith (/) m (n_jj m)
307 where
308 n_jj :: Elt e => Acc (SymetricMatrix e) -> Acc (Matrix e)
309 n_jj myMat' = backpermute (shape m)
310 (lift1 ( \(Z :. (_ :: Exp Int) :. (j:: Exp Int))
311 -> (Z :. j :. j)
312 )
313 ) myMat'
314
315 -- | P(j|i) = Nij /N(ii) Probability to get i given j
316 -- to test
317 p_ji :: (Elt e, P.Fractional (Exp e)) => Acc (Array DIM2 e) -> Acc (Array DIM2 e)
318 p_ji = transpose . p_ij
319
320
321 -- | Step to ckeck the result in visual/qualitative tests
322 incExcSpeGen_proba :: Matrix Int -> Matrix Double
323 incExcSpeGen_proba m = run' pro m
324 where
325 run' fun mat = run $ fun $ map fromIntegral $ use mat
326
327 pro mat = p_ji mat
328
329 {-
330 -- | Hypothesis to test maybe later (or not)
331 -- TODO ask accelerate for instances to ease such writtings:
332 p_ :: (Elt e, P.Fractional (Exp e)) => Acc (Array DIM2 e) -> Acc (Array DIM2 e)
333 p_ m = zipWith (/) m (n_ m)
334 where
335 n_ :: Elt e => Acc (SymetricMatrix e) -> Acc (Matrix e)
336 n_ m = backpermute (shape m)
337 (lift1 ( \(Z :. (i :: Exp Int) :. (j:: Exp Int))
338 -> (ifThenElse (i < j) (lift (Z :. j :. j)) (lift (Z :. i :. i)) :: Exp DIM2)
339 )
340 ) m
341 -}
342
343 -- * For Tests (to be removed)
344 -- | Test perfermance with this matrix
345 -- TODO : add this in a benchmark folder
346 distriTest :: Matrix Double
347 distriTest = distributional $ matrix 100 [1..]
348 -----------------------------------------------------------------------
349