]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/Graph/Distances/Matrice.hs
[DOC] adding haddock documentation (compiles). Commenting src-test. Focusing on docte...
[gargantext.git] / src / Gargantext / Viz / Graph / Distances / Matrice.hs
1 {-|
2 Module : Gargantext.Graph.Distances.Matrix
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10
11 2 main measures are actually implemented in order to compute the proximity of two terms.
12 - Conditional measure is an absolute measure which reflects interactions of 2 terms in the corpus.
13 - Distributional measure is a relative measure which depends on the selected list, it represents structural equivalence.
14
15 Motivation and definition of the @Conditional@ distance.
16
17 Implementation use Accelerate library :
18 * Manuel M. T. Chakravarty, Gabriele Keller, Sean Lee, Trevor L. McDonell, and Vinod Grover.
19 [Accelerating Haskell Array Codes with Multicore GPUs][CKLM+11].
20 In _DAMP '11: Declarative Aspects of Multicore Programming_, ACM, 2011.
21
22 * Trevor L. McDonell, Manuel M. T. Chakravarty, Gabriele Keller, and Ben Lippmeier.
23 [Optimising Purely Functional GPU Programs][MCKL13].
24 In _ICFP '13: The 18th ACM SIGPLAN International Conference on Functional Programming_, ACM, 2013.
25
26 * Robert Clifton-Everest, Trevor L. McDonell, Manuel M. T. Chakravarty, and Gabriele Keller.
27 [Embedding Foreign Code][CMCK14].
28 In _PADL '14: The 16th International Symposium on Practical Aspects of Declarative Languages_, Springer-Verlag, LNCS, 2014.
29
30 * Trevor L. McDonell, Manuel M. T. Chakravarty, Vinod Grover, and Ryan R. Newton.
31 [Type-safe Runtime Code Generation: Accelerate to LLVM][MCGN15].
32 In _Haskell '15: The 8th ACM SIGPLAN Symposium on Haskell_, ACM, 2015.
33
34 -}
35
36 {-# LANGUAGE NoImplicitPrelude #-}
37 {-# LANGUAGE FlexibleContexts #-}
38 {-# LANGUAGE TypeFamilies #-}
39 {-# LANGUAGE TypeOperators #-}
40 {-# LANGUAGE ScopedTypeVariables #-}
41
42 module Gargantext.Viz.Graph.Distances.Matrice
43 where
44
45 import Data.Array.Accelerate
46 import Data.Array.Accelerate.Interpreter (run)
47
48 import qualified Gargantext.Prelude as P
49
50
51 -----------------------------------------------------------------------
52 -- | Test perf.
53 distriTest :: Matrix Double
54 distriTest = distributional $ myMat 100
55 -----------------------------------------------------------------------
56
57 vector :: Int -> (Array (Z :. Int) Int)
58 vector n = fromList (Z :. n) [0..n]
59
60 matrix :: Elt c => Int -> [c] -> Matrix c
61 matrix n l = fromList (Z :. n :. n) l
62
63 myMat :: Int -> Matrix Int
64 myMat n = matrix n [1..]
65
66 -- | Two ways to get the rank (as documentation)
67 rank :: (Matrix a) -> Int
68 rank m = arrayRank $ arrayShape m
69
70 -----------------------------------------------------------------------
71 -- | Dimension of a square Matrix
72 -- How to force use with SquareMatrix ?
73 type Dim = Int
74
75 dim :: Matrix a -> Dim
76 dim m = n
77 where
78 Z :. _ :. n = arrayShape m
79 -- indexTail (arrayShape m)
80
81 -----------------------------------------------------------------------
82 proba :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
83 proba r mat = zipWith (/) mat (mkSum r mat)
84
85 mkSum :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
86 mkSum r mat = replicate (constant (Z :. (r :: Int) :. All)) $ sum mat
87
88 -- | divByDiag
89 divByDiag :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
90 divByDiag d mat = zipWith (/) mat (replicate (constant (Z :. (d :: Int) :. All)) $ diag mat)
91 where
92 diag :: Elt e => Acc (Matrix e) -> Acc (Vector e)
93 diag m = backpermute (indexTail (shape m)) (lift1 (\(Z :. x) -> (Z :. x :. (x :: Exp Int)))) m
94 -----------------------------------------------------------------------
95
96 miniMax :: Acc (Matrix Double) -> Acc (Matrix Double)
97 miniMax m = map (\x -> ifThenElse (x > miniMax') x 0) m
98 where
99 miniMax' = (the $ minimum $ maximum m)
100
101 -- | Conditional distance (basic version)
102
103 conditional :: Matrix Int -> Matrix Double
104 conditional m = run (miniMax $ proba (dim m) $ map fromIntegral $ use m)
105
106
107 -- | Conditional distance (advanced version)
108 -- The conditional measure \[P_c\] of 2 terms @i@ and @j@, also called "confidence"
109 -- , is the maximum probability between @i@ and @j@. If \[n_i\] (resp.
110 -- \[n_j\]) is the number of occurrences of @i@ (resp. @j@) in the corpus and _[n_{ij}\] the number of its occurrences we get:
111 -- \[P_c=max(\frac{n_i}{n_{ij}},\frac{n_j}{n_{ij}} )\]
112 conditional' :: Matrix Int -> (Matrix InclusionExclusion, Matrix SpecificityGenericity)
113 conditional' m = (run $ ie $ map fromIntegral $ use m, run $ sg $ map fromIntegral $ use m)
114 where
115 ie :: Acc (Matrix Double) -> Acc (Matrix Double)
116 ie mat = map (\x -> x / (2*n-1)) $ zipWith (+) (xs mat) (ys mat)
117 sg :: Acc (Matrix Double) -> Acc (Matrix Double)
118 sg mat = map (\x -> x / (2*n-1)) $ zipWith (-) (xs mat) (ys mat)
119
120 n :: Exp Double
121 n = P.fromIntegral r
122
123 r :: Dim
124 r = dim m
125
126 xs :: Acc (Matrix Double) -> Acc (Matrix Double)
127 xs mat = zipWith (-) (proba r mat) (mkSum r $ proba r mat)
128 ys :: Acc (Matrix Double) -> Acc (Matrix Double)
129 ys mat = zipWith (-) (proba r mat) (mkSum r $ transpose $ proba r mat)
130
131 -----------------------------------------------------------------------
132
133 -- | Distributional Distance
134 -- The distributional measure \[P_c\] of @i@ and @j@ terms is:
135 -- \[ S_{MI} = \frac {\sum_{k \neq i,j ; MI_{ik} >0}^{} \min(MI_{ik}, MI_{jk})}{\sum_{k \neq i,j ; MI_{ik}}^{}}
136 -- \]
137 -- \[S{MI}({i},{j}) = \log(\frac{C{ij}}{E{ij}})\] is mutual information
138 -- \[C{ij}\] is number of cooccurrences of @i@ and @j@ in the same context of text
139 -- \[E_{ij} = \frac {S_{i} S_{j}} {N}\] is the expected value of the cooccurrences
140 -- \[N_{i} = \sum_{i}^{} S_{i}\] is the total cooccurrences of @i@ term
141 distributional :: Matrix Int -> Matrix Double
142 distributional m = run $ miniMax $ ri (map fromIntegral $ use m)
143 where
144 n = dim m
145
146 -- filter m = zipWith (\a b -> max a b) m (transpose m)
147
148 ri mat = zipWith (/) mat1 mat2
149 where
150 mat1 = mkSum n $ zipWith min (mi mat) (mi $ transpose mat)
151 mat2 = mkSum n mat
152
153 mi m' = zipWith (\a b -> max (log $ a/b) 0) m'
154 $ zipWith (/) (crossProduct m') (total m')
155
156 total m'' = replicate (constant (Z :. n :. n)) $ fold (+) 0 $ fold (+) 0 m''
157
158 crossProduct m''' = zipWith (*) (cross m''' ) (cross (transpose m'''))
159 cross mat = zipWith (-) (mkSum n mat) (mat)
160
161 -----------------------------------------------------------------------
162 -----------------------------------------------------------------------
163
164 {-
165 Metric Specificity and genericity: select terms
166
167 let N termes
168 Ni : occ de i
169 Nij : cooc i et j
170 Probability to get i given j : P(i|j)=Nij/Nj
171 Gen(i) : 1/(N-1)*Sum(j!=i, P(i|j)) : Genericity of i
172 Spec(i) : 1/(N-1)*Sum(j!=i, P(j|i)) : Specificity of j
173 Inclusion (i) = Gen(i)+Spec(i)
174 Genericity score = Gen(i)- Spec(i)
175
176
177 References:
178 * Science mapping with asymmetrical paradigmatic proximity Jean-Philippe Cointet (CREA, TSV), David Chavalarias (CREA) (Submitted on 15 Mar 2008), Networks and Heterogeneous Media 3, 2 (2008) 267 - 276, arXiv:0803.2315 [cs.OH]
179
180 -}
181
182 type InclusionExclusion = Double
183 type SpecificityGenericity = Double
184
185 data SquareMatrix = SymetricMatrix | NonSymetricMatrix
186 type SymetricMatrix = Matrix
187 type NonSymetricMatrix = Matrix
188
189
190 incExcSpeGen :: Matrix Int -> (Vector InclusionExclusion, Vector SpecificityGenericity)
191 incExcSpeGen m = (run' inclusionExclusion m, run' specificityGenericity m)
192 where
193 run' fun mat = run $ fun $ map fromIntegral $ use mat
194
195 -- | Inclusion (i) = Gen(i)+Spec(i)
196 inclusionExclusion :: Acc (Matrix Double) -> Acc (Vector Double)
197 inclusionExclusion mat = zipWith (+) (pV mat) (pH mat)
198 --
199 -- | Genericity score = Gen(i)- Spec(i)
200 specificityGenericity :: Acc (Matrix Double) -> Acc (Vector Double)
201 specificityGenericity mat = zipWith (-) (pV mat) (pH mat)
202
203 -- | Gen(i) : 1/(N-1)*Sum(j!=i, P(i|j)) : Genericity of i
204 pV :: Acc (Matrix Double) -> Acc (Vector Double)
205 pV mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ij mat
206
207 -- | Spec(i) : 1/(N-1)*Sum(j!=i, P(j|i)) : Specificity of j
208 pH :: Acc (Matrix Double) -> Acc (Vector Double)
209 pH mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ji mat
210
211 cardN :: Exp Double
212 cardN = constant (P.fromIntegral (dim m) :: Double)
213
214
215 -- | P(i|j) = Nij /N(jj) Probability to get i given j
216 p_ij :: (Elt e, P.Fractional (Exp e)) => Acc (SymetricMatrix e) -> Acc (Matrix e)
217 p_ij m = zipWith (/) m (n_jj m)
218 where
219 n_jj :: Elt e => Acc (SymetricMatrix e) -> Acc (Matrix e)
220 n_jj myMat' = backpermute (shape m)
221 (lift1 ( \(Z :. (_ :: Exp Int) :. (j:: Exp Int))
222 -> (Z :. j :. j)
223 )
224 ) myMat'
225
226 -- | P(j|i) = Nij /N(ii) Probability to get i given j
227 -- to test
228 p_ji :: (Elt e, P.Fractional (Exp e)) => Acc (Array DIM2 e) -> Acc (Array DIM2 e)
229 p_ji = transpose . p_ij
230
231
232 -- | Step to ckeck the result in visual/qualitative tests
233 incExcSpeGen_proba :: Matrix Int -> Matrix Double
234 incExcSpeGen_proba m = run' pro m
235 where
236 run' fun mat = run $ fun $ map fromIntegral $ use mat
237
238 pro mat = p_ji mat
239
240 {-
241 -- | Hypothesis to test maybe later (or not)
242 -- TODO ask accelerate for instances to ease such writtings:
243 p_ :: (Elt e, P.Fractional (Exp e)) => Acc (Array DIM2 e) -> Acc (Array DIM2 e)
244 p_ m = zipWith (/) m (n_ m)
245 where
246 n_ :: Elt e => Acc (SymetricMatrix e) -> Acc (Matrix e)
247 n_ m = backpermute (shape m)
248 (lift1 ( \(Z :. (i :: Exp Int) :. (j:: Exp Int))
249 -> (ifThenElse (i < j) (lift (Z :. j :. j)) (lift (Z :. i :. i)) :: Exp DIM2)
250 )
251 ) m
252 -}
253
254