]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Methods/Similarities/Accelerate/Distributional.hs
[nlp] corenlps/johnsnows/spacys for https versions of schema
[gargantext.git] / src / Gargantext / Core / Methods / Similarities / Accelerate / Distributional.hs
1 {-|
2 Module : Gargantext.Core.Methods.Similarities.Accelerate.Distributional
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10
11 * Distributional Similarity metric
12 __Definition :__ Distributional metric is a relative metric which depends on the
13 selected list, it represents structural equivalence of mutual information.
14
15 __Objective :__ We want to compute with matrices processing the similarity between term $i$ and term $j$ :
16 distr(i,j)=$\frac{\Sigma_{k \neq i,j} min(\frac{n_{ik}^2}{n_{ii}n_{kk}},\frac{n_{jk}^2}{n_{jj}n_{kk}})}{\Sigma_{k \neq i}\frac{n_{ik}^2}{ n_{ii}n_{kk}}}$
17
18 where $n_{ij}$ is the cooccurrence between term $i$ and term $j$
19
20 * For a vector V=[$x_1$ ... $x_n$], we note $|V|_1=\Sigma_ix_i$
21 * operator : .* and ./ cell by cell multiplication and division of the matrix
22 * operator * is the matrix multiplication
23 * Matrice M=[$n_{ij}$]$_{i,j}$
24 * opérateur : Diag(M)=[$n_{ii}$]$_i$ (vecteur)
25 * Id= identity matrix
26 * O=[1]$_{i,j}$ (matrice one)
27 * D(M)=Id .* M
28 * O * D(M) =[$n_{jj}$]$_{i,j}$
29 * D(M) * O =[$n_{ii}$]$_{i,j}$
30 * $V_i=[0~0~0~1~0~0~0]'$ en i
31 * MI=(M ./ O * D(M)) .* (M / D(M) * O )
32 * distr(i,j)=$\frac{|min(V'_i * (MI-D(MI)),V'_j * (MI-D(MI)))|_1}{|V'_i.(MI-D(MI))|_1}$
33
34 [Finally, we have used as convention the Distributional metric used in Legacy GarganText](https://gitlab.iscpif.fr/gargantext/haskell-gargantext/issues/50)
35
36 mi = defaultdict(lambda : defaultdict(int))
37 total_cooc = x.sum().sum()
38
39 for i in matrix.keys():
40 si = sum([matrix[i][j] for j in matrix[i].keys() if i != j])
41 for j in matrix[i].keys():
42 sj = sum([matrix[j][k] for k in matrix[j].keys() if j != k])
43 if i!=j :
44 mi[i][j] = log( matrix[i][j] / ((si * sj) / total_cooc) )
45
46 r = defaultdict(lambda : defaultdict(int))
47
48 for i in matrix.keys():
49 for j in matrix.keys():
50 sumMin = sum(
51 [
52 min(mi[i][k], mi[j][k])
53 for k in matrix.keys()
54 if i != j and k != i and k != j and mi[i][k] > 0
55 ]
56 )
57
58 sumMi = sum(
59 [
60 mi[i][k]
61 for k in matrix.keys()
62 if k != i and k != j and mi[i][k] > 0
63 ]
64 )
65
66 try:
67 r[i][j] = sumMin / sumMi
68 except Exception as error:
69 r[i][j] = 0
70
71 # Need to filter the weak links, automatic threshold here
72 minmax = min([ max([ r[i][j] for i in r.keys()]) for j in r.keys()])
73
74 G = nx.DiGraph()
75 G.add_edges_from(
76 [
77 (i, j, {'weight': r[i][j]})
78 for i in r.keys() for j in r.keys()
79 if i != j and r[i][j] > minmax and r[i][j] > r[j][i]
80 ]
81 )
82 -}
83
84 {-# LANGUAGE TypeFamilies #-}
85 {-# LANGUAGE TypeOperators #-}
86 {-# LANGUAGE ScopedTypeVariables #-}
87 {-# LANGUAGE ViewPatterns #-}
88 {-# LANGUAGE GADTs #-}
89
90 module Gargantext.Core.Methods.Similarities.Accelerate.Distributional
91 where
92
93 -- import qualified Data.Foldable as P (foldl1)
94 -- import Debug.Trace (trace)
95 import Data.Array.Accelerate as A
96 -- import Data.Array.Accelerate.Interpreter (run)
97 import Data.Array.Accelerate.LLVM.Native (run) -- TODO: try runQ?
98 import Gargantext.Core.Methods.Matrix.Accelerate.Utils
99 import qualified Gargantext.Prelude as P
100
101 import Debug.Trace
102 import Prelude (show, mappend{- , String, (<>), fromIntegral, flip -})
103
104 import qualified Prelude
105
106 -- | `distributional m` returns the distributional distance between terms each
107 -- pair of terms as a matrix. The argument m is the matrix $[n_{ij}]_{i,j}$
108 -- where $n_{ij}$ is the coocccurrence between term $i$ and term $j$.
109 --
110 -- ## Basic example with Matrix of size 3:
111 --
112 -- >>> theMatrixInt 3
113 -- Matrix (Z :. 3 :. 3)
114 -- [ 7, 4, 0,
115 -- 4, 5, 3,
116 -- 0, 3, 4]
117 --
118 -- >>> distributional $ theMatrixInt 3
119 -- Matrix (Z :. 3 :. 3)
120 -- [ 1.0, 0.0, 0.9843749999999999,
121 -- 0.0, 1.0, 0.0,
122 -- 1.0, 0.0, 1.0]
123 --
124 -- ## Basic example with Matrix of size 4:
125 --
126 -- >>> theMatrixInt 4
127 -- Matrix (Z :. 4 :. 4)
128 -- [ 4, 1, 2, 1,
129 -- 1, 4, 0, 0,
130 -- 2, 0, 3, 3,
131 -- 1, 0, 3, 3]
132 --
133 -- >>> distributional $ theMatrixInt 4
134 -- Matrix (Z :. 4 :. 4)
135 -- [ 1.0, 0.0, 0.5714285714285715, 0.8421052631578947,
136 -- 0.0, 1.0, 1.0, 1.0,
137 -- 8.333333333333333e-2, 4.6875e-2, 1.0, 0.25,
138 -- 0.3333333333333333, 5.7692307692307696e-2, 1.0, 1.0]
139 --
140 distributional :: Matrix Int -> Acc (Matrix Double)
141 distributional m' = result
142 where
143 m = map A.fromIntegral $ use m'
144 n = dim m'
145
146 diag_m = diag m
147
148 d_1 = replicate (constant (Z :. n :. All)) diag_m
149 d_2 = replicate (constant (Z :. All :. n)) diag_m
150
151 mi = (.*) ((./) m d_1) ((./) m d_2)
152
153 -- w = (.-) mi d_mi
154
155 -- The matrix permutations is taken care of below by directly replicating
156 -- the matrix mi, making the matrix w unneccessary and saving one step.
157 w_1 = replicate (constant (Z :. All :. n :. All)) mi
158 w_2 = replicate (constant (Z :. n :. All :. All)) mi
159 w' = zipWith min w_1 w_2
160
161 -- The matrix ii = [r_{i,j,k}]_{i,j,k} has r_(i,j,k) = 0 if k = i OR k = j
162 -- and r_(i,j,k) = 1 otherwise (i.e. k /= i AND k /= j).
163 ii = generate (constant (Z :. n :. n :. n))
164 (lift1 (\(Z :. i :. j :. k) -> cond ((&&) ((/=) k i) ((/=) k j)) 1 0))
165
166 z_1 = sum ((.*) w' ii)
167 z_2 = sum ((.*) w_1 ii)
168
169 result = termDivNan z_1 z_2
170
171 logDistributional2 :: Matrix Int -> Matrix Double
172 logDistributional2 m = trace ("logDistributional, dim=" `mappend` show n) . run
173 $ diagNull n
174 $ matMaxMini
175 $ logDistributional' n m
176 where
177 n = dim m
178
179 logDistributional' :: Int -> Matrix Int -> Acc (Matrix Double)
180 logDistributional' n m' = trace ("logDistributional'") result
181 where
182 -- From Matrix Int to Matrix Double, i.e :
183 -- m :: Matrix Int -> Matrix Double
184 m = map A.fromIntegral $ use m'
185
186 -- Scalar. Sum of all elements of m.
187 to = the $ sum (flatten m)
188
189 -- Diagonal matrix with the diagonal of m.
190 d_m = (.*) m (matrixIdentity n)
191
192 -- Size n vector. s = [s_i]_i
193 s = sum ((.-) m d_m)
194
195 -- Matrix nxn. Vector s replicated as rows.
196 s_1 = replicate (constant (Z :. All :. n)) s
197 -- Matrix nxn. Vector s replicated as columns.
198 s_2 = replicate (constant (Z :. n :. All)) s
199
200 -- Matrix nxn. ss = [s_i * s_j]_{i,j}. Outer product of s with itself.
201 ss = (.*) s_1 s_2
202
203 -- Matrix nxn. mi = [m_{i,j}]_{i,j} where
204 -- m_{i,j} = 0 if n_{i,j} = 0 or i = j,
205 -- m_{i,j} = log(to * n_{i,j} / s_{i,j}) otherwise.
206 mi = (.*) (matrixEye n)
207 (map (lift1 (\x -> cond (x == 0) 0 (log (x * to)))) ((./) m ss))
208 -- mi_nnz :: Int
209 -- mi_nnz = flip indexArray Z . run $
210 -- foldAll (+) 0 $ map (\a -> ifThenElse (abs a < 10^(-6 :: Exp Int)) 0 1) mi
211
212 -- mi_total = n*n
213
214 -- reportMat :: String -> Int -> Int -> String
215 -- reportMat name nnz tot = name <> ": " <> show nnz <> "nnz / " <> show tot <>
216 -- " | " <> show pc <> "%"
217 -- where pc = 100 * Prelude.fromIntegral nnz / Prelude.fromIntegral tot :: Double
218
219 -- Tensor nxnxn. Matrix mi replicated along the 2nd axis.
220 -- w_1 = trace (reportMat "mi" mi_nnz mi_total) $ replicate (constant (Z :. All :. n :. All)) mi
221 -- w1_nnz :: Int
222 -- w1_nnz = flip indexArray Z . run $
223 -- foldAll (+) 0 $ map (\a -> ifThenElse (abs a < 10^(-6 :: Exp Int)) 0 1) w_1
224 -- w1_total = n*n*n
225
226 -- Tensor nxnxn. Matrix mi replicated along the 1st axis.
227 -- w_2 = trace (reportMat "w1" w1_nnz w1_total) $ replicate (constant (Z :. n :. All :. All)) mi
228
229 -- Tensor nxnxn.
230 -- w' = trace "w'" $ zipWith min w_1 w_2
231
232 -- A predicate that is true when the input (i, j, k) satisfy
233 -- k /= i AND k /= j
234 -- k_diff_i_and_j = lift1 (\(Z :. i :. j :. k) -> ((&&) ((/=) k i) ((/=) k j)))
235
236 -- Matrix nxn.
237 sumMin = trace "sumMin" $ sumMin_go n mi -- sum (condOrDefault k_diff_i_and_j 0 w')
238
239 -- Matrix nxn. All columns are the same.
240 sumM = trace "sumM" $ sumM_go n mi -- trace "sumM" $ sum (condOrDefault k_diff_i_and_j 0 w_1)
241
242 result = termDivNan sumMin sumM
243
244
245 -- The distributional metric P(c) of @i@ and @j@ terms is: \[
246 -- S_{MI} = \frac {\sum_{k \neq i,j ; MI_{ik} >0}^{} \min(MI_{ik},
247 -- MI_{jk})}{\sum_{k \neq i,j ; MI_{ik}>0}^{}} \]
248 --
249 -- Mutual information
250 -- \[S_{MI}({i},{j}) = \log(\frac{C{ij}}{E{ij}})\]
251 --
252 -- Number of cooccurrences of @i@ and @j@ in the same context of text
253 -- \[C{ij}\]
254 --
255 -- The expected value of the cooccurrences @i@ and @j@ (given a map list of size @n@)
256 -- \[E_{ij}^{m} = \frac {S_{i} S_{j}} {N_{m}}\]
257 --
258 -- Total cooccurrences of term @i@ given a map list of size @m@
259 -- \[S_{i} = \sum_{j, j \neq i}^{m} S_{ij}\]
260 --
261 -- Total cooccurrences of terms given a map list of size @m@
262 -- \[N_{m} = \sum_{i,i \neq i}^{m} \sum_{j, j \neq j}^{m} S_{ij}\]
263 --
264
265 logDistributional :: Matrix Int -> Matrix Double
266 logDistributional m' = run $ diagNull n result
267 where
268 m = map fromIntegral $ use m'
269 n = dim m'
270
271 -- Scalar. Sum of all elements of m.
272 to = the $ sum (flatten m)
273
274 -- Diagonal matrix with the diagonal of m.
275 d_m = (.*) m (matrixIdentity n)
276
277 -- Size n vector. s = [s_i]_i
278 s = sum ((.-) m d_m)
279
280 -- Matrix nxn. Vector s replicated as rows.
281 s_1 = replicate (constant (Z :. All :. n)) s
282 -- Matrix nxn. Vector s replicated as columns.
283 s_2 = replicate (constant (Z :. n :. All)) s
284
285 -- Matrix nxn. ss = [s_i * s_j]_{i,j}. Outer product of s with itself.
286 ss = (.*) s_1 s_2
287
288 -- Matrix nxn. mi = [m_{i,j}]_{i,j} where
289 -- m_{i,j} = 0 if n_{i,j} = 0 or i = j,
290 -- m_{i,j} = log(to * n_{i,j} / s_{i,j}) otherwise.
291 mi = (.*) (matrixEye n)
292 (map (lift1 (\x -> cond (x == 0) 0 (log (x * to)))) ((./) m ss))
293
294 -- Tensor nxnxn. Matrix mi replicated along the 2nd axis.
295 w_1 = replicate (constant (Z :. All :. n :. All)) mi
296
297 -- Tensor nxnxn. Matrix mi replicated along the 1st axis.
298 w_2 = replicate (constant (Z :. n :. All :. All)) mi
299
300 -- Tensor nxnxn.
301 w' = zipWith min w_1 w_2
302
303 -- A predicate that is true when the input (i, j, k) satisfy
304 -- k /= i AND k /= j
305 k_diff_i_and_j = lift1 (\(Z :. i :. j :. k) -> ((&&) ((/=) k i) ((/=) k j)))
306
307 -- Matrix nxn.
308 sumMin = sum (condOrDefault k_diff_i_and_j 0 w')
309
310 -- Matrix nxn. All columns are the same.
311 sumM = sum (condOrDefault k_diff_i_and_j 0 w_1)
312
313 result = termDivNan sumMin sumM
314
315
316
317
318 distributional'' :: Matrix Int -> Matrix Double
319 distributional'' m = -- run {- $ matMaxMini -}
320 run $ diagNull n
321 $ rIJ n
322 $ filterWith 0 100
323 $ filter' 0
324 $ s_mi
325 $ map A.fromIntegral
326 {- from Int to Double -}
327 $ use m
328 {- push matrix in Accelerate type -}
329 where
330
331 _ri :: Acc (Matrix Double) -> Acc (Matrix Double)
332 _ri mat = mat1 -- zipWith (/) mat1 mat2
333 where
334 mat1 = matSumCol n $ zipWith min (_myMin mat) (_myMin $ filterWith 0 100 $ diagNull n $ transpose mat)
335 _mat2 = total mat
336
337 _myMin :: Acc (Matrix Double) -> Acc (Matrix Double)
338 _myMin = replicate (constant (Z :. n :. All)) . minimum
339
340
341 -- TODO fix NaN
342 -- Quali TEST: OK
343 s_mi :: Acc (Matrix Double) -> Acc (Matrix Double)
344 s_mi m' = zipWith (\x y -> log (x / y)) (diagNull n m')
345 $ zipWith (/) (crossProduct n m') (total m')
346 -- crossProduct n m'
347
348
349 total :: Acc (Matrix Double) -> Acc (Matrix Double)
350 total = replicate (constant (Z :. n :. n)) . sum . sum
351
352 n :: Dim
353 n = dim m
354
355 rIJ :: (Elt a, Ord a, P.Fractional (Exp a), P.Num a)
356 => Dim -> Acc (Matrix a) -> Acc (Matrix a)
357 rIJ n m = matMaxMini $ divide a b
358 where
359 a = sumRowMin n m
360 b = sumColMin n m
361
362 -- * For Tests (to be removed)
363 -- | Test perfermance with this matrix
364 -- TODO : add this in a benchmark folder
365 distriTest :: Int -> Matrix Double
366 distriTest n = logDistributional (theMatrixInt n)
367
368
369 -- * sparse utils
370
371 -- compact repr of "extend along an axis" op?
372 -- general sparse repr ?
373
374 type Extended sh = sh :. Int
375
376 data Ext where
377 Along1 :: Int -> Ext
378 Along2 :: Int -> Ext
379
380 along1 :: Int -> Ext
381 along1 = Along1
382
383 along2 :: Int -> Ext
384 along2 = Along2
385
386 type Delayed sh a = Exp sh -> Exp a
387
388 data ExtArr sh a = ExtArr
389 { extSh :: Extended sh
390 , extFun :: Delayed (Extended sh) a
391 }
392
393 {-
394 w_1_{i, j, k} = mi_{i, k}
395 w_2_{i, j, k} = mi_{j, k}
396
397 w'_{i, j, k} = min w_1_{i, j, k} w_2_{i, j, k}
398 = min mi_{i, k} mi_{j, k}
399
400 w"_{i, j, k} = 0 if i = k or j = k
401 min mi_{i, k} mi_{j, k} otherwise
402
403 w_1'_{i, j, k} = 0 if i = k or j = k
404 mi_{i, k} otherwise
405
406 sumMin_{i, j} = sum_k of w"_{i, j, k}
407 = sum_k (k /= i && k /= j) of min mi_{i, k} mi_{j, k}
408
409 sumM_{i, j} = sum_k of w_1'_{i, j, k}
410 = sum_k (k /= i && k /= j) of mi_{i, k}
411
412 -}
413
414 sumM_go :: (Elt a, Num a) => Int -> Acc (Array DIM2 a) -> Acc (Array DIM2 a)
415 sumM_go n mi = generate (lift (Z :. n :. n)) $ \coord ->
416 let (Z :. i :. j) = unlift coord in
417 Prelude.sum
418 [ cond (constant k /= i && constant k /= j)
419 (mi ! lift (constant Z :. i :. constant k))
420 0
421 | k <- [0 .. n-1]
422 ]
423
424 sumMin_go :: (Elt a, Num a, Ord a) => Int -> Acc (Array DIM2 a) -> Acc (Array DIM2 a)
425 sumMin_go n mi = generate (constant (Z :. n :. n)) $ \coord ->
426 let (Z :. i :. j) = unlift coord in
427 Prelude.sum
428 [ cond (constant k /= i && constant k /= j)
429 (min
430 (mi ! lift (constant Z :. i :. constant k))
431 (mi ! lift (constant Z :. j :. constant k))
432 )
433 0
434 | k <- [0 .. n-1]
435 ]