]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Methods/Distances/Accelerate/Distributional.hs
Merge branch 'dev' into 97-dev-istex-search
[gargantext.git] / src / Gargantext / Core / Methods / Distances / Accelerate / Distributional.hs
1 {-|
2 Module : Gargantext.Core.Methods.Distances.Accelerate.Distributional
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10
11 * Distributional Distance metric
12 __Definition :__ Distributional metric is a relative metric which depends on the
13 selected list, it represents structural equivalence of mutual information.
14
15 __Objective :__ We want to compute with matrices processing the similarity between term $i$ and term $j$ :
16 distr(i,j)=$\frac{\Sigma_{k \neq i,j} min(\frac{n_{ik}^2}{n_{ii}n_{kk}},\frac{n_{jk}^2}{n_{jj}n_{kk}})}{\Sigma_{k \neq i}\frac{n_{ik}^2}{ n_{ii}n_{kk}}}$
17
18 where $n_{ij}$ is the cooccurrence between term $i$ and term $j$
19
20 * For a vector V=[$x_1$ ... $x_n$], we note $|V|_1=\Sigma_ix_i$
21 * operator : .* and ./ cell by cell multiplication and division of the matrix
22 * operator * is the matrix multiplication
23 * Matrice M=[$n_{ij}$]$_{i,j}$
24 * opérateur : Diag(M)=[$n_{ii}$]$_i$ (vecteur)
25 * Id= identity matrix
26 * O=[1]$_{i,j}$ (matrice one)
27 * D(M)=Id .* M
28 * O * D(M) =[$n_{jj}$]$_{i,j}$
29 * D(M) * O =[$n_{ii}$]$_{i,j}$
30 * $V_i=[0~0~0~1~0~0~0]'$ en i
31 * MI=(M ./ O * D(M)) .* (M / D(M) * O )
32 * distr(i,j)=$\frac{|min(V'_i * (MI-D(MI)),V'_j * (MI-D(MI)))|_1}{|V'_i.(MI-D(MI))|_1}$
33
34 [Specifications written by David Chavalarias on Garg v4 shared NodeWrite, team Pyremiel 2020]
35
36 -}
37
38 {-# LANGUAGE TypeFamilies #-}
39 {-# LANGUAGE TypeOperators #-}
40 {-# LANGUAGE ScopedTypeVariables #-}
41 {-# LANGUAGE ViewPatterns #-}
42
43 module Gargantext.Core.Methods.Distances.Accelerate.Distributional
44 where
45
46 -- import qualified Data.Foldable as P (foldl1)
47 -- import Debug.Trace (trace)
48 import Data.Array.Accelerate as A
49 import Data.Array.Accelerate.Interpreter (run)
50 import Gargantext.Core.Methods.Matrix.Accelerate.Utils
51 import qualified Gargantext.Prelude as P
52
53 -- | `distributional m` returns the distributional distance between terms each
54 -- pair of terms as a matrix. The argument m is the matrix $[n_{ij}]_{i,j}$
55 -- where $n_{ij}$ is the coocccurrence between term $i$ and term $j$.
56 --
57 -- ## Basic example with Matrix of size 3:
58 --
59 -- >>> theMatrixInt 3
60 -- Matrix (Z :. 3 :. 3)
61 -- [ 7, 4, 0,
62 -- 4, 5, 3,
63 -- 0, 3, 4]
64 --
65 -- >>> distributional $ theMatrixInt 3
66 -- Matrix (Z :. 3 :. 3)
67 -- [ 1.0, 0.0, 0.9843749999999999,
68 -- 0.0, 1.0, 0.0,
69 -- 1.0, 0.0, 1.0]
70 --
71 -- ## Basic example with Matrix of size 4:
72 --
73 -- >>> theMatrixInt 4
74 -- Matrix (Z :. 4 :. 4)
75 -- [ 4, 1, 2, 1,
76 -- 1, 4, 0, 0,
77 -- 2, 0, 3, 3,
78 -- 1, 0, 3, 3]
79 --
80 -- >>> distributional $ theMatrixInt 4
81 -- Matrix (Z :. 4 :. 4)
82 -- [ 1.0, 0.0, 0.5714285714285715, 0.8421052631578947,
83 -- 0.0, 1.0, 1.0, 1.0,
84 -- 8.333333333333333e-2, 4.6875e-2, 1.0, 0.25,
85 -- 0.3333333333333333, 5.7692307692307696e-2, 1.0, 1.0]
86 --
87 distributional :: Matrix Int -> Matrix Double
88 distributional m' = run result
89 where
90 m = map fromIntegral $ use m'
91 n = dim m'
92
93 diag_m = diag m
94
95 d_1 = replicate (constant (Z :. n :. All)) diag_m
96 d_2 = replicate (constant (Z :. All :. n)) diag_m
97
98 mi = (.*) ((./) m d_1) ((./) m d_2)
99
100 -- w = (.-) mi d_mi
101
102 -- The matrix permutations is taken care of below by directly replicating
103 -- the matrix mi, making the matrix w unneccessary and saving one step.
104 w_1 = replicate (constant (Z :. All :. n :. All)) mi
105 w_2 = replicate (constant (Z :. n :. All :. All)) mi
106 w' = zipWith min w_1 w_2
107
108 -- The matrix ii = [r_{i,j,k}]_{i,j,k} has r_(i,j,k) = 0 if k = i OR k = j
109 -- and r_(i,j,k) = 1 otherwise (i.e. k /= i AND k /= j).
110 ii = generate (constant (Z :. n :. n :. n))
111 (lift1 (\(Z :. i :. j :. k) -> cond ((&&) ((/=) k i) ((/=) k j)) 1 0))
112
113 z_1 = sum ((.*) w' ii)
114 z_2 = sum ((.*) w_1 ii)
115
116 result = termDivNan z_1 z_2
117
118 logDistributional :: Matrix Int -> Matrix Double
119 logDistributional m = run
120 $ diagNull n
121 $ matMiniMax
122 $ logDistributional' n m
123 where
124 n = dim m
125
126 logDistributional' :: Int -> Matrix Int -> Acc (Matrix Double)
127 logDistributional' n m' = result
128 where
129 -- From Matrix Int to Matrix Double, i.e :
130 -- m :: Matrix Int -> Matrix Double
131 m = map fromIntegral $ use m'
132
133 -- Scalar. Sum of all elements of m.
134 to = the $ sum (flatten m)
135
136 -- Diagonal matrix with the diagonal of m.
137 d_m = (.*) m (matrixIdentity n)
138
139 -- Size n vector. s = [s_i]_i
140 s = sum ((.-) m d_m)
141
142 -- Matrix nxn. Vector s replicated as rows.
143 s_1 = replicate (constant (Z :. All :. n)) s
144 -- Matrix nxn. Vector s replicated as columns.
145 s_2 = replicate (constant (Z :. n :. All)) s
146
147 -- Matrix nxn. ss = [s_i * s_j]_{i,j}. Outer product of s with itself.
148 ss = (.*) s_1 s_2
149
150 -- Matrix nxn. mi = [m_{i,j}]_{i,j} where
151 -- m_{i,j} = 0 if n_{i,j} = 0 or i = j,
152 -- m_{i,j} = log(to * n_{i,j} / s_{i,j}) otherwise.
153 mi = (.*) (matrixEye n)
154 (map (lift1 (\x -> cond (x == 0) 0 (log (x * to)))) ((./) m ss))
155
156 -- Tensor nxnxn. Matrix mi replicated along the 2nd axis.
157 w_1 = replicate (constant (Z :. All :. n :. All)) mi
158
159 -- Tensor nxnxn. Matrix mi replicated along the 1st axis.
160 w_2 = replicate (constant (Z :. n :. All :. All)) mi
161
162 -- Tensor nxnxn.
163 w' = zipWith min w_1 w_2
164
165 -- A predicate that is true when the input (i, j, k) satisfy
166 -- k /= i AND k /= j
167 k_diff_i_and_j = lift1 (\(Z :. i :. j :. k) -> ((&&) ((/=) k i) ((/=) k j)))
168
169 -- Matrix nxn.
170 sumMin = sum (condOrDefault k_diff_i_and_j 0 w')
171
172 -- Matrix nxn. All columns are the same.
173 sumM = sum (condOrDefault k_diff_i_and_j 0 w_1)
174
175 result = termDivNan sumMin sumM
176
177
178 -- The distributional metric P(c) of @i@ and @j@ terms is: \[
179 -- S_{MI} = \frac {\sum_{k \neq i,j ; MI_{ik} >0}^{} \min(MI_{ik},
180 -- MI_{jk})}{\sum_{k \neq i,j ; MI_{ik}>0}^{}} \]
181 --
182 -- Mutual information
183 -- \[S_{MI}({i},{j}) = \log(\frac{C{ij}}{E{ij}})\]
184 --
185 -- Number of cooccurrences of @i@ and @j@ in the same context of text
186 -- \[C{ij}\]
187 --
188 -- The expected value of the cooccurrences @i@ and @j@ (given a map list of size @n@)
189 -- \[E_{ij}^{m} = \frac {S_{i} S_{j}} {N_{m}}\]
190 --
191 -- Total cooccurrences of term @i@ given a map list of size @m@
192 -- \[S_{i} = \sum_{j, j \neq i}^{m} S_{ij}\]
193 --
194 -- Total cooccurrences of terms given a map list of size @m@
195 -- \[N_{m} = \sum_{i,i \neq i}^{m} \sum_{j, j \neq j}^{m} S_{ij}\]
196 --
197
198 distributional'' :: Matrix Int -> Matrix Double
199 distributional'' m = -- run {- $ matMiniMax -}
200 run $ diagNull n
201 $ rIJ n
202 $ filterWith 0 100
203 $ filter' 0
204 $ s_mi
205 $ map fromIntegral
206 {- from Int to Double -}
207 $ use m
208 {- push matrix in Accelerate type -}
209 where
210
211 _ri :: Acc (Matrix Double) -> Acc (Matrix Double)
212 _ri mat = mat1 -- zipWith (/) mat1 mat2
213 where
214 mat1 = matSumCol n $ zipWith min (_myMin mat) (_myMin $ filterWith 0 100 $ diagNull n $ transpose mat)
215 _mat2 = total mat
216
217 _myMin :: Acc (Matrix Double) -> Acc (Matrix Double)
218 _myMin = replicate (constant (Z :. n :. All)) . minimum
219
220
221 -- TODO fix NaN
222 -- Quali TEST: OK
223 s_mi :: Acc (Matrix Double) -> Acc (Matrix Double)
224 s_mi m' = zipWith (\x y -> log (x / y)) (diagNull n m')
225 $ zipWith (/) (crossProduct n m') (total m')
226 -- crossProduct n m'
227
228
229 total :: Acc (Matrix Double) -> Acc (Matrix Double)
230 total = replicate (constant (Z :. n :. n)) . sum . sum
231
232 n :: Dim
233 n = dim m
234
235 rIJ :: (Elt a, Ord a, P.Fractional (Exp a), P.Num a)
236 => Dim -> Acc (Matrix a) -> Acc (Matrix a)
237 rIJ n m = matMiniMax $ divide a b
238 where
239 a = sumRowMin n m
240 b = sumColMin n m
241
242 -- * For Tests (to be removed)
243 -- | Test perfermance with this matrix
244 -- TODO : add this in a benchmark folder
245 distriTest :: Int -> Matrix Double
246 distriTest n = logDistributional (theMatrixInt n)
247
248