2 Module : Gargantext.Core.Methods.Similarities.Accelerate.Distributional
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 * Distributional Similarity metric
12 __Definition :__ Distributional metric is a relative metric which depends on the
13 selected list, it represents structural equivalence of mutual information.
15 __Objective :__ We want to compute with matrices processing the similarity between term $i$ and term $j$ :
16 distr(i,j)=$\frac{\Sigma_{k \neq i,j} min(\frac{n_{ik}^2}{n_{ii}n_{kk}},\frac{n_{jk}^2}{n_{jj}n_{kk}})}{\Sigma_{k \neq i}\frac{n_{ik}^2}{ n_{ii}n_{kk}}}$
18 where $n_{ij}$ is the cooccurrence between term $i$ and term $j$
20 * For a vector V=[$x_1$ ... $x_n$], we note $|V|_1=\Sigma_ix_i$
21 * operator : .* and ./ cell by cell multiplication and division of the matrix
22 * operator * is the matrix multiplication
23 * Matrice M=[$n_{ij}$]$_{i,j}$
24 * opérateur : Diag(M)=[$n_{ii}$]$_i$ (vecteur)
26 * O=[1]$_{i,j}$ (matrice one)
28 * O * D(M) =[$n_{jj}$]$_{i,j}$
29 * D(M) * O =[$n_{ii}$]$_{i,j}$
30 * $V_i=[0~0~0~1~0~0~0]'$ en i
31 * MI=(M ./ O * D(M)) .* (M / D(M) * O )
32 * distr(i,j)=$\frac{|min(V'_i * (MI-D(MI)),V'_j * (MI-D(MI)))|_1}{|V'_i.(MI-D(MI))|_1}$
34 [Specifications written by David Chavalarias on Garg v4 shared NodeWrite, team Pyremiel 2020]
38 {-# LANGUAGE TypeFamilies #-}
39 {-# LANGUAGE TypeOperators #-}
40 {-# LANGUAGE ScopedTypeVariables #-}
41 {-# LANGUAGE ViewPatterns #-}
42 {-# LANGUAGE GADTs #-}
44 module Gargantext.Core.Methods.Similarities.Accelerate.Distributional
47 -- import qualified Data.Foldable as P (foldl1)
48 -- import Debug.Trace (trace)
49 import Data.Array.Accelerate as A
50 -- import Data.Array.Accelerate.Interpreter (run)
51 import Data.Array.Accelerate.LLVM.Native (run) -- TODO: try runQ?
52 import Gargantext.Core.Methods.Matrix.Accelerate.Utils
53 import qualified Gargantext.Prelude as P
56 import Prelude (show, mappend{- , String, (<>), fromIntegral, flip -})
58 import qualified Prelude
60 -- | `distributional m` returns the distributional distance between terms each
61 -- pair of terms as a matrix. The argument m is the matrix $[n_{ij}]_{i,j}$
62 -- where $n_{ij}$ is the coocccurrence between term $i$ and term $j$.
64 -- ## Basic example with Matrix of size 3:
67 -- Matrix (Z :. 3 :. 3)
72 -- >>> distributional $ theMatrixInt 3
73 -- Matrix (Z :. 3 :. 3)
74 -- [ 1.0, 0.0, 0.9843749999999999,
78 -- ## Basic example with Matrix of size 4:
81 -- Matrix (Z :. 4 :. 4)
87 -- >>> distributional $ theMatrixInt 4
88 -- Matrix (Z :. 4 :. 4)
89 -- [ 1.0, 0.0, 0.5714285714285715, 0.8421052631578947,
90 -- 0.0, 1.0, 1.0, 1.0,
91 -- 8.333333333333333e-2, 4.6875e-2, 1.0, 0.25,
92 -- 0.3333333333333333, 5.7692307692307696e-2, 1.0, 1.0]
94 distributional :: Matrix Int -> Acc (Matrix Double)
95 distributional m' = result
97 m = map A.fromIntegral $ use m'
102 d_1 = replicate (constant (Z :. n :. All)) diag_m
103 d_2 = replicate (constant (Z :. All :. n)) diag_m
105 mi = (.*) ((./) m d_1) ((./) m d_2)
109 -- The matrix permutations is taken care of below by directly replicating
110 -- the matrix mi, making the matrix w unneccessary and saving one step.
111 w_1 = replicate (constant (Z :. All :. n :. All)) mi
112 w_2 = replicate (constant (Z :. n :. All :. All)) mi
113 w' = zipWith min w_1 w_2
115 -- The matrix ii = [r_{i,j,k}]_{i,j,k} has r_(i,j,k) = 0 if k = i OR k = j
116 -- and r_(i,j,k) = 1 otherwise (i.e. k /= i AND k /= j).
117 ii = generate (constant (Z :. n :. n :. n))
118 (lift1 (\(Z :. i :. j :. k) -> cond ((&&) ((/=) k i) ((/=) k j)) 1 0))
120 z_1 = sum ((.*) w' ii)
121 z_2 = sum ((.*) w_1 ii)
123 result = termDivNan z_1 z_2
125 logDistributional :: Matrix Int -> Matrix Double
126 logDistributional m = trace ("logDistributional, dim=" `mappend` show n) . run
129 $ logDistributional' n m
133 logDistributional' :: Int -> Matrix Int -> Acc (Matrix Double)
134 logDistributional' n m' = trace ("logDistributional'") result
136 -- From Matrix Int to Matrix Double, i.e :
137 -- m :: Matrix Int -> Matrix Double
138 m = map A.fromIntegral $ use m'
140 -- Scalar. Sum of all elements of m.
141 to = the $ sum (flatten m)
143 -- Diagonal matrix with the diagonal of m.
144 d_m = (.*) m (matrixIdentity n)
146 -- Size n vector. s = [s_i]_i
149 -- Matrix nxn. Vector s replicated as rows.
150 s_1 = replicate (constant (Z :. All :. n)) s
151 -- Matrix nxn. Vector s replicated as columns.
152 s_2 = replicate (constant (Z :. n :. All)) s
154 -- Matrix nxn. ss = [s_i * s_j]_{i,j}. Outer product of s with itself.
157 -- Matrix nxn. mi = [m_{i,j}]_{i,j} where
158 -- m_{i,j} = 0 if n_{i,j} = 0 or i = j,
159 -- m_{i,j} = log(to * n_{i,j} / s_{i,j}) otherwise.
160 mi = (.*) (matrixEye n)
161 (map (lift1 (\x -> cond (x == 0) 0 (log (x * to)))) ((./) m ss))
163 -- mi_nnz = flip indexArray Z . run $
164 -- foldAll (+) 0 $ map (\a -> ifThenElse (abs a < 10^(-6 :: Exp Int)) 0 1) mi
168 -- reportMat :: String -> Int -> Int -> String
169 -- reportMat name nnz tot = name <> ": " <> show nnz <> "nnz / " <> show tot <>
170 -- " | " <> show pc <> "%"
171 -- where pc = 100 * Prelude.fromIntegral nnz / Prelude.fromIntegral tot :: Double
173 -- Tensor nxnxn. Matrix mi replicated along the 2nd axis.
174 -- w_1 = trace (reportMat "mi" mi_nnz mi_total) $ replicate (constant (Z :. All :. n :. All)) mi
176 -- w1_nnz = flip indexArray Z . run $
177 -- foldAll (+) 0 $ map (\a -> ifThenElse (abs a < 10^(-6 :: Exp Int)) 0 1) w_1
180 -- Tensor nxnxn. Matrix mi replicated along the 1st axis.
181 -- w_2 = trace (reportMat "w1" w1_nnz w1_total) $ replicate (constant (Z :. n :. All :. All)) mi
184 -- w' = trace "w'" $ zipWith min w_1 w_2
186 -- A predicate that is true when the input (i, j, k) satisfy
188 -- k_diff_i_and_j = lift1 (\(Z :. i :. j :. k) -> ((&&) ((/=) k i) ((/=) k j)))
191 sumMin = trace "sumMin" $ sumMin_go n mi -- sum (condOrDefault k_diff_i_and_j 0 w')
193 -- Matrix nxn. All columns are the same.
194 sumM = trace "sumM" $ sumM_go n mi -- trace "sumM" $ sum (condOrDefault k_diff_i_and_j 0 w_1)
196 result = termDivNan sumMin sumM
199 -- The distributional metric P(c) of @i@ and @j@ terms is: \[
200 -- S_{MI} = \frac {\sum_{k \neq i,j ; MI_{ik} >0}^{} \min(MI_{ik},
201 -- MI_{jk})}{\sum_{k \neq i,j ; MI_{ik}>0}^{}} \]
203 -- Mutual information
204 -- \[S_{MI}({i},{j}) = \log(\frac{C{ij}}{E{ij}})\]
206 -- Number of cooccurrences of @i@ and @j@ in the same context of text
209 -- The expected value of the cooccurrences @i@ and @j@ (given a map list of size @n@)
210 -- \[E_{ij}^{m} = \frac {S_{i} S_{j}} {N_{m}}\]
212 -- Total cooccurrences of term @i@ given a map list of size @m@
213 -- \[S_{i} = \sum_{j, j \neq i}^{m} S_{ij}\]
215 -- Total cooccurrences of terms given a map list of size @m@
216 -- \[N_{m} = \sum_{i,i \neq i}^{m} \sum_{j, j \neq j}^{m} S_{ij}\]
219 distributional'' :: Matrix Int -> Matrix Double
220 distributional'' m = -- run {- $ matMiniMax -}
227 {- from Int to Double -}
229 {- push matrix in Accelerate type -}
232 _ri :: Acc (Matrix Double) -> Acc (Matrix Double)
233 _ri mat = mat1 -- zipWith (/) mat1 mat2
235 mat1 = matSumCol n $ zipWith min (_myMin mat) (_myMin $ filterWith 0 100 $ diagNull n $ transpose mat)
238 _myMin :: Acc (Matrix Double) -> Acc (Matrix Double)
239 _myMin = replicate (constant (Z :. n :. All)) . minimum
244 s_mi :: Acc (Matrix Double) -> Acc (Matrix Double)
245 s_mi m' = zipWith (\x y -> log (x / y)) (diagNull n m')
246 $ zipWith (/) (crossProduct n m') (total m')
250 total :: Acc (Matrix Double) -> Acc (Matrix Double)
251 total = replicate (constant (Z :. n :. n)) . sum . sum
256 rIJ :: (Elt a, Ord a, P.Fractional (Exp a), P.Num a)
257 => Dim -> Acc (Matrix a) -> Acc (Matrix a)
258 rIJ n m = matMiniMax $ divide a b
263 -- * For Tests (to be removed)
264 -- | Test perfermance with this matrix
265 -- TODO : add this in a benchmark folder
266 distriTest :: Int -> Matrix Double
267 distriTest n = logDistributional (theMatrixInt n)
272 -- compact repr of "extend along an axis" op?
273 -- general sparse repr ?
275 type Extended sh = sh :. Int
287 type Delayed sh a = Exp sh -> Exp a
289 data ExtArr sh a = ExtArr
290 { extSh :: Extended sh
291 , extFun :: Delayed (Extended sh) a
295 w_1_{i, j, k} = mi_{i, k}
296 w_2_{i, j, k} = mi_{j, k}
298 w'_{i, j, k} = min w_1_{i, j, k} w_2_{i, j, k}
299 = min mi_{i, k} mi_{j, k}
301 w"_{i, j, k} = 0 if i = k or j = k
302 min mi_{i, k} mi_{j, k} otherwise
304 w_1'_{i, j, k} = 0 if i = k or j = k
307 sumMin_{i, j} = sum_k of w"_{i, j, k}
308 = sum_k (k /= i && k /= j) of min mi_{i, k} mi_{j, k}
310 sumM_{i, j} = sum_k of w_1'_{i, j, k}
311 = sum_k (k /= i && k /= j) of mi_{i, k}
315 sumM_go :: (Elt a, Num a) => Int -> Acc (Array DIM2 a) -> Acc (Array DIM2 a)
316 sumM_go n mi = generate (lift (Z :. n :. n)) $ \coord ->
317 let (Z :. i :. j) = unlift coord in
319 [ cond (constant k /= i && constant k /= j)
320 (mi ! lift (constant Z :. i :. constant k))
325 sumMin_go :: (Elt a, Num a, Ord a) => Int -> Acc (Array DIM2 a) -> Acc (Array DIM2 a)
326 sumMin_go n mi = generate (constant (Z :. n :. n)) $ \coord ->
327 let (Z :. i :. j) = unlift coord in
329 [ cond (constant k /= i && constant k /= j)
331 (mi ! lift (constant Z :. i :. constant k))
332 (mi ! lift (constant Z :. j :. constant k))