src/Gargantext/Core/Methods/Distances/Accelerate/Distributional.hs

   1 {-|
   2 Module      : Gargantext.Core.Methods.Distances.Accelerate.Distributional
   3 Description :
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10
  11 * Distributional Distance metric
  12 __Definition :__ Distributional metric is a relative metric which depends on the
  13 selected list, it represents structural equivalence of mutual information.
  14
  15 __Objective :__ We want to compute with matrices processing the similarity between term $i$ and term $j$ :
  16  distr(i,j)=$\frac{\Sigma_{k \neq i,j} min(\frac{n_{ik}^2}{n_{ii}n_{kk}},\frac{n_{jk}^2}{n_{jj}n_{kk}})}{\Sigma_{k \neq i}\frac{n_{ik}^2}{ n_{ii}n_{kk}}}$
  17
  18 where $n_{ij}$ is the cooccurrence between term $i$ and term $j$
  19
  20 * For a vector V=[$x_1$ ... $x_n$], we note $|V|_1=\Sigma_ix_i$
  21 * operator : .* and ./ cell by cell multiplication and division of the matrix
  22 * operator * is the matrix multiplication
  23 * Matrice M=[$n_{ij}$]$_{i,j}$
  24 * opérateur : Diag(M)=[$n_{ii}$]$_i$ (vecteur)
  25 * Id= identity matrix
  26 * O=[1]$_{i,j}$ (matrice one)
  27 * D(M)=Id .* M
  28 * O * D(M) =[$n_{jj}$]$_{i,j}$
  29 * D(M) * O =[$n_{ii}$]$_{i,j}$
  30 * $V_i=[0~0~0~1~0~0~0]'$ en i
  31 * MI=(M ./ O * D(M)) .* (M / D(M) * O )
  32 * distr(i,j)=$\frac{|min(V'_i * (MI-D(MI)),V'_j * (MI-D(MI)))|_1}{|V'_i.(MI-D(MI))|_1}$
  33
  34 [Specifications written by David Chavalarias on Garg v4 shared NodeWrite, team Pyremiel 2020]
  35
  36 -}
  37
  38 {-# LANGUAGE TypeFamilies        #-}
  39 {-# LANGUAGE TypeOperators       #-}
  40 {-# LANGUAGE ScopedTypeVariables #-}
  41 {-# LANGUAGE ViewPatterns        #-}
  42
  43 module Gargantext.Core.Methods.Distances.Accelerate.Distributional
  44   where
  45
  46 -- import qualified Data.Foldable as P (foldl1)
  47 -- import Debug.Trace (trace)
  48 import Data.Array.Accelerate
  49 import Data.Array.Accelerate.Interpreter (run)
  50 import Gargantext.Core.Methods.Matrix.Accelerate.Utils
  51 import qualified Gargantext.Prelude as P
  52
  53 -- | `distributional m` returns the distributional distance between terms each
  54 -- pair of terms as a matrix.  The argument m is the matrix $[n_{ij}]_{i,j}$
  55 -- where $n_{ij}$ is the coocccurrence between term $i$ and term $j$.
  56 --
  57 -- ## Basic example with Matrix of size 3:
  58 --
  59 -- >>> theMatrixInt 3
  60 -- Matrix (Z :. 3 :. 3)
  61 --   [ 7, 4, 0,
  62 --     4, 5, 3,
  63 --     0, 3, 4]
  64 --
  65 -- >>> distributional $ theMatrixInt 3
  66 -- Matrix (Z :. 3 :. 3)
  67 --   [ 1.0, 0.0, 0.9843749999999999,
  68 --     0.0, 1.0,                0.0,
  69 --     1.0, 0.0,                1.0]
  70 --
  71 -- ## Basic example with Matrix of size 4:
  72 --
  73 -- >>> theMatrixInt 4
  74 -- Matrix (Z :. 4 :. 4)
  75 --   [ 4, 1, 2, 1,
  76 --     1, 4, 0, 0,
  77 --     2, 0, 3, 3,
  78 --     1, 0, 3, 3]
  79 --
  80 -- >>> distributional $ theMatrixInt 4
  81 -- Matrix (Z :. 4 :. 4)
  82 --   [                  1.0,                   0.0, 0.5714285714285715, 0.8421052631578947,
  83 --                      0.0,                   1.0,                1.0,                1.0,
  84 --     8.333333333333333e-2,             4.6875e-2,                1.0,               0.25,
  85 --       0.3333333333333333, 5.7692307692307696e-2,                1.0,                1.0]
  86 --
  87 distributional :: Matrix Int -> Matrix Double
  88 distributional m' = run result
  89  where
  90     m = map fromIntegral $ use m'
  91     n = dim m'
  92
  93     diag_m = diag m
  94
  95     d_1 = replicate (constant (Z :. n :. All)) diag_m
  96     d_2 = replicate (constant (Z :. All :. n)) diag_m
  97
  98     mi    = (.*) ((./) m d_1) ((./) m d_2)
  99
 100     -- w = (.-) mi d_mi
 101
 102     -- The matrix permutations is taken care of below by directly replicating
 103     -- the matrix mi, making the matrix w unneccessary and saving one step.
 104     w_1 = replicate (constant (Z :. All :. n :. All)) mi
 105     w_2 = replicate (constant (Z :. n :. All :. All)) mi
 106     w' = zipWith min w_1 w_2
 107
 108     -- The matrix ii = [r_{i,j,k}]_{i,j,k} has r_(i,j,k) = 0 if k = i OR k = j
 109     -- and r_(i,j,k) = 1 otherwise (i.e. k /= i AND k /= j).
 110     ii = generate (constant (Z :. n :. n :. n))
 111         (lift1 (\(Z :. i :. j :. k) -> cond ((&&) ((/=) k i) ((/=) k j)) 1 0))
 112
 113     z_1 = sum ((.*) w' ii)
 114     z_2 = sum ((.*) w_1 ii)
 115
 116     result = termDivNan z_1 z_2
 117
 118
 119 --
 120 -- The distributional metric P(c) of @i@ and @j@ terms is: \[
 121 -- S_{MI} = \frac {\sum_{k \neq i,j ; MI_{ik} >0}^{} \min(MI_{ik},
 122 -- MI_{jk})}{\sum_{k \neq i,j ; MI_{ik}>0}^{}} \]
 123 --
 124 -- Mutual information
 125 -- \[S_{MI}({i},{j}) = \log(\frac{C{ij}}{E{ij}})\]
 126 --
 127 -- Number of cooccurrences of @i@ and @j@ in the same context of text
 128 --                        \[C{ij}\]
 129 --
 130 -- The expected value of the cooccurrences @i@ and @j@ (given a map list of size @n@)
 131 --            \[E_{ij}^{m} = \frac {S_{i} S_{j}} {N_{m}}\]
 132 --
 133 -- Total cooccurrences of term @i@ given a map list of size @m@
 134 --            \[S_{i} = \sum_{j, j \neq i}^{m} S_{ij}\]
 135 --
 136 -- Total cooccurrences of terms given a map list of size @m@
 137 --            \[N_{m} = \sum_{i,i \neq i}^{m} \sum_{j, j \neq j}^{m} S_{ij}\]
 138 --
 139
 140 distributional'' :: Matrix Int -> Matrix Double
 141 distributional'' m = -- run {- $ matMiniMax -}
 142                    run  $ diagNull n
 143                        $ rIJ n
 144                        $ filterWith 0 100
 145                        $ filter' 0
 146                        $ s_mi
 147                        $ map fromIntegral
 148                           {- from Int to Double -}
 149                        $ use m
 150                           {- push matrix in Accelerate type -}
 151   where
 152
 153     _ri :: Acc (Matrix Double) -> Acc (Matrix Double)
 154     _ri mat = mat1 -- zipWith (/) mat1 mat2
 155       where
 156         mat1 = matSumCol n $ zipWith min (_myMin mat) (_myMin $ filterWith 0 100 $ diagNull n $ transpose mat)
 157         _mat2 = total mat
 158
 159     _myMin :: Acc (Matrix Double) -> Acc (Matrix Double)
 160     _myMin = replicate (constant (Z :. n :. All)) . minimum
 161
 162
 163     -- TODO fix NaN
 164     -- Quali TEST: OK
 165     s_mi :: Acc (Matrix Double) -> Acc (Matrix Double)
 166     s_mi m' = zipWith (\x y -> log (x / y)) (diagNull n m')
 167             $ zipWith (/) (crossProduct n m') (total m')
 168             -- crossProduct n m'
 169
 170
 171     total :: Acc (Matrix Double) -> Acc (Matrix Double)
 172     total = replicate (constant (Z :. n :. n)) . sum . sum
 173
 174     n :: Dim
 175     n = dim m
 176
 177 rIJ :: (Elt a, Ord a, P.Fractional (Exp a), P.Num a)
 178     => Dim -> Acc (Matrix a) -> Acc (Matrix a)
 179 rIJ n m = matMiniMax $ divide a b
 180   where
 181     a = sumRowMin n m
 182     b = sumColMin n m
 183
 184 -- * For Tests (to be removed)
 185 -- | Test perfermance with this matrix
 186 -- TODO : add this in a benchmark folder
 187 distriTest :: Int -> Matrix Double
 188 distriTest n = distributional (theMatrixInt n)
 189
 190