src/Gargantext/Core/Methods/Distances/Accelerate/Distributional.hs

   1 {-|
   2 Module      : Gargantext.Core.Methods.Distances.Accelerate.Distributional
   3 Description :
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10
  11 * Distributional Distance metric
  12 __Definition :__ Distributional metric is a relative metric which depends on the
  13 selected list, it represents structural equivalence of mutual information.
  14
  15 __Objective :__ We want to compute with matrices processing the similarity between term $i$ and term $j$ :
  16  distr(i,j)=$\frac{\Sigma_{k \neq i,j} min(\frac{n_{ik}^2}{n_{ii}n_{kk}},\frac{n_{jk}^2}{n_{jj}n_{kk}})}{\Sigma_{k \neq i}\frac{n_{ik}^2}{ n_{ii}n_{kk}}}$
  17
  18 where $n_{ij}$ is the cooccurrence between term $i$ and term $j$
  19
  20 * For a vector V=[$x_1$ ... $x_n$], we note $|V|_1=\Sigma_ix_i$
  21 * operator : .* and ./ cell by cell multiplication and division of the matrix
  22 * operator * is the matrix multiplication
  23 * Matrice M=[$n_{ij}$]$_{i,j}$
  24 * opérateur : Diag(M)=[$n_{ii}$]$_i$ (vecteur)
  25 * Id= identity matrix
  26 * O=[1]$_{i,j}$ (matrice one)
  27 * D(M)=Id .* M
  28 * O * D(M) =[$n_{jj}$]$_{i,j}$
  29 * D(M) * O =[$n_{ii}$]$_{i,j}$
  30 * $V_i=[0~0~0~1~0~0~0]'$ en i
  31 * MI=(M ./ O * D(M)) .* (M / D(M) * O )
  32 * distr(i,j)=$\frac{|min(V'_i * (MI-D(MI)),V'_j * (MI-D(MI)))|_1}{|V'_i.(MI-D(MI))|_1}$
  33
  34 [Specifications written by David Chavalarias on Garg v4 shared NodeWrite, team Pyremiel 2020]
  35
  36 -}
  37
  38 {-# LANGUAGE TypeFamilies        #-}
  39 {-# LANGUAGE TypeOperators       #-}
  40 {-# LANGUAGE ScopedTypeVariables #-}
  41 {-# LANGUAGE ViewPatterns        #-}
  42
  43 module Gargantext.Core.Methods.Distances.Accelerate.Distributional
  44   where
  45
  46 -- import qualified Data.Foldable as P (foldl1)
  47 -- import Debug.Trace (trace)
  48 import Data.Array.Accelerate
  49 import Data.Array.Accelerate.Interpreter (run)
  50 import Gargantext.Core.Methods.Matrix.Accelerate.Utils
  51 import qualified Gargantext.Prelude as P
  52 -- import Data.Array.Accelerate.LinearAlgebra (identity) TODO
  53 -----------------------------------------------------------------------
  54
  55 -- * Distributional Distance
  56 distributional :: Matrix Int -> Matrix Double
  57 distributional m' = run z
  58  where
  59     m = map fromIntegral $ use m'
  60     n = dim m'
  61
  62     d_m   = (.*) (matrixIdentity n) m
  63
  64     o_d_m = (#*#) (matrixOne n) d_m
  65     d_m_o = transpose o_d_m
  66
  67     mi    = (.*) ((./) m o_d_m) ((./) m d_m_o)
  68     d_mi  = (.*) (matrixIdentity n) mi
  69
  70     w     = (.-) mi d_mi
  71
  72     z     = (#*#) w (matrixOne n)
  73     z'    = transpose z
  74
  75     min_z_z' = zipWith min z z'
  76
  77     result = (./) min_z_z' z
  78
  79
  80
  81
  82 --
  83 -- The distributional metric P(c) of @i@ and @j@ terms is: \[
  84 -- S_{MI} = \frac {\sum_{k \neq i,j ; MI_{ik} >0}^{} \min(MI_{ik},
  85 -- MI_{jk})}{\sum_{k \neq i,j ; MI_{ik}>0}^{}} \]
  86 --
  87 -- Mutual information
  88 -- \[S_{MI}({i},{j}) = \log(\frac{C{ij}}{E{ij}})\]
  89 --
  90 -- Number of cooccurrences of @i@ and @j@ in the same context of text
  91 --                        \[C{ij}\]
  92 --
  93 -- The expected value of the cooccurrences @i@ and @j@ (given a map list of size @n@)
  94 --            \[E_{ij}^{m} = \frac {S_{i} S_{j}} {N_{m}}\]
  95 --
  96 -- Total cooccurrences of term @i@ given a map list of size @m@
  97 --            \[S_{i} = \sum_{j, j \neq i}^{m} S_{ij}\]
  98 --
  99 -- Total cooccurrences of terms given a map list of size @m@
 100 --            \[N_{m} = \sum_{i,i \neq i}^{m} \sum_{j, j \neq j}^{m} S_{ij}\]
 101 --
 102
 103 distributional'' :: Matrix Int -> Matrix Double
 104 distributional'' m = -- run {- $ matMiniMax -}
 105                    run  $ diagNull n
 106                        $ rIJ n
 107                        $ filterWith 0 100
 108                        $ filter' 0
 109                        $ s_mi
 110                        $ map fromIntegral
 111                           {- from Int to Double -}
 112                        $ use m
 113                           {- push matrix in Accelerate type -}
 114   where
 115
 116     _ri :: Acc (Matrix Double) -> Acc (Matrix Double)
 117     _ri mat = mat1 -- zipWith (/) mat1 mat2
 118       where
 119         mat1 = matSumCol n $ zipWith min (_myMin mat) (_myMin $ filterWith 0 100 $ diagNull n $ transpose mat)
 120         _mat2 = total mat
 121
 122     _myMin :: Acc (Matrix Double) -> Acc (Matrix Double)
 123     _myMin = replicate (constant (Z :. n :. All)) . minimum
 124
 125
 126     -- TODO fix NaN
 127     -- Quali TEST: OK
 128     s_mi :: Acc (Matrix Double) -> Acc (Matrix Double)
 129     s_mi m' = zipWith (\x y -> log (x / y)) (diagNull n m')
 130             $ zipWith (/) (crossProduct n m') (total m')
 131             -- crossProduct n m'
 132
 133
 134     total :: Acc (Matrix Double) -> Acc (Matrix Double)
 135     total = replicate (constant (Z :. n :. n)) . sum . sum
 136
 137     n :: Dim
 138     n = dim m
 139
 140 rIJ :: (Elt a, Ord a, P.Fractional (Exp a), P.Num a)
 141     => Dim -> Acc (Matrix a) -> Acc (Matrix a)
 142 rIJ n m = matMiniMax $ divide a b
 143   where
 144     a = sumRowMin n m
 145     b = sumColMin n m
 146
 147 -- * For Tests (to be removed)
 148 -- | Test perfermance with this matrix
 149 -- TODO : add this in a benchmark folder
 150 distriTest :: Int -> Matrix Double
 151 distriTest n = distributional (theMatrixInt n)
 152
 153