src/Gargantext/Core/Methods/Similarities/Accelerate/Distributional.hs

   1 {-|
   2 Module      : Gargantext.Core.Methods.Similarities.Accelerate.Distributional
   3 Description :
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10
  11 * Distributional Similarity metric
  12 __Definition :__ Distributional metric is a relative metric which depends on the
  13 selected list, it represents structural equivalence of mutual information.
  14
  15 __Objective :__ We want to compute with matrices processing the similarity between term $i$ and term $j$ :
  16  distr(i,j)=$\frac{\Sigma_{k \neq i,j} min(\frac{n_{ik}^2}{n_{ii}n_{kk}},\frac{n_{jk}^2}{n_{jj}n_{kk}})}{\Sigma_{k \neq i}\frac{n_{ik}^2}{ n_{ii}n_{kk}}}$
  17
  18 where $n_{ij}$ is the cooccurrence between term $i$ and term $j$
  19
  20 * For a vector V=[$x_1$ ... $x_n$], we note $|V|_1=\Sigma_ix_i$
  21 * operator : .* and ./ cell by cell multiplication and division of the matrix
  22 * operator * is the matrix multiplication
  23 * Matrice M=[$n_{ij}$]$_{i,j}$
  24 * opérateur : Diag(M)=[$n_{ii}$]$_i$ (vecteur)
  25 * Id= identity matrix
  26 * O=[1]$_{i,j}$ (matrice one)
  27 * D(M)=Id .* M
  28 * O * D(M) =[$n_{jj}$]$_{i,j}$
  29 * D(M) * O =[$n_{ii}$]$_{i,j}$
  30 * $V_i=[0~0~0~1~0~0~0]'$ en i
  31 * MI=(M ./ O * D(M)) .* (M / D(M) * O )
  32 * distr(i,j)=$\frac{|min(V'_i * (MI-D(MI)),V'_j * (MI-D(MI)))|_1}{|V'_i.(MI-D(MI))|_1}$
  33
  34 [Specifications written by David Chavalarias on Garg v4 shared NodeWrite, team Pyremiel 2020]
  35
  36 -}
  37
  38 {-# LANGUAGE TypeFamilies        #-}
  39 {-# LANGUAGE TypeOperators       #-}
  40 {-# LANGUAGE ScopedTypeVariables #-}
  41 {-# LANGUAGE ViewPatterns        #-}
  42 {-# LANGUAGE GADTs               #-}
  43
  44 module Gargantext.Core.Methods.Similarities.Accelerate.Distributional
  45   where
  46
  47 -- import qualified Data.Foldable as P (foldl1)
  48 -- import Debug.Trace (trace)
  49 import Data.Array.Accelerate as A
  50 -- import Data.Array.Accelerate.Interpreter (run)
  51 import Data.Array.Accelerate.LLVM.Native (run) -- TODO: try runQ?
  52 import Gargantext.Core.Methods.Matrix.Accelerate.Utils
  53 import qualified Gargantext.Prelude as P
  54
  55 import Debug.Trace
  56 import Prelude (show, mappend{- , String, (<>), fromIntegral, flip -})
  57
  58 import qualified Prelude
  59
  60 -- | `distributional m` returns the distributional distance between terms each
  61 -- pair of terms as a matrix.  The argument m is the matrix $[n_{ij}]_{i,j}$
  62 -- where $n_{ij}$ is the coocccurrence between term $i$ and term $j$.
  63 --
  64 -- ## Basic example with Matrix of size 3:
  65 --
  66 -- >>> theMatrixInt 3
  67 -- Matrix (Z :. 3 :. 3)
  68 --   [ 7, 4, 0,
  69 --     4, 5, 3,
  70 --     0, 3, 4]
  71 --
  72 -- >>> distributional $ theMatrixInt 3
  73 -- Matrix (Z :. 3 :. 3)
  74 --   [ 1.0, 0.0, 0.9843749999999999,
  75 --     0.0, 1.0,                0.0,
  76 --     1.0, 0.0,                1.0]
  77 --
  78 -- ## Basic example with Matrix of size 4:
  79 --
  80 -- >>> theMatrixInt 4
  81 -- Matrix (Z :. 4 :. 4)
  82 --   [ 4, 1, 2, 1,
  83 --     1, 4, 0, 0,
  84 --     2, 0, 3, 3,
  85 --     1, 0, 3, 3]
  86 --
  87 -- >>> distributional $ theMatrixInt 4
  88 -- Matrix (Z :. 4 :. 4)
  89 --   [                  1.0,                   0.0, 0.5714285714285715, 0.8421052631578947,
  90 --                      0.0,                   1.0,                1.0,                1.0,
  91 --     8.333333333333333e-2,             4.6875e-2,                1.0,               0.25,
  92 --       0.3333333333333333, 5.7692307692307696e-2,                1.0,                1.0]
  93 --
  94 distributional :: Matrix Int -> Acc (Matrix Double)
  95 distributional m' = result
  96  where
  97     m = map A.fromIntegral $ use m'
  98     n = dim m'
  99
 100     diag_m = diag m
 101
 102     d_1 = replicate (constant (Z :. n :. All)) diag_m
 103     d_2 = replicate (constant (Z :. All :. n)) diag_m
 104
 105     mi    = (.*) ((./) m d_1) ((./) m d_2)
 106
 107     -- w = (.-) mi d_mi
 108
 109     -- The matrix permutations is taken care of below by directly replicating
 110     -- the matrix mi, making the matrix w unneccessary and saving one step.
 111     w_1 = replicate (constant (Z :. All :. n :. All)) mi
 112     w_2 = replicate (constant (Z :. n :. All :. All)) mi
 113     w' = zipWith min w_1 w_2
 114
 115     -- The matrix ii = [r_{i,j,k}]_{i,j,k} has r_(i,j,k) = 0 if k = i OR k = j
 116     -- and r_(i,j,k) = 1 otherwise (i.e. k /= i AND k /= j).
 117     ii = generate (constant (Z :. n :. n :. n))
 118         (lift1 (\(Z :. i :. j :. k) -> cond ((&&) ((/=) k i) ((/=) k j)) 1 0))
 119
 120     z_1 = sum ((.*) w' ii)
 121     z_2 = sum ((.*) w_1 ii)
 122
 123     result = termDivNan z_1 z_2
 124
 125 logDistributional :: Matrix Int -> Matrix Double
 126 logDistributional m = trace ("logDistributional, dim=" `mappend` show n) . run
 127                     $ diagNull n
 128                     $ matMiniMax
 129                     $ logDistributional' n m
 130   where
 131     n = dim m
 132
 133 logDistributional' :: Int -> Matrix Int -> Acc (Matrix Double)
 134 logDistributional' n m' = trace ("logDistributional'") result
 135  where
 136     -- From Matrix Int to Matrix Double, i.e :
 137     -- m :: Matrix Int -> Matrix Double
 138     m = map A.fromIntegral $ use m'
 139
 140     -- Scalar. Sum of all elements of m.
 141     to = the $ sum (flatten m)
 142
 143     -- Diagonal matrix with the diagonal of m.
 144     d_m = (.*) m (matrixIdentity n)
 145
 146     -- Size n vector. s = [s_i]_i
 147     s = sum ((.-) m d_m)
 148
 149     -- Matrix nxn. Vector s replicated as rows.
 150     s_1 = replicate (constant (Z :. All :. n)) s
 151     -- Matrix nxn. Vector s replicated as columns.
 152     s_2 = replicate (constant (Z :. n :. All)) s
 153
 154     -- Matrix nxn. ss = [s_i * s_j]_{i,j}. Outer product of s with itself.
 155     ss = (.*) s_1 s_2
 156
 157     -- Matrix nxn. mi = [m_{i,j}]_{i,j} where
 158     -- m_{i,j} = 0 if n_{i,j} = 0 or i = j,
 159     -- m_{i,j} = log(to * n_{i,j} / s_{i,j}) otherwise.
 160     mi = (.*) (matrixEye n)
 161         (map (lift1 (\x -> cond (x == 0) 0 (log (x * to)))) ((./) m ss))
 162     -- mi_nnz :: Int
 163     -- mi_nnz = flip indexArray Z . run $
 164     --   foldAll (+) 0 $ map (\a -> ifThenElse (abs a < 10^(-6 :: Exp Int)) 0 1) mi
 165
 166     -- mi_total = n*n
 167
 168     -- reportMat :: String -> Int -> Int -> String
 169     -- reportMat name nnz tot = name <> ": " <> show nnz <> "nnz / " <> show tot <>
 170     --                          " | " <> show pc <> "%"
 171     --   where pc = 100 * Prelude.fromIntegral nnz / Prelude.fromIntegral tot :: Double
 172
 173     -- Tensor nxnxn. Matrix mi replicated along the 2nd axis.
 174     -- w_1 = trace (reportMat "mi" mi_nnz mi_total) $ replicate (constant (Z :. All :. n :. All)) mi
 175     -- w1_nnz :: Int
 176     -- w1_nnz = flip indexArray Z . run $
 177     --   foldAll (+) 0 $ map (\a -> ifThenElse (abs a < 10^(-6 :: Exp Int)) 0 1) w_1
 178     -- w1_total = n*n*n
 179
 180     -- Tensor nxnxn. Matrix mi replicated along the 1st axis.
 181     -- w_2 = trace (reportMat "w1" w1_nnz w1_total) $ replicate (constant (Z :. n :. All :. All)) mi
 182
 183     -- Tensor nxnxn.
 184     -- w' = trace "w'" $ zipWith min w_1 w_2
 185
 186     -- A predicate that is true when the input (i, j, k) satisfy
 187     -- k /= i AND k /= j
 188     -- k_diff_i_and_j = lift1 (\(Z :. i :. j :. k) -> ((&&) ((/=) k i) ((/=) k j)))
 189
 190     -- Matrix nxn.
 191     sumMin = trace "sumMin" $ sumMin_go n mi -- sum (condOrDefault k_diff_i_and_j 0 w')
 192
 193     -- Matrix nxn. All columns are the same.
 194     sumM = trace "sumM" $ sumM_go n mi -- trace "sumM" $ sum (condOrDefault k_diff_i_and_j 0 w_1)
 195
 196     result = termDivNan sumMin sumM
 197
 198
 199 -- The distributional metric P(c) of @i@ and @j@ terms is: \[
 200 -- S_{MI} = \frac {\sum_{k \neq i,j ; MI_{ik} >0}^{} \min(MI_{ik},
 201 -- MI_{jk})}{\sum_{k \neq i,j ; MI_{ik}>0}^{}} \]
 202 --
 203 -- Mutual information
 204 -- \[S_{MI}({i},{j}) = \log(\frac{C{ij}}{E{ij}})\]
 205 --
 206 -- Number of cooccurrences of @i@ and @j@ in the same context of text
 207 --                        \[C{ij}\]
 208 --
 209 -- The expected value of the cooccurrences @i@ and @j@ (given a map list of size @n@)
 210 --            \[E_{ij}^{m} = \frac {S_{i} S_{j}} {N_{m}}\]
 211 --
 212 -- Total cooccurrences of term @i@ given a map list of size @m@
 213 --            \[S_{i} = \sum_{j, j \neq i}^{m} S_{ij}\]
 214 --
 215 -- Total cooccurrences of terms given a map list of size @m@
 216 --            \[N_{m} = \sum_{i,i \neq i}^{m} \sum_{j, j \neq j}^{m} S_{ij}\]
 217 --
 218
 219 distributional'' :: Matrix Int -> Matrix Double
 220 distributional'' m = -- run {- $ matMiniMax -}
 221                    run  $ diagNull n
 222                        $ rIJ n
 223                        $ filterWith 0 100
 224                        $ filter' 0
 225                        $ s_mi
 226                        $ map A.fromIntegral
 227                           {- from Int to Double -}
 228                        $ use m
 229                           {- push matrix in Accelerate type -}
 230   where
 231
 232     _ri :: Acc (Matrix Double) -> Acc (Matrix Double)
 233     _ri mat = mat1 -- zipWith (/) mat1 mat2
 234       where
 235         mat1 = matSumCol n $ zipWith min (_myMin mat) (_myMin $ filterWith 0 100 $ diagNull n $ transpose mat)
 236         _mat2 = total mat
 237
 238     _myMin :: Acc (Matrix Double) -> Acc (Matrix Double)
 239     _myMin = replicate (constant (Z :. n :. All)) . minimum
 240
 241
 242     -- TODO fix NaN
 243     -- Quali TEST: OK
 244     s_mi :: Acc (Matrix Double) -> Acc (Matrix Double)
 245     s_mi m' = zipWith (\x y -> log (x / y)) (diagNull n m')
 246             $ zipWith (/) (crossProduct n m') (total m')
 247             -- crossProduct n m'
 248
 249
 250     total :: Acc (Matrix Double) -> Acc (Matrix Double)
 251     total = replicate (constant (Z :. n :. n)) . sum . sum
 252
 253     n :: Dim
 254     n = dim m
 255
 256 rIJ :: (Elt a, Ord a, P.Fractional (Exp a), P.Num a)
 257     => Dim -> Acc (Matrix a) -> Acc (Matrix a)
 258 rIJ n m = matMiniMax $ divide a b
 259   where
 260     a = sumRowMin n m
 261     b = sumColMin n m
 262
 263 -- * For Tests (to be removed)
 264 -- | Test perfermance with this matrix
 265 -- TODO : add this in a benchmark folder
 266 distriTest :: Int -> Matrix Double
 267 distriTest n = logDistributional (theMatrixInt n)
 268
 269
 270 -- * sparse utils
 271
 272 -- compact repr of "extend along an axis" op?
 273 -- general sparse repr ?
 274
 275 type Extended sh = sh :. Int
 276
 277 data Ext where
 278   Along1 :: Int -> Ext
 279   Along2 :: Int -> Ext
 280
 281 along1 :: Int -> Ext
 282 along1 = Along1
 283
 284 along2 :: Int -> Ext
 285 along2 = Along2
 286
 287 type Delayed sh a = Exp sh -> Exp a
 288
 289 data ExtArr sh a = ExtArr
 290   { extSh  :: Extended sh
 291   , extFun :: Delayed (Extended sh) a
 292   }
 293
 294 {-
 295 w_1_{i, j, k} = mi_{i, k}
 296 w_2_{i, j, k} = mi_{j, k}
 297
 298 w'_{i, j, k} = min  w_1_{i, j, k}  w_2_{i, j, k}
 299              = min  mi_{i, k}  mi_{j, k}
 300
 301 w"_{i, j, k} = 0                          if i = k or j = k
 302                min  mi_{i, k}  mi_{j, k}  otherwise
 303
 304 w_1'_{i, j, k} = 0          if i = k or j = k
 305                  mi_{i, k}  otherwise
 306
 307 sumMin_{i, j} = sum_k of w"_{i, j, k}
 308               = sum_k (k /= i && k /= j) of  min  mi_{i, k}  mi_{j, k}
 309
 310 sumM_{i, j} = sum_k of w_1'_{i, j, k}
 311             = sum_k (k /= i && k /= j) of mi_{i, k}
 312
 313 -}
 314
 315 sumM_go :: (Elt a, Num a) => Int -> Acc (Array DIM2 a) -> Acc (Array DIM2 a)
 316 sumM_go n mi = generate (lift (Z :. n :. n)) $ \coord ->
 317   let (Z :. i :. j) = unlift coord in
 318     Prelude.sum
 319       [ cond (constant k /= i && constant k /= j)
 320              (mi ! lift (constant Z :. i :. constant k))
 321              0
 322       | k <- [0 .. n-1]
 323       ]
 324
 325 sumMin_go :: (Elt a, Num a, Ord a) => Int -> Acc (Array DIM2 a) -> Acc (Array DIM2 a)
 326 sumMin_go n mi = generate (constant (Z :. n :. n)) $ \coord ->
 327   let (Z :. i :. j) = unlift coord in
 328     Prelude.sum
 329       [ cond (constant k /= i && constant k /= j)
 330              (min
 331                (mi ! lift (constant Z :. i :. constant k))
 332                (mi ! lift (constant Z :. j :. constant k))
 333              )
 334              0
 335       | k <- [0 .. n-1]
 336       ]