src/Gargantext/Core/Methods/Similarities/Accelerate/Distributional.hs

   1 {-|
   2 Module      : Gargantext.Core.Methods.Similarities.Accelerate.Distributional
   3 Description :
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10
  11 * Distributional Similarity metric
  12 __Definition :__ Distributional metric is a relative metric which depends on the
  13 selected list, it represents structural equivalence of mutual information.
  14
  15 __Objective :__ We want to compute with matrices processing the similarity between term $i$ and term $j$ :
  16  distr(i,j)=$\frac{\Sigma_{k \neq i,j} min(\frac{n_{ik}^2}{n_{ii}n_{kk}},\frac{n_{jk}^2}{n_{jj}n_{kk}})}{\Sigma_{k \neq i}\frac{n_{ik}^2}{ n_{ii}n_{kk}}}$
  17
  18 where $n_{ij}$ is the cooccurrence between term $i$ and term $j$
  19
  20 * For a vector V=[$x_1$ ... $x_n$], we note $|V|_1=\Sigma_ix_i$
  21 * operator : .* and ./ cell by cell multiplication and division of the matrix
  22 * operator * is the matrix multiplication
  23 * Matrice M=[$n_{ij}$]$_{i,j}$
  24 * opérateur : Diag(M)=[$n_{ii}$]$_i$ (vecteur)
  25 * Id= identity matrix
  26 * O=[1]$_{i,j}$ (matrice one)
  27 * D(M)=Id .* M
  28 * O * D(M) =[$n_{jj}$]$_{i,j}$
  29 * D(M) * O =[$n_{ii}$]$_{i,j}$
  30 * $V_i=[0~0~0~1~0~0~0]'$ en i
  31 * MI=(M ./ O * D(M)) .* (M / D(M) * O )
  32 * distr(i,j)=$\frac{|min(V'_i * (MI-D(MI)),V'_j * (MI-D(MI)))|_1}{|V'_i.(MI-D(MI))|_1}$
  33
  34 [Finally, we have used as convention the Distributional metric used in Legacy GarganText](https://gitlab.iscpif.fr/gargantext/haskell-gargantext/issues/50)
  35
  36        mi = defaultdict(lambda : defaultdict(int))
  37         total_cooc = x.sum().sum()
  38
  39         for i in matrix.keys():
  40             si = sum([matrix[i][j] for j in matrix[i].keys() if i != j])
  41             for j in matrix[i].keys():
  42                 sj = sum([matrix[j][k] for k in matrix[j].keys() if j != k])
  43                 if i!=j :
  44                     mi[i][j] = log( matrix[i][j] / ((si * sj) / total_cooc) )
  45
  46         r = defaultdict(lambda : defaultdict(int))
  47
  48         for i in matrix.keys():
  49             for j in matrix.keys():
  50                 sumMin = sum(
  51                                 [
  52                                 min(mi[i][k], mi[j][k])
  53                                     for k in matrix.keys()
  54                                     if i != j and k != i and k != j and mi[i][k] > 0
  55                                 ]
  56                             )
  57
  58                 sumMi  = sum(
  59                                 [
  60                                 mi[i][k]
  61                                     for k in matrix.keys()
  62                                     if k != i and k != j and mi[i][k] > 0
  63                                 ]
  64                             )
  65
  66                 try:
  67                     r[i][j] = sumMin / sumMi
  68                 except Exception as error:
  69                     r[i][j] = 0
  70
  71         # Need to filter the weak links, automatic threshold here
  72         minmax = min([ max([ r[i][j] for i in r.keys()]) for j in r.keys()])
  73
  74         G = nx.DiGraph()
  75         G.add_edges_from(
  76                           [
  77                             (i, j, {'weight': r[i][j]})
  78                                 for i in r.keys() for j in r.keys()
  79                                 if i != j and r[i][j] > minmax and r[i][j] > r[j][i]
  80                           ]
  81                         )
  82 -}
  83
  84 {-# LANGUAGE TypeFamilies        #-}
  85 {-# LANGUAGE TypeOperators       #-}
  86 {-# LANGUAGE ScopedTypeVariables #-}
  87 {-# LANGUAGE ViewPatterns        #-}
  88 {-# LANGUAGE GADTs               #-}
  89
  90 module Gargantext.Core.Methods.Similarities.Accelerate.Distributional
  91   where
  92
  93 -- import qualified Data.Foldable as P (foldl1)
  94 -- import Debug.Trace (trace)
  95 import Data.Array.Accelerate as A
  96 -- import Data.Array.Accelerate.Interpreter (run)
  97 import Data.Array.Accelerate.LLVM.Native (run) -- TODO: try runQ?
  98 import Gargantext.Core.Methods.Matrix.Accelerate.Utils
  99 import qualified Gargantext.Prelude as P
 100
 101 import Debug.Trace
 102 import Prelude (show, mappend{- , String, (<>), fromIntegral, flip -})
 103
 104 import qualified Prelude
 105
 106 -- | `distributional m` returns the distributional distance between terms each
 107 -- pair of terms as a matrix.  The argument m is the matrix $[n_{ij}]_{i,j}$
 108 -- where $n_{ij}$ is the coocccurrence between term $i$ and term $j$.
 109 --
 110 -- ## Basic example with Matrix of size 3:
 111 --
 112 -- >>> theMatrixInt 3
 113 -- Matrix (Z :. 3 :. 3)
 114 --   [ 7, 4, 0,
 115 --     4, 5, 3,
 116 --     0, 3, 4]
 117 --
 118 -- >>> distributional $ theMatrixInt 3
 119 -- Matrix (Z :. 3 :. 3)
 120 --   [ 1.0, 0.0, 0.9843749999999999,
 121 --     0.0, 1.0,                0.0,
 122 --     1.0, 0.0,                1.0]
 123 --
 124 -- ## Basic example with Matrix of size 4:
 125 --
 126 -- >>> theMatrixInt 4
 127 -- Matrix (Z :. 4 :. 4)
 128 --   [ 4, 1, 2, 1,
 129 --     1, 4, 0, 0,
 130 --     2, 0, 3, 3,
 131 --     1, 0, 3, 3]
 132 --
 133 -- >>> distributional $ theMatrixInt 4
 134 -- Matrix (Z :. 4 :. 4)
 135 --   [                  1.0,                   0.0, 0.5714285714285715, 0.8421052631578947,
 136 --                      0.0,                   1.0,                1.0,                1.0,
 137 --     8.333333333333333e-2,             4.6875e-2,                1.0,               0.25,
 138 --       0.3333333333333333, 5.7692307692307696e-2,                1.0,                1.0]
 139 --
 140 distributional :: Matrix Int -> Matrix Double
 141 distributional m' = run $ result
 142  where
 143     m = map A.fromIntegral $ use m'
 144     n = dim m'
 145
 146     diag_m = diag m
 147
 148     d_1 = replicate (constant (Z :. n :. All)) diag_m
 149     d_2 = replicate (constant (Z :. All :. n)) diag_m
 150
 151     mi    = (.*) ((./) m d_1) ((./) m d_2)
 152
 153     -- w = (.-) mi d_mi
 154
 155     -- The matrix permutations is taken care of below by directly replicating
 156     -- the matrix mi, making the matrix w unneccessary and saving one step.
 157     w_1 = replicate (constant (Z :. All :. n :. All)) mi
 158     w_2 = replicate (constant (Z :. n :. All :. All)) mi
 159     w' = zipWith min w_1 w_2
 160
 161     -- The matrix ii = [r_{i,j,k}]_{i,j,k} has r_(i,j,k) = 0 if k = i OR k = j
 162     -- and r_(i,j,k) = 1 otherwise (i.e. k /= i AND k /= j).
 163     ii = generate (constant (Z :. n :. n :. n))
 164         (lift1 (\(Z :. i :. j :. k) -> cond ((&&) ((/=) k i) ((/=) k j)) 1 0))
 165
 166     z_1 = sum ((.*) w' ii)
 167     z_2 = sum ((.*) w_1 ii)
 168
 169     result = termDivNan z_1 z_2
 170
 171 logDistributional2 :: Matrix Int -> Matrix Double
 172 logDistributional2 m = trace ("logDistributional2, dim=" `mappend` show n) . run
 173                     $ diagNull n
 174                     $ matMaxMini
 175                     $ logDistributional' n m
 176   where
 177     n = dim m
 178
 179 logDistributional' :: Int -> Matrix Int -> Acc (Matrix Double)
 180 logDistributional' n m' = trace ("logDistributional'") result
 181  where
 182     -- From Matrix Int to Matrix Double, i.e :
 183     -- m :: Matrix Int -> Matrix Double
 184     m = map A.fromIntegral $ use m'
 185
 186     -- Scalar. Sum of all elements of m.
 187     to = the $ sum (flatten m)
 188
 189     -- Diagonal matrix with the diagonal of m.
 190     d_m = (.*) m (matrixIdentity n)
 191
 192     -- Size n vector. s = [s_i]_i
 193     s = sum ((.-) m d_m)
 194
 195     -- Matrix nxn. Vector s replicated as rows.
 196     s_1 = replicate (constant (Z :. All :. n)) s
 197     -- Matrix nxn. Vector s replicated as columns.
 198     s_2 = replicate (constant (Z :. n :. All)) s
 199
 200     -- Matrix nxn. ss = [s_i * s_j]_{i,j}. Outer product of s with itself.
 201     ss = (.*) s_1 s_2
 202
 203     -- Matrix nxn. mi = [m_{i,j}]_{i,j} where
 204     -- m_{i,j} = 0 if n_{i,j} = 0 or i = j,
 205     -- m_{i,j} = log(to * n_{i,j} / s_{i,j}) otherwise.
 206     mi = (.*) (matrixEye n)
 207         (map (lift1 (\x -> cond (x == 0) 0 (log (1 + x * to)))) ((./) m ss))
 208     -- mi_nnz :: Int
 209     -- mi_nnz = flip indexArray Z . run $
 210     --   foldAll (+) 0 $ map (\a -> ifThenElse (abs a < 10^(-6 :: Exp Int)) 0 1) mi
 211
 212     -- mi_total = n*n
 213
 214     -- reportMat :: String -> Int -> Int -> String
 215     -- reportMat name nnz tot = name <> ": " <> show nnz <> "nnz / " <> show tot <>
 216     --                          " | " <> show pc <> "%"
 217     --   where pc = 100 * Prelude.fromIntegral nnz / Prelude.fromIntegral tot :: Double
 218
 219     -- Tensor nxnxn. Matrix mi replicated along the 2nd axis.
 220     -- w_1 = trace (reportMat "mi" mi_nnz mi_total) $ replicate (constant (Z :. All :. n :. All)) mi
 221     -- w1_nnz :: Int
 222     -- w1_nnz = flip indexArray Z . run $
 223     --   foldAll (+) 0 $ map (\a -> ifThenElse (abs a < 10^(-6 :: Exp Int)) 0 1) w_1
 224     -- w1_total = n*n*n
 225
 226     -- Tensor nxnxn. Matrix mi replicated along the 1st axis.
 227     -- w_2 = trace (reportMat "w1" w1_nnz w1_total) $ replicate (constant (Z :. n :. All :. All)) mi
 228
 229     -- Tensor nxnxn.
 230     -- w' = trace "w'" $ zipWith min w_1 w_2
 231
 232     -- A predicate that is true when the input (i, j, k) satisfy
 233     -- k /= i AND k /= j
 234     -- k_diff_i_and_j = lift1 (\(Z :. i :. j :. k) -> ((&&) ((/=) k i) ((/=) k j)))
 235
 236     -- Matrix nxn.
 237     sumMin = trace "sumMin" $ sumMin_go n mi -- sum (condOrDefault k_diff_i_and_j 0 w')
 238
 239     -- Matrix nxn. All columns are the same.
 240     sumM = trace "sumM" $ sumM_go n mi -- trace "sumM" $ sum (condOrDefault k_diff_i_and_j 0 w_1)
 241
 242     result = termDivNan sumMin sumM
 243
 244
 245 -- The distributional metric P(c) of @i@ and @j@ terms is: \[
 246 -- S_{MI} = \frac {\sum_{k \neq i,j ; MI_{ik} >0}^{} \min(MI_{ik},
 247 -- MI_{jk})}{\sum_{k \neq i,j ; MI_{ik}>0}^{}} \]
 248 --
 249 -- Mutual information
 250 -- \[S_{MI}({i},{j}) = \log(\frac{C{ij}}{E{ij}})\]
 251 --
 252 -- Number of cooccurrences of @i@ and @j@ in the same context of text
 253 --                        \[C{ij}\]
 254 --
 255 -- The expected value of the cooccurrences @i@ and @j@ (given a map list of size @n@)
 256 --            \[E_{ij}^{m} = \frac {S_{i} S_{j}} {N_{m}}\]
 257 --
 258 -- Total cooccurrences of term @i@ given a map list of size @m@
 259 --            \[S_{i} = \sum_{j, j \neq i}^{m} S_{ij}\]
 260 --
 261 -- Total cooccurrences of terms given a map list of size @m@
 262 --            \[N_{m} = \sum_{i,i \neq i}^{m} \sum_{j, j \neq j}^{m} S_{ij}\]
 263 --
 264
 265 logDistributional :: Matrix Int -> Matrix Double
 266 logDistributional m' = run $ diagNull n $ result
 267  where
 268     m = map fromIntegral $ use m'
 269     n = dim m'
 270
 271     -- Scalar. Sum of all elements of m.
 272     to = the $ sum (flatten m)
 273
 274     -- Diagonal matrix with the diagonal of m.
 275     d_m = (.*) m (matrixIdentity n)
 276
 277     -- Size n vector. s = [s_i]_i
 278     s = sum ((.-) m d_m)
 279
 280     -- Matrix nxn. Vector s replicated as rows.
 281     s_1 = replicate (constant (Z :. All :. n)) s
 282     -- Matrix nxn. Vector s replicated as columns.
 283     s_2 = replicate (constant (Z :. n :. All)) s
 284
 285     -- Matrix nxn. ss = [s_i * s_j]_{i,j}. Outer product of s with itself.
 286     ss = (.*) s_1 s_2
 287
 288     -- Matrix nxn. mi = [m_{i,j}]_{i,j} where
 289     -- m_{i,j} = 0 if n_{i,j} = 0 or i = j,
 290     -- m_{i,j} = log(to * n_{i,j} / s_{i,j}) otherwise.
 291     mi = (.*) (matrixEye n)
 292         (map (lift1 (\x -> cond (x == 0) 0 (log (x * to)))) ((./) m ss))
 293
 294     -- Tensor nxnxn. Matrix mi replicated along the 2nd axis.
 295     w_1 = replicate (constant (Z :. All :. n :. All)) mi
 296
 297     -- Tensor nxnxn. Matrix mi replicated along the 1st axis.
 298     w_2 = replicate (constant (Z :. n :. All :. All)) mi
 299
 300     -- Tensor nxnxn.
 301     w' = zipWith min w_1 w_2
 302
 303     -- A predicate that is true when the input (i, j, k) satisfy
 304     -- k /= i AND k /= j
 305     k_diff_i_and_j = lift1 (\(Z :. i :. j :. k) -> ((&&) ((/=) k i) ((/=) k j)))
 306
 307     -- Matrix nxn.
 308     sumMin = sum (condOrDefault k_diff_i_and_j 0 w')
 309
 310     -- Matrix nxn. All columns are the same.
 311     sumM = sum (condOrDefault k_diff_i_and_j 0 w_1)
 312
 313     result = termDivNan sumMin sumM
 314
 315
 316
 317
 318 distributional'' :: Matrix Int -> Matrix Double
 319 distributional'' m = -- run {- $ matMaxMini -}
 320                    run  $ diagNull n
 321                        $ rIJ n
 322                        $ filterWith 0 100
 323                        $ filter' 0
 324                        $ s_mi
 325                        $ map A.fromIntegral
 326                           {- from Int to Double -}
 327                        $ use m
 328                           {- push matrix in Accelerate type -}
 329   where
 330
 331     _ri :: Acc (Matrix Double) -> Acc (Matrix Double)
 332     _ri mat = mat1 -- zipWith (/) mat1 mat2
 333       where
 334         mat1 = matSumCol n $ zipWith min (_myMin mat) (_myMin $ filterWith 0 100 $ diagNull n $ transpose mat)
 335         _mat2 = total mat
 336
 337     _myMin :: Acc (Matrix Double) -> Acc (Matrix Double)
 338     _myMin = replicate (constant (Z :. n :. All)) . minimum
 339
 340
 341     -- TODO fix NaN
 342     -- Quali TEST: OK
 343     s_mi :: Acc (Matrix Double) -> Acc (Matrix Double)
 344     s_mi m' = zipWith (\x y -> log (x / y)) (diagNull n m')
 345             $ zipWith (/) (crossProduct n m') (total m')
 346             -- crossProduct n m'
 347
 348
 349     total :: Acc (Matrix Double) -> Acc (Matrix Double)
 350     total = replicate (constant (Z :. n :. n)) . sum . sum
 351
 352     n :: Dim
 353     n = dim m
 354
 355 rIJ :: (Elt a, Ord a, P.Fractional (Exp a), P.Num a)
 356     => Dim -> Acc (Matrix a) -> Acc (Matrix a)
 357 rIJ n m = matMaxMini $ divide a b
 358   where
 359     a = sumRowMin n m
 360     b = sumColMin n m
 361
 362 -- * For Tests (to be removed)
 363 -- | Test perfermance with this matrix
 364 -- TODO : add this in a benchmark folder
 365 {-
 366 distriTest :: Int -> Bool
 367 distriTest n = logDistributional m == distributional m
 368   where
 369     m = theMatrixInt n
 370 -}
 371
 372 -- * sparse utils
 373
 374 -- compact repr of "extend along an axis" op?
 375 -- general sparse repr ?
 376
 377 type Extended sh = sh :. Int
 378
 379 data Ext where
 380   Along1 :: Int -> Ext
 381   Along2 :: Int -> Ext
 382
 383 along1 :: Int -> Ext
 384 along1 = Along1
 385
 386 along2 :: Int -> Ext
 387 along2 = Along2
 388
 389 type Delayed sh a = Exp sh -> Exp a
 390
 391 data ExtArr sh a = ExtArr
 392   { extSh  :: Extended sh
 393   , extFun :: Delayed (Extended sh) a
 394   }
 395
 396 {-
 397 w_1_{i, j, k} = mi_{i, k}
 398 w_2_{i, j, k} = mi_{j, k}
 399
 400 w'_{i, j, k} = min  w_1_{i, j, k}  w_2_{i, j, k}
 401              = min  mi_{i, k}  mi_{j, k}
 402
 403 w"_{i, j, k} = 0                          if i = k or j = k
 404                min  mi_{i, k}  mi_{j, k}  otherwise
 405
 406 w_1'_{i, j, k} = 0          if i = k or j = k
 407                  mi_{i, k}  otherwise
 408
 409 sumMin_{i, j} = sum_k of w"_{i, j, k}
 410               = sum_k (k /= i && k /= j) of  min  mi_{i, k}  mi_{j, k}
 411
 412 sumM_{i, j} = sum_k of w_1'_{i, j, k}
 413             = sum_k (k /= i && k /= j) of mi_{i, k}
 414
 415 -}
 416
 417 sumM_go :: (Elt a, Num a) => Int -> Acc (Array DIM2 a) -> Acc (Array DIM2 a)
 418 sumM_go n mi = generate (lift (Z :. n :. n)) $ \coord ->
 419   let (Z :. i :. j) = unlift coord in
 420     Prelude.sum
 421       [ cond (constant k /= i && constant k /= j)
 422              (mi ! lift (constant Z :. i :. constant k))
 423              0
 424       | k <- [0 .. n-1]
 425       ]
 426
 427 sumMin_go :: (Elt a, Num a, Ord a) => Int -> Acc (Array DIM2 a) -> Acc (Array DIM2 a)
 428 sumMin_go n mi = generate (constant (Z :. n :. n)) $ \coord ->
 429   let (Z :. i :. j) = unlift coord in
 430     Prelude.sum
 431       [ cond (constant k /= i && constant k /= j)
 432              (min
 433                (mi ! lift (constant Z :. i :. constant k))
 434                (mi ! lift (constant Z :. j :. constant k))
 435              )
 436              0
 437       | k <- [0 .. n-1]
 438       ]