src/Gargantext/Viz/Graph/Distances/Matrice.hs

   1 {-|
   2 Module      : Gargantext.Graph.Distances.Matrix
   3 Description :
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 This module aims at implementig distances of terms context by context is
  11 the same referential of corpus.
  12
  13
  14 Implementation use Accelerate library which enables GPU and CPU computation:
  15
  16   * Manuel M. T. Chakravarty, Gabriele Keller, Sean Lee, Trevor L. McDonell, and Vinod Grover.
  17     [Accelerating Haskell Array Codes with Multicore GPUs][CKLM+11].
  18     In _DAMP '11: Declarative Aspects of Multicore Programming_, ACM, 2011.
  19
  20   * Trevor L. McDonell, Manuel M. T. Chakravarty, Vinod Grover, and Ryan R. Newton.
  21     [Type-safe Runtime Code Generation: Accelerate to LLVM][MCGN15].
  22     In _Haskell '15: The 8th ACM SIGPLAN Symposium on Haskell_, ACM, 2015.
  23
  24 -}
  25
  26 {-# LANGUAGE TypeFamilies        #-}
  27 {-# LANGUAGE TypeOperators       #-}
  28 {-# LANGUAGE ScopedTypeVariables #-}
  29 {-# LANGUAGE ViewPatterns        #-}
  30
  31 module Gargantext.Viz.Graph.Distances.Matrice
  32   where
  33
  34 import Debug.Trace (trace)
  35 import Data.Array.Accelerate
  36 import Data.Array.Accelerate.Interpreter (run)
  37 import qualified Gargantext.Prelude as P
  38
  39
  40 -----------------------------------------------------------------------
  41 -- | Define a vector
  42 --
  43 -- >>> vector 3
  44 -- Vector (Z :. 3) [0,1,2]
  45 vector :: Elt c => Int -> [c] -> (Array (Z :. Int) c)
  46 vector n l = fromList (Z :. n) l
  47
  48 -- | Define a matrix
  49 --
  50 -- >>> matrix 3 ([1..] :: [Double])
  51 -- Matrix (Z :. 3 :. 3)
  52 --   [ 1.0, 2.0, 3.0,
  53 --     4.0, 5.0, 6.0,
  54 --     7.0, 8.0, 9.0]
  55 matrix :: Elt c => Int -> [c] -> Matrix c
  56 matrix n l = fromList (Z :. n :. n) l
  57
  58 -- | Two ways to get the rank (as documentation)
  59 --
  60 -- >>> rank (matrix 3 ([1..] :: [Int]))
  61 -- 2
  62 rank :: (Matrix a) -> Int
  63 rank m = arrayRank $ arrayShape m
  64
  65 -----------------------------------------------------------------------
  66 -- | Dimension of a square Matrix
  67 -- How to force use with SquareMatrix ?
  68 type Dim = Int
  69
  70 -- | Get Dimension of a square Matrix
  71 --
  72 -- >>> dim (matrix 3 ([1..] :: [Int]))
  73 -- 3
  74 dim :: Matrix a -> Dim
  75 dim m = n
  76   where
  77     Z :. _ :. n = arrayShape m
  78     -- indexTail (arrayShape m)
  79
  80 -----------------------------------------------------------------------
  81 -- TODO move to Utils
  82 runExp :: Elt e => Exp e -> e
  83 runExp e = indexArray (run (unit e)) Z
  84 -----------------------------------------------------------------------
  85
  86 -- | Sum of a Matrix by Column
  87 --
  88 -- >>> run $ matSumCol 3 (use $ matrix 3 [1..])
  89 -- Matrix (Z :. 3 :. 3)
  90 --   [ 12.0, 15.0, 18.0,
  91 --     12.0, 15.0, 18.0,
  92 --     12.0, 15.0, 18.0]
  93 matSumCol :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
  94 matSumCol r mat = replicate (constant (Z :. (r :: Int) :. All)) $ sum $ transpose mat
  95
  96 matSumCol' :: Matrix Double -> Matrix Double
  97 matSumCol' m = run $ matSumCol n m'
  98   where
  99     n  = dim m
 100     m' = use m
 101
 102
 103 -- | Proba computes de probability matrix: all cells divided by thee sum of its column
 104 -- if you need get the probability on the lines, just transpose it
 105 --
 106 -- >>> run $ matProba 3 (use $ matrix 3 [1..])
 107 -- Matrix (Z :. 3 :. 3)
 108 --   [ 8.333333333333333e-2, 0.13333333333333333, 0.16666666666666666,
 109 --       0.3333333333333333,  0.3333333333333333,  0.3333333333333333,
 110 --       0.5833333333333334,  0.5333333333333333,                 0.5]
 111 matProba :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
 112 matProba r mat = zipWith (/) mat (matSumCol r mat)
 113
 114 -- | Diagonal of the matrix
 115 --
 116 -- >>> run $ diag (use $ matrix 3 ([1..] :: [Int]))
 117 -- Vector (Z :. 3) [1,5,9]
 118 diag :: Elt e => Acc (Matrix e) -> Acc (Vector e)
 119 diag m = backpermute (indexTail (shape m))
 120                      (lift1 (\(Z :. x) -> (Z :. x :. (x :: Exp Int))))
 121                      m
 122
 123 -- | Divide by the Diagonal of the matrix
 124 --
 125 -- >>> run $ divByDiag 3 (use $ matrix 3 ([1..] :: [Double]))
 126 -- Matrix (Z :. 3 :. 3)
 127 --   [ 1.0, 0.4, 0.3333333333333333,
 128 --     4.0, 1.0, 0.6666666666666666,
 129 --     7.0, 1.6,                1.0]
 130 divByDiag :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
 131 divByDiag d mat = zipWith (/) mat (replicate (constant (Z :. (d :: Int) :. All)) $ diag mat)
 132
 133 -----------------------------------------------------------------------
 134 -- | Filters the matrix with the minimum of maximums
 135 --
 136 -- >>> run $ matMiniMax $ use $ matrix 3 [1..]
 137 -- Matrix (Z :. 3 :. 3)
 138 --   [ 0.0, 4.0, 7.0,
 139 --     0.0, 5.0, 8.0,
 140 --     0.0, 6.0, 9.0]
 141 matMiniMax :: Acc (Matrix Double) -> Acc (Matrix Double)
 142 matMiniMax m = map (\x -> ifThenElse (x > miniMax') x 0) (transpose m)
 143   where
 144     miniMax' = (the $ minimum $ maximum m)
 145
 146 -- | Filters the matrix with a constant
 147 --
 148 -- >>> run $ matFilter 5 $ use $ matrix 3 [1..]
 149 -- Matrix (Z :. 3 :. 3)
 150 --   [ 0.0, 0.0, 7.0,
 151 --     0.0, 0.0, 8.0,
 152 --     0.0, 6.0, 9.0]
 153 filter' :: Double -> Acc (Matrix Double) -> Acc (Matrix Double)
 154 filter' t m = map (\x -> ifThenElse (x > (constant t)) x 0) (transpose m)
 155
 156 -----------------------------------------------------------------------
 157 -- * Measures of proximity
 158 -----------------------------------------------------------------------
 159 -- ** Conditional distance
 160
 161 -- *** Conditional distance (basic)
 162
 163 -- | Conditional distance (basic version)
 164 --
 165 -- 2 main measures are actually implemented in order to compute the
 166 -- proximity of two terms: conditional and distributional
 167 --
 168 -- Conditional measure is an absolute measure which reflects
 169 -- interactions of 2 terms in the corpus.
 170 measureConditional :: Matrix Int -> Matrix Double
 171 --measureConditional m = run (matMiniMax $ matProba (dim m) $ map fromIntegral $ use m)
 172 measureConditional m = run $ matProba (dim m)
 173                            $ map fromIntegral
 174                            $ use m
 175
 176
 177 -- *** Conditional distance (advanced)
 178
 179 -- | Conditional distance (advanced version)
 180 --
 181 -- The conditional measure P(i|j) of 2 terms @i@ and @j@, also called
 182 -- "confidence" , is the maximum probability between @i@ and @j@ to see
 183 -- @i@ in the same context of @j@ knowing @j@.
 184 --
 185 -- If N(i) (resp. N(j)) is the number of occurrences of @i@ (resp. @j@)
 186 -- in the corpus and _[n_{ij}\] the number of its occurrences we get:
 187 --
 188 -- \[P_c=max(\frac{n_i}{n_{ij}},\frac{n_j}{n_{ij}} )\]
 189 conditional' :: Matrix Int -> (Matrix InclusionExclusion, Matrix SpecificityGenericity)
 190 conditional' m = ( run $ ie $ map fromIntegral $ use m
 191                  , run $ sg $ map fromIntegral $ use m
 192                  )
 193   where
 194     ie :: Acc (Matrix Double) -> Acc (Matrix Double)
 195     ie mat = map (\x -> x / (2*n-1)) $ zipWith (+) (xs mat) (ys mat)
 196     sg :: Acc (Matrix Double) -> Acc (Matrix Double)
 197     sg mat = map (\x -> x / (2*n-1)) $ zipWith (-) (xs mat) (ys mat)
 198
 199     n :: Exp Double
 200     n = P.fromIntegral r
 201
 202     r :: Dim
 203     r = dim m
 204
 205     xs :: Acc (Matrix Double) -> Acc (Matrix Double)
 206     xs mat = zipWith (-) (matSumCol r $ matProba r mat) (matProba r mat)
 207     ys :: Acc (Matrix Double) -> Acc (Matrix Double)
 208     ys mat = zipWith (-) (matSumCol r $ transpose $ matProba r mat) (matProba r mat)
 209
 210 -----------------------------------------------------------------------
 211 -- ** Distributional Distance
 212
 213 -- | Distributional Distance Measure
 214 --
 215 -- Distributional measure is a relative measure which depends on the
 216 -- selected list, it represents structural equivalence of mutual information.
 217 --
 218 -- The distributional measure P(c) of @i@ and @j@ terms is: \[
 219 -- S_{MI} = \frac {\sum_{k \neq i,j ; MI_{ik} >0}^{} \min(MI_{ik},
 220 -- MI_{jk})}{\sum_{k \neq i,j ; MI_{ik}>0}^{}} \]
 221 --
 222 -- Mutual information
 223 -- \[S_{MI}({i},{j}) = \log(\frac{C{ij}}{E{ij}})\]
 224 --
 225 -- Number of cooccurrences of @i@ and @j@ in the same context of text
 226 --                        \[C{ij}\]
 227 --
 228 -- The expected value of the cooccurrences @i@ and @j@ (given a map list of size @n@)
 229 --            \[E_{ij}^{m} = \frac {S_{i} S_{j}} {N_{m}}\]
 230 --
 231 -- Total cooccurrences of term @i@ given a map list of size @m@
 232 --            \[S_{i} = \sum_{j, j \neq i}^{m} S_{ij}\]
 233 --
 234 -- Total cooccurrences of terms given a map list of size @m@
 235 --            \[N_{m} = \sum_{i,i \neq i}^{m} \sum_{j, j \neq j}^{m} S_{ij}\]
 236 --
 237 distributional :: Matrix Int -> Matrix Double
 238 distributional m = run -- $ matMiniMax
 239                        -- $ ri
 240                        -- $ myMin
 241                        $ filter' 0
 242                        $ s_mi
 243                        $ map fromIntegral  -- ^ from Int to Double
 244                        $ use m             -- ^ push matrix in Accelerate type
 245   where
 246     -- filter  m = zipWith (\a b -> max a b) m (transpose m)
 247
 248     ri :: Acc (Matrix Double) -> Acc (Matrix Double)
 249     ri mat = mat1 -- zipWith (/) mat1 mat2
 250       where
 251         mat1 = matSumCol n $ zipWith min'  (myMin mat) (myMin $ transpose mat)
 252         mat2 = total mat
 253
 254     myMin :: Acc (Matrix Double) -> Acc (Matrix Double)
 255     myMin = replicate (constant (Z :. n :. All)) . minimum
 256
 257
 258     -- TODO fix NaN
 259     -- Quali TEST: OK
 260     s_mi :: Acc (Matrix Double) -> Acc (Matrix Double)
 261     s_mi m' = zipWith (\x y -> log (x / y)) (diagNull n m')
 262             $ zipWith (/) (crossProduct n m') (total m')
 263             -- crossProduct n m'
 264
 265
 266     total :: Acc (Matrix Double) -> Acc (Matrix Double)
 267     total = replicate (constant (Z :. n :. n)) . sum . sum
 268
 269     n :: Dim
 270     n = dim m
 271
 272 -- run $ (identityMatrix (DAA.constant (10::Int)) :: DAA.Acc (DAA.Matrix Int)) Matrix (Z :. 10 :. 10)
 273 identityMatrix :: Num a => Exp Int -> Acc (Matrix a)
 274 identityMatrix n =
 275         let zeros = fill (index2 n n) 0
 276             ones  = fill (index1 n)   1
 277         in
 278         permute const zeros (\(unindex1 -> i) -> index2 i i) ones
 279
 280
 281 eyeMatrix :: Num a => Dim -> Acc (Matrix a) -> Acc (Matrix a)
 282 eyeMatrix n' m =
 283         let ones = fill (index2 n n) 1
 284             zeros  = fill (index1 n)   0
 285             n = constant n'
 286         in
 287         permute const ones (\(unindex1 -> i) -> index2 i i) zeros
 288
 289
 290 selfMatrix :: Num a => Dim -> Acc (Matrix a) -> Acc (Matrix a)
 291 selfMatrix n' m =
 292         let zeros = fill (index2 n n) 0
 293             ones  = fill (index2 n n) 1
 294             n = constant n'
 295         in
 296         permute const ones ( lift1 ( \(Z :. (i :: Exp Int) :. (j:: Exp Int))
 297                                                 -> -- ifThenElse (i /= j)
 298                                                      --         (Z :. i :. j)
 299                                                               (Z :. i :. i)
 300                                     )) zeros
 301
 302
 303 selfMatrix' m' = run $ selfMatrix n m
 304   where
 305     n = dim m'
 306     m = use m'
 307
 308 -------------------------------------------------
 309 diagNull :: Num a => Dim -> Acc (Matrix a) -> Acc (Matrix a)
 310 diagNull n m = zipWith (*) m eye
 311   where
 312     eye = eyeMatrix n m
 313
 314
 315 -------------------------------------------------
 316 crossProduct :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
 317 crossProduct n m = trace (P.show (run m',run m'')) $ zipWith (*) m' m''
 318   where
 319     m'  = cross n m
 320     m'' = cross n (transpose m)
 321
 322 crossT :: Matrix Double -> Matrix Double
 323 crossT  = run . transpose . use
 324
 325 crossProduct' :: Matrix Double -> Matrix Double
 326 crossProduct' m = run $ crossProduct n m'
 327   where
 328     n  = dim m
 329     m' = use m
 330
 331 runWith :: (Arrays c, Elt a1)
 332         => (Dim -> Acc (Matrix a1) -> a2 -> Acc c)
 333         -> Matrix a1
 334         -> a2
 335         -> c
 336 runWith f m = run . f (dim m) (use m)
 337
 338 -- | cross
 339 cross :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
 340 cross n mat = diagNull n (matSumCol n $ diagNull n mat)
 341
 342 cross' :: Matrix Double -> Matrix Double
 343 cross' mat = run $ cross n mat'
 344   where
 345     mat' = use mat
 346     n = dim mat
 347
 348
 349 -----------------------------------------------------------------------
 350 -----------------------------------------------------------------------
 351 -- * Specificity and Genericity
 352
 353 {- | Metric Specificity and genericity: select terms
 354
 355 - let N termes and occurrences of i \[N{i}\]
 356
 357 - Cooccurrences of i and j \[N{ij}\]
 358 - Probability to get i given j : \[P(i|j)=N{ij}/N{j}\]
 359
 360 - Genericity of i  \[Gen(i)  = \frac{\sum_{j \neq i,j} P(i|j)}{N-1}\]
 361 - Specificity of j \[Spec(i) = \frac{\sum_{j \neq i,j} P(j|i)}{N-1}\]
 362
 363 - \[Inclusion (i) = Gen(i) + Spec(i)\)
 364 - \[GenericityScore = Gen(i)- Spec(i)\]
 365
 366 - References: Science mapping with asymmetrical paradigmatic proximity
 367 Jean-Philippe Cointet (CREA, TSV), David Chavalarias (CREA) (Submitted
 368 on 15 Mar 2008), Networks and Heterogeneous Media 3, 2 (2008) 267 - 276,
 369 arXiv:0803.2315 [cs.OH]
 370 -}
 371 type InclusionExclusion    = Double
 372 type SpecificityGenericity = Double
 373
 374 data SquareMatrix      = SymetricMatrix | NonSymetricMatrix
 375 type SymetricMatrix    = Matrix
 376 type NonSymetricMatrix = Matrix
 377
 378
 379 incExcSpeGen :: Matrix Int -> (Vector InclusionExclusion, Vector SpecificityGenericity)
 380 incExcSpeGen m = (run' inclusionExclusion m, run' specificityGenericity m)
 381   where
 382     run' fun mat = run $ fun $ map fromIntegral $ use mat
 383
 384     -- | Inclusion (i) = Gen(i)+Spec(i)
 385     inclusionExclusion :: Acc (Matrix Double) -> Acc (Vector Double)
 386     inclusionExclusion mat = zipWith (+) (pV mat) (pV mat)
 387
 388     -- | Genericity score = Gen(i)- Spec(i)
 389     specificityGenericity :: Acc (Matrix Double) -> Acc (Vector Double)
 390     specificityGenericity mat = zipWith (+) (pH mat) (pH mat)
 391
 392     -- | Gen(i)  : 1/(N-1)*Sum(j!=i, P(i|j)) : Genericity  of i
 393     pV :: Acc (Matrix Double) -> Acc (Vector Double)
 394     pV mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ij mat
 395
 396     -- | Spec(i) : 1/(N-1)*Sum(j!=i, P(j|i)) : Specificity of j
 397     pH :: Acc (Matrix Double) -> Acc (Vector Double)
 398     pH mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ji mat
 399
 400     cardN :: Exp Double
 401     cardN = constant (P.fromIntegral (dim m) :: Double)
 402
 403
 404 -- | P(i|j) = Nij /N(jj) Probability to get i given j
 405 --p_ij :: (Elt e, P.Fractional (Exp e)) => Acc (SymetricMatrix e) -> Acc (Matrix e)
 406 p_ij :: (Elt e, P.Fractional (Exp e)) => Acc (Matrix e) -> Acc (Matrix e)
 407 p_ij m = zipWith (/) m (n_jj m)
 408   where
 409     n_jj :: Elt e => Acc (SymetricMatrix e) -> Acc (Matrix e)
 410     n_jj myMat' = backpermute (shape m)
 411                          (lift1 ( \(Z :. (_ :: Exp Int) :. (j:: Exp Int))
 412                                    -> (Z :. j :. j)
 413                                 )
 414                          ) myMat'
 415
 416 -- | P(j|i) = Nij /N(ii) Probability to get i given j
 417 -- to test
 418 p_ji :: (Elt e, P.Fractional (Exp e)) => Acc (Array DIM2 e) -> Acc (Array DIM2 e)
 419 p_ji = transpose . p_ij
 420
 421
 422 -- | Step to ckeck the result in visual/qualitative tests
 423 incExcSpeGen_proba :: Matrix Int -> Matrix Double
 424 incExcSpeGen_proba m = run' pro m
 425   where
 426     run' fun mat = run $ fun $ map fromIntegral $ use mat
 427
 428     pro mat = p_ji mat
 429
 430 {-
 431 -- | Hypothesis to test maybe later (or not)
 432 -- TODO ask accelerate for instances to ease such writtings:
 433 p_ :: (Elt e, P.Fractional (Exp e)) => Acc (Array DIM2 e) -> Acc (Array DIM2 e)
 434 p_ m = zipWith (/) m (n_ m)
 435   where
 436     n_ :: Elt e => Acc (SymetricMatrix e) -> Acc (Matrix e)
 437     n_ m = backpermute (shape m)
 438                          (lift1 ( \(Z :. (i :: Exp Int) :. (j:: Exp Int))
 439                                    -> (ifThenElse (i < j) (lift (Z :. j :. j)) (lift (Z :. i :. i)) :: Exp DIM2)
 440                                 )
 441                          ) m
 442 -}
 443
 444 -- * For Tests (to be removed)
 445 -- | Test perfermance with this matrix
 446 -- TODO : add this in a benchmark folder
 447 distriTest :: Matrix Double
 448 distriTest = distributional $ matrix 100 [1..]
 449 -----------------------------------------------------------------------
 450