src/Gargantext/Viz/Graph/Distances/Matrice.hs

   1 {-|
   2 Module      : Gargantext.Graph.Distances.Matrix
   3 Description :
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 This module aims at implementig distances of terms context by context is
  11 the same referential of corpus.
  12
  13
  14 Implementation use Accelerate library which enables GPU and CPU computation:
  15
  16   * Manuel M. T. Chakravarty, Gabriele Keller, Sean Lee, Trevor L. McDonell, and Vinod Grover.
  17     [Accelerating Haskell Array Codes with Multicore GPUs][CKLM+11].
  18     In _DAMP '11: Declarative Aspects of Multicore Programming_, ACM, 2011.
  19
  20   * Trevor L. McDonell, Manuel M. T. Chakravarty, Vinod Grover, and Ryan R. Newton.
  21     [Type-safe Runtime Code Generation: Accelerate to LLVM][MCGN15].
  22     In _Haskell '15: The 8th ACM SIGPLAN Symposium on Haskell_, ACM, 2015.
  23
  24 -}
  25
  26 {-# LANGUAGE TypeFamilies        #-}
  27 {-# LANGUAGE TypeOperators       #-}
  28 {-# LANGUAGE ScopedTypeVariables #-}
  29 {-# LANGUAGE ViewPatterns        #-}
  30
  31 module Gargantext.Viz.Graph.Distances.Matrice
  32   where
  33
  34 import Debug.Trace (trace)
  35 import Data.Array.Accelerate
  36 import Data.Array.Accelerate.Interpreter (run)
  37 import qualified Gargantext.Prelude as P
  38
  39
  40 -----------------------------------------------------------------------
  41 -- | Define a vector
  42 --
  43 -- >>> vector 3
  44 -- Vector (Z :. 3) [0,1,2]
  45 vector :: Elt c => Int -> [c] -> (Array (Z :. Int) c)
  46 vector n l = fromList (Z :. n) l
  47
  48 -- | Define a matrix
  49 --
  50 -- >>> matrix 3 ([1..] :: [Double])
  51 -- Matrix (Z :. 3 :. 3)
  52 --   [ 1.0, 2.0, 3.0,
  53 --     4.0, 5.0, 6.0,
  54 --     7.0, 8.0, 9.0]
  55 matrix :: Elt c => Int -> [c] -> Matrix c
  56 matrix n l = fromList (Z :. n :. n) l
  57
  58 -- | Two ways to get the rank (as documentation)
  59 --
  60 -- >>> rank (matrix 3 ([1..] :: [Int]))
  61 -- 2
  62 rank :: (Matrix a) -> Int
  63 rank m = arrayRank $ arrayShape m
  64
  65 -----------------------------------------------------------------------
  66 -- | Dimension of a square Matrix
  67 -- How to force use with SquareMatrix ?
  68 type Dim = Int
  69
  70 -- | Get Dimension of a square Matrix
  71 --
  72 -- >>> dim (matrix 3 ([1..] :: [Int]))
  73 -- 3
  74 dim :: Matrix a -> Dim
  75 dim m = n
  76   where
  77     Z :. _ :. n = arrayShape m
  78     -- indexTail (arrayShape m)
  79
  80 -----------------------------------------------------------------------
  81 -- TODO move to Utils
  82 runExp :: Elt e => Exp e -> e
  83 runExp e = indexArray (run (unit e)) Z
  84 -----------------------------------------------------------------------
  85
  86 -- | Sum of a Matrix by Column
  87 --
  88 -- >>> run $ matSumCol 3 (use $ matrix 3 [1..])
  89 -- Matrix (Z :. 3 :. 3)
  90 --   [ 12.0, 15.0, 18.0,
  91 --     12.0, 15.0, 18.0,
  92 --     12.0, 15.0, 18.0]
  93 matSumCol :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
  94 matSumCol r mat = replicate (constant (Z :. (r :: Int) :. All)) $ sum $ transpose mat
  95
  96 matSumCol' :: Matrix Double -> Matrix Double
  97 matSumCol' m = run $ matSumCol n m'
  98   where
  99     n  = dim m
 100     m' = use m
 101
 102
 103 -- | Proba computes de probability matrix: all cells divided by thee sum of its column
 104 -- if you need get the probability on the lines, just transpose it
 105 --
 106 -- >>> run $ matProba 3 (use $ matrix 3 [1..])
 107 -- Matrix (Z :. 3 :. 3)
 108 --   [ 8.333333333333333e-2, 0.13333333333333333, 0.16666666666666666,
 109 --       0.3333333333333333,  0.3333333333333333,  0.3333333333333333,
 110 --       0.5833333333333334,  0.5333333333333333,                 0.5]
 111 matProba :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
 112 matProba r mat = zipWith (/) mat (matSumCol r mat)
 113
 114 -- | Diagonal of the matrix
 115 --
 116 -- >>> run $ diag (use $ matrix 3 ([1..] :: [Int]))
 117 -- Vector (Z :. 3) [1,5,9]
 118 diag :: Elt e => Acc (Matrix e) -> Acc (Vector e)
 119 diag m = backpermute (indexTail (shape m))
 120                      (lift1 (\(Z :. x) -> (Z :. x :. (x :: Exp Int))))
 121                      m
 122
 123 -- | Divide by the Diagonal of the matrix
 124 --
 125 -- >>> run $ divByDiag 3 (use $ matrix 3 ([1..] :: [Double]))
 126 -- Matrix (Z :. 3 :. 3)
 127 --   [ 1.0, 0.4, 0.3333333333333333,
 128 --     4.0, 1.0, 0.6666666666666666,
 129 --     7.0, 1.6,                1.0]
 130 divByDiag :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
 131 divByDiag d mat = zipWith (/) mat (replicate (constant (Z :. (d :: Int) :. All)) $ diag mat)
 132
 133 -----------------------------------------------------------------------
 134 -- | Filters the matrix with the minimum of maximums
 135 --
 136 -- >>> run $ matMiniMax $ use $ matrix 3 [1..]
 137 -- Matrix (Z :. 3 :. 3)
 138 --   [ 0.0, 4.0, 7.0,
 139 --     0.0, 5.0, 8.0,
 140 --     0.0, 6.0, 9.0]
 141 matMiniMax :: Acc (Matrix Double) -> Acc (Matrix Double)
 142 matMiniMax m = map (\x -> ifThenElse (x > miniMax') x 0) (transpose m)
 143   where
 144     miniMax' = (the $ minimum $ maximum m)
 145
 146 -- | Filters the matrix with a constant
 147 --
 148 -- >>> run $ matFilter 5 $ use $ matrix 3 [1..]
 149 -- Matrix (Z :. 3 :. 3)
 150 --   [ 0.0, 0.0, 7.0,
 151 --     0.0, 0.0, 8.0,
 152 --     0.0, 6.0, 9.0]
 153 filter' :: Double -> Acc (Matrix Double) -> Acc (Matrix Double)
 154 filter' t m = map (\x -> ifThenElse (x > (constant t)) x 0) (transpose m)
 155
 156 -----------------------------------------------------------------------
 157 -- * Measures of proximity
 158 -----------------------------------------------------------------------
 159 -- ** Conditional distance
 160
 161 -- *** Conditional distance (basic)
 162
 163 -- | Conditional distance (basic version)
 164 --
 165 -- 2 main measures are actually implemented in order to compute the
 166 -- proximity of two terms: conditional and distributional
 167 --
 168 -- Conditional measure is an absolute measure which reflects
 169 -- interactions of 2 terms in the corpus.
 170 measureConditional :: Matrix Int -> Matrix Double
 171 --measureConditional m = run (matMiniMax $ matProba (dim m) $ map fromIntegral $ use m)
 172 measureConditional m = run $ matProba (dim m)
 173                            $ map fromIntegral
 174                            $ use m
 175
 176
 177 -- *** Conditional distance (advanced)
 178
 179 -- | Conditional distance (advanced version)
 180 --
 181 -- The conditional measure P(i|j) of 2 terms @i@ and @j@, also called
 182 -- "confidence" , is the maximum probability between @i@ and @j@ to see
 183 -- @i@ in the same context of @j@ knowing @j@.
 184 --
 185 -- If N(i) (resp. N(j)) is the number of occurrences of @i@ (resp. @j@)
 186 -- in the corpus and _[n_{ij}\] the number of its occurrences we get:
 187 --
 188 -- \[P_c=max(\frac{n_i}{n_{ij}},\frac{n_j}{n_{ij}} )\]
 189 conditional' :: Matrix Int -> (Matrix InclusionExclusion, Matrix SpecificityGenericity)
 190 conditional' m = ( run $ ie $ map fromIntegral $ use m
 191                  , run $ sg $ map fromIntegral $ use m
 192                  )
 193   where
 194     ie :: Acc (Matrix Double) -> Acc (Matrix Double)
 195     ie mat = map (\x -> x / (2*n-1)) $ zipWith (+) (xs mat) (ys mat)
 196     sg :: Acc (Matrix Double) -> Acc (Matrix Double)
 197     sg mat = map (\x -> x / (2*n-1)) $ zipWith (-) (xs mat) (ys mat)
 198
 199     n :: Exp Double
 200     n = P.fromIntegral r
 201
 202     r :: Dim
 203     r = dim m
 204
 205     xs :: Acc (Matrix Double) -> Acc (Matrix Double)
 206     xs mat = zipWith (-) (matSumCol r $ matProba r mat) (matProba r mat)
 207     ys :: Acc (Matrix Double) -> Acc (Matrix Double)
 208     ys mat = zipWith (-) (matSumCol r $ transpose $ matProba r mat) (matProba r mat)
 209
 210 -----------------------------------------------------------------------
 211 -- ** Distributional Distance
 212
 213 -- | Distributional Distance Measure
 214 --
 215 -- Distributional measure is a relative measure which depends on the
 216 -- selected list, it represents structural equivalence of mutual information.
 217 --
 218 -- The distributional measure P(c) of @i@ and @j@ terms is: \[
 219 -- S_{MI} = \frac {\sum_{k \neq i,j ; MI_{ik} >0}^{} \min(MI_{ik},
 220 -- MI_{jk})}{\sum_{k \neq i,j ; MI_{ik}>0}^{}} \]
 221 --
 222 -- Mutual information
 223 -- \[S_{MI}({i},{j}) = \log(\frac{C{ij}}{E{ij}})\]
 224 --
 225 -- Number of cooccurrences of @i@ and @j@ in the same context of text
 226 --                        \[C{ij}\]
 227 --
 228 -- The expected value of the cooccurrences @i@ and @j@ (given a map list of size @n@)
 229 --            \[E_{ij}^{m} = \frac {S_{i} S_{j}} {N_{m}}\]
 230 --
 231 -- Total cooccurrences of term @i@ given a map list of size @m@
 232 --            \[S_{i} = \sum_{j, j \neq i}^{m} S_{ij}\]
 233 --
 234 -- Total cooccurrences of terms given a map list of size @m@
 235 --            \[N_{m} = \sum_{i,i \neq i}^{m} \sum_{j, j \neq j}^{m} S_{ij}\]
 236 --
 237 distributional :: Matrix Int -> Matrix Double
 238 distributional m = run -- $ matMiniMax
 239                        -- $ ri
 240                        -- $ myMin
 241                        $ filter' 0
 242                        $ s_mi
 243                        $ diag2null n
 244                        $ map fromIntegral  -- ^ from Int to Double
 245                        $ use m             -- ^ push matrix in Accelerate type
 246   where
 247     -- filter  m = zipWith (\a b -> max a b) m (transpose m)
 248
 249     ri :: Acc (Matrix Double) -> Acc (Matrix Double)
 250     ri mat = mat1 -- zipWith (/) mat1 mat2
 251       where
 252         mat1 = matSumCol n $ zipWith min'  (myMin mat) (myMin $ transpose mat)
 253         mat2 = total mat
 254
 255     s_mi :: Acc (Matrix Double) -> Acc (Matrix Double)
 256     s_mi m' = zipWith (\a b -> log (a/b))  m'
 257             $ zipWith (/) (crossProduct n m') (total m')
 258
 259     total :: Acc (Matrix Double) -> Acc (Matrix Double)
 260     total = replicate (constant (Z :. n :. n)) . sum . sum
 261
 262     min' x y
 263       | runExp (x > y && x /= 0) = x
 264       | P.otherwise              = y
 265
 266     myMin :: Acc (Matrix Double) -> Acc (Matrix Double)
 267     myMin = replicate (constant (Z :. n :. All)) . minimum
 268
 269     n :: Dim
 270     n = dim m
 271
 272 -- run $ (identityMatrix (DAA.constant (10::Int)) :: DAA.Acc (DAA.Matrix Int)) Matrix (Z :. 10 :. 10)
 273 identityMatrix :: Num a => Exp Int -> Acc (Matrix a)
 274 identityMatrix n =
 275         let zeros = fill (index2 n n) 0
 276             ones  = fill (index1 n)   1
 277         in
 278         permute const zeros (\(unindex1 -> i) -> index2 i i) ones
 279
 280
 281 eyeMatrix :: Num a => Dim -> Acc (Matrix a) -> Acc (Matrix a)
 282 eyeMatrix n' m =
 283         let zeros = fill (index2 n n) 1
 284             ones  = fill (index1 n)   0
 285             n = constant n'
 286         in
 287         permute const zeros (\(unindex1 -> i) -> index2 i i) ones
 288
 289
 290 diag2null :: Num a => Dim -> Acc (Matrix a) -> Acc (Matrix a)
 291 diag2null n m = zipWith (*) m eye
 292   where
 293     eye = eyeMatrix n m
 294
 295
 296 crossProduct :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
 297 crossProduct n m = trace (P.show (run m',run m'')) $ zipWith (*) m' m''
 298   where
 299     m'  = cross n m
 300     m'' = cross n (transpose m)
 301
 302 crossT :: Matrix Double -> Matrix Double
 303 crossT  = run . transpose . use
 304
 305 crossProduct' :: Matrix Double -> Matrix Double
 306 crossProduct' m = run $ crossProduct n m'
 307   where
 308     n  = dim m
 309     m' = use m
 310
 311 runWith :: (Arrays c, Elt a1)
 312         => (Dim -> Acc (Matrix a1) -> a2 -> Acc c)
 313         -> Matrix a1
 314         -> a2
 315         -> c
 316 runWith f m = run . f (dim m) (use m)
 317
 318 -- | cross
 319 cross :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
 320 cross n mat = zipWith (-) (matSumCol n mat) (mat)
 321
 322 cross' :: Matrix Double -> Matrix Double
 323 cross' mat = run $ cross n mat'
 324   where
 325     mat' = use mat
 326     n = dim mat
 327
 328
 329 -----------------------------------------------------------------------
 330 -----------------------------------------------------------------------
 331 -- * Specificity and Genericity
 332
 333 {- | Metric Specificity and genericity: select terms
 334
 335 - let N termes and occurrences of i \[N{i}\]
 336
 337 - Cooccurrences of i and j \[N{ij}\]
 338 - Probability to get i given j : \[P(i|j)=N{ij}/N{j}\]
 339
 340 - Genericity of i  \[Gen(i)  = \frac{\sum_{j \neq i,j} P(i|j)}{N-1}\]
 341 - Specificity of j \[Spec(i) = \frac{\sum_{j \neq i,j} P(j|i)}{N-1}\]
 342
 343 - \[Inclusion (i) = Gen(i) + Spec(i)\)
 344 - \[GenericityScore = Gen(i)- Spec(i)\]
 345
 346 - References: Science mapping with asymmetrical paradigmatic proximity
 347 Jean-Philippe Cointet (CREA, TSV), David Chavalarias (CREA) (Submitted
 348 on 15 Mar 2008), Networks and Heterogeneous Media 3, 2 (2008) 267 - 276,
 349 arXiv:0803.2315 [cs.OH]
 350 -}
 351 type InclusionExclusion    = Double
 352 type SpecificityGenericity = Double
 353
 354 data SquareMatrix      = SymetricMatrix | NonSymetricMatrix
 355 type SymetricMatrix    = Matrix
 356 type NonSymetricMatrix = Matrix
 357
 358
 359 incExcSpeGen :: Matrix Int -> (Vector InclusionExclusion, Vector SpecificityGenericity)
 360 incExcSpeGen m = (run' inclusionExclusion m, run' specificityGenericity m)
 361   where
 362     run' fun mat = run $ fun $ map fromIntegral $ use mat
 363
 364     -- | Inclusion (i) = Gen(i)+Spec(i)
 365     inclusionExclusion :: Acc (Matrix Double) -> Acc (Vector Double)
 366     inclusionExclusion mat = zipWith (+) (pV mat) (pV mat)
 367
 368     -- | Genericity score = Gen(i)- Spec(i)
 369     specificityGenericity :: Acc (Matrix Double) -> Acc (Vector Double)
 370     specificityGenericity mat = zipWith (+) (pH mat) (pH mat)
 371
 372     -- | Gen(i)  : 1/(N-1)*Sum(j!=i, P(i|j)) : Genericity  of i
 373     pV :: Acc (Matrix Double) -> Acc (Vector Double)
 374     pV mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ij mat
 375
 376     -- | Spec(i) : 1/(N-1)*Sum(j!=i, P(j|i)) : Specificity of j
 377     pH :: Acc (Matrix Double) -> Acc (Vector Double)
 378     pH mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ji mat
 379
 380     cardN :: Exp Double
 381     cardN = constant (P.fromIntegral (dim m) :: Double)
 382
 383
 384 -- | P(i|j) = Nij /N(jj) Probability to get i given j
 385 --p_ij :: (Elt e, P.Fractional (Exp e)) => Acc (SymetricMatrix e) -> Acc (Matrix e)
 386 p_ij :: (Elt e, P.Fractional (Exp e)) => Acc (Matrix e) -> Acc (Matrix e)
 387 p_ij m = zipWith (/) m (n_jj m)
 388   where
 389     n_jj :: Elt e => Acc (SymetricMatrix e) -> Acc (Matrix e)
 390     n_jj myMat' = backpermute (shape m)
 391                          (lift1 ( \(Z :. (_ :: Exp Int) :. (j:: Exp Int))
 392                                    -> (Z :. j :. j)
 393                                 )
 394                          ) myMat'
 395
 396 -- | P(j|i) = Nij /N(ii) Probability to get i given j
 397 -- to test
 398 p_ji :: (Elt e, P.Fractional (Exp e)) => Acc (Array DIM2 e) -> Acc (Array DIM2 e)
 399 p_ji = transpose . p_ij
 400
 401
 402 -- | Step to ckeck the result in visual/qualitative tests
 403 incExcSpeGen_proba :: Matrix Int -> Matrix Double
 404 incExcSpeGen_proba m = run' pro m
 405   where
 406     run' fun mat = run $ fun $ map fromIntegral $ use mat
 407
 408     pro mat = p_ji mat
 409
 410 {-
 411 -- | Hypothesis to test maybe later (or not)
 412 -- TODO ask accelerate for instances to ease such writtings:
 413 p_ :: (Elt e, P.Fractional (Exp e)) => Acc (Array DIM2 e) -> Acc (Array DIM2 e)
 414 p_ m = zipWith (/) m (n_ m)
 415   where
 416     n_ :: Elt e => Acc (SymetricMatrix e) -> Acc (Matrix e)
 417     n_ m = backpermute (shape m)
 418                          (lift1 ( \(Z :. (i :: Exp Int) :. (j:: Exp Int))
 419                                    -> (ifThenElse (i < j) (lift (Z :. j :. j)) (lift (Z :. i :. i)) :: Exp DIM2)
 420                                 )
 421                          ) m
 422 -}
 423
 424 -- * For Tests (to be removed)
 425 -- | Test perfermance with this matrix
 426 -- TODO : add this in a benchmark folder
 427 distriTest :: Matrix Double
 428 distriTest = distributional $ matrix 100 [1..]
 429 -----------------------------------------------------------------------
 430