src/Gargantext/Viz/Graph/Distances/Matrice.hs

   1 {-|
   2 Module      : Gargantext.Graph.Distances.Matrix
   3 Description :
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 This module aims at implementig distances of terms context by context is
  11 the same referential of corpus.
  12
  13
  14 Implementation use Accelerate library which enables GPU and CPU computation:
  15
  16   * Manuel M. T. Chakravarty, Gabriele Keller, Sean Lee, Trevor L. McDonell, and Vinod Grover.
  17     [Accelerating Haskell Array Codes with Multicore GPUs][CKLM+11].
  18     In _DAMP '11: Declarative Aspects of Multicore Programming_, ACM, 2011.
  19
  20   * Trevor L. McDonell, Manuel M. T. Chakravarty, Gabriele Keller, and Ben Lippmeier.
  21     [Optimising Purely Functional GPU Programs][MCKL13].
  22     In _ICFP '13: The 18th ACM SIGPLAN International Conference on Functional Programming_, ACM, 2013.
  23
  24   * Robert Clifton-Everest, Trevor L. McDonell, Manuel M. T. Chakravarty, and Gabriele Keller.
  25     [Embedding Foreign Code][CMCK14].
  26     In _PADL '14: The 16th International Symposium on Practical Aspects of Declarative Languages_, Springer-Verlag, LNCS, 2014.
  27
  28   * Trevor L. McDonell, Manuel M. T. Chakravarty, Vinod Grover, and Ryan R. Newton.
  29     [Type-safe Runtime Code Generation: Accelerate to LLVM][MCGN15].
  30     In _Haskell '15: The 8th ACM SIGPLAN Symposium on Haskell_, ACM, 2015.
  31
  32 -}
  33
  34 {-# LANGUAGE TypeFamilies        #-}
  35 {-# LANGUAGE TypeOperators       #-}
  36 {-# LANGUAGE ScopedTypeVariables #-}
  37 {-# LANGUAGE ViewPatterns        #-}
  38
  39 module Gargantext.Viz.Graph.Distances.Matrice
  40   where
  41
  42 import Debug.Trace (trace)
  43 import Data.Array.Accelerate
  44 import Data.Array.Accelerate.Interpreter (run)
  45 import qualified Gargantext.Prelude as P
  46
  47
  48 -----------------------------------------------------------------------
  49 -- | Define a vector
  50 --
  51 -- >>> vector 3
  52 -- Vector (Z :. 3) [0,1,2]
  53 vector :: Int -> (Array (Z :. Int) Int)
  54 vector n = fromList (Z :. n) [0..n]
  55
  56 -- | Define a matrix
  57 --
  58 -- >>> matrix 3 ([1..] :: [Double])
  59 -- Matrix (Z :. 3 :. 3)
  60 --   [ 1.0, 2.0, 3.0,
  61 --     4.0, 5.0, 6.0,
  62 --     7.0, 8.0, 9.0]
  63 matrix :: Elt c => Int -> [c] -> Matrix c
  64 matrix n l = fromList (Z :. n :. n) l
  65
  66 -- | Two ways to get the rank (as documentation)
  67 --
  68 -- >>> rank (matrix 3 ([1..] :: [Int]))
  69 -- 2
  70 rank :: (Matrix a) -> Int
  71 rank m = arrayRank $ arrayShape m
  72
  73 -----------------------------------------------------------------------
  74 -- | Dimension of a square Matrix
  75 -- How to force use with SquareMatrix ?
  76 type Dim = Int
  77
  78 -- | Get Dimension of a square Matrix
  79 --
  80 -- >>> dim (matrix 3 ([1..] :: [Int]))
  81 -- 3
  82 dim :: Matrix a -> Dim
  83 dim m = n
  84   where
  85     Z :. _ :. n = arrayShape m
  86     -- indexTail (arrayShape m)
  87
  88 -----------------------------------------------------------------------
  89 -- TODO move to Utils
  90 runExp :: Elt e => Exp e -> e
  91 runExp e = indexArray (run (unit e)) Z
  92 -----------------------------------------------------------------------
  93
  94 -- | Sum of a Matrix by Column
  95 --
  96 -- >>> run $ matSumCol 3 (use $ matrix 3 [1..])
  97 -- Matrix (Z :. 3 :. 3)
  98 --   [ 12.0, 15.0, 18.0,
  99 --     12.0, 15.0, 18.0,
 100 --     12.0, 15.0, 18.0]
 101 matSumCol :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
 102 matSumCol r mat = replicate (constant (Z :. (r :: Int) :. All)) $ sum $ transpose mat
 103
 104 matSumCol' :: Matrix Double -> Matrix Double
 105 matSumCol' m = run $ matSumCol n m'
 106   where
 107     n  = dim m
 108     m' = use m
 109
 110
 111 -- | Proba computes de probability matrix: all cells divided by thee sum of its column
 112 -- if you need get the probability on the lines, just transpose it
 113 --
 114 -- >>> run $ matProba 3 (use $ matrix 3 [1..])
 115 -- Matrix (Z :. 3 :. 3)
 116 --   [ 8.333333333333333e-2, 0.13333333333333333, 0.16666666666666666,
 117 --       0.3333333333333333,  0.3333333333333333,  0.3333333333333333,
 118 --       0.5833333333333334,  0.5333333333333333,                 0.5]
 119 matProba :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
 120 matProba r mat = zipWith (/) mat (matSumCol r mat)
 121
 122 -- | Diagonal of the matrix
 123 --
 124 -- >>> run $ diag (use $ matrix 3 ([1..] :: [Int]))
 125 -- Vector (Z :. 3) [1,5,9]
 126 diag :: Elt e => Acc (Matrix e) -> Acc (Vector e)
 127 diag m = backpermute (indexTail (shape m))
 128                      (lift1 (\(Z :. x) -> (Z :. x :. (x :: Exp Int))))
 129                      m
 130
 131 -- | Divide by the Diagonal of the matrix
 132 --
 133 -- >>> run $ divByDiag 3 (use $ matrix 3 ([1..] :: [Double]))
 134 -- Matrix (Z :. 3 :. 3)
 135 --   [ 1.0, 0.4, 0.3333333333333333,
 136 --     4.0, 1.0, 0.6666666666666666,
 137 --     7.0, 1.6,                1.0]
 138 divByDiag :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
 139 divByDiag d mat = zipWith (/) mat (replicate (constant (Z :. (d :: Int) :. All)) $ diag mat)
 140
 141 -----------------------------------------------------------------------
 142 -- | Filters the matrix with the minimum of maximums
 143 --
 144 -- >>> run $ matMiniMax $ use $ matrix 3 [1..]
 145 -- Matrix (Z :. 3 :. 3)
 146 --   [ 0.0, 4.0, 7.0,
 147 --     0.0, 5.0, 8.0,
 148 --     0.0, 6.0, 9.0]
 149 matMiniMax :: Acc (Matrix Double) -> Acc (Matrix Double)
 150 matMiniMax m = map (\x -> ifThenElse (x > miniMax') x 0) (transpose m)
 151   where
 152     miniMax' = (the $ minimum $ maximum m)
 153
 154 -- | Filters the matrix with a constant
 155 --
 156 -- >>> run $ matFilter 5 $ use $ matrix 3 [1..]
 157 -- Matrix (Z :. 3 :. 3)
 158 --   [ 0.0, 0.0, 7.0,
 159 --     0.0, 0.0, 8.0,
 160 --     0.0, 6.0, 9.0]
 161 filter' :: Double -> Acc (Matrix Double) -> Acc (Matrix Double)
 162 filter' t m = map (\x -> ifThenElse (x > (constant t)) x 0) (transpose m)
 163
 164 -----------------------------------------------------------------------
 165 -- * Measures of proximity
 166 -----------------------------------------------------------------------
 167 -- ** Conditional distance
 168
 169 -- *** Conditional distance (basic)
 170
 171 -- | Conditional distance (basic version)
 172 --
 173 -- 2 main measures are actually implemented in order to compute the
 174 -- proximity of two terms: conditional and distributional
 175 --
 176 -- Conditional measure is an absolute measure which reflects
 177 -- interactions of 2 terms in the corpus.
 178 measureConditional :: Matrix Int -> Matrix Double
 179 --measureConditional m = run (matMiniMax $ matProba (dim m) $ map fromIntegral $ use m)
 180 measureConditional m = run $ matProba (dim m)
 181                            $ map fromIntegral
 182                            $ use m
 183
 184
 185 -- *** Conditional distance (advanced)
 186
 187 -- | Conditional distance (advanced version)
 188 --
 189 -- The conditional measure P(i|j) of 2 terms @i@ and @j@, also called
 190 -- "confidence" , is the maximum probability between @i@ and @j@ to see
 191 -- @i@ in the same context of @j@ knowing @j@.
 192 --
 193 -- If N(i) (resp. N(j)) is the number of occurrences of @i@ (resp. @j@)
 194 -- in the corpus and _[n_{ij}\] the number of its occurrences we get:
 195 --
 196 -- \[P_c=max(\frac{n_i}{n_{ij}},\frac{n_j}{n_{ij}} )\]
 197 conditional' :: Matrix Int -> (Matrix InclusionExclusion, Matrix SpecificityGenericity)
 198 conditional' m = ( run $ ie $ map fromIntegral $ use m
 199                  , run $ sg $ map fromIntegral $ use m
 200                  )
 201   where
 202     ie :: Acc (Matrix Double) -> Acc (Matrix Double)
 203     ie mat = map (\x -> x / (2*n-1)) $ zipWith (+) (xs mat) (ys mat)
 204     sg :: Acc (Matrix Double) -> Acc (Matrix Double)
 205     sg mat = map (\x -> x / (2*n-1)) $ zipWith (-) (xs mat) (ys mat)
 206
 207     n :: Exp Double
 208     n = P.fromIntegral r
 209
 210     r :: Dim
 211     r = dim m
 212
 213     xs :: Acc (Matrix Double) -> Acc (Matrix Double)
 214     xs mat = zipWith (-) (matSumCol r $ matProba r mat) (matProba r mat)
 215     ys :: Acc (Matrix Double) -> Acc (Matrix Double)
 216     ys mat = zipWith (-) (matSumCol r $ transpose $ matProba r mat) (matProba r mat)
 217
 218 -----------------------------------------------------------------------
 219 -- ** Distributional Distance
 220
 221 -- | Distributional Distance Measure
 222 --
 223 -- Distributional measure is a relative measure which depends on the
 224 -- selected list, it represents structural equivalence of mutual information.
 225 --
 226 -- The distributional measure P(c) of @i@ and @j@ terms is: \[
 227 -- S_{MI} = \frac {\sum_{k \neq i,j ; MI_{ik} >0}^{} \min(MI_{ik},
 228 -- MI_{jk})}{\sum_{k \neq i,j ; MI_{ik}>0}^{}} \]
 229 --
 230 -- Mutual information
 231 -- \[S_{MI}({i},{j}) = \log(\frac{C{ij}}{E{ij}})\]
 232 --
 233 -- Number of cooccurrences of @i@ and @j@ in the same context of text
 234 --                        \[C{ij}\]
 235 --
 236 -- The expected value of the cooccurrences @i@ and @j@ (given a map list of size @n@)
 237 --            \[E_{ij}^{m} = \frac {S_{i} S_{j}} {N_{m}}\]
 238 --
 239 -- Total cooccurrences of term @i@ given a map list of size @m@
 240 --            \[S_{i} = \sum_{j, j \neq i}^{m} S_{ij}\]
 241 --
 242 -- Total cooccurrences of terms given a map list of size @m@
 243 --            \[N_{m} = \sum_{i,i \neq i}^{m} \sum_{j, j \neq j}^{m} S_{ij}\]
 244 --
 245 distributional :: Matrix Int -> Matrix Double
 246 distributional m = run -- $ matMiniMax
 247                        -- $ ri
 248                        -- $ myMin
 249                        $ filter' 0
 250                        $ s_mi
 251 --                       $ diag2null
 252                        $ map fromIntegral  -- ^ from Int to Double
 253                        $ use m             -- ^ push matrix in Accelerate type
 254   where
 255     -- filter  m = zipWith (\a b -> max a b) m (transpose m)
 256
 257     ri :: Acc (Matrix Double) -> Acc (Matrix Double)
 258     ri mat = mat1 -- zipWith (/) mat1 mat2
 259       where
 260         mat1 = matSumCol n $ zipWith min'  (myMin mat) (myMin $ transpose mat)
 261         mat2 = total mat
 262
 263     s_mi :: Acc (Matrix Double) -> Acc (Matrix Double)
 264     s_mi m' = zipWith (\a b -> log (a/b))  m'
 265             $ zipWith (/) (crossProduct n m') (total m')
 266
 267     total :: Acc (Matrix Double) -> Acc (Matrix Double)
 268     total = replicate (constant (Z :. n :. n)) . sum . sum
 269
 270     min' x y
 271       | runExp (x > y && x /= 0) = x
 272       | P.otherwise              = y
 273
 274     myMin :: Acc (Matrix Double) -> Acc (Matrix Double)
 275     myMin = replicate (constant (Z :. n :. All)) . minimum
 276
 277     n :: Dim
 278     n = dim m
 279
 280 -- run $ (identityMatrix (DAA.constant (10::Int)) :: DAA.Acc (DAA.Matrix Int)) Matrix (Z :. 10 :. 10)
 281 identityMatrix :: Num a => Exp Int -> Acc (Matrix a)
 282 identityMatrix n =
 283         let zeros = fill (index2 n n) 0
 284             ones  = fill (index1 n)   1
 285         in
 286         permute const zeros (\(unindex1 -> i) -> index2 i i) ones
 287
 288
 289 eyeMatrix :: Num a => (Matrix a) -> Acc (Matrix a)
 290 eyeMatrix m =
 291         let zeros = fill (index2 n n) 1
 292             ones  = fill (index1 n)   0
 293             n     = constant $ dim m
 294         in
 295         permute const zeros (\(unindex1 -> i) -> index2 i i) ones
 296
 297 diag2null :: Num a => (Matrix a) -> Acc (Matrix a)
 298 diag2null m' = zipWith (*) m eye
 299   where
 300     m   = use m'
 301     eye = eyeMatrix m'
 302
 303
 304
 305 crossProduct :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
 306 crossProduct n m = trace (P.show (run m',run m'')) $ zipWith (*) m' m''
 307   where
 308     m'  = cross n m
 309     m'' = cross n (transpose m)
 310
 311 crossT :: Matrix Double -> Matrix Double
 312 crossT  = run . transpose . use
 313
 314 crossProduct' :: Matrix Double -> Matrix Double
 315 crossProduct' m = run $ crossProduct n m'
 316   where
 317     n  = dim m
 318     m' = use m
 319
 320 runWith :: (Arrays c, Elt a1)
 321         => (Dim -> Acc (Matrix a1) -> a2 -> Acc c)
 322         -> Matrix a1
 323         -> a2
 324         -> c
 325 runWith f m = run . f (dim m) (use m)
 326
 327 -- | cross
 328 cross :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
 329 cross n mat = zipWith (-) (matSumCol n mat) (mat)
 330
 331 cross' :: Matrix Double -> Matrix Double
 332 cross' mat = run $ cross n mat'
 333   where
 334     mat' = use mat
 335     n = dim mat
 336
 337
 338 -----------------------------------------------------------------------
 339 -----------------------------------------------------------------------
 340 -- * Specificity and Genericity
 341
 342 {- | Metric Specificity and genericity: select terms
 343
 344 - let N termes and occurrences of i \[N{i}\]
 345
 346 - Cooccurrences of i and j \[N{ij}\]
 347 - Probability to get i given j : \[P(i|j)=N{ij}/N{j}\]
 348
 349 - Genericity of i  \[Gen(i)  = \frac{\sum_{j \neq i,j} P(i|j)}{N-1}\]
 350 - Specificity of j \[Spec(i) = \frac{\sum_{j \neq i,j} P(j|i)}{N-1}\]
 351
 352 - \[Inclusion (i) = Gen(i) + Spec(i)\)
 353 - \[GenericityScore = Gen(i)- Spec(i)\]
 354
 355 - References: Science mapping with asymmetrical paradigmatic proximity
 356 Jean-Philippe Cointet (CREA, TSV), David Chavalarias (CREA) (Submitted
 357 on 15 Mar 2008), Networks and Heterogeneous Media 3, 2 (2008) 267 - 276,
 358 arXiv:0803.2315 [cs.OH]
 359 -}
 360 type InclusionExclusion    = Double
 361 type SpecificityGenericity = Double
 362
 363 data SquareMatrix      = SymetricMatrix | NonSymetricMatrix
 364 type SymetricMatrix    = Matrix
 365 type NonSymetricMatrix = Matrix
 366
 367
 368 incExcSpeGen :: Matrix Int -> (Vector InclusionExclusion, Vector SpecificityGenericity)
 369 incExcSpeGen m = (run' inclusionExclusion m, run' specificityGenericity m)
 370   where
 371     run' fun mat = run $ fun $ map fromIntegral $ use mat
 372
 373     -- | Inclusion (i) = Gen(i)+Spec(i)
 374     inclusionExclusion :: Acc (Matrix Double) -> Acc (Vector Double)
 375     inclusionExclusion mat = zipWith (+) (pV mat) (pV mat)
 376
 377     -- | Genericity score = Gen(i)- Spec(i)
 378     specificityGenericity :: Acc (Matrix Double) -> Acc (Vector Double)
 379     specificityGenericity mat = zipWith (+) (pH mat) (pH mat)
 380
 381     -- | Gen(i)  : 1/(N-1)*Sum(j!=i, P(i|j)) : Genericity  of i
 382     pV :: Acc (Matrix Double) -> Acc (Vector Double)
 383     pV mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ij mat
 384
 385     -- | Spec(i) : 1/(N-1)*Sum(j!=i, P(j|i)) : Specificity of j
 386     pH :: Acc (Matrix Double) -> Acc (Vector Double)
 387     pH mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ji mat
 388
 389     cardN :: Exp Double
 390     cardN = constant (P.fromIntegral (dim m) :: Double)
 391
 392
 393 -- | P(i|j) = Nij /N(jj) Probability to get i given j
 394 --p_ij :: (Elt e, P.Fractional (Exp e)) => Acc (SymetricMatrix e) -> Acc (Matrix e)
 395 p_ij :: (Elt e, P.Fractional (Exp e)) => Acc (Matrix e) -> Acc (Matrix e)
 396 p_ij m = zipWith (/) m (n_jj m)
 397   where
 398     n_jj :: Elt e => Acc (SymetricMatrix e) -> Acc (Matrix e)
 399     n_jj myMat' = backpermute (shape m)
 400                          (lift1 ( \(Z :. (_ :: Exp Int) :. (j:: Exp Int))
 401                                    -> (Z :. j :. j)
 402                                 )
 403                          ) myMat'
 404
 405 -- | P(j|i) = Nij /N(ii) Probability to get i given j
 406 -- to test
 407 p_ji :: (Elt e, P.Fractional (Exp e)) => Acc (Array DIM2 e) -> Acc (Array DIM2 e)
 408 p_ji = transpose . p_ij
 409
 410
 411 -- | Step to ckeck the result in visual/qualitative tests
 412 incExcSpeGen_proba :: Matrix Int -> Matrix Double
 413 incExcSpeGen_proba m = run' pro m
 414   where
 415     run' fun mat = run $ fun $ map fromIntegral $ use mat
 416
 417     pro mat = p_ji mat
 418
 419 {-
 420 -- | Hypothesis to test maybe later (or not)
 421 -- TODO ask accelerate for instances to ease such writtings:
 422 p_ :: (Elt e, P.Fractional (Exp e)) => Acc (Array DIM2 e) -> Acc (Array DIM2 e)
 423 p_ m = zipWith (/) m (n_ m)
 424   where
 425     n_ :: Elt e => Acc (SymetricMatrix e) -> Acc (Matrix e)
 426     n_ m = backpermute (shape m)
 427                          (lift1 ( \(Z :. (i :: Exp Int) :. (j:: Exp Int))
 428                                    -> (ifThenElse (i < j) (lift (Z :. j :. j)) (lift (Z :. i :. i)) :: Exp DIM2)
 429                                 )
 430                          ) m
 431 -}
 432
 433 -- * For Tests (to be removed)
 434 -- | Test perfermance with this matrix
 435 -- TODO : add this in a benchmark folder
 436 distriTest :: Matrix Double
 437 distriTest = distributional $ matrix 100 [1..]
 438 -----------------------------------------------------------------------
 439