src/Gargantext/Viz/Graph/Distances/Matrice.hs

   1 {-|
   2 Module      : Gargantext.Graph.Distances.Matrix
   3 Description :
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 This module aims at implementig distances of terms context by context is
  11 the same referential of corpus.
  12
  13 Implementation use Accelerate library which enables GPU and CPU computation:
  14
  15   * Manuel M. T. Chakravarty, Gabriele Keller, Sean Lee, Trevor L. McDonell, and Vinod Grover.
  16     [Accelerating Haskell Array Codes with Multicore GPUs][CKLM+11].
  17     In _DAMP '11: Declarative Aspects of Multicore Programming_, ACM, 2011.
  18
  19   * Trevor L. McDonell, Manuel M. T. Chakravarty, Vinod Grover, and Ryan R. Newton.
  20     [Type-safe Runtime Code Generation: Accelerate to LLVM][MCGN15].
  21     In _Haskell '15: The 8th ACM SIGPLAN Symposium on Haskell_, ACM, 2015.
  22
  23 -}
  24
  25 {-# LANGUAGE TypeFamilies        #-}
  26 {-# LANGUAGE TypeOperators       #-}
  27 {-# LANGUAGE ScopedTypeVariables #-}
  28 {-# LANGUAGE ViewPatterns        #-}
  29
  30 module Gargantext.Viz.Graph.Distances.Matrice
  31   where
  32
  33 import Debug.Trace (trace)
  34 import Data.Array.Accelerate
  35 import Data.Array.Accelerate.Interpreter (run)
  36 import qualified Gargantext.Prelude as P
  37
  38
  39 -----------------------------------------------------------------------
  40 -- | Define a vector
  41 --
  42 -- >>> vector 3
  43 -- Vector (Z :. 3) [0,1,2]
  44 vector :: Elt c => Int -> [c] -> (Array (Z :. Int) c)
  45 vector n l = fromList (Z :. n) l
  46
  47 -- | Define a matrix
  48 --
  49 -- >>> matrix 3 ([1..] :: [Double])
  50 -- Matrix (Z :. 3 :. 3)
  51 --   [ 1.0, 2.0, 3.0,
  52 --     4.0, 5.0, 6.0,
  53 --     7.0, 8.0, 9.0]
  54 matrix :: Elt c => Int -> [c] -> Matrix c
  55 matrix n l = fromList (Z :. n :. n) l
  56
  57 -- | Two ways to get the rank (as documentation)
  58 --
  59 -- >>> rank (matrix 3 ([1..] :: [Int]))
  60 -- 2
  61 rank :: (Matrix a) -> Int
  62 rank m = arrayRank $ arrayShape m
  63
  64 -----------------------------------------------------------------------
  65 -- | Dimension of a square Matrix
  66 -- How to force use with SquareMatrix ?
  67 type Dim = Int
  68
  69 -- | Get Dimension of a square Matrix
  70 --
  71 -- >>> dim (matrix 3 ([1..] :: [Int]))
  72 -- 3
  73 dim :: Matrix a -> Dim
  74 dim m = n
  75   where
  76     Z :. _ :. n = arrayShape m
  77     -- indexTail (arrayShape m)
  78
  79 -----------------------------------------------------------------------
  80 -- TODO move to Utils
  81 runExp :: Elt e => Exp e -> e
  82 runExp e = indexArray (run (unit e)) Z
  83 -----------------------------------------------------------------------
  84
  85 -- | Sum of a Matrix by Column
  86 --
  87 -- >>> run $ matSumCol 3 (use $ matrix 3 [1..])
  88 -- Matrix (Z :. 3 :. 3)
  89 --   [ 12.0, 15.0, 18.0,
  90 --     12.0, 15.0, 18.0,
  91 --     12.0, 15.0, 18.0]
  92 matSumCol :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
  93 matSumCol r mat = replicate (constant (Z :. (r :: Int) :. All)) $ sum $ transpose mat
  94
  95 matSumCol' :: Matrix Double -> Matrix Double
  96 matSumCol' m = run $ matSumCol n m'
  97   where
  98     n  = dim m
  99     m' = use m
 100
 101
 102 -- | Proba computes de probability matrix: all cells divided by thee sum of its column
 103 -- if you need get the probability on the lines, just transpose it
 104 --
 105 -- >>> run $ matProba 3 (use $ matrix 3 [1..])
 106 -- Matrix (Z :. 3 :. 3)
 107 --   [ 8.333333333333333e-2, 0.13333333333333333, 0.16666666666666666,
 108 --       0.3333333333333333,  0.3333333333333333,  0.3333333333333333,
 109 --       0.5833333333333334,  0.5333333333333333,                 0.5]
 110 matProba :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
 111 matProba r mat = zipWith (/) mat (matSumCol r mat)
 112
 113 -- | Diagonal of the matrix
 114 --
 115 -- >>> run $ diag (use $ matrix 3 ([1..] :: [Int]))
 116 -- Vector (Z :. 3) [1,5,9]
 117 diag :: Elt e => Acc (Matrix e) -> Acc (Vector e)
 118 diag m = backpermute (indexTail (shape m))
 119                      (lift1 (\(Z :. x) -> (Z :. x :. (x :: Exp Int))))
 120                      m
 121
 122 -- | Divide by the Diagonal of the matrix
 123 --
 124 -- >>> run $ divByDiag 3 (use $ matrix 3 ([1..] :: [Double]))
 125 -- Matrix (Z :. 3 :. 3)
 126 --   [ 1.0, 0.4, 0.3333333333333333,
 127 --     4.0, 1.0, 0.6666666666666666,
 128 --     7.0, 1.6,                1.0]
 129 divByDiag :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
 130 divByDiag d mat = zipWith (/) mat (replicate (constant (Z :. (d :: Int) :. All)) $ diag mat)
 131
 132 -----------------------------------------------------------------------
 133 -- | Filters the matrix with the minimum of maximums
 134 --
 135 -- >>> run $ matMiniMax $ use $ matrix 3 [1..]
 136 -- Matrix (Z :. 3 :. 3)
 137 --   [ 0.0, 4.0, 7.0,
 138 --     0.0, 5.0, 8.0,
 139 --     0.0, 6.0, 9.0]
 140 matMiniMax :: Acc (Matrix Double) -> Acc (Matrix Double)
 141 matMiniMax m = map (\x -> ifThenElse (x > miniMax') x 0) (transpose m)
 142   where
 143     miniMax' = (the $ minimum $ maximum m)
 144
 145
 146
 147 -- | Filters the matrix with a constant
 148 --
 149 -- >>> run $ matFilter 5 $ use $ matrix 3 [1..]
 150 -- Matrix (Z :. 3 :. 3)
 151 --   [ 0.0, 0.0, 7.0,
 152 --     0.0, 0.0, 8.0,
 153 --     0.0, 6.0, 9.0]
 154 filter' :: Double -> Acc (Matrix Double) -> Acc (Matrix Double)
 155 filter' t m = filterWith t 0 m
 156
 157 filterWith :: Double -> Double -> Acc (Matrix Double) -> Acc (Matrix Double)
 158 filterWith t v m = map (\x -> ifThenElse (x > (constant t)) x (constant v)) (transpose m)
 159
 160
 161
 162 -----------------------------------------------------------------------
 163 -- * Measures of proximity
 164 -----------------------------------------------------------------------
 165 -- ** Conditional distance
 166
 167 -- *** Conditional distance (basic)
 168
 169 -- | Conditional distance (basic version)
 170 --
 171 -- 2 main measures are actually implemented in order to compute the
 172 -- proximity of two terms: conditional and distributional
 173 --
 174 -- Conditional measure is an absolute measure which reflects
 175 -- interactions of 2 terms in the corpus.
 176 measureConditional :: Matrix Int -> Matrix Double
 177 --measureConditional m = run (matMiniMax $ matProba (dim m) $ map fromIntegral $ use m)
 178 measureConditional m = run $ matProba (dim m)
 179                            $ map fromIntegral
 180                            $ use m
 181
 182
 183 -- *** Conditional distance (advanced)
 184
 185 -- | Conditional distance (advanced version)
 186 --
 187 -- The conditional measure P(i|j) of 2 terms @i@ and @j@, also called
 188 -- "confidence" , is the maximum probability between @i@ and @j@ to see
 189 -- @i@ in the same context of @j@ knowing @j@.
 190 --
 191 -- If N(i) (resp. N(j)) is the number of occurrences of @i@ (resp. @j@)
 192 -- in the corpus and _[n_{ij}\] the number of its occurrences we get:
 193 --
 194 -- \[P_c=max(\frac{n_i}{n_{ij}},\frac{n_j}{n_{ij}} )\]
 195 conditional' :: Matrix Int -> (Matrix InclusionExclusion, Matrix SpecificityGenericity)
 196 conditional' m = ( run $ ie $ map fromIntegral $ use m
 197                  , run $ sg $ map fromIntegral $ use m
 198                  )
 199   where
 200     ie :: Acc (Matrix Double) -> Acc (Matrix Double)
 201     ie mat = map (\x -> x / (2*n-1)) $ zipWith (+) (xs mat) (ys mat)
 202     sg :: Acc (Matrix Double) -> Acc (Matrix Double)
 203     sg mat = map (\x -> x / (2*n-1)) $ zipWith (-) (xs mat) (ys mat)
 204
 205     n :: Exp Double
 206     n = P.fromIntegral r
 207
 208     r :: Dim
 209     r = dim m
 210
 211     xs :: Acc (Matrix Double) -> Acc (Matrix Double)
 212     xs mat = zipWith (-) (matSumCol r $ matProba r mat) (matProba r mat)
 213     ys :: Acc (Matrix Double) -> Acc (Matrix Double)
 214     ys mat = zipWith (-) (matSumCol r $ transpose $ matProba r mat) (matProba r mat)
 215
 216 -----------------------------------------------------------------------
 217 -- ** Distributional Distance
 218
 219 -- | Distributional Distance Measure
 220 --
 221 -- Distributional measure is a relative measure which depends on the
 222 -- selected list, it represents structural equivalence of mutual information.
 223 --
 224 -- The distributional measure P(c) of @i@ and @j@ terms is: \[
 225 -- S_{MI} = \frac {\sum_{k \neq i,j ; MI_{ik} >0}^{} \min(MI_{ik},
 226 -- MI_{jk})}{\sum_{k \neq i,j ; MI_{ik}>0}^{}} \]
 227 --
 228 -- Mutual information
 229 -- \[S_{MI}({i},{j}) = \log(\frac{C{ij}}{E{ij}})\]
 230 --
 231 -- Number of cooccurrences of @i@ and @j@ in the same context of text
 232 --                        \[C{ij}\]
 233 --
 234 -- The expected value of the cooccurrences @i@ and @j@ (given a map list of size @n@)
 235 --            \[E_{ij}^{m} = \frac {S_{i} S_{j}} {N_{m}}\]
 236 --
 237 -- Total cooccurrences of term @i@ given a map list of size @m@
 238 --            \[S_{i} = \sum_{j, j \neq i}^{m} S_{ij}\]
 239 --
 240 -- Total cooccurrences of terms given a map list of size @m@
 241 --            \[N_{m} = \sum_{i,i \neq i}^{m} \sum_{j, j \neq j}^{m} S_{ij}\]
 242 --
 243 distributional :: Matrix Int -> Matrix Double
 244 distributional m = run -- $ matMiniMax
 245                       --  $ diagNull n
 246                        $ ri
 247                        $ filterWith 0 100
 248                        $ filter' 0
 249                        $ s_mi
 250                        $ map fromIntegral
 251                           {- from Int to Double -}
 252                        $ use m
 253                           {- push matrix in Accelerate type -}
 254   where
 255
 256     ri :: Acc (Matrix Double) -> Acc (Matrix Double)
 257     ri mat = mat1 -- zipWith (/) mat1 mat2
 258       where
 259         mat1 = matSumCol n $ zipWith min (myMin mat) (myMin $ filterWith 0 100 $ diagNull n $ transpose mat)
 260         mat2 = total mat
 261
 262     myMin :: Acc (Matrix Double) -> Acc (Matrix Double)
 263     myMin = replicate (constant (Z :. n :. All)) . minimum
 264
 265
 266     -- TODO fix NaN
 267     -- Quali TEST: OK
 268     s_mi :: Acc (Matrix Double) -> Acc (Matrix Double)
 269     s_mi m' = zipWith (\x y -> log (x / y)) (diagNull n m')
 270             $ zipWith (/) (crossProduct n m') (total m')
 271             -- crossProduct n m'
 272
 273
 274     total :: Acc (Matrix Double) -> Acc (Matrix Double)
 275     total = replicate (constant (Z :. n :. n)) . sum . sum
 276
 277     n :: Dim
 278     n = dim m
 279
 280 -- run $ (identityMatrix (DAA.constant (10::Int)) :: DAA.Acc (DAA.Matrix Int)) Matrix (Z :. 10 :. 10)
 281 identityMatrix :: Num a => Exp Int -> Acc (Matrix a)
 282 identityMatrix n =
 283         let zeros = fill (index2 n n) 0
 284             ones  = fill (index1 n)   1
 285         in
 286         permute const zeros (\(unindex1 -> i) -> index2 i i) ones
 287
 288
 289 eyeMatrix :: Num a => Dim -> Acc (Matrix a)
 290 eyeMatrix n' =
 291         let ones   = fill (index2 n n) 1
 292             zeros  = fill (index1 n)   0
 293             n = constant n'
 294         in
 295         permute const ones (\(unindex1 -> i) -> index2 i i) zeros
 296
 297 -- | TODO use Lenses
 298 data Direction = MatCol (Exp Int) | MatRow (Exp Int) | Diag
 299
 300 nullOf :: Num a => Dim -> Direction -> Acc (Matrix a)
 301 nullOf n' dir =
 302         let ones   = fill (index2 n n) 1
 303             zeros  = fill (index2 n n) 0
 304             n = constant n'
 305         in
 306         permute const ones -- (\(unindex2 -> i) -> let Exp (x,y) = i in index2 x y)
 307
 308                            ( lift1 ( \(Z :. (i :: Exp Int) :. (_j:: Exp Int))
 309                                                 -> case dir of
 310                                                      MatCol m -> (Z :. i :. m)
 311                                                      MatRow m -> (Z :. m :. i)
 312                                                      Diag     -> (Z :. i :. i)
 313
 314                                     ))
 315                            zeros
 316
 317 nullOfWithDiag :: Num a => Dim -> Direction -> Acc (Matrix a)
 318 nullOfWithDiag n dir = zipWith (*) (nullOf n dir) (nullOf n Diag)
 319
 320
 321
 322 {- | WIP fun with indexes
 323 selfMatrix :: Num a => Dim -> Acc (Matrix a)
 324 selfMatrix n' =
 325         let zeros = fill (index2 n n) 0
 326             ones  = fill (index2 n n) 1
 327             n = constant n'
 328         in
 329         permute const ones ( lift1 ( \(Z :. (i :: Exp Int) :. (_j:: Exp Int))
 330                                                 -> -- ifThenElse (i /= j)
 331                                                      --         (Z :. i :. j)
 332                                                               (Z :. i :. i)
 333                                     )) zeros
 334
 335 selfMatrix' :: (Elt a, P.Num (Exp a)) => Array DIM2 a -> Matrix a
 336 selfMatrix' m' = run $ selfMatrix n
 337   where
 338     n = dim m'
 339     m = use m'
 340 -}
 341 -------------------------------------------------
 342 diagNull :: Num a => Dim -> Acc (Matrix a) -> Acc (Matrix a)
 343 diagNull n m = zipWith (*) m eye
 344   where
 345     eye = eyeMatrix n
 346
 347 -------------------------------------------------
 348 crossProduct :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
 349 crossProduct n m = {-trace (P.show (run m',run m'')) $-} zipWith (*) m' m''
 350   where
 351     m'  = cross n m
 352     m'' = transpose $ cross n m
 353
 354
 355 crossT :: Matrix Double -> Matrix Double
 356 crossT  = run . transpose . use
 357
 358 crossProduct' :: Matrix Double -> Matrix Double
 359 crossProduct' m = run $ crossProduct n m'
 360   where
 361     n  = dim m
 362     m' = use m
 363
 364 runWith :: (Arrays c, Elt a1)
 365         => (Dim -> Acc (Matrix a1) -> a2 -> Acc c)
 366         -> Matrix a1
 367         -> a2
 368         -> c
 369 runWith f m = run . f (dim m) (use m)
 370
 371 -- | cross
 372 cross :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
 373 cross n mat = diagNull n (matSumCol n $ diagNull n mat)
 374
 375 cross' :: Matrix Double -> Matrix Double
 376 cross' mat = run $ cross n mat'
 377   where
 378     mat' = use mat
 379     n = dim mat
 380
 381
 382 -----------------------------------------------------------------------
 383 -----------------------------------------------------------------------
 384 -- * Specificity and Genericity
 385
 386 {- | Metric Specificity and genericity: select terms
 387
 388 - let N termes and occurrences of i \[N{i}\]
 389
 390 - Cooccurrences of i and j \[N{ij}\]
 391 - Probability to get i given j : \[P(i|j)=N{ij}/N{j}\]
 392
 393 - Genericity of i  \[Gen(i)  = \frac{\sum_{j \neq i,j} P(i|j)}{N-1}\]
 394 - Specificity of j \[Spec(i) = \frac{\sum_{j \neq i,j} P(j|i)}{N-1}\]
 395
 396 - \[Inclusion (i) = Gen(i) + Spec(i)\)
 397 - \[GenericityScore = Gen(i)- Spec(i)\]
 398
 399 - References: Science mapping with asymmetrical paradigmatic proximity
 400 Jean-Philippe Cointet (CREA, TSV), David Chavalarias (CREA) (Submitted
 401 on 15 Mar 2008), Networks and Heterogeneous Media 3, 2 (2008) 267 - 276,
 402 arXiv:0803.2315 [cs.OH]
 403 -}
 404 type InclusionExclusion    = Double
 405 type SpecificityGenericity = Double
 406
 407 data SquareMatrix      = SymetricMatrix | NonSymetricMatrix
 408 type SymetricMatrix    = Matrix
 409 type NonSymetricMatrix = Matrix
 410
 411
 412 incExcSpeGen :: Matrix Int -> (Vector InclusionExclusion, Vector SpecificityGenericity)
 413 incExcSpeGen m = (run' inclusionExclusion m, run' specificityGenericity m)
 414   where
 415     run' fun mat = run $ fun $ map fromIntegral $ use mat
 416
 417     -- | Inclusion (i) = Gen(i)+Spec(i)
 418     inclusionExclusion :: Acc (Matrix Double) -> Acc (Vector Double)
 419     inclusionExclusion mat = zipWith (+) (pV mat) (pV mat)
 420
 421     -- | Genericity score = Gen(i)- Spec(i)
 422     specificityGenericity :: Acc (Matrix Double) -> Acc (Vector Double)
 423     specificityGenericity mat = zipWith (+) (pH mat) (pH mat)
 424
 425     -- | Gen(i)  : 1/(N-1)*Sum(j!=i, P(i|j)) : Genericity  of i
 426     pV :: Acc (Matrix Double) -> Acc (Vector Double)
 427     pV mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ij mat
 428
 429     -- | Spec(i) : 1/(N-1)*Sum(j!=i, P(j|i)) : Specificity of j
 430     pH :: Acc (Matrix Double) -> Acc (Vector Double)
 431     pH mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ji mat
 432
 433     cardN :: Exp Double
 434     cardN = constant (P.fromIntegral (dim m) :: Double)
 435
 436
 437 -- | P(i|j) = Nij /N(jj) Probability to get i given j
 438 --p_ij :: (Elt e, P.Fractional (Exp e)) => Acc (SymetricMatrix e) -> Acc (Matrix e)
 439 p_ij :: (Elt e, P.Fractional (Exp e)) => Acc (Matrix e) -> Acc (Matrix e)
 440 p_ij m = zipWith (/) m (n_jj m)
 441   where
 442     n_jj :: Elt e => Acc (SymetricMatrix e) -> Acc (Matrix e)
 443     n_jj myMat' = backpermute (shape m)
 444                          (lift1 ( \(Z :. (_ :: Exp Int) :. (j:: Exp Int))
 445                                    -> (Z :. j :. j)
 446                                 )
 447                          ) myMat'
 448
 449 -- | P(j|i) = Nij /N(ii) Probability to get i given j
 450 -- to test
 451 p_ji :: (Elt e, P.Fractional (Exp e)) => Acc (Array DIM2 e) -> Acc (Array DIM2 e)
 452 p_ji = transpose . p_ij
 453
 454
 455 -- | Step to ckeck the result in visual/qualitative tests
 456 incExcSpeGen_proba :: Matrix Int -> Matrix Double
 457 incExcSpeGen_proba m = run' pro m
 458   where
 459     run' fun mat = run $ fun $ map fromIntegral $ use mat
 460
 461     pro mat = p_ji mat
 462
 463 {-
 464 -- | Hypothesis to test maybe later (or not)
 465 -- TODO ask accelerate for instances to ease such writtings:
 466 p_ :: (Elt e, P.Fractional (Exp e)) => Acc (Array DIM2 e) -> Acc (Array DIM2 e)
 467 p_ m = zipWith (/) m (n_ m)
 468   where
 469     n_ :: Elt e => Acc (SymetricMatrix e) -> Acc (Matrix e)
 470     n_ m = backpermute (shape m)
 471                          (lift1 ( \(Z :. (i :: Exp Int) :. (j:: Exp Int))
 472                                    -> (ifThenElse (i < j) (lift (Z :. j :. j)) (lift (Z :. i :. i)) :: Exp DIM2)
 473                                 )
 474                          ) m
 475 -}
 476
 477 -- * For Tests (to be removed)
 478 -- | Test perfermance with this matrix
 479 -- TODO : add this in a benchmark folder
 480 distriTest :: Int -> Matrix Double
 481 distriTest n = distributional (matrix n theMatrix)
 482   where
 483     theMatrix | (P.==) n 2 = [ 1, 1
 484                              , 1, 2
 485                              ]
 486
 487               | (P.==) n 3 =  [ 1, 1, 2
 488                               , 1, 2, 3
 489                               , 2, 3, 4
 490                               ]
 491               | (P.==) n 4 =  [ 1, 1, 2, 3
 492                               , 1, 2, 3, 4
 493                               , 2, 3, 4, 5
 494                               , 3, 4, 5, 6
 495                               ]
 496               | P.otherwise = P.undefined
 497
 498     theResult | (P.==) n 2 = let r = 1.6094379124341003 in [ 0, r, r, 0]
 499               | P.otherwise = [ 1, 1 ]
 500
 501
 502
 503
 504 -----------------------------------------------------------------------
 505