src/Gargantext/Viz/Graph/Distances/Matrice.hs

   1 {-|
   2 Module      : Gargantext.Graph.Distances.Matrix
   3 Description :
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 This module aims at implementig distances of terms context by context is
  11 the same referential of corpus.
  12
  13
  14 Implementation use Accelerate library which enables GPU and CPU computation:
  15
  16   * Manuel M. T. Chakravarty, Gabriele Keller, Sean Lee, Trevor L. McDonell, and Vinod Grover.
  17     [Accelerating Haskell Array Codes with Multicore GPUs][CKLM+11].
  18     In _DAMP '11: Declarative Aspects of Multicore Programming_, ACM, 2011.
  19
  20   * Trevor L. McDonell, Manuel M. T. Chakravarty, Gabriele Keller, and Ben Lippmeier.
  21     [Optimising Purely Functional GPU Programs][MCKL13].
  22     In _ICFP '13: The 18th ACM SIGPLAN International Conference on Functional Programming_, ACM, 2013.
  23
  24   * Robert Clifton-Everest, Trevor L. McDonell, Manuel M. T. Chakravarty, and Gabriele Keller.
  25     [Embedding Foreign Code][CMCK14].
  26     In _PADL '14: The 16th International Symposium on Practical Aspects of Declarative Languages_, Springer-Verlag, LNCS, 2014.
  27
  28   * Trevor L. McDonell, Manuel M. T. Chakravarty, Vinod Grover, and Ryan R. Newton.
  29     [Type-safe Runtime Code Generation: Accelerate to LLVM][MCGN15].
  30     In _Haskell '15: The 8th ACM SIGPLAN Symposium on Haskell_, ACM, 2015.
  31
  32 -}
  33
  34 {-# LANGUAGE TypeFamilies        #-}
  35 {-# LANGUAGE TypeOperators       #-}
  36 {-# LANGUAGE ScopedTypeVariables #-}
  37
  38 module Gargantext.Viz.Graph.Distances.Matrice
  39   where
  40
  41 import Data.Array.Accelerate
  42 import Data.Array.Accelerate.Interpreter (run)
  43
  44 import qualified Gargantext.Prelude as P
  45
  46
  47 -----------------------------------------------------------------------
  48 -- | Define a vector
  49 --
  50 -- >>> vector 3
  51 -- Vector (Z :. 3) [0,1,2]
  52 vector :: Int -> (Array (Z :. Int) Int)
  53 vector n = fromList (Z :. n) [0..n]
  54
  55 -- | Define a matrix
  56 --
  57 -- >>> matrix 3 ([1..] :: [Double])
  58 -- Matrix (Z :. 3 :. 3)
  59 --   [ 1.0, 2.0, 3.0,
  60 --     4.0, 5.0, 6.0,
  61 --     7.0, 8.0, 9.0]
  62 matrix :: Elt c => Int -> [c] -> Matrix c
  63 matrix n l = fromList (Z :. n :. n) l
  64
  65 -- | Two ways to get the rank (as documentation)
  66 --
  67 -- >>> rank (matrix 3 ([1..] :: [Int]))
  68 -- 2
  69 rank :: (Matrix a) -> Int
  70 rank m = arrayRank $ arrayShape m
  71
  72 -----------------------------------------------------------------------
  73 -- | Dimension of a square Matrix
  74 -- How to force use with SquareMatrix ?
  75 type Dim = Int
  76
  77 -- | Get Dimension of a square Matrix
  78 --
  79 -- >>> dim (matrix 3 ([1..] :: [Int]))
  80 -- 3
  81 dim :: Matrix a -> Dim
  82 dim m = n
  83   where
  84     Z :. _ :. n = arrayShape m
  85     -- indexTail (arrayShape m)
  86
  87 -----------------------------------------------------------------------
  88
  89 -- | Sum of a Matrix by Column
  90 --
  91 -- >>> run $ matSumCol 3 (use $ matrix 3 [1..])
  92 -- Matrix (Z :. 3 :. 3)
  93 --   [ 12.0, 15.0, 18.0,
  94 --     12.0, 15.0, 18.0,
  95 --     12.0, 15.0, 18.0]
  96 matSumCol :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
  97 matSumCol r mat = replicate (constant (Z :. (r :: Int) :. All)) $ sum $ transpose mat
  98
  99 matSumCol' :: Matrix Double -> Matrix Double
 100 matSumCol' m = run $ matSumCol n m'
 101   where
 102     n  = dim m
 103     m' = use m
 104
 105
 106 -- | Proba computes de probability matrix: all cells divided by thee sum of its column
 107 -- if you need get the probability on the lines, just transpose it
 108 --
 109 -- >>> run $ matProba 3 (use $ matrix 3 [1..])
 110 -- Matrix (Z :. 3 :. 3)
 111 --   [ 8.333333333333333e-2, 0.13333333333333333, 0.16666666666666666,
 112 --       0.3333333333333333,  0.3333333333333333,  0.3333333333333333,
 113 --       0.5833333333333334,  0.5333333333333333,                 0.5]
 114 matProba :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
 115 matProba r mat = zipWith (/) mat (matSumCol r mat)
 116
 117 -- | Diagonal of the matrix
 118 --
 119 -- >>> run $ diag (use $ matrix 3 ([1..] :: [Int]))
 120 -- Vector (Z :. 3) [1,5,9]
 121 diag :: Elt e => Acc (Matrix e) -> Acc (Vector e)
 122 diag m = backpermute (indexTail (shape m)) (lift1 (\(Z :. x) -> (Z :. x :. (x :: Exp Int)))) m
 123
 124 -- | Divide by the Diagonal of the matrix
 125 --
 126 -- >>> run $ divByDiag 3 (use $ matrix 3 ([1..] :: [Double]))
 127 -- Matrix (Z :. 3 :. 3)
 128 --   [ 1.0, 0.4, 0.3333333333333333,
 129 --     4.0, 1.0, 0.6666666666666666,
 130 --     7.0, 1.6,                1.0]
 131 divByDiag :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
 132 divByDiag d mat = zipWith (/) mat (replicate (constant (Z :. (d :: Int) :. All)) $ diag mat)
 133
 134 -----------------------------------------------------------------------
 135 -- | Filters the matrix with the minimum of maximums
 136 --
 137 -- >>> run $ matMiniMax $ use $ matrix 3 [1..]
 138 -- Matrix (Z :. 3 :. 3)
 139 --   [ 0.0, 4.0, 7.0,
 140 --     0.0, 5.0, 8.0,
 141 --     0.0, 6.0, 9.0]
 142 matMiniMax :: Acc (Matrix Double) -> Acc (Matrix Double)
 143 matMiniMax m = map (\x -> ifThenElse (x > miniMax') x 0) (transpose m)
 144   where
 145     miniMax' = (the $ minimum $ maximum m)
 146
 147 -- | Filters the matrix with a constant
 148 --
 149 -- >>> run $ matFilter 5 $ use $ matrix 3 [1..]
 150 -- Matrix (Z :. 3 :. 3)
 151 --   [ 0.0, 0.0, 7.0,
 152 --     0.0, 0.0, 8.0,
 153 --     0.0, 6.0, 9.0]
 154 matFilter :: Double -> Acc (Matrix Double) -> Acc (Matrix Double)
 155 matFilter t m = map (\x -> ifThenElse (x > (constant t)) x 0) (transpose m)
 156
 157 -----------------------------------------------------------------------
 158 -- * Measures of proximity
 159 -----------------------------------------------------------------------
 160 -- ** Conditional distance
 161
 162 -- *** Conditional distance (basic)
 163
 164 -- | Conditional distance (basic version)
 165 --
 166 -- 2 main measures are actually implemented in order to compute the
 167 -- proximity of two terms: conditional and distributional
 168 --
 169 -- Conditional measure is an absolute measure which reflects
 170 -- interactions of 2 terms in the corpus.
 171 measureConditional :: Matrix Int -> Matrix Double
 172 --measureConditional m = run (matMiniMax $ matProba (dim m) $ map fromIntegral $ use m)
 173 measureConditional m = run $ matProba (dim m)
 174                            $ map fromIntegral
 175                            $ use m
 176
 177
 178 -- *** Conditional distance (advanced)
 179
 180 -- | Conditional distance (advanced version)
 181 --
 182 -- The conditional measure P(i|j) of 2 terms @i@ and @j@, also called
 183 -- "confidence" , is the maximum probability between @i@ and @j@ to see
 184 -- @i@ in the same context of @j@ knowing @j@.
 185 --
 186 -- If N(i) (resp. N(j)) is the number of occurrences of @i@ (resp. @j@)
 187 -- in the corpus and _[n_{ij}\] the number of its occurrences we get:
 188 --
 189 -- \[P_c=max(\frac{n_i}{n_{ij}},\frac{n_j}{n_{ij}} )\]
 190 conditional' :: Matrix Int -> (Matrix InclusionExclusion, Matrix SpecificityGenericity)
 191 conditional' m = ( run $ ie $ map fromIntegral $ use m
 192                  , run $ sg $ map fromIntegral $ use m
 193                  )
 194   where
 195     ie :: Acc (Matrix Double) -> Acc (Matrix Double)
 196     ie mat = map (\x -> x / (2*n-1)) $ zipWith (+) (xs mat) (ys mat)
 197     sg :: Acc (Matrix Double) -> Acc (Matrix Double)
 198     sg mat = map (\x -> x / (2*n-1)) $ zipWith (-) (xs mat) (ys mat)
 199
 200     n :: Exp Double
 201     n = P.fromIntegral r
 202
 203     r :: Dim
 204     r = dim m
 205
 206     xs :: Acc (Matrix Double) -> Acc (Matrix Double)
 207     xs mat = zipWith (-) (matSumCol r $ matProba r mat) (matProba r mat)
 208     ys :: Acc (Matrix Double) -> Acc (Matrix Double)
 209     ys mat = zipWith (-) (matSumCol r $ transpose $ matProba r mat) (matProba r mat)
 210
 211 -----------------------------------------------------------------------
 212 -- ** Distributional Distance
 213
 214 -- | Distributional Distance Measure
 215 --
 216 -- Distributional measure is a relative measure which depends on the
 217 -- selected list, it represents structural equivalence of mutual information.
 218 --
 219 -- The distributional measure P(c) of @i@ and @j@ terms is: \[
 220 -- S_{MI} = \frac {\sum_{k \neq i,j ; MI_{ik} >0}^{} \min(MI_{ik},
 221 -- MI_{jk})}{\sum_{k \neq i,j ; MI_{ik}>0}^{}} \]
 222 --
 223 -- Mutual information
 224 -- \[S_{MI}({i},{j}) = \log(\frac{C{ij}}{E{ij}})\]
 225 --
 226 -- Number of cooccurrences of @i@ and @j@ in the same context of text
 227 --                        \[C{ij}\]
 228 --
 229 -- The expected value of the cooccurrences @i@ and @j@ (given a map list of size @n@)
 230 --            \[E_{ij}^{m} = \frac {S_{i} S_{j}} {N_{m}}\]
 231 --
 232 -- Total cooccurrences of term @i@ given a map list of size @m@
 233 --            \[S_{i} = \sum_{j, j \neq i}^{m} S_{ij}\]
 234 --
 235 -- Total cooccurrences of terms given a map list of size @m@
 236 --            \[N_{m} = \sum_{i,i \neq i}^{m} \sum_{j, j \neq j}^{m} S_{ij}\]
 237 --
 238 distributional :: Matrix Int -> Matrix Double
 239 distributional m = run $ matMiniMax
 240                        $ ri
 241                        $ map fromIntegral  -- ^ from Int to Double
 242                        $ use m             -- ^ push matrix in Accelerate type
 243   where
 244     -- filter  m = zipWith (\a b -> max a b) m (transpose m)
 245
 246     ri :: Acc (Matrix Double) -> Acc (Matrix Double)
 247     ri mat = zipWith (/) mat1 mat2
 248       where
 249         mat1 = matSumCol n $ zipWith min (s_mi mat) (s_mi $ transpose mat)
 250         mat2 = matSumCol n mat
 251
 252     s_mi :: Acc (Matrix Double) -> Acc (Matrix Double)
 253     s_mi m' = zipWith (\a b -> log (a/b))  m'
 254             $ zipWith (/) (crossProduct m') (total m')
 255
 256     total :: Acc (Matrix Double) -> Acc (Matrix Double)
 257     total = replicate (constant (Z :. n :. n)) . sum . sum
 258
 259     n :: Dim
 260     n = dim m
 261
 262     crossProduct :: Acc (Matrix Double) -> Acc (Matrix Double)
 263     crossProduct m''' = zipWith (*) (cross m'''  ) (cross (transpose m'''))
 264     cross :: Acc (Matrix Double) -> Acc (Matrix Double)
 265     cross mat         = zipWith (-) (matSumCol n mat) (mat)
 266
 267 -- | cross
 268 {-
 269 cross :: Matrix Double -> Matrix Double
 270 cross mat = run $ zipWith (-) (matSumCol n mat') (mat')
 271   where
 272     mat' = use mat
 273     n = dim mat
 274 -}
 275
 276
 277 -----------------------------------------------------------------------
 278 -----------------------------------------------------------------------
 279 -- * Specificity and Genericity
 280
 281 {- | Metric Specificity and genericity: select terms
 282
 283 - let N termes and occurrences of i \[N{i}\]
 284
 285 - Cooccurrences of i and j \[N{ij}\]
 286 - Probability to get i given j : \[P(i|j)=N{ij}/N{j}\]
 287
 288 - Genericity of i  \[Gen(i)  = \frac{\sum_{j \neq i,j} P(i|j)}{N-1}\]
 289 - Specificity of j \[Spec(i) = \frac{\sum_{j \neq i,j} P(j|i)}{N-1}\]
 290
 291 - \[Inclusion (i) = Gen(i) + Spec(i)\)
 292 - \[GenericityScore = Gen(i)- Spec(i)\]
 293
 294 - References: Science mapping with asymmetrical paradigmatic proximity
 295 Jean-Philippe Cointet (CREA, TSV), David Chavalarias (CREA) (Submitted
 296 on 15 Mar 2008), Networks and Heterogeneous Media 3, 2 (2008) 267 - 276,
 297 arXiv:0803.2315 [cs.OH]
 298 -}
 299 type InclusionExclusion    = Double
 300 type SpecificityGenericity = Double
 301
 302 data SquareMatrix      = SymetricMatrix | NonSymetricMatrix
 303 type SymetricMatrix    = Matrix
 304 type NonSymetricMatrix = Matrix
 305
 306
 307 incExcSpeGen :: Matrix Int -> (Vector InclusionExclusion, Vector SpecificityGenericity)
 308 incExcSpeGen m = (run' inclusionExclusion m, run' specificityGenericity m)
 309   where
 310     run' fun mat = run $ fun $ map fromIntegral $ use mat
 311
 312     -- | Inclusion (i) = Gen(i)+Spec(i)
 313     inclusionExclusion :: Acc (Matrix Double) -> Acc (Vector Double)
 314     inclusionExclusion mat = zipWith (+) (pV mat) (pV mat)
 315
 316     -- | Genericity score = Gen(i)- Spec(i)
 317     specificityGenericity :: Acc (Matrix Double) -> Acc (Vector Double)
 318     specificityGenericity mat = zipWith (+) (pH mat) (pH mat)
 319
 320     -- | Gen(i)  : 1/(N-1)*Sum(j!=i, P(i|j)) : Genericity  of i
 321     pV :: Acc (Matrix Double) -> Acc (Vector Double)
 322     pV mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ij mat
 323
 324     -- | Spec(i) : 1/(N-1)*Sum(j!=i, P(j|i)) : Specificity of j
 325     pH :: Acc (Matrix Double) -> Acc (Vector Double)
 326     pH mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ji mat
 327
 328     cardN :: Exp Double
 329     cardN = constant (P.fromIntegral (dim m) :: Double)
 330
 331
 332 -- | P(i|j) = Nij /N(jj) Probability to get i given j
 333 --p_ij :: (Elt e, P.Fractional (Exp e)) => Acc (SymetricMatrix e) -> Acc (Matrix e)
 334 p_ij :: (Elt e, P.Fractional (Exp e)) => Acc (Matrix e) -> Acc (Matrix e)
 335 p_ij m = zipWith (/) m (n_jj m)
 336   where
 337     n_jj :: Elt e => Acc (SymetricMatrix e) -> Acc (Matrix e)
 338     n_jj myMat' = backpermute (shape m)
 339                          (lift1 ( \(Z :. (_ :: Exp Int) :. (j:: Exp Int))
 340                                    -> (Z :. j :. j)
 341                                 )
 342                          ) myMat'
 343
 344 -- | P(j|i) = Nij /N(ii) Probability to get i given j
 345 -- to test
 346 p_ji :: (Elt e, P.Fractional (Exp e)) => Acc (Array DIM2 e) -> Acc (Array DIM2 e)
 347 p_ji = transpose . p_ij
 348
 349
 350 -- | Step to ckeck the result in visual/qualitative tests
 351 incExcSpeGen_proba :: Matrix Int -> Matrix Double
 352 incExcSpeGen_proba m = run' pro m
 353   where
 354     run' fun mat = run $ fun $ map fromIntegral $ use mat
 355
 356     pro mat = p_ji mat
 357
 358 {-
 359 -- | Hypothesis to test maybe later (or not)
 360 -- TODO ask accelerate for instances to ease such writtings:
 361 p_ :: (Elt e, P.Fractional (Exp e)) => Acc (Array DIM2 e) -> Acc (Array DIM2 e)
 362 p_ m = zipWith (/) m (n_ m)
 363   where
 364     n_ :: Elt e => Acc (SymetricMatrix e) -> Acc (Matrix e)
 365     n_ m = backpermute (shape m)
 366                          (lift1 ( \(Z :. (i :: Exp Int) :. (j:: Exp Int))
 367                                    -> (ifThenElse (i < j) (lift (Z :. j :. j)) (lift (Z :. i :. i)) :: Exp DIM2)
 368                                 )
 369                          ) m
 370 -}
 371
 372 -- * For Tests (to be removed)
 373 -- | Test perfermance with this matrix
 374 -- TODO : add this in a benchmark folder
 375 distriTest :: Matrix Double
 376 distriTest = distributional $ matrix 100 [1..]
 377 -----------------------------------------------------------------------
 378