src/Gargantext/Viz/Graph/Distances/Matrice.hs

   1 {-|
   2 Module      : Gargantext.Graph.Distances.Matrix
   3 Description :
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 This module aims at implementig distances of terms context by context is
  11 the same referential of corpus.
  12
  13
  14 Implementation use Accelerate library which enables GPU and CPU computation:
  15
  16   * Manuel M. T. Chakravarty, Gabriele Keller, Sean Lee, Trevor L. McDonell, and Vinod Grover.
  17     [Accelerating Haskell Array Codes with Multicore GPUs][CKLM+11].
  18     In _DAMP '11: Declarative Aspects of Multicore Programming_, ACM, 2011.
  19
  20   * Trevor L. McDonell, Manuel M. T. Chakravarty, Gabriele Keller, and Ben Lippmeier.
  21     [Optimising Purely Functional GPU Programs][MCKL13].
  22     In _ICFP '13: The 18th ACM SIGPLAN International Conference on Functional Programming_, ACM, 2013.
  23
  24   * Robert Clifton-Everest, Trevor L. McDonell, Manuel M. T. Chakravarty, and Gabriele Keller.
  25     [Embedding Foreign Code][CMCK14].
  26     In _PADL '14: The 16th International Symposium on Practical Aspects of Declarative Languages_, Springer-Verlag, LNCS, 2014.
  27
  28   * Trevor L. McDonell, Manuel M. T. Chakravarty, Vinod Grover, and Ryan R. Newton.
  29     [Type-safe Runtime Code Generation: Accelerate to LLVM][MCGN15].
  30     In _Haskell '15: The 8th ACM SIGPLAN Symposium on Haskell_, ACM, 2015.
  31
  32 -}
  33
  34 {-# LANGUAGE NoImplicitPrelude   #-}
  35 {-# LANGUAGE FlexibleContexts    #-}
  36 {-# LANGUAGE TypeFamilies        #-}
  37 {-# LANGUAGE TypeOperators       #-}
  38 {-# LANGUAGE ScopedTypeVariables #-}
  39
  40 module Gargantext.Viz.Graph.Distances.Matrice
  41   where
  42
  43 import Data.Array.Accelerate
  44 import Data.Array.Accelerate.Interpreter (run)
  45
  46 import qualified Gargantext.Prelude as P
  47
  48
  49 -----------------------------------------------------------------------
  50 -- | Define a vector
  51 --
  52 -- >>> vector 3
  53 -- Vector (Z :. 3) [0,1,2]
  54 vector :: Int -> (Array (Z :. Int) Int)
  55 vector n = fromList (Z :. n) [0..n]
  56
  57 -- | Define a matrix
  58 --
  59 -- >>> matrix 3 ([1..] :: [Double])
  60 -- Matrix (Z :. 3 :. 3)
  61 --   [ 1.0, 2.0, 3.0,
  62 --     4.0, 5.0, 6.0,
  63 --     7.0, 8.0, 9.0]
  64 matrix :: Elt c => Int -> [c] -> Matrix c
  65 matrix n l = fromList (Z :. n :. n) l
  66
  67 -- | Two ways to get the rank (as documentation)
  68 --
  69 -- >>> rank (matrix 3 ([1..] :: [Int]))
  70 -- 2
  71 rank :: (Matrix a) -> Int
  72 rank m = arrayRank $ arrayShape m
  73
  74 -----------------------------------------------------------------------
  75 -- | Dimension of a square Matrix
  76 -- How to force use with SquareMatrix ?
  77 type Dim = Int
  78
  79 -- | Get Dimension of a square Matrix
  80 --
  81 -- >>> dim (matrix 3 ([1..] :: [Int]))
  82 -- 3
  83 dim :: Matrix a -> Dim
  84 dim m = n
  85   where
  86     Z :. _ :. n = arrayShape m
  87     -- indexTail (arrayShape m)
  88
  89 -----------------------------------------------------------------------
  90
  91 -- | Sum of a Matrix by Column
  92 --
  93 -- >>> run $ matSum 3 (use $ matrix 3 [1..])
  94 -- Matrix (Z :. 3 :. 3)
  95 --   [ 12.0, 15.0, 18.0,
  96 --     12.0, 15.0, 18.0,
  97 --     12.0, 15.0, 18.0]
  98 matSum :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
  99 matSum r mat = replicate (constant (Z :. (r :: Int) :. All)) $ sum $ transpose mat
 100
 101
 102 -- | Proba computes de probability matrix: all cells divided by thee sum of its column
 103 -- if you need get the probability on the lines, just transpose it
 104 --
 105 -- >>> run $ matProba 3 (use $ matrix 3 [1..])
 106 -- Matrix (Z :. 3 :. 3)
 107 --   [ 8.333333333333333e-2, 0.13333333333333333, 0.16666666666666666,
 108 --       0.3333333333333333,  0.3333333333333333,  0.3333333333333333,
 109 --       0.5833333333333334,  0.5333333333333333,                 0.5]
 110 matProba :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
 111 matProba r mat = zipWith (/) mat (matSum r mat)
 112
 113 -- | Diagonal of the matrix
 114 --
 115 -- >>> run $ diag (use $ matrix 3 ([1..] :: [Int]))
 116 -- Vector (Z :. 3) [1,5,9]
 117 diag :: Elt e => Acc (Matrix e) -> Acc (Vector e)
 118 diag m = backpermute (indexTail (shape m)) (lift1 (\(Z :. x) -> (Z :. x :. (x :: Exp Int)))) m
 119
 120
 121 -- | Divide by the Diagonal of the matrix
 122 --
 123 -- >>> run $ divByDiag 3 (use $ matrix 3 ([1..] :: [Double]))
 124 -- Matrix (Z :. 3 :. 3)
 125 --   [ 1.0, 0.4, 0.3333333333333333,
 126 --     4.0, 1.0, 0.6666666666666666,
 127 --     7.0, 1.6,                1.0]
 128 divByDiag :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
 129 divByDiag d mat = zipWith (/) mat (replicate (constant (Z :. (d :: Int) :. All)) $ diag mat)
 130
 131 -----------------------------------------------------------------------
 132 -- | Filters the matrix with the minimum of maximums
 133 --
 134 -- >>> run $ matMiniMax $ use $ matrix 3 [1..]
 135 -- Matrix (Z :. 3 :. 3)
 136 --   [ 0.0, 4.0, 7.0,
 137 --     0.0, 5.0, 8.0,
 138 --     0.0, 6.0, 9.0]
 139 matMiniMax :: Acc (Matrix Double) -> Acc (Matrix Double)
 140 matMiniMax m = map (\x -> ifThenElse (x > miniMax') x 0) (transpose m)
 141   where
 142     miniMax' = (the $ minimum $ maximum m)
 143
 144 -- | Filters the matrix with a constant
 145 --
 146 -- >>> run $ matFilter 5 $ use $ matrix 3 [1..]
 147 -- Matrix (Z :. 3 :. 3)
 148 --   [ 0.0, 0.0, 7.0,
 149 --     0.0, 0.0, 8.0,
 150 --     0.0, 6.0, 9.0]
 151 matFilter :: Double -> Acc (Matrix Double) -> Acc (Matrix Double)
 152 matFilter t m = map (\x -> ifThenElse (x > (constant t)) x 0) (transpose m)
 153
 154 -----------------------------------------------------------------------
 155 -- * Measures of proximity
 156 -----------------------------------------------------------------------
 157 -- ** Conditional distance
 158
 159 -- *** Conditional distance (basic)
 160
 161 -- | Conditional distance (basic version)
 162 --
 163 -- 2 main measures are actually implemented in order to compute the
 164 -- proximity of two terms: conditional and distributional
 165 --
 166 -- Conditional measure is an absolute measure which reflects
 167 -- interactions of 2 terms in the corpus.
 168 measureConditional :: Matrix Int -> Matrix Double
 169 --measureConditional m = run (matMiniMax $ matProba (dim m) $ map fromIntegral $ use m)
 170 measureConditional m = run (matProba (dim m) $ map fromIntegral $ use m)
 171
 172
 173 -- *** Conditional distance (advanced)
 174
 175 -- | Conditional distance (advanced version)
 176 --
 177 -- The conditional measure P(i|j) of 2 terms @i@ and @j@, also called
 178 -- "confidence" , is the maximum probability between @i@ and @j@ to see
 179 -- @i@ in the same context of @j@ knowing @j@.
 180 --
 181 -- If N(i) (resp. N(j)) is the number of occurrences of @i@ (resp. @j@)
 182 -- in the corpus and _[n_{ij}\] the number of its occurrences we get:
 183 --
 184 -- \[P_c=max(\frac{n_i}{n_{ij}},\frac{n_j}{n_{ij}} )\]
 185 conditional' :: Matrix Int -> (Matrix InclusionExclusion, Matrix SpecificityGenericity)
 186 conditional' m = (run $ ie $ map fromIntegral $ use m, run $ sg $ map fromIntegral $ use m)
 187   where
 188     ie :: Acc (Matrix Double) -> Acc (Matrix Double)
 189     ie mat = map (\x -> x / (2*n-1)) $ zipWith (+) (xs mat) (ys mat)
 190     sg :: Acc (Matrix Double) -> Acc (Matrix Double)
 191     sg mat = map (\x -> x / (2*n-1)) $ zipWith (-) (xs mat) (ys mat)
 192
 193     n :: Exp Double
 194     n = P.fromIntegral r
 195
 196     r :: Dim
 197     r = dim m
 198
 199     xs :: Acc (Matrix Double) -> Acc (Matrix Double)
 200     xs mat = zipWith (-) (matSum r $ matProba r mat) (matProba r mat)
 201     ys :: Acc (Matrix Double) -> Acc (Matrix Double)
 202     ys mat = zipWith (-) (matSum r $ transpose $ matProba r mat) (matProba r mat)
 203
 204 -----------------------------------------------------------------------
 205 -- ** Distributional Distance
 206
 207 -- | Distributional Distance Measure
 208 --
 209 -- Distributional measure is a relative measure which depends on the
 210 -- selected list, it represents structural equivalence.
 211 --
 212 -- The distributional measure \[P_c\] of @i@ and @j@ terms is: \[
 213 -- S_{MI} = \frac {\sum_{k \neq i,j ; MI_{ik} >0}^{} \min(MI_{ik},
 214 -- MI_{jk})}{\sum_{k \neq i,j ; MI_{ik}}^{}} \]
 215 --
 216 -- Mutual information
 217 -- \[S{MI}({i},{j}) = \log(\frac{C{ij}}{E{ij}})\]
 218 --
 219 -- Number of cooccurrences of @i@ and @j@ in the same context of text
 220 --                        \[C{ij}\]
 221 --
 222 -- The expected value of the cooccurrences
 223 --            \[E_{ij} = \frac {S_{i} S_{j}} {N}\]
 224 --
 225 -- Total cooccurrences of @i@ term
 226 --            \[N_{i} = \sum_{i}^{} S_{i}\]
 227 distributional :: Matrix Int -> Matrix Double
 228 distributional m = run $ matMiniMax $ ri (map fromIntegral $ use m)
 229   where
 230
 231     -- filter  m = zipWith (\a b -> max a b) m (transpose m)
 232
 233     ri mat = zipWith (/) mat1 mat2
 234       where
 235         mat1 = matSum n $ zipWith min (mi mat) (mi $ transpose mat)
 236         mat2 = matSum n mat
 237
 238     mi    m'  = zipWith (\a b -> max (log $ a/b) 0)  m'
 239               $ zipWith (/) (crossProduct m') (total m')
 240
 241     total m'' = replicate (constant (Z :. n :. n)) $ fold (+) 0 $ fold (+) 0 m''
 242     n    = dim m
 243
 244     crossProduct m''' = zipWith (*) (cross m'''  ) (cross (transpose m'''))
 245     cross mat      = zipWith (-) (matSum n mat) (mat)
 246
 247 -----------------------------------------------------------------------
 248 -----------------------------------------------------------------------
 249 -- * Specificity and Genericity
 250
 251 {- | Metric Specificity and genericity: select terms
 252
 253 - let N termes and occurrences of i \[N{i}\]
 254
 255 - Cooccurrences of i and j \[N{ij}\]
 256 - Probability to get i given j : \[P(i|j)=N{ij}/N{j}\]
 257
 258 - Genericity of i  \[Gen(i)  = \frac{\sum_{j \neq i,j} P(i|j)}{N-1}\]
 259 - Specificity of j \[Spec(i) = \frac{\sum_{j \neq i,j} P(j|i)}{N-1}\]
 260
 261 - \[Inclusion (i) = Gen(i) + Spec(i)\)
 262 - \[GenericityScore = Gen(i)- Spec(i)\]
 263
 264 - References: Science mapping with asymmetrical paradigmatic proximity
 265 Jean-Philippe Cointet (CREA, TSV), David Chavalarias (CREA) (Submitted
 266 on 15 Mar 2008), Networks and Heterogeneous Media 3, 2 (2008) 267 - 276,
 267 arXiv:0803.2315 [cs.OH]
 268 -}
 269 type InclusionExclusion    = Double
 270 type SpecificityGenericity = Double
 271
 272 data SquareMatrix      = SymetricMatrix | NonSymetricMatrix
 273 type SymetricMatrix    = Matrix
 274 type NonSymetricMatrix = Matrix
 275
 276
 277 incExcSpeGen :: Matrix Int -> (Vector InclusionExclusion, Vector SpecificityGenericity)
 278 incExcSpeGen m = (run' inclusionExclusion m, run' specificityGenericity m)
 279   where
 280     run' fun mat = run $ fun $ map fromIntegral $ use mat
 281
 282     -- | Inclusion (i) = Gen(i)+Spec(i)
 283     inclusionExclusion :: Acc (Matrix Double) -> Acc (Vector Double)
 284     inclusionExclusion mat = zipWith (+) (pV mat) (pH mat)
 285
 286     -- | Genericity score = Gen(i)- Spec(i)
 287     specificityGenericity :: Acc (Matrix Double) -> Acc (Vector Double)
 288     specificityGenericity mat = zipWith (-) (pV mat) (pH mat)
 289
 290     -- | Gen(i)  : 1/(N-1)*Sum(j!=i, P(i|j)) : Genericity  of i
 291     pV :: Acc (Matrix Double) -> Acc (Vector Double)
 292     pV mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ij mat
 293
 294     -- | Spec(i) : 1/(N-1)*Sum(j!=i, P(j|i)) : Specificity of j
 295     pH :: Acc (Matrix Double) -> Acc (Vector Double)
 296     pH mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ji mat
 297
 298     cardN :: Exp Double
 299     cardN = constant (P.fromIntegral (dim m) :: Double)
 300
 301
 302 -- | P(i|j) = Nij /N(jj) Probability to get i given j
 303 --p_ij :: (Elt e, P.Fractional (Exp e)) => Acc (SymetricMatrix e) -> Acc (Matrix e)
 304 p_ij :: (Elt e, P.Fractional (Exp e)) => Acc (Matrix e) -> Acc (Matrix e)
 305 p_ij m = zipWith (/) m (n_jj m)
 306   where
 307     n_jj :: Elt e => Acc (SymetricMatrix e) -> Acc (Matrix e)
 308     n_jj myMat' = backpermute (shape m)
 309                          (lift1 ( \(Z :. (_ :: Exp Int) :. (j:: Exp Int))
 310                                    -> (Z :. j :. j)
 311                                 )
 312                          ) myMat'
 313
 314 -- | P(j|i) = Nij /N(ii) Probability to get i given j
 315 -- to test
 316 p_ji :: (Elt e, P.Fractional (Exp e)) => Acc (Array DIM2 e) -> Acc (Array DIM2 e)
 317 p_ji = transpose . p_ij
 318
 319
 320 -- | Step to ckeck the result in visual/qualitative tests
 321 incExcSpeGen_proba :: Matrix Int -> Matrix Double
 322 incExcSpeGen_proba m = run' pro m
 323   where
 324     run' fun mat = run $ fun $ map fromIntegral $ use mat
 325
 326     pro mat = p_ji mat
 327
 328 {-
 329 -- | Hypothesis to test maybe later (or not)
 330 -- TODO ask accelerate for instances to ease such writtings:
 331 p_ :: (Elt e, P.Fractional (Exp e)) => Acc (Array DIM2 e) -> Acc (Array DIM2 e)
 332 p_ m = zipWith (/) m (n_ m)
 333   where
 334     n_ :: Elt e => Acc (SymetricMatrix e) -> Acc (Matrix e)
 335     n_ m = backpermute (shape m)
 336                          (lift1 ( \(Z :. (i :: Exp Int) :. (j:: Exp Int))
 337                                    -> (ifThenElse (i < j) (lift (Z :. j :. j)) (lift (Z :. i :. i)) :: Exp DIM2)
 338                                 )
 339                          ) m
 340 -}
 341
 342 -- * For Tests (to be removed)
 343 -- | Test perfermance with this matrix
 344 -- TODO : add this in a benchmark folder
 345 distriTest :: Matrix Double
 346 distriTest = distributional $ matrix 100 [1..]
 347 -----------------------------------------------------------------------
 348