src/Gargantext/Viz/Graph/Distances/Matrice.hs

   1 {-|
   2 Module      : Gargantext.Graph.Distances.Matrix
   3 Description :
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 This module aims at implementig distances of terms context by context is
  11 the same referential of corpus.
  12
  13
  14 Implementation use Accelerate library which enables GPU and CPU computation:
  15
  16   * Manuel M. T. Chakravarty, Gabriele Keller, Sean Lee, Trevor L. McDonell, and Vinod Grover.
  17     [Accelerating Haskell Array Codes with Multicore GPUs][CKLM+11].
  18     In _DAMP '11: Declarative Aspects of Multicore Programming_, ACM, 2011.
  19
  20   * Trevor L. McDonell, Manuel M. T. Chakravarty, Gabriele Keller, and Ben Lippmeier.
  21     [Optimising Purely Functional GPU Programs][MCKL13].
  22     In _ICFP '13: The 18th ACM SIGPLAN International Conference on Functional Programming_, ACM, 2013.
  23
  24   * Robert Clifton-Everest, Trevor L. McDonell, Manuel M. T. Chakravarty, and Gabriele Keller.
  25     [Embedding Foreign Code][CMCK14].
  26     In _PADL '14: The 16th International Symposium on Practical Aspects of Declarative Languages_, Springer-Verlag, LNCS, 2014.
  27
  28   * Trevor L. McDonell, Manuel M. T. Chakravarty, Vinod Grover, and Ryan R. Newton.
  29     [Type-safe Runtime Code Generation: Accelerate to LLVM][MCGN15].
  30     In _Haskell '15: The 8th ACM SIGPLAN Symposium on Haskell_, ACM, 2015.
  31
  32 -}
  33
  34 {-# LANGUAGE TypeFamilies        #-}
  35 {-# LANGUAGE TypeOperators       #-}
  36 {-# LANGUAGE ScopedTypeVariables #-}
  37
  38 module Gargantext.Viz.Graph.Distances.Matrice
  39   where
  40
  41 import Data.Array.Accelerate
  42 import Data.Array.Accelerate.Interpreter (run)
  43
  44 import qualified Gargantext.Prelude as P
  45
  46
  47 -----------------------------------------------------------------------
  48 -- | Define a vector
  49 --
  50 -- >>> vector 3
  51 -- Vector (Z :. 3) [0,1,2]
  52 vector :: Int -> (Array (Z :. Int) Int)
  53 vector n = fromList (Z :. n) [0..n]
  54
  55 -- | Define a matrix
  56 --
  57 -- >>> matrix 3 ([1..] :: [Double])
  58 -- Matrix (Z :. 3 :. 3)
  59 --   [ 1.0, 2.0, 3.0,
  60 --     4.0, 5.0, 6.0,
  61 --     7.0, 8.0, 9.0]
  62 matrix :: Elt c => Int -> [c] -> Matrix c
  63 matrix n l = fromList (Z :. n :. n) l
  64
  65 -- | Two ways to get the rank (as documentation)
  66 --
  67 -- >>> rank (matrix 3 ([1..] :: [Int]))
  68 -- 2
  69 rank :: (Matrix a) -> Int
  70 rank m = arrayRank $ arrayShape m
  71
  72 -----------------------------------------------------------------------
  73 -- | Dimension of a square Matrix
  74 -- How to force use with SquareMatrix ?
  75 type Dim = Int
  76
  77 -- | Get Dimension of a square Matrix
  78 --
  79 -- >>> dim (matrix 3 ([1..] :: [Int]))
  80 -- 3
  81 dim :: Matrix a -> Dim
  82 dim m = n
  83   where
  84     Z :. _ :. n = arrayShape m
  85     -- indexTail (arrayShape m)
  86
  87 -----------------------------------------------------------------------
  88
  89 -- | Sum of a Matrix by Column
  90 --
  91 -- >>> run $ matSum 3 (use $ matrix 3 [1..])
  92 -- Matrix (Z :. 3 :. 3)
  93 --   [ 12.0, 15.0, 18.0,
  94 --     12.0, 15.0, 18.0,
  95 --     12.0, 15.0, 18.0]
  96 matSum :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
  97 matSum r mat = replicate (constant (Z :. (r :: Int) :. All)) $ sum $ transpose mat
  98
  99
 100 -- | Proba computes de probability matrix: all cells divided by thee sum of its column
 101 -- if you need get the probability on the lines, just transpose it
 102 --
 103 -- >>> run $ matProba 3 (use $ matrix 3 [1..])
 104 -- Matrix (Z :. 3 :. 3)
 105 --   [ 8.333333333333333e-2, 0.13333333333333333, 0.16666666666666666,
 106 --       0.3333333333333333,  0.3333333333333333,  0.3333333333333333,
 107 --       0.5833333333333334,  0.5333333333333333,                 0.5]
 108 matProba :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
 109 matProba r mat = zipWith (/) mat (matSum r mat)
 110
 111 -- | Diagonal of the matrix
 112 --
 113 -- >>> run $ diag (use $ matrix 3 ([1..] :: [Int]))
 114 -- Vector (Z :. 3) [1,5,9]
 115 diag :: Elt e => Acc (Matrix e) -> Acc (Vector e)
 116 diag m = backpermute (indexTail (shape m)) (lift1 (\(Z :. x) -> (Z :. x :. (x :: Exp Int)))) m
 117
 118 -- | Divide by the Diagonal of the matrix
 119 --
 120 -- >>> run $ divByDiag 3 (use $ matrix 3 ([1..] :: [Double]))
 121 -- Matrix (Z :. 3 :. 3)
 122 --   [ 1.0, 0.4, 0.3333333333333333,
 123 --     4.0, 1.0, 0.6666666666666666,
 124 --     7.0, 1.6,                1.0]
 125 divByDiag :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
 126 divByDiag d mat = zipWith (/) mat (replicate (constant (Z :. (d :: Int) :. All)) $ diag mat)
 127
 128 -----------------------------------------------------------------------
 129 -- | Filters the matrix with the minimum of maximums
 130 --
 131 -- >>> run $ matMiniMax $ use $ matrix 3 [1..]
 132 -- Matrix (Z :. 3 :. 3)
 133 --   [ 0.0, 4.0, 7.0,
 134 --     0.0, 5.0, 8.0,
 135 --     0.0, 6.0, 9.0]
 136 matMiniMax :: Acc (Matrix Double) -> Acc (Matrix Double)
 137 matMiniMax m = map (\x -> ifThenElse (x > miniMax') x 0) (transpose m)
 138   where
 139     miniMax' = (the $ minimum $ maximum m)
 140
 141 -- | Filters the matrix with a constant
 142 --
 143 -- >>> run $ matFilter 5 $ use $ matrix 3 [1..]
 144 -- Matrix (Z :. 3 :. 3)
 145 --   [ 0.0, 0.0, 7.0,
 146 --     0.0, 0.0, 8.0,
 147 --     0.0, 6.0, 9.0]
 148 matFilter :: Double -> Acc (Matrix Double) -> Acc (Matrix Double)
 149 matFilter t m = map (\x -> ifThenElse (x > (constant t)) x 0) (transpose m)
 150
 151 -----------------------------------------------------------------------
 152 -- * Measures of proximity
 153 -----------------------------------------------------------------------
 154 -- ** Conditional distance
 155
 156 -- *** Conditional distance (basic)
 157
 158 -- | Conditional distance (basic version)
 159 --
 160 -- 2 main measures are actually implemented in order to compute the
 161 -- proximity of two terms: conditional and distributional
 162 --
 163 -- Conditional measure is an absolute measure which reflects
 164 -- interactions of 2 terms in the corpus.
 165 measureConditional :: Matrix Int -> Matrix Double
 166 --measureConditional m = run (matMiniMax $ matProba (dim m) $ map fromIntegral $ use m)
 167 measureConditional m = run (matProba (dim m) $ map fromIntegral $ use m)
 168
 169
 170 -- *** Conditional distance (advanced)
 171
 172 -- | Conditional distance (advanced version)
 173 --
 174 -- The conditional measure P(i|j) of 2 terms @i@ and @j@, also called
 175 -- "confidence" , is the maximum probability between @i@ and @j@ to see
 176 -- @i@ in the same context of @j@ knowing @j@.
 177 --
 178 -- If N(i) (resp. N(j)) is the number of occurrences of @i@ (resp. @j@)
 179 -- in the corpus and _[n_{ij}\] the number of its occurrences we get:
 180 --
 181 -- \[P_c=max(\frac{n_i}{n_{ij}},\frac{n_j}{n_{ij}} )\]
 182 conditional' :: Matrix Int -> (Matrix InclusionExclusion, Matrix SpecificityGenericity)
 183 conditional' m = ( run $ ie $ map fromIntegral $ use m
 184                  , run $ sg $ map fromIntegral $ use m
 185                  )
 186   where
 187     ie :: Acc (Matrix Double) -> Acc (Matrix Double)
 188     ie mat = map (\x -> x / (2*n-1)) $ zipWith (+) (xs mat) (ys mat)
 189     sg :: Acc (Matrix Double) -> Acc (Matrix Double)
 190     sg mat = map (\x -> x / (2*n-1)) $ zipWith (-) (xs mat) (ys mat)
 191
 192     n :: Exp Double
 193     n = P.fromIntegral r
 194
 195     r :: Dim
 196     r = dim m
 197
 198     xs :: Acc (Matrix Double) -> Acc (Matrix Double)
 199     xs mat = zipWith (-) (matSum r $ matProba r mat) (matProba r mat)
 200     ys :: Acc (Matrix Double) -> Acc (Matrix Double)
 201     ys mat = zipWith (-) (matSum r $ transpose $ matProba r mat) (matProba r mat)
 202
 203 -----------------------------------------------------------------------
 204 -- ** Distributional Distance
 205
 206 -- | Distributional Distance Measure
 207 --
 208 -- Distributional measure is a relative measure which depends on the
 209 -- selected list, it represents structural equivalence.
 210 --
 211 -- The distributional measure P(c) of @i@ and @j@ terms is: \[
 212 -- S_{MI} = \frac {\sum_{k \neq i,j ; MI_{ik} >0}^{} \min(MI_{ik},
 213 -- MI_{jk})}{\sum_{k \neq i,j ; MI_{ik}}^{}} \]
 214 --
 215 -- Mutual information
 216 -- \[S_{MI}({i},{j}) = \log(\frac{C{ij}}{E{ij}})\]
 217 --
 218 -- Number of cooccurrences of @i@ and @j@ in the same context of text
 219 --                        \[C{ij}\]
 220 --
 221 -- The expected value of the cooccurrences @i@ and @j@ (given a map list of size @n@)
 222 --            \[E_{ij}^{m} = \frac {S_{i} S_{j}} {N_{m}}\]
 223 --
 224 -- Total cooccurrences of term @i@ given a map list of size @m@
 225 --            \[S_{i} = \sum_{j, j \neq i}^{m} S_{ij}\]
 226 --
 227 -- Total cooccurrences of terms given a map list of size @m@
 228 --            \[N_{m} = \sum_{i,i \neq i}^{m} \sum_{j, j \neq j}^{m} S_{ij}\]
 229 --
 230 distributional :: Matrix Int -> Matrix Double
 231 distributional m = run $ matMiniMax $ ri (map fromIntegral $ use m)
 232   where
 233
 234     -- filter  m = zipWith (\a b -> max a b) m (transpose m)
 235
 236     ri mat = zipWith (/) mat1 mat2
 237       where
 238         mat1 = matSum n $ zipWith min (s_mi mat) (s_mi $ transpose mat)
 239         mat2 = matSum n mat
 240
 241     s_mi    m'  = zipWith (\a b -> log (a/b))  m'
 242               $ zipWith (/) (crossProduct m') (total m')
 243
 244     total m'' = replicate (constant (Z :. n :. n)) $ fold (+) 0 $ fold (+) 0 m''
 245
 246     n :: Dim
 247     n = dim m
 248
 249     crossProduct m''' = zipWith (*) (cross m'''  ) (cross (transpose m'''))
 250     cross mat         = zipWith (-) (matSum n mat) (mat)
 251
 252 -----------------------------------------------------------------------
 253 -----------------------------------------------------------------------
 254 -- * Specificity and Genericity
 255
 256 {- | Metric Specificity and genericity: select terms
 257
 258 - let N termes and occurrences of i \[N{i}\]
 259
 260 - Cooccurrences of i and j \[N{ij}\]
 261 - Probability to get i given j : \[P(i|j)=N{ij}/N{j}\]
 262
 263 - Genericity of i  \[Gen(i)  = \frac{\sum_{j \neq i,j} P(i|j)}{N-1}\]
 264 - Specificity of j \[Spec(i) = \frac{\sum_{j \neq i,j} P(j|i)}{N-1}\]
 265
 266 - \[Inclusion (i) = Gen(i) + Spec(i)\)
 267 - \[GenericityScore = Gen(i)- Spec(i)\]
 268
 269 - References: Science mapping with asymmetrical paradigmatic proximity
 270 Jean-Philippe Cointet (CREA, TSV), David Chavalarias (CREA) (Submitted
 271 on 15 Mar 2008), Networks and Heterogeneous Media 3, 2 (2008) 267 - 276,
 272 arXiv:0803.2315 [cs.OH]
 273 -}
 274 type InclusionExclusion    = Double
 275 type SpecificityGenericity = Double
 276
 277 data SquareMatrix      = SymetricMatrix | NonSymetricMatrix
 278 type SymetricMatrix    = Matrix
 279 type NonSymetricMatrix = Matrix
 280
 281
 282 incExcSpeGen :: Matrix Int -> (Vector InclusionExclusion, Vector SpecificityGenericity)
 283 incExcSpeGen m = (run' inclusionExclusion m, run' specificityGenericity m)
 284   where
 285     run' fun mat = run $ fun $ map fromIntegral $ use mat
 286
 287     -- | Inclusion (i) = Gen(i)+Spec(i)
 288     inclusionExclusion :: Acc (Matrix Double) -> Acc (Vector Double)
 289     inclusionExclusion mat = zipWith (+) (pV mat) (pV mat)
 290
 291     -- | Genericity score = Gen(i)- Spec(i)
 292     specificityGenericity :: Acc (Matrix Double) -> Acc (Vector Double)
 293     specificityGenericity mat = zipWith (+) (pH mat) (pH mat)
 294
 295     -- | Gen(i)  : 1/(N-1)*Sum(j!=i, P(i|j)) : Genericity  of i
 296     pV :: Acc (Matrix Double) -> Acc (Vector Double)
 297     pV mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ij mat
 298
 299     -- | Spec(i) : 1/(N-1)*Sum(j!=i, P(j|i)) : Specificity of j
 300     pH :: Acc (Matrix Double) -> Acc (Vector Double)
 301     pH mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ji mat
 302
 303     cardN :: Exp Double
 304     cardN = constant (P.fromIntegral (dim m) :: Double)
 305
 306
 307 -- | P(i|j) = Nij /N(jj) Probability to get i given j
 308 --p_ij :: (Elt e, P.Fractional (Exp e)) => Acc (SymetricMatrix e) -> Acc (Matrix e)
 309 p_ij :: (Elt e, P.Fractional (Exp e)) => Acc (Matrix e) -> Acc (Matrix e)
 310 p_ij m = zipWith (/) m (n_jj m)
 311   where
 312     n_jj :: Elt e => Acc (SymetricMatrix e) -> Acc (Matrix e)
 313     n_jj myMat' = backpermute (shape m)
 314                          (lift1 ( \(Z :. (_ :: Exp Int) :. (j:: Exp Int))
 315                                    -> (Z :. j :. j)
 316                                 )
 317                          ) myMat'
 318
 319 -- | P(j|i) = Nij /N(ii) Probability to get i given j
 320 -- to test
 321 p_ji :: (Elt e, P.Fractional (Exp e)) => Acc (Array DIM2 e) -> Acc (Array DIM2 e)
 322 p_ji = transpose . p_ij
 323
 324
 325 -- | Step to ckeck the result in visual/qualitative tests
 326 incExcSpeGen_proba :: Matrix Int -> Matrix Double
 327 incExcSpeGen_proba m = run' pro m
 328   where
 329     run' fun mat = run $ fun $ map fromIntegral $ use mat
 330
 331     pro mat = p_ji mat
 332
 333 {-
 334 -- | Hypothesis to test maybe later (or not)
 335 -- TODO ask accelerate for instances to ease such writtings:
 336 p_ :: (Elt e, P.Fractional (Exp e)) => Acc (Array DIM2 e) -> Acc (Array DIM2 e)
 337 p_ m = zipWith (/) m (n_ m)
 338   where
 339     n_ :: Elt e => Acc (SymetricMatrix e) -> Acc (Matrix e)
 340     n_ m = backpermute (shape m)
 341                          (lift1 ( \(Z :. (i :: Exp Int) :. (j:: Exp Int))
 342                                    -> (ifThenElse (i < j) (lift (Z :. j :. j)) (lift (Z :. i :. i)) :: Exp DIM2)
 343                                 )
 344                          ) m
 345 -}
 346
 347 -- * For Tests (to be removed)
 348 -- | Test perfermance with this matrix
 349 -- TODO : add this in a benchmark folder
 350 distriTest :: Matrix Double
 351 distriTest = distributional $ matrix 100 [1..]
 352 -----------------------------------------------------------------------
 353