src/Gargantext/Viz/Graph/Distances/Matrice.hs

   1 {-|
   2 Module      : Gargantext.Graph.Distances.Matrix
   3 Description :
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 This module aims at implementig distances of terms context by context is
  11 the same referential of corpus.
  12
  13
  14 Implementation use Accelerate library which enables GPU and CPU computation:
  15
  16   * Manuel M. T. Chakravarty, Gabriele Keller, Sean Lee, Trevor L. McDonell, and Vinod Grover.
  17     [Accelerating Haskell Array Codes with Multicore GPUs][CKLM+11].
  18     In _DAMP '11: Declarative Aspects of Multicore Programming_, ACM, 2011.
  19
  20   * Trevor L. McDonell, Manuel M. T. Chakravarty, Gabriele Keller, and Ben Lippmeier.
  21     [Optimising Purely Functional GPU Programs][MCKL13].
  22     In _ICFP '13: The 18th ACM SIGPLAN International Conference on Functional Programming_, ACM, 2013.
  23
  24   * Robert Clifton-Everest, Trevor L. McDonell, Manuel M. T. Chakravarty, and Gabriele Keller.
  25     [Embedding Foreign Code][CMCK14].
  26     In _PADL '14: The 16th International Symposium on Practical Aspects of Declarative Languages_, Springer-Verlag, LNCS, 2014.
  27
  28   * Trevor L. McDonell, Manuel M. T. Chakravarty, Vinod Grover, and Ryan R. Newton.
  29     [Type-safe Runtime Code Generation: Accelerate to LLVM][MCGN15].
  30     In _Haskell '15: The 8th ACM SIGPLAN Symposium on Haskell_, ACM, 2015.
  31
  32 -}
  33
  34 {-# LANGUAGE NoImplicitPrelude   #-}
  35 {-# LANGUAGE FlexibleContexts    #-}
  36 {-# LANGUAGE TypeFamilies        #-}
  37 {-# LANGUAGE TypeOperators       #-}
  38 {-# LANGUAGE ScopedTypeVariables #-}
  39
  40 module Gargantext.Viz.Graph.Distances.Matrice
  41   where
  42
  43 import Data.Array.Accelerate
  44 import Data.Array.Accelerate.Interpreter (run)
  45
  46 import qualified Gargantext.Prelude as P
  47
  48
  49 -----------------------------------------------------------------------
  50 -- | Define a vector
  51 --
  52 -- >>> vector 3
  53 -- Vector (Z :. 3) [0,1,2]
  54 vector :: Int -> (Array (Z :. Int) Int)
  55 vector n = fromList (Z :. n) [0..n]
  56
  57 -- | Define a matrix
  58 --
  59 -- >>> matrix 3 ([1..] :: [Double])
  60 -- Matrix (Z :. 3 :. 3)
  61 --   [ 1.0, 2.0, 3.0,
  62 --     4.0, 5.0, 6.0,
  63 --     7.0, 8.0, 9.0]
  64 matrix :: Elt c => Int -> [c] -> Matrix c
  65 matrix n l = fromList (Z :. n :. n) l
  66
  67 -- | Two ways to get the rank (as documentation)
  68 --
  69 -- >>> rank (matrix 3 ([1..] :: [Int]))
  70 -- 2
  71 rank :: (Matrix a) -> Int
  72 rank m = arrayRank $ arrayShape m
  73
  74 -----------------------------------------------------------------------
  75 -- | Dimension of a square Matrix
  76 -- How to force use with SquareMatrix ?
  77 type Dim = Int
  78
  79 -- | Get Dimension of a square Matrix
  80 --
  81 -- >>> dim (matrix 3 ([1..] :: [Int]))
  82 -- 3
  83 dim :: Matrix a -> Dim
  84 dim m = n
  85   where
  86     Z :. _ :. n = arrayShape m
  87     -- indexTail (arrayShape m)
  88
  89 -----------------------------------------------------------------------
  90
  91 -- | Sum of a Matrix by Column
  92 --
  93 -- >>> run $ matSum 3 (use $ matrix 3 [1..])
  94 -- Matrix (Z :. 3 :. 3)
  95 --   [ 12.0, 15.0, 18.0,
  96 --     12.0, 15.0, 18.0,
  97 --     12.0, 15.0, 18.0]
  98 matSum :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
  99 matSum r mat = replicate (constant (Z :. (r :: Int) :. All)) $ sum $ transpose mat
 100
 101
 102 -- | Proba computes de probability matrix: all cells divided by thee sum of its column
 103 -- if you need get the probability on the lines, just transpose it
 104 --
 105 -- >>> run $ matProba 3 (use $ matrix 3 [1..])
 106 -- Matrix (Z :. 3 :. 3)
 107 --   [ 8.333333333333333e-2, 0.13333333333333333, 0.16666666666666666,
 108 --       0.3333333333333333,  0.3333333333333333,  0.3333333333333333,
 109 --       0.5833333333333334,  0.5333333333333333,                 0.5]
 110 matProba :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
 111 matProba r mat = zipWith (/) mat (matSum r mat)
 112
 113 -- | Diagonal of the matrix
 114 --
 115 -- >>> run $ diag (use $ matrix 3 ([1..] :: [Int]))
 116 -- Vector (Z :. 3) [1,5,9]
 117 diag :: Elt e => Acc (Matrix e) -> Acc (Vector e)
 118 diag m = backpermute (indexTail (shape m)) (lift1 (\(Z :. x) -> (Z :. x :. (x :: Exp Int)))) m
 119
 120 -- | Divide by the Diagonal of the matrix
 121 --
 122 -- >>> run $ divByDiag 3 (use $ matrix 3 ([1..] :: [Double]))
 123 -- Matrix (Z :. 3 :. 3)
 124 --   [ 1.0, 0.4, 0.3333333333333333,
 125 --     4.0, 1.0, 0.6666666666666666,
 126 --     7.0, 1.6,                1.0]
 127 divByDiag :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
 128 divByDiag d mat = zipWith (/) mat (replicate (constant (Z :. (d :: Int) :. All)) $ diag mat)
 129
 130 -----------------------------------------------------------------------
 131 -- | Filters the matrix with the minimum of maximums
 132 --
 133 -- >>> run $ matMiniMax $ use $ matrix 3 [1..]
 134 -- Matrix (Z :. 3 :. 3)
 135 --   [ 0.0, 4.0, 7.0,
 136 --     0.0, 5.0, 8.0,
 137 --     0.0, 6.0, 9.0]
 138 matMiniMax :: Acc (Matrix Double) -> Acc (Matrix Double)
 139 matMiniMax m = map (\x -> ifThenElse (x > miniMax') x 0) (transpose m)
 140   where
 141     miniMax' = (the $ minimum $ maximum m)
 142
 143 -- | Filters the matrix with a constant
 144 --
 145 -- >>> run $ matFilter 5 $ use $ matrix 3 [1..]
 146 -- Matrix (Z :. 3 :. 3)
 147 --   [ 0.0, 0.0, 7.0,
 148 --     0.0, 0.0, 8.0,
 149 --     0.0, 6.0, 9.0]
 150 matFilter :: Double -> Acc (Matrix Double) -> Acc (Matrix Double)
 151 matFilter t m = map (\x -> ifThenElse (x > (constant t)) x 0) (transpose m)
 152
 153 -----------------------------------------------------------------------
 154 -- * Measures of proximity
 155 -----------------------------------------------------------------------
 156 -- ** Conditional distance
 157
 158 -- *** Conditional distance (basic)
 159
 160 -- | Conditional distance (basic version)
 161 --
 162 -- 2 main measures are actually implemented in order to compute the
 163 -- proximity of two terms: conditional and distributional
 164 --
 165 -- Conditional measure is an absolute measure which reflects
 166 -- interactions of 2 terms in the corpus.
 167 measureConditional :: Matrix Int -> Matrix Double
 168 --measureConditional m = run (matMiniMax $ matProba (dim m) $ map fromIntegral $ use m)
 169 measureConditional m = run (matProba (dim m) $ map fromIntegral $ use m)
 170
 171
 172 -- *** Conditional distance (advanced)
 173
 174 -- | Conditional distance (advanced version)
 175 --
 176 -- The conditional measure P(i|j) of 2 terms @i@ and @j@, also called
 177 -- "confidence" , is the maximum probability between @i@ and @j@ to see
 178 -- @i@ in the same context of @j@ knowing @j@.
 179 --
 180 -- If N(i) (resp. N(j)) is the number of occurrences of @i@ (resp. @j@)
 181 -- in the corpus and _[n_{ij}\] the number of its occurrences we get:
 182 --
 183 -- \[P_c=max(\frac{n_i}{n_{ij}},\frac{n_j}{n_{ij}} )\]
 184 conditional' :: Matrix Int -> (Matrix InclusionExclusion, Matrix SpecificityGenericity)
 185 conditional' m = (run $ ie $ map fromIntegral $ use m, run $ sg $ map fromIntegral $ use m)
 186   where
 187     ie :: Acc (Matrix Double) -> Acc (Matrix Double)
 188     ie mat = map (\x -> x / (2*n-1)) $ zipWith (+) (xs mat) (ys mat)
 189     sg :: Acc (Matrix Double) -> Acc (Matrix Double)
 190     sg mat = map (\x -> x / (2*n-1)) $ zipWith (-) (xs mat) (ys mat)
 191
 192     n :: Exp Double
 193     n = P.fromIntegral r
 194
 195     r :: Dim
 196     r = dim m
 197
 198     xs :: Acc (Matrix Double) -> Acc (Matrix Double)
 199     xs mat = zipWith (-) (matSum r $ matProba r mat) (matProba r mat)
 200     ys :: Acc (Matrix Double) -> Acc (Matrix Double)
 201     ys mat = zipWith (-) (matSum r $ transpose $ matProba r mat) (matProba r mat)
 202
 203 -----------------------------------------------------------------------
 204 -- ** Distributional Distance
 205
 206 -- | Distributional Distance Measure
 207 --
 208 -- Distributional measure is a relative measure which depends on the
 209 -- selected list, it represents structural equivalence.
 210 --
 211 -- The distributional measure P(c) of @i@ and @j@ terms is: \[
 212 -- S_{MI} = \frac {\sum_{k \neq i,j ; MI_{ik} >0}^{} \min(MI_{ik},
 213 -- MI_{jk})}{\sum_{k \neq i,j ; MI_{ik}}^{}} \]
 214 --
 215 -- Mutual information
 216 -- \[S_{MI}({i},{j}) = \log(\frac{C{ij}}{E{ij}})\]
 217 --
 218 -- Number of cooccurrences of @i@ and @j@ in the same context of text
 219 --                        \[C{ij}\]
 220 --
 221 -- The expected value of the cooccurrences @i@ and @j@ (given a map list of size @n@)
 222 --            \[E_{ij}^{m} = \frac {S_{i} S_{j}} {N_{m}}\]
 223 --
 224 -- Total cooccurrences of term @i@ given a map list of size @m@
 225 --            \[S_{i} = \sum_{j, j \neq i}^{m} S_{ij}\]
 226 --
 227 -- Total cooccurrences of terms given a map list of size @m@
 228 --            \[N_{m} = \sum_{i,i \neq i}^{m} \sum_{j, j \neq j}^{m} S_{ij}\]
 229 --
 230 distributional :: Matrix Int -> Matrix Double
 231 distributional m = run $ matMiniMax $ ri (map fromIntegral $ use m)
 232   where
 233
 234     -- filter  m = zipWith (\a b -> max a b) m (transpose m)
 235
 236     ri mat = zipWith (/) mat1 mat2
 237       where
 238         mat1 = matSum n $ zipWith min (s_mi mat) (s_mi $ transpose mat)
 239         mat2 = matSum n mat
 240
 241     s_mi    m'  = zipWith (\a b -> log (a/b))  m'
 242               $ zipWith (/) (crossProduct m') (total m')
 243
 244     total m'' = replicate (constant (Z :. n :. n)) $ fold (+) 0 $ fold (+) 0 m''
 245     n         = dim m
 246
 247     crossProduct m''' = zipWith (*) (cross m'''  ) (cross (transpose m'''))
 248     cross mat         = zipWith (-) (matSum n mat) (mat)
 249
 250 -----------------------------------------------------------------------
 251 -----------------------------------------------------------------------
 252 -- * Specificity and Genericity
 253
 254 {- | Metric Specificity and genericity: select terms
 255
 256 - let N termes and occurrences of i \[N{i}\]
 257
 258 - Cooccurrences of i and j \[N{ij}\]
 259 - Probability to get i given j : \[P(i|j)=N{ij}/N{j}\]
 260
 261 - Genericity of i  \[Gen(i)  = \frac{\sum_{j \neq i,j} P(i|j)}{N-1}\]
 262 - Specificity of j \[Spec(i) = \frac{\sum_{j \neq i,j} P(j|i)}{N-1}\]
 263
 264 - \[Inclusion (i) = Gen(i) + Spec(i)\)
 265 - \[GenericityScore = Gen(i)- Spec(i)\]
 266
 267 - References: Science mapping with asymmetrical paradigmatic proximity
 268 Jean-Philippe Cointet (CREA, TSV), David Chavalarias (CREA) (Submitted
 269 on 15 Mar 2008), Networks and Heterogeneous Media 3, 2 (2008) 267 - 276,
 270 arXiv:0803.2315 [cs.OH]
 271 -}
 272 type InclusionExclusion    = Double
 273 type SpecificityGenericity = Double
 274
 275 data SquareMatrix      = SymetricMatrix | NonSymetricMatrix
 276 type SymetricMatrix    = Matrix
 277 type NonSymetricMatrix = Matrix
 278
 279
 280 incExcSpeGen :: Matrix Int -> (Vector InclusionExclusion, Vector SpecificityGenericity)
 281 incExcSpeGen m = (run' inclusionExclusion m, run' specificityGenericity m)
 282   where
 283     run' fun mat = run $ fun $ map fromIntegral $ use mat
 284
 285     -- | Inclusion (i) = Gen(i)+Spec(i)
 286     inclusionExclusion :: Acc (Matrix Double) -> Acc (Vector Double)
 287     inclusionExclusion mat = zipWith (+) (pV mat) (pH mat)
 288
 289     -- | Genericity score = Gen(i)- Spec(i)
 290     specificityGenericity :: Acc (Matrix Double) -> Acc (Vector Double)
 291     specificityGenericity mat = zipWith (-) (pV mat) (pH mat)
 292
 293     -- | Gen(i)  : 1/(N-1)*Sum(j!=i, P(i|j)) : Genericity  of i
 294     pV :: Acc (Matrix Double) -> Acc (Vector Double)
 295     pV mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ij mat
 296
 297     -- | Spec(i) : 1/(N-1)*Sum(j!=i, P(j|i)) : Specificity of j
 298     pH :: Acc (Matrix Double) -> Acc (Vector Double)
 299     pH mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ji mat
 300
 301     cardN :: Exp Double
 302     cardN = constant (P.fromIntegral (dim m) :: Double)
 303
 304
 305 -- | P(i|j) = Nij /N(jj) Probability to get i given j
 306 --p_ij :: (Elt e, P.Fractional (Exp e)) => Acc (SymetricMatrix e) -> Acc (Matrix e)
 307 p_ij :: (Elt e, P.Fractional (Exp e)) => Acc (Matrix e) -> Acc (Matrix e)
 308 p_ij m = zipWith (/) m (n_jj m)
 309   where
 310     n_jj :: Elt e => Acc (SymetricMatrix e) -> Acc (Matrix e)
 311     n_jj myMat' = backpermute (shape m)
 312                          (lift1 ( \(Z :. (_ :: Exp Int) :. (j:: Exp Int))
 313                                    -> (Z :. j :. j)
 314                                 )
 315                          ) myMat'
 316
 317 -- | P(j|i) = Nij /N(ii) Probability to get i given j
 318 -- to test
 319 p_ji :: (Elt e, P.Fractional (Exp e)) => Acc (Array DIM2 e) -> Acc (Array DIM2 e)
 320 p_ji = transpose . p_ij
 321
 322
 323 -- | Step to ckeck the result in visual/qualitative tests
 324 incExcSpeGen_proba :: Matrix Int -> Matrix Double
 325 incExcSpeGen_proba m = run' pro m
 326   where
 327     run' fun mat = run $ fun $ map fromIntegral $ use mat
 328
 329     pro mat = p_ji mat
 330
 331 {-
 332 -- | Hypothesis to test maybe later (or not)
 333 -- TODO ask accelerate for instances to ease such writtings:
 334 p_ :: (Elt e, P.Fractional (Exp e)) => Acc (Array DIM2 e) -> Acc (Array DIM2 e)
 335 p_ m = zipWith (/) m (n_ m)
 336   where
 337     n_ :: Elt e => Acc (SymetricMatrix e) -> Acc (Matrix e)
 338     n_ m = backpermute (shape m)
 339                          (lift1 ( \(Z :. (i :: Exp Int) :. (j:: Exp Int))
 340                                    -> (ifThenElse (i < j) (lift (Z :. j :. j)) (lift (Z :. i :. i)) :: Exp DIM2)
 341                                 )
 342                          ) m
 343 -}
 344
 345 -- * For Tests (to be removed)
 346 -- | Test perfermance with this matrix
 347 -- TODO : add this in a benchmark folder
 348 distriTest :: Matrix Double
 349 distriTest = distributional $ matrix 100 [1..]
 350 -----------------------------------------------------------------------
 351