src/Gargantext/Viz/Graph/Distances/Matrice.hs

   1 {-|
   2 Module      : Gargantext.Graph.Distances.Matrix
   3 Description :
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10
  11 2 main measures are actually implemented in order to compute the proximity of two terms.
  12 - Conditional measure is an absolute measure which reflects interactions of 2 terms in the corpus.
  13 - Distributional measure is a relative measure which depends on the selected list, it represents structural equivalence.
  14
  15 Motivation and definition of the @Conditional@ distance.
  16
  17 Implementation use Accelerate library :
  18   * Manuel M. T. Chakravarty, Gabriele Keller, Sean Lee, Trevor L. McDonell, and Vinod Grover.
  19     [Accelerating Haskell Array Codes with Multicore GPUs][CKLM+11].
  20     In _DAMP '11: Declarative Aspects of Multicore Programming_, ACM, 2011.
  21
  22   * Trevor L. McDonell, Manuel M. T. Chakravarty, Gabriele Keller, and Ben Lippmeier.
  23     [Optimising Purely Functional GPU Programs][MCKL13].
  24     In _ICFP '13: The 18th ACM SIGPLAN International Conference on Functional Programming_, ACM, 2013.
  25
  26   * Robert Clifton-Everest, Trevor L. McDonell, Manuel M. T. Chakravarty, and Gabriele Keller.
  27     [Embedding Foreign Code][CMCK14].
  28     In _PADL '14: The 16th International Symposium on Practical Aspects of Declarative Languages_, Springer-Verlag, LNCS, 2014.
  29
  30   * Trevor L. McDonell, Manuel M. T. Chakravarty, Vinod Grover, and Ryan R. Newton.
  31     [Type-safe Runtime Code Generation: Accelerate to LLVM][MCGN15].
  32     In _Haskell '15: The 8th ACM SIGPLAN Symposium on Haskell_, ACM, 2015.
  33
  34 -}
  35
  36 {-# LANGUAGE NoImplicitPrelude   #-}
  37 {-# LANGUAGE FlexibleContexts    #-}
  38 {-# LANGUAGE TypeFamilies        #-}
  39 {-# LANGUAGE TypeOperators       #-}
  40 {-# LANGUAGE ScopedTypeVariables #-}
  41
  42 module Gargantext.Viz.Graph.Distances.Matrice
  43   where
  44
  45 import Data.Array.Accelerate
  46 import Data.Array.Accelerate.Interpreter (run)
  47
  48 import qualified Gargantext.Prelude as P
  49
  50
  51 -----------------------------------------------------------------------
  52 -- | Test perf.
  53 distriTest :: Matrix Double
  54 distriTest = distributional $ myMat 100
  55 -----------------------------------------------------------------------
  56
  57 vector :: Int -> (Array (Z :. Int) Int)
  58 vector n = fromList (Z :. n) [0..n]
  59
  60 matrix :: Elt c => Int -> [c] -> Matrix c
  61 matrix n l = fromList (Z :. n :. n) l
  62
  63 myMat :: Int -> Matrix Int
  64 myMat n = matrix n [1..]
  65
  66 -- | Two ways to get the rank (as documentation)
  67 rank :: (Matrix a) -> Int
  68 rank m = arrayRank $ arrayShape m
  69
  70 -----------------------------------------------------------------------
  71 -- | Dimension of a square Matrix
  72 -- How to force use with SquareMatrix ?
  73 type Dim = Int
  74
  75 dim :: Matrix a -> Dim
  76 dim m = n
  77   where
  78     Z :. _ :. n = arrayShape m
  79     -- indexTail (arrayShape m)
  80
  81 -----------------------------------------------------------------------
  82 proba :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
  83 proba r mat = zipWith (/) mat (mkSum r mat)
  84
  85 mkSum :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
  86 mkSum r mat = replicate (constant (Z :. (r :: Int) :. All)) $ sum mat
  87
  88 -- | divByDiag
  89 divByDiag :: Dim -> Acc (Matrix Double) -> Acc (Matrix Double)
  90 divByDiag d mat = zipWith (/) mat (replicate (constant (Z :. (d :: Int) :. All)) $ diag mat)
  91   where
  92     diag :: Elt e => Acc (Matrix e) -> Acc (Vector e)
  93     diag m = backpermute (indexTail (shape m)) (lift1 (\(Z :. x) -> (Z :. x :. (x :: Exp Int)))) m
  94 -----------------------------------------------------------------------
  95
  96 miniMax :: Acc (Matrix Double) -> Acc (Matrix Double)
  97 miniMax m = map (\x -> ifThenElse (x > miniMax') x 0) m
  98   where
  99     miniMax' = (the $ minimum $ maximum m)
 100
 101 -- | Conditional distance (basic version)
 102
 103 conditional :: Matrix Int -> Matrix Double
 104 conditional m = run (miniMax $ proba (dim m) $ map fromIntegral $ use m)
 105
 106
 107 -- | Conditional distance (advanced version)
 108 -- The conditional measure \[P_c\] of 2 terms @i@ and @j@, also called "confidence"
 109 -- , is the maximum probability between @i@ and @j@. If \[n_i\] (resp.
 110 -- \[n_j\]) is the number of occurrences of @i@ (resp. @j@) in the corpus and _[n_{ij}\] the number of its occurrences we get:
 111 -- \[P_c=max(\frac{n_i}{n_{ij}},\frac{n_j}{n_{ij}} )\]
 112 conditional' :: Matrix Int -> (Matrix InclusionExclusion, Matrix SpecificityGenericity)
 113 conditional' m = (run $ ie $ map fromIntegral $ use m, run $ sg $ map fromIntegral $ use m)
 114   where
 115     ie :: Acc (Matrix Double) -> Acc (Matrix Double)
 116     ie mat = map (\x -> x / (2*n-1)) $ zipWith (+) (xs mat) (ys mat)
 117     sg :: Acc (Matrix Double) -> Acc (Matrix Double)
 118     sg mat = map (\x -> x / (2*n-1)) $ zipWith (-) (xs mat) (ys mat)
 119
 120     n :: Exp Double
 121     n = P.fromIntegral r
 122
 123     r :: Dim
 124     r = dim m
 125
 126     xs :: Acc (Matrix Double) -> Acc (Matrix Double)
 127     xs mat = zipWith (-) (proba r mat) (mkSum r $ proba r mat)
 128     ys :: Acc (Matrix Double) -> Acc (Matrix Double)
 129     ys mat = zipWith (-) (proba r mat) (mkSum r $ transpose $ proba r mat)
 130
 131 -----------------------------------------------------------------------
 132
 133 -- | Distributional Distance
 134 -- The distributional measure \[P_c\] of @i@ and @j@ terms is:
 135 -- \[ S_{MI} = \frac {\sum_{k \neq i,j ; MI_{ik} >0}^{} \min(MI_{ik}, MI_{jk})}{\sum_{k \neq i,j ; MI_{ik}}^{}}
 136 -- \]
 137 -- \[S{MI}({i},{j}) = \log(\frac{C{ij}}{E{ij}})\] is mutual information
 138 -- \[C{ij}\] is number of cooccurrences of @i@ and @j@ in the same context of text
 139 -- \[E_{ij} = \frac {S_{i} S_{j}} {N}\] is the expected value of the cooccurrences
 140 -- \[N_{i} = \sum_{i}^{} S_{i}\] is the total cooccurrences of @i@ term
 141 distributional :: Matrix Int -> Matrix Double
 142 distributional m = run $ miniMax $ ri (map fromIntegral $ use m)
 143   where
 144     n    = dim m
 145
 146     -- filter  m = zipWith (\a b -> max a b) m (transpose m)
 147
 148     ri mat = zipWith (/) mat1 mat2
 149       where
 150         mat1 = mkSum n $ zipWith min (mi mat) (mi $ transpose mat)
 151         mat2 = mkSum n mat
 152
 153     mi    m'  = zipWith (\a b -> max (log $ a/b) 0)  m'
 154               $ zipWith (/) (crossProduct m') (total m')
 155
 156     total m'' = replicate (constant (Z :. n :. n)) $ fold (+) 0 $ fold (+) 0 m''
 157
 158     crossProduct m''' = zipWith (*) (cross m'''  ) (cross (transpose m'''))
 159     cross mat      = zipWith (-) (mkSum n mat) (mat)
 160
 161 -----------------------------------------------------------------------
 162 -----------------------------------------------------------------------
 163
 164 {-
 165 Metric Specificity and genericity: select terms
 166
 167 let N termes
 168 Ni      : occ de i
 169 Nij     : cooc i et j
 170 Probability to get i given j : P(i|j)=Nij/Nj
 171 Gen(i)  : 1/(N-1)*Sum(j!=i, P(i|j)) : Genericity  of i
 172 Spec(i) : 1/(N-1)*Sum(j!=i, P(j|i)) : Specificity of j
 173 Inclusion (i) = Gen(i)+Spec(i)
 174 Genericity score = Gen(i)- Spec(i)
 175
 176
 177 References:
 178  *  Science mapping with asymmetrical paradigmatic proximity Jean-Philippe Cointet (CREA, TSV), David Chavalarias (CREA) (Submitted on 15 Mar 2008), Networks and Heterogeneous Media 3, 2 (2008) 267 - 276, arXiv:0803.2315 [cs.OH]
 179
 180 -}
 181
 182 type InclusionExclusion    = Double
 183 type SpecificityGenericity = Double
 184
 185 data SquareMatrix      = SymetricMatrix | NonSymetricMatrix
 186 type SymetricMatrix    = Matrix
 187 type NonSymetricMatrix = Matrix
 188
 189
 190 incExcSpeGen :: Matrix Int -> (Vector InclusionExclusion, Vector SpecificityGenericity)
 191 incExcSpeGen m = (run' inclusionExclusion m, run' specificityGenericity m)
 192   where
 193     run' fun mat = run $ fun $ map fromIntegral $ use mat
 194
 195     -- | Inclusion (i) = Gen(i)+Spec(i)
 196     inclusionExclusion :: Acc (Matrix Double) -> Acc (Vector Double)
 197     inclusionExclusion mat = zipWith (+) (pV mat) (pH mat)
 198 --
 199     -- | Genericity score = Gen(i)- Spec(i)
 200     specificityGenericity :: Acc (Matrix Double) -> Acc (Vector Double)
 201     specificityGenericity mat = zipWith (-) (pV mat) (pH mat)
 202
 203     -- | Gen(i)  : 1/(N-1)*Sum(j!=i, P(i|j)) : Genericity  of i
 204     pV :: Acc (Matrix Double) -> Acc (Vector Double)
 205     pV mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ij mat
 206
 207     -- | Spec(i) : 1/(N-1)*Sum(j!=i, P(j|i)) : Specificity of j
 208     pH :: Acc (Matrix Double) -> Acc (Vector Double)
 209     pH mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ji mat
 210
 211     cardN :: Exp Double
 212     cardN = constant (P.fromIntegral (dim m) :: Double)
 213
 214
 215 -- | P(i|j) = Nij /N(jj) Probability to get i given j
 216 p_ij :: (Elt e, P.Fractional (Exp e)) => Acc (SymetricMatrix e) -> Acc (Matrix e)
 217 p_ij m = zipWith (/) m (n_jj m)
 218   where
 219     n_jj :: Elt e => Acc (SymetricMatrix e) -> Acc (Matrix e)
 220     n_jj myMat' = backpermute (shape m)
 221                          (lift1 ( \(Z :. (_ :: Exp Int) :. (j:: Exp Int))
 222                                    -> (Z :. j :. j)
 223                                 )
 224                          ) myMat'
 225
 226 -- | P(j|i) = Nij /N(ii) Probability to get i given j
 227 -- to test
 228 p_ji :: (Elt e, P.Fractional (Exp e)) => Acc (Array DIM2 e) -> Acc (Array DIM2 e)
 229 p_ji = transpose . p_ij
 230
 231
 232 -- | Step to ckeck the result in visual/qualitative tests
 233 incExcSpeGen_proba :: Matrix Int -> Matrix Double
 234 incExcSpeGen_proba m = run' pro m
 235   where
 236     run' fun mat = run $ fun $ map fromIntegral $ use mat
 237
 238     pro mat = p_ji mat
 239
 240 {-
 241 -- | Hypothesis to test maybe later (or not)
 242 -- TODO ask accelerate for instances to ease such writtings:
 243 p_ :: (Elt e, P.Fractional (Exp e)) => Acc (Array DIM2 e) -> Acc (Array DIM2 e)
 244 p_ m = zipWith (/) m (n_ m)
 245   where
 246     n_ :: Elt e => Acc (SymetricMatrix e) -> Acc (Matrix e)
 247     n_ m = backpermute (shape m)
 248                          (lift1 ( \(Z :. (i :: Exp Int) :. (j:: Exp Int))
 249                                    -> (ifThenElse (i < j) (lift (Z :. j :. j)) (lift (Z :. i :. i)) :: Exp DIM2)
 250                                 )
 251                          ) m
 252 -}
 253
 254