src/Gargantext/Core/Methods/Distances/Matrice.hs

   1 {-|
   2 Module      : Gargantext.Core.Methods.Distances.Matrice
   3 Description :
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 This module aims at implementig distances of terms context by context is
  11 the same referential of corpus.
  12
  13 Implementation use Accelerate library which enables GPU and CPU computation
  14 See Gargantext.Core.Methods.Graph.Accelerate)
  15
  16 -}
  17
  18 {-# LANGUAGE TypeFamilies        #-}
  19 {-# LANGUAGE TypeOperators       #-}
  20 {-# LANGUAGE ScopedTypeVariables #-}
  21 {-# LANGUAGE ViewPatterns        #-}
  22
  23 module Gargantext.Core.Methods.Distances.Matrice
  24   where
  25
  26 -- import qualified Data.Foldable as P (foldl1)
  27 -- import Debug.Trace (trace)
  28 import Data.Array.Accelerate
  29 import Data.Array.Accelerate.Interpreter (run)
  30 import Gargantext.Core.Methods.Matrix.Accelerate.Utils
  31 import qualified Gargantext.Prelude as P
  32
  33
  34 -- * Metrics of proximity
  35 -----------------------------------------------------------------------
  36 -- ** Conditional distance
  37
  38 -- *** Conditional distance (basic)
  39
  40 -- | Conditional distance (basic version)
  41 --
  42 -- 2 main metrics are actually implemented in order to compute the
  43 -- proximity of two terms: conditional and distributional
  44 --
  45 -- Conditional metric is an absolute metric which reflects
  46 -- interactions of 2 terms in the corpus.
  47 measureConditional :: Matrix Int -> Matrix Double
  48 --measureConditional m = run (matMiniMax $ matProba (dim m) $ map fromIntegral $ use m)
  49 measureConditional m = run $ matProba (dim m)
  50                            $ map fromIntegral
  51                            $ use m
  52
  53
  54 -- *** Conditional distance (advanced)
  55
  56 -- | Conditional distance (advanced version)
  57 --
  58 -- The conditional metric P(i|j) of 2 terms @i@ and @j@, also called
  59 -- "confidence" , is the maximum probability between @i@ and @j@ to see
  60 -- @i@ in the same context of @j@ knowing @j@.
  61 --
  62 -- If N(i) (resp. N(j)) is the number of occurrences of @i@ (resp. @j@)
  63 -- in the corpus and _[n_{ij}\] the number of its occurrences we get:
  64 --
  65 -- \[P_c=max(\frac{n_i}{n_{ij}},\frac{n_j}{n_{ij}} )\]
  66 conditional' :: Matrix Int -> (Matrix GenericityInclusion, Matrix SpecificityExclusion)
  67 conditional' m = ( run $ ie $ map fromIntegral $ use m
  68                  , run $ sg $ map fromIntegral $ use m
  69                  )
  70   where
  71     ie :: Acc (Matrix Double) -> Acc (Matrix Double)
  72     ie mat = map (\x -> x / (2*n-1)) $ zipWith (+) (xs mat) (ys mat)
  73     sg :: Acc (Matrix Double) -> Acc (Matrix Double)
  74     sg mat = map (\x -> x / (2*n-1)) $ zipWith (-) (xs mat) (ys mat)
  75
  76     n :: Exp Double
  77     n = P.fromIntegral r
  78
  79     r :: Dim
  80     r = dim m
  81
  82     xs :: Acc (Matrix Double) -> Acc (Matrix Double)
  83     xs mat = zipWith (-) (matSumCol r $ matProba r mat) (matProba r mat)
  84     ys :: Acc (Matrix Double) -> Acc (Matrix Double)
  85     ys mat = zipWith (-) (matSumCol r $ transpose $ matProba r mat) (matProba r mat)
  86
  87 -----------------------------------------------------------------------
  88 -- ** Distributional Distance
  89
  90 -- | Distributional Distance metric
  91 --
  92 -- Distributional metric is a relative metric which depends on the
  93 -- selected list, it represents structural equivalence of mutual information.
  94 --
  95 -- The distributional metric P(c) of @i@ and @j@ terms is: \[
  96 -- S_{MI} = \frac {\sum_{k \neq i,j ; MI_{ik} >0}^{} \min(MI_{ik},
  97 -- MI_{jk})}{\sum_{k \neq i,j ; MI_{ik}>0}^{}} \]
  98 --
  99 -- Mutual information
 100 -- \[S_{MI}({i},{j}) = \log(\frac{C{ij}}{E{ij}})\]
 101 --
 102 -- Number of cooccurrences of @i@ and @j@ in the same context of text
 103 --                        \[C{ij}\]
 104 --
 105 -- The expected value of the cooccurrences @i@ and @j@ (given a map list of size @n@)
 106 --            \[E_{ij}^{m} = \frac {S_{i} S_{j}} {N_{m}}\]
 107 --
 108 -- Total cooccurrences of term @i@ given a map list of size @m@
 109 --            \[S_{i} = \sum_{j, j \neq i}^{m} S_{ij}\]
 110 --
 111 -- Total cooccurrences of terms given a map list of size @m@
 112 --            \[N_{m} = \sum_{i,i \neq i}^{m} \sum_{j, j \neq j}^{m} S_{ij}\]
 113 --
 114 distributional :: Matrix Int -> Matrix Double
 115 distributional m = -- run {- $ matMiniMax -}
 116                    run  $ diagNull n
 117                        $ rIJ n
 118                        $ filterWith 0 100
 119                        $ filter' 0
 120                        $ s_mi
 121                        $ map fromIntegral
 122                           {- from Int to Double -}
 123                        $ use m
 124                           {- push matrix in Accelerate type -}
 125   where
 126
 127     _ri :: Acc (Matrix Double) -> Acc (Matrix Double)
 128     _ri mat = mat1 -- zipWith (/) mat1 mat2
 129       where
 130         mat1 = matSumCol n $ zipWith min (_myMin mat) (_myMin $ filterWith 0 100 $ diagNull n $ transpose mat)
 131         _mat2 = total mat
 132
 133     _myMin :: Acc (Matrix Double) -> Acc (Matrix Double)
 134     _myMin = replicate (constant (Z :. n :. All)) . minimum
 135
 136
 137     -- TODO fix NaN
 138     -- Quali TEST: OK
 139     s_mi :: Acc (Matrix Double) -> Acc (Matrix Double)
 140     s_mi m' = zipWith (\x y -> log (x / y)) (diagNull n m')
 141             $ zipWith (/) (crossProduct n m') (total m')
 142             -- crossProduct n m'
 143
 144
 145     total :: Acc (Matrix Double) -> Acc (Matrix Double)
 146     total = replicate (constant (Z :. n :. n)) . sum . sum
 147
 148     n :: Dim
 149     n = dim m
 150
 151 rIJ :: (Elt a, Ord a, P.Fractional (Exp a), P.Num a)
 152     => Dim -> Acc (Matrix a) -> Acc (Matrix a)
 153 rIJ n m = matMiniMax $ divide a b
 154   where
 155     a = sumRowMin n m
 156     b = sumColMin n m
 157
 158 -----------------------------------------------------------------------
 159 -----------------------------------------------------------------------
 160 -- * Specificity and Genericity
 161
 162 {- | Metric Specificity and genericity: select terms
 163
 164 - let N termes and occurrences of i \[N{i}\]
 165
 166 - Cooccurrences of i and j \[N{ij}\]
 167 - Probability to get i given j : \[P(i|j)=N{ij}/N{j}\]
 168
 169 - Genericity of i  \[Gen(i)  = \frac{\sum_{j \neq i,j} P(i|j)}{N-1}\]
 170 - Specificity of j \[Spec(i) = \frac{\sum_{j \neq i,j} P(j|i)}{N-1}\]
 171
 172 - \[Inclusion (i) = Gen(i) + Spec(i)\)
 173 - \[GenericityScore = Gen(i)- Spec(i)\]
 174
 175 - References: Science mapping with asymmetrical paradigmatic proximity
 176 Jean-Philippe Cointet (CREA, TSV), David Chavalarias (CREA) (Submitted
 177 on 15 Mar 2008), Networks and Heterogeneous Media 3, 2 (2008) 267 - 276,
 178 arXiv:0803.2315 [cs.OH]
 179 -}
 180 type GenericityInclusion  = Double
 181 type SpecificityExclusion = Double
 182
 183 data SquareMatrix      = SymetricMatrix | NonSymetricMatrix
 184 type SymetricMatrix    = Matrix
 185 type NonSymetricMatrix = Matrix
 186
 187
 188 incExcSpeGen :: Matrix Int
 189              -> ( Vector GenericityInclusion
 190                 , Vector SpecificityExclusion
 191                 )
 192 incExcSpeGen m = (run' inclusionExclusion m, run' specificityGenericity m)
 193   where
 194     run' fun mat = run $ fun $ map fromIntegral $ use mat
 195
 196     -- | Inclusion (i) = Gen(i)+Spec(i)
 197     inclusionExclusion :: Acc (Matrix Double) -> Acc (Vector Double)
 198     inclusionExclusion mat = zipWith (+) (pV mat) (pV mat)
 199
 200     -- | Genericity score = Gen(i)- Spec(i)
 201     specificityGenericity :: Acc (Matrix Double) -> Acc (Vector Double)
 202     specificityGenericity mat = zipWith (+) (pH mat) (pH mat)
 203
 204     -- | Gen(i)  : 1/(N-1)*Sum(j!=i, P(i|j)) : Genericity  of i
 205     pV :: Acc (Matrix Double) -> Acc (Vector Double)
 206     pV mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ij mat
 207
 208     -- | Spec(i) : 1/(N-1)*Sum(j!=i, P(j|i)) : Specificity of j
 209     pH :: Acc (Matrix Double) -> Acc (Vector Double)
 210     pH mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ji mat
 211
 212     cardN :: Exp Double
 213     cardN = constant (P.fromIntegral (dim m) :: Double)
 214
 215
 216 -- | P(i|j) = Nij /N(jj) Probability to get i given j
 217 --p_ij :: (Elt e, P.Fractional (Exp e)) => Acc (SymetricMatrix e) -> Acc (Matrix e)
 218 p_ij :: (Elt e, P.Fractional (Exp e)) => Acc (Matrix e) -> Acc (Matrix e)
 219 p_ij m = zipWith (/) m (n_jj m)
 220   where
 221     n_jj :: Elt e => Acc (SymetricMatrix e) -> Acc (Matrix e)
 222     n_jj myMat' = backpermute (shape m)
 223                          (lift1 ( \(Z :. (_ :: Exp Int) :. (j:: Exp Int))
 224                                    -> (Z :. j :. j)
 225                                 )
 226                          ) myMat'
 227
 228 -- | P(j|i) = Nij /N(ii) Probability to get i given j
 229 -- to test
 230 p_ji :: (Elt e, P.Fractional (Exp e))
 231      => Acc (Array DIM2 e)
 232      -> Acc (Array DIM2 e)
 233 p_ji = transpose . p_ij
 234
 235
 236 -- | Step to ckeck the result in visual/qualitative tests
 237 incExcSpeGen_proba :: Matrix Int -> Matrix Double
 238 incExcSpeGen_proba m = run' pro m
 239   where
 240     run' fun mat = run $ fun $ map fromIntegral $ use mat
 241
 242     pro mat = p_ji mat
 243
 244 {-
 245 -- | Hypothesis to test maybe later (or not)
 246 -- TODO ask accelerate for instances to ease such writtings:
 247 p_ :: (Elt e, P.Fractional (Exp e)) => Acc (Array DIM2 e) -> Acc (Array DIM2 e)
 248 p_ m = zipWith (/) m (n_ m)
 249   where
 250     n_ :: Elt e => Acc (SymetricMatrix e) -> Acc (Matrix e)
 251     n_ m = backpermute (shape m)
 252                          (lift1 ( \(Z :. (i :: Exp Int) :. (j:: Exp Int))
 253                                    -> (ifThenElse (i < j) (lift (Z :. j :. j)) (lift (Z :. i :. i)) :: Exp DIM2)
 254                                 )
 255                          ) m
 256 -}
 257
 258 -- * For Tests (to be removed)
 259 -- | Test perfermance with this matrix
 260 -- TODO : add this in a benchmark folder
 261 distriTest :: Int -> Matrix Double
 262 distriTest n = distributional (theMatrix n)
 263
 264
 265 {-
 266 theResult :: Int -> Matrix Double
 267 theResult n | (P.==) n 2 = let r = 1.6094379124341003 in [ 0, r, r, 0]
 268           | P.otherwise = [ 1, 1 ]
 269 -}
 270
 271
 272 colMatrix :: Elt e
 273           => Int -> [e] -> Acc (Array ((Z :. Int) :. Int) e)
 274 colMatrix n ns = replicate (constant (Z :. (n :: Int) :. All)) v
 275   where
 276     v = use $ vector (P.length ns) ns
 277