src/Gargantext/Core/Methods/Distances/Accelerate/Distributional.hs

   1 {-|
   2 Module      : Gargantext.Core.Methods.Distances.Accelerate.Distributional
   3 Description :
   4 Copyright   : (c) CNRS, 2017-Present
   5 License     : AGPL + CECILL v3
   6 Maintainer  : team@gargantext.org
   7 Stability   : experimental
   8 Portability : POSIX
   9
  10 This module aims at implementig distances of terms context by context is
  11 the same referential of corpus.
  12
  13 Implementation use Accelerate library which enables GPU and CPU computation
  14 See Gargantext.Core.Methods.Graph.Accelerate)
  15
  16 -}
  17
  18 {-# LANGUAGE TypeFamilies        #-}
  19 {-# LANGUAGE TypeOperators       #-}
  20 {-# LANGUAGE ScopedTypeVariables #-}
  21 {-# LANGUAGE ViewPatterns        #-}
  22
  23 module Gargantext.Core.Methods.Distances.Accelerate.Distributional
  24   where
  25
  26 -- import qualified Data.Foldable as P (foldl1)
  27 -- import Debug.Trace (trace)
  28 import Data.Array.Accelerate
  29 import Data.Array.Accelerate.Interpreter (run)
  30 import Gargantext.Core.Methods.Matrix.Accelerate.Utils
  31 import qualified Gargantext.Prelude as P
  32
  33
  34 -- * Metrics of proximity
  35 -----------------------------------------------------------------------
  36 -- ** Distributional Distance
  37
  38 -- | Distributional Distance metric
  39 --
  40 -- Distributional metric is a relative metric which depends on the
  41 -- selected list, it represents structural equivalence of mutual information.
  42 --
  43 -- The distributional metric P(c) of @i@ and @j@ terms is: \[
  44 -- S_{MI} = \frac {\sum_{k \neq i,j ; MI_{ik} >0}^{} \min(MI_{ik},
  45 -- MI_{jk})}{\sum_{k \neq i,j ; MI_{ik}>0}^{}} \]
  46 --
  47 -- Mutual information
  48 -- \[S_{MI}({i},{j}) = \log(\frac{C{ij}}{E{ij}})\]
  49 --
  50 -- Number of cooccurrences of @i@ and @j@ in the same context of text
  51 --                        \[C{ij}\]
  52 --
  53 -- The expected value of the cooccurrences @i@ and @j@ (given a map list of size @n@)
  54 --            \[E_{ij}^{m} = \frac {S_{i} S_{j}} {N_{m}}\]
  55 --
  56 -- Total cooccurrences of term @i@ given a map list of size @m@
  57 --            \[S_{i} = \sum_{j, j \neq i}^{m} S_{ij}\]
  58 --
  59 -- Total cooccurrences of terms given a map list of size @m@
  60 --            \[N_{m} = \sum_{i,i \neq i}^{m} \sum_{j, j \neq j}^{m} S_{ij}\]
  61 --
  62 distributional :: Matrix Int -> Matrix Double
  63 distributional m = -- run {- $ matMiniMax -}
  64                    run  $ diagNull n
  65                        $ rIJ n
  66                        $ filterWith 0 100
  67                        $ filter' 0
  68                        $ s_mi
  69                        $ map fromIntegral
  70                           {- from Int to Double -}
  71                        $ use m
  72                           {- push matrix in Accelerate type -}
  73   where
  74
  75     _ri :: Acc (Matrix Double) -> Acc (Matrix Double)
  76     _ri mat = mat1 -- zipWith (/) mat1 mat2
  77       where
  78         mat1 = matSumCol n $ zipWith min (_myMin mat) (_myMin $ filterWith 0 100 $ diagNull n $ transpose mat)
  79         _mat2 = total mat
  80
  81     _myMin :: Acc (Matrix Double) -> Acc (Matrix Double)
  82     _myMin = replicate (constant (Z :. n :. All)) . minimum
  83
  84
  85     -- TODO fix NaN
  86     -- Quali TEST: OK
  87     s_mi :: Acc (Matrix Double) -> Acc (Matrix Double)
  88     s_mi m' = zipWith (\x y -> log (x / y)) (diagNull n m')
  89             $ zipWith (/) (crossProduct n m') (total m')
  90             -- crossProduct n m'
  91
  92
  93     total :: Acc (Matrix Double) -> Acc (Matrix Double)
  94     total = replicate (constant (Z :. n :. n)) . sum . sum
  95
  96     n :: Dim
  97     n = dim m
  98
  99 rIJ :: (Elt a, Ord a, P.Fractional (Exp a), P.Num a)
 100     => Dim -> Acc (Matrix a) -> Acc (Matrix a)
 101 rIJ n m = matMiniMax $ divide a b
 102   where
 103     a = sumRowMin n m
 104     b = sumColMin n m
 105
 106 -- * For Tests (to be removed)
 107 -- | Test perfermance with this matrix
 108 -- TODO : add this in a benchmark folder
 109 distriTest :: Int -> Matrix Double
 110 distriTest n = distributional (theMatrix n)
 111
 112