2 Module : Gargantext.Core.Methods.Distances.Matrice
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 This module aims at implementig distances of terms context by context is
11 the same referential of corpus.
13 Implementation use Accelerate library which enables GPU and CPU computation
14 See Gargantext.Core.Methods.Graph.Accelerate)
18 {-# LANGUAGE TypeFamilies #-}
19 {-# LANGUAGE TypeOperators #-}
20 {-# LANGUAGE ScopedTypeVariables #-}
21 {-# LANGUAGE ViewPatterns #-}
23 module Gargantext.Core.Methods.Distances.Matrice
26 -- import qualified Data.Foldable as P (foldl1)
27 -- import Debug.Trace (trace)
28 import Data.Array.Accelerate
29 import Data.Array.Accelerate.Interpreter (run)
30 import Gargantext.Core.Methods.Matrix.Accelerate.Utils
31 import qualified Gargantext.Prelude as P
34 -- * Metrics of proximity
35 -----------------------------------------------------------------------
36 -- ** Conditional distance
38 -- *** Conditional distance (basic)
40 -- | Conditional distance (basic version)
42 -- 2 main metrics are actually implemented in order to compute the
43 -- proximity of two terms: conditional and distributional
45 -- Conditional metric is an absolute metric which reflects
46 -- interactions of 2 terms in the corpus.
47 measureConditional :: Matrix Int -> Matrix Double
48 --measureConditional m = run (matMiniMax $ matProba (dim m) $ map fromIntegral $ use m)
49 measureConditional m = run $ matProba (dim m)
54 -- *** Conditional distance (advanced)
56 -- | Conditional distance (advanced version)
58 -- The conditional metric P(i|j) of 2 terms @i@ and @j@, also called
59 -- "confidence" , is the maximum probability between @i@ and @j@ to see
60 -- @i@ in the same context of @j@ knowing @j@.
62 -- If N(i) (resp. N(j)) is the number of occurrences of @i@ (resp. @j@)
63 -- in the corpus and _[n_{ij}\] the number of its occurrences we get:
65 -- \[P_c=max(\frac{n_i}{n_{ij}},\frac{n_j}{n_{ij}} )\]
66 conditional' :: Matrix Int -> (Matrix GenericityInclusion, Matrix SpecificityExclusion)
67 conditional' m = ( run $ ie $ map fromIntegral $ use m
68 , run $ sg $ map fromIntegral $ use m
71 ie :: Acc (Matrix Double) -> Acc (Matrix Double)
72 ie mat = map (\x -> x / (2*n-1)) $ zipWith (+) (xs mat) (ys mat)
73 sg :: Acc (Matrix Double) -> Acc (Matrix Double)
74 sg mat = map (\x -> x / (2*n-1)) $ zipWith (-) (xs mat) (ys mat)
82 xs :: Acc (Matrix Double) -> Acc (Matrix Double)
83 xs mat = zipWith (-) (matSumCol r $ matProba r mat) (matProba r mat)
84 ys :: Acc (Matrix Double) -> Acc (Matrix Double)
85 ys mat = zipWith (-) (matSumCol r $ transpose $ matProba r mat) (matProba r mat)
87 -----------------------------------------------------------------------
88 -- ** Distributional Distance
90 -- | Distributional Distance metric
92 -- Distributional metric is a relative metric which depends on the
93 -- selected list, it represents structural equivalence of mutual information.
95 -- The distributional metric P(c) of @i@ and @j@ terms is: \[
96 -- S_{MI} = \frac {\sum_{k \neq i,j ; MI_{ik} >0}^{} \min(MI_{ik},
97 -- MI_{jk})}{\sum_{k \neq i,j ; MI_{ik}>0}^{}} \]
100 -- \[S_{MI}({i},{j}) = \log(\frac{C{ij}}{E{ij}})\]
102 -- Number of cooccurrences of @i@ and @j@ in the same context of text
105 -- The expected value of the cooccurrences @i@ and @j@ (given a map list of size @n@)
106 -- \[E_{ij}^{m} = \frac {S_{i} S_{j}} {N_{m}}\]
108 -- Total cooccurrences of term @i@ given a map list of size @m@
109 -- \[S_{i} = \sum_{j, j \neq i}^{m} S_{ij}\]
111 -- Total cooccurrences of terms given a map list of size @m@
112 -- \[N_{m} = \sum_{i,i \neq i}^{m} \sum_{j, j \neq j}^{m} S_{ij}\]
114 distributional :: Matrix Int -> Matrix Double
115 distributional m = -- run {- $ matMiniMax -}
122 {- from Int to Double -}
124 {- push matrix in Accelerate type -}
127 _ri :: Acc (Matrix Double) -> Acc (Matrix Double)
128 _ri mat = mat1 -- zipWith (/) mat1 mat2
130 mat1 = matSumCol n $ zipWith min (_myMin mat) (_myMin $ filterWith 0 100 $ diagNull n $ transpose mat)
133 _myMin :: Acc (Matrix Double) -> Acc (Matrix Double)
134 _myMin = replicate (constant (Z :. n :. All)) . minimum
139 s_mi :: Acc (Matrix Double) -> Acc (Matrix Double)
140 s_mi m' = zipWith (\x y -> log (x / y)) (diagNull n m')
141 $ zipWith (/) (crossProduct n m') (total m')
145 total :: Acc (Matrix Double) -> Acc (Matrix Double)
146 total = replicate (constant (Z :. n :. n)) . sum . sum
151 rIJ :: (Elt a, Ord a, P.Fractional (Exp a), P.Num a)
152 => Dim -> Acc (Matrix a) -> Acc (Matrix a)
153 rIJ n m = matMiniMax $ divide a b
158 -----------------------------------------------------------------------
159 -----------------------------------------------------------------------
160 -- * Specificity and Genericity
162 {- | Metric Specificity and genericity: select terms
164 - let N termes and occurrences of i \[N{i}\]
166 - Cooccurrences of i and j \[N{ij}\]
167 - Probability to get i given j : \[P(i|j)=N{ij}/N{j}\]
169 - Genericity of i \[Gen(i) = \frac{\sum_{j \neq i,j} P(i|j)}{N-1}\]
170 - Specificity of j \[Spec(i) = \frac{\sum_{j \neq i,j} P(j|i)}{N-1}\]
172 - \[Inclusion (i) = Gen(i) + Spec(i)\)
173 - \[GenericityScore = Gen(i)- Spec(i)\]
175 - References: Science mapping with asymmetrical paradigmatic proximity
176 Jean-Philippe Cointet (CREA, TSV), David Chavalarias (CREA) (Submitted
177 on 15 Mar 2008), Networks and Heterogeneous Media 3, 2 (2008) 267 - 276,
178 arXiv:0803.2315 [cs.OH]
180 type GenericityInclusion = Double
181 type SpecificityExclusion = Double
183 data SquareMatrix = SymetricMatrix | NonSymetricMatrix
184 type SymetricMatrix = Matrix
185 type NonSymetricMatrix = Matrix
188 incExcSpeGen :: Matrix Int
189 -> ( Vector GenericityInclusion
190 , Vector SpecificityExclusion
192 incExcSpeGen m = (run' inclusionExclusion m, run' specificityGenericity m)
194 run' fun mat = run $ fun $ map fromIntegral $ use mat
196 -- | Inclusion (i) = Gen(i)+Spec(i)
197 inclusionExclusion :: Acc (Matrix Double) -> Acc (Vector Double)
198 inclusionExclusion mat = zipWith (+) (pV mat) (pV mat)
200 -- | Genericity score = Gen(i)- Spec(i)
201 specificityGenericity :: Acc (Matrix Double) -> Acc (Vector Double)
202 specificityGenericity mat = zipWith (+) (pH mat) (pH mat)
204 -- | Gen(i) : 1/(N-1)*Sum(j!=i, P(i|j)) : Genericity of i
205 pV :: Acc (Matrix Double) -> Acc (Vector Double)
206 pV mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ij mat
208 -- | Spec(i) : 1/(N-1)*Sum(j!=i, P(j|i)) : Specificity of j
209 pH :: Acc (Matrix Double) -> Acc (Vector Double)
210 pH mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ji mat
213 cardN = constant (P.fromIntegral (dim m) :: Double)
216 -- | P(i|j) = Nij /N(jj) Probability to get i given j
217 --p_ij :: (Elt e, P.Fractional (Exp e)) => Acc (SymetricMatrix e) -> Acc (Matrix e)
218 p_ij :: (Elt e, P.Fractional (Exp e)) => Acc (Matrix e) -> Acc (Matrix e)
219 p_ij m = zipWith (/) m (n_jj m)
221 n_jj :: Elt e => Acc (SymetricMatrix e) -> Acc (Matrix e)
222 n_jj myMat' = backpermute (shape m)
223 (lift1 ( \(Z :. (_ :: Exp Int) :. (j:: Exp Int))
228 -- | P(j|i) = Nij /N(ii) Probability to get i given j
230 p_ji :: (Elt e, P.Fractional (Exp e))
231 => Acc (Array DIM2 e)
232 -> Acc (Array DIM2 e)
233 p_ji = transpose . p_ij
236 -- | Step to ckeck the result in visual/qualitative tests
237 incExcSpeGen_proba :: Matrix Int -> Matrix Double
238 incExcSpeGen_proba m = run' pro m
240 run' fun mat = run $ fun $ map fromIntegral $ use mat
245 -- | Hypothesis to test maybe later (or not)
246 -- TODO ask accelerate for instances to ease such writtings:
247 p_ :: (Elt e, P.Fractional (Exp e)) => Acc (Array DIM2 e) -> Acc (Array DIM2 e)
248 p_ m = zipWith (/) m (n_ m)
250 n_ :: Elt e => Acc (SymetricMatrix e) -> Acc (Matrix e)
251 n_ m = backpermute (shape m)
252 (lift1 ( \(Z :. (i :: Exp Int) :. (j:: Exp Int))
253 -> (ifThenElse (i < j) (lift (Z :. j :. j)) (lift (Z :. i :. i)) :: Exp DIM2)
258 -- * For Tests (to be removed)
259 -- | Test perfermance with this matrix
260 -- TODO : add this in a benchmark folder
261 distriTest :: Int -> Matrix Double
262 distriTest n = distributional (theMatrix n)
266 theResult :: Int -> Matrix Double
267 theResult n | (P.==) n 2 = let r = 1.6094379124341003 in [ 0, r, r, 0]
268 | P.otherwise = [ 1, 1 ]
273 => Int -> [e] -> Acc (Array ((Z :. Int) :. Int) e)
274 colMatrix n ns = replicate (constant (Z :. (n :: Int) :. All)) v
276 v = use $ vector (P.length ns) ns