]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Methods/Distances/Matrice.hs
[Clean] Core.Methods.Matrix.Accelerate.Utils created
[gargantext.git] / src / Gargantext / Core / Methods / Distances / Matrice.hs
1 {-|
2 Module : Gargantext.Core.Methods.Distances.Matrice
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 This module aims at implementig distances of terms context by context is
11 the same referential of corpus.
12
13 Implementation use Accelerate library which enables GPU and CPU computation
14 See Gargantext.Core.Methods.Graph.Accelerate)
15
16 -}
17
18 {-# LANGUAGE TypeFamilies #-}
19 {-# LANGUAGE TypeOperators #-}
20 {-# LANGUAGE ScopedTypeVariables #-}
21 {-# LANGUAGE ViewPatterns #-}
22
23 module Gargantext.Core.Methods.Distances.Matrice
24 where
25
26 -- import qualified Data.Foldable as P (foldl1)
27 -- import Debug.Trace (trace)
28 import Data.Array.Accelerate
29 import Data.Array.Accelerate.Interpreter (run)
30 import Gargantext.Core.Methods.Matrix.Accelerate.Utils
31 import qualified Gargantext.Prelude as P
32
33
34 -- * Metrics of proximity
35 -----------------------------------------------------------------------
36 -- ** Conditional distance
37
38 -- *** Conditional distance (basic)
39
40 -- | Conditional distance (basic version)
41 --
42 -- 2 main metrics are actually implemented in order to compute the
43 -- proximity of two terms: conditional and distributional
44 --
45 -- Conditional metric is an absolute metric which reflects
46 -- interactions of 2 terms in the corpus.
47 measureConditional :: Matrix Int -> Matrix Double
48 --measureConditional m = run (matMiniMax $ matProba (dim m) $ map fromIntegral $ use m)
49 measureConditional m = run $ matProba (dim m)
50 $ map fromIntegral
51 $ use m
52
53
54 -- *** Conditional distance (advanced)
55
56 -- | Conditional distance (advanced version)
57 --
58 -- The conditional metric P(i|j) of 2 terms @i@ and @j@, also called
59 -- "confidence" , is the maximum probability between @i@ and @j@ to see
60 -- @i@ in the same context of @j@ knowing @j@.
61 --
62 -- If N(i) (resp. N(j)) is the number of occurrences of @i@ (resp. @j@)
63 -- in the corpus and _[n_{ij}\] the number of its occurrences we get:
64 --
65 -- \[P_c=max(\frac{n_i}{n_{ij}},\frac{n_j}{n_{ij}} )\]
66 conditional' :: Matrix Int -> (Matrix GenericityInclusion, Matrix SpecificityExclusion)
67 conditional' m = ( run $ ie $ map fromIntegral $ use m
68 , run $ sg $ map fromIntegral $ use m
69 )
70 where
71 ie :: Acc (Matrix Double) -> Acc (Matrix Double)
72 ie mat = map (\x -> x / (2*n-1)) $ zipWith (+) (xs mat) (ys mat)
73 sg :: Acc (Matrix Double) -> Acc (Matrix Double)
74 sg mat = map (\x -> x / (2*n-1)) $ zipWith (-) (xs mat) (ys mat)
75
76 n :: Exp Double
77 n = P.fromIntegral r
78
79 r :: Dim
80 r = dim m
81
82 xs :: Acc (Matrix Double) -> Acc (Matrix Double)
83 xs mat = zipWith (-) (matSumCol r $ matProba r mat) (matProba r mat)
84 ys :: Acc (Matrix Double) -> Acc (Matrix Double)
85 ys mat = zipWith (-) (matSumCol r $ transpose $ matProba r mat) (matProba r mat)
86
87 -----------------------------------------------------------------------
88 -- ** Distributional Distance
89
90 -- | Distributional Distance metric
91 --
92 -- Distributional metric is a relative metric which depends on the
93 -- selected list, it represents structural equivalence of mutual information.
94 --
95 -- The distributional metric P(c) of @i@ and @j@ terms is: \[
96 -- S_{MI} = \frac {\sum_{k \neq i,j ; MI_{ik} >0}^{} \min(MI_{ik},
97 -- MI_{jk})}{\sum_{k \neq i,j ; MI_{ik}>0}^{}} \]
98 --
99 -- Mutual information
100 -- \[S_{MI}({i},{j}) = \log(\frac{C{ij}}{E{ij}})\]
101 --
102 -- Number of cooccurrences of @i@ and @j@ in the same context of text
103 -- \[C{ij}\]
104 --
105 -- The expected value of the cooccurrences @i@ and @j@ (given a map list of size @n@)
106 -- \[E_{ij}^{m} = \frac {S_{i} S_{j}} {N_{m}}\]
107 --
108 -- Total cooccurrences of term @i@ given a map list of size @m@
109 -- \[S_{i} = \sum_{j, j \neq i}^{m} S_{ij}\]
110 --
111 -- Total cooccurrences of terms given a map list of size @m@
112 -- \[N_{m} = \sum_{i,i \neq i}^{m} \sum_{j, j \neq j}^{m} S_{ij}\]
113 --
114 distributional :: Matrix Int -> Matrix Double
115 distributional m = -- run {- $ matMiniMax -}
116 run $ diagNull n
117 $ rIJ n
118 $ filterWith 0 100
119 $ filter' 0
120 $ s_mi
121 $ map fromIntegral
122 {- from Int to Double -}
123 $ use m
124 {- push matrix in Accelerate type -}
125 where
126
127 _ri :: Acc (Matrix Double) -> Acc (Matrix Double)
128 _ri mat = mat1 -- zipWith (/) mat1 mat2
129 where
130 mat1 = matSumCol n $ zipWith min (_myMin mat) (_myMin $ filterWith 0 100 $ diagNull n $ transpose mat)
131 _mat2 = total mat
132
133 _myMin :: Acc (Matrix Double) -> Acc (Matrix Double)
134 _myMin = replicate (constant (Z :. n :. All)) . minimum
135
136
137 -- TODO fix NaN
138 -- Quali TEST: OK
139 s_mi :: Acc (Matrix Double) -> Acc (Matrix Double)
140 s_mi m' = zipWith (\x y -> log (x / y)) (diagNull n m')
141 $ zipWith (/) (crossProduct n m') (total m')
142 -- crossProduct n m'
143
144
145 total :: Acc (Matrix Double) -> Acc (Matrix Double)
146 total = replicate (constant (Z :. n :. n)) . sum . sum
147
148 n :: Dim
149 n = dim m
150
151 rIJ :: (Elt a, Ord a, P.Fractional (Exp a), P.Num a)
152 => Dim -> Acc (Matrix a) -> Acc (Matrix a)
153 rIJ n m = matMiniMax $ divide a b
154 where
155 a = sumRowMin n m
156 b = sumColMin n m
157
158 -----------------------------------------------------------------------
159 -----------------------------------------------------------------------
160 -- * Specificity and Genericity
161
162 {- | Metric Specificity and genericity: select terms
163
164 - let N termes and occurrences of i \[N{i}\]
165
166 - Cooccurrences of i and j \[N{ij}\]
167 - Probability to get i given j : \[P(i|j)=N{ij}/N{j}\]
168
169 - Genericity of i \[Gen(i) = \frac{\sum_{j \neq i,j} P(i|j)}{N-1}\]
170 - Specificity of j \[Spec(i) = \frac{\sum_{j \neq i,j} P(j|i)}{N-1}\]
171
172 - \[Inclusion (i) = Gen(i) + Spec(i)\)
173 - \[GenericityScore = Gen(i)- Spec(i)\]
174
175 - References: Science mapping with asymmetrical paradigmatic proximity
176 Jean-Philippe Cointet (CREA, TSV), David Chavalarias (CREA) (Submitted
177 on 15 Mar 2008), Networks and Heterogeneous Media 3, 2 (2008) 267 - 276,
178 arXiv:0803.2315 [cs.OH]
179 -}
180 type GenericityInclusion = Double
181 type SpecificityExclusion = Double
182
183 data SquareMatrix = SymetricMatrix | NonSymetricMatrix
184 type SymetricMatrix = Matrix
185 type NonSymetricMatrix = Matrix
186
187
188 incExcSpeGen :: Matrix Int
189 -> ( Vector GenericityInclusion
190 , Vector SpecificityExclusion
191 )
192 incExcSpeGen m = (run' inclusionExclusion m, run' specificityGenericity m)
193 where
194 run' fun mat = run $ fun $ map fromIntegral $ use mat
195
196 -- | Inclusion (i) = Gen(i)+Spec(i)
197 inclusionExclusion :: Acc (Matrix Double) -> Acc (Vector Double)
198 inclusionExclusion mat = zipWith (+) (pV mat) (pV mat)
199
200 -- | Genericity score = Gen(i)- Spec(i)
201 specificityGenericity :: Acc (Matrix Double) -> Acc (Vector Double)
202 specificityGenericity mat = zipWith (+) (pH mat) (pH mat)
203
204 -- | Gen(i) : 1/(N-1)*Sum(j!=i, P(i|j)) : Genericity of i
205 pV :: Acc (Matrix Double) -> Acc (Vector Double)
206 pV mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ij mat
207
208 -- | Spec(i) : 1/(N-1)*Sum(j!=i, P(j|i)) : Specificity of j
209 pH :: Acc (Matrix Double) -> Acc (Vector Double)
210 pH mat = map (\x -> (x-1)/(cardN-1)) $ sum $ p_ji mat
211
212 cardN :: Exp Double
213 cardN = constant (P.fromIntegral (dim m) :: Double)
214
215
216 -- | P(i|j) = Nij /N(jj) Probability to get i given j
217 --p_ij :: (Elt e, P.Fractional (Exp e)) => Acc (SymetricMatrix e) -> Acc (Matrix e)
218 p_ij :: (Elt e, P.Fractional (Exp e)) => Acc (Matrix e) -> Acc (Matrix e)
219 p_ij m = zipWith (/) m (n_jj m)
220 where
221 n_jj :: Elt e => Acc (SymetricMatrix e) -> Acc (Matrix e)
222 n_jj myMat' = backpermute (shape m)
223 (lift1 ( \(Z :. (_ :: Exp Int) :. (j:: Exp Int))
224 -> (Z :. j :. j)
225 )
226 ) myMat'
227
228 -- | P(j|i) = Nij /N(ii) Probability to get i given j
229 -- to test
230 p_ji :: (Elt e, P.Fractional (Exp e))
231 => Acc (Array DIM2 e)
232 -> Acc (Array DIM2 e)
233 p_ji = transpose . p_ij
234
235
236 -- | Step to ckeck the result in visual/qualitative tests
237 incExcSpeGen_proba :: Matrix Int -> Matrix Double
238 incExcSpeGen_proba m = run' pro m
239 where
240 run' fun mat = run $ fun $ map fromIntegral $ use mat
241
242 pro mat = p_ji mat
243
244 {-
245 -- | Hypothesis to test maybe later (or not)
246 -- TODO ask accelerate for instances to ease such writtings:
247 p_ :: (Elt e, P.Fractional (Exp e)) => Acc (Array DIM2 e) -> Acc (Array DIM2 e)
248 p_ m = zipWith (/) m (n_ m)
249 where
250 n_ :: Elt e => Acc (SymetricMatrix e) -> Acc (Matrix e)
251 n_ m = backpermute (shape m)
252 (lift1 ( \(Z :. (i :: Exp Int) :. (j:: Exp Int))
253 -> (ifThenElse (i < j) (lift (Z :. j :. j)) (lift (Z :. i :. i)) :: Exp DIM2)
254 )
255 ) m
256 -}
257
258 -- * For Tests (to be removed)
259 -- | Test perfermance with this matrix
260 -- TODO : add this in a benchmark folder
261 distriTest :: Int -> Matrix Double
262 distriTest n = distributional (theMatrix n)
263
264
265 {-
266 theResult :: Int -> Matrix Double
267 theResult n | (P.==) n 2 = let r = 1.6094379124341003 in [ 0, r, r, 0]
268 | P.otherwise = [ 1, 1 ]
269 -}
270
271
272 colMatrix :: Elt e
273 => Int -> [e] -> Acc (Array ((Z :. Int) :. Int) e)
274 colMatrix n ns = replicate (constant (Z :. (n :: Int) :. All)) v
275 where
276 v = use $ vector (P.length ns) ns
277