]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Methods/Distances/Accelerate/Distributional.hs
[FEAT] Distributional, work with David
[gargantext.git] / src / Gargantext / Core / Methods / Distances / Accelerate / Distributional.hs
1 {-|
2 Module : Gargantext.Core.Methods.Distances.Accelerate.Distributional
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10
11 * Distributional Distance metric
12 __Definition :__ Distributional metric is a relative metric which depends on the
13 selected list, it represents structural equivalence of mutual information.
14
15 __Objective :__ We want to compute with matrices processing the similarity between term $i$ and term $j$ :
16 distr(i,j)=$\frac{\Sigma_{k \neq i,j} min(\frac{n_{ik}^2}{n_{ii}n_{kk}},\frac{n_{jk}^2}{n_{jj}n_{kk}})}{\Sigma_{k \neq i}\frac{n_{ik}^2}{ n_{ii}n_{kk}}}$
17
18 where $n_{ij}$ is the cooccurrence between term $i$ and term $j$
19
20 * For a vector V=[$x_1$ ... $x_n$], we note $|V|_1=\Sigma_ix_i$
21 * operator : .* and ./ cell by cell multiplication and division of the matrix
22 * operator * is the matrix multiplication
23 * Matrice M=[$n_{ij}$]$_{i,j}$
24 * opérateur : Diag(M)=[$n_{ii}$]$_i$ (vecteur)
25 * Id= identity matrix
26 * O=[1]$_{i,j}$ (matrice one)
27 * D(M)=Id .* M
28 * O * D(M) =[$n_{jj}$]$_{i,j}$
29 * D(M) * O =[$n_{ii}$]$_{i,j}$
30 * $V_i=[0~0~0~1~0~0~0]'$ en i
31 * MI=(M ./ O * D(M)) .* (M / D(M) * O )
32 * distr(i,j)=$\frac{|min(V'_i * (MI-D(MI)),V'_j * (MI-D(MI)))|_1}{|V'_i.(MI-D(MI))|_1}$
33
34 [Specifications written by David Chavalarias on Garg v4 shared NodeWrite, team Pyremiel 2020]
35
36 -}
37
38 {-# LANGUAGE TypeFamilies #-}
39 {-# LANGUAGE TypeOperators #-}
40 {-# LANGUAGE ScopedTypeVariables #-}
41 {-# LANGUAGE ViewPatterns #-}
42
43 module Gargantext.Core.Methods.Distances.Accelerate.Distributional
44 where
45
46 -- import qualified Data.Foldable as P (foldl1)
47 -- import Debug.Trace (trace)
48 import Data.Array.Accelerate
49 import Data.Array.Accelerate.Interpreter (run)
50 import Gargantext.Core.Methods.Matrix.Accelerate.Utils
51 import qualified Gargantext.Prelude as P
52 -- import Data.Array.Accelerate.LinearAlgebra (identity) TODO
53 -----------------------------------------------------------------------
54
55 -- * Distributional Distance
56 distributional :: Matrix Int -> Matrix Double
57 distributional m' = run z
58 where
59 m = map fromIntegral $ use m'
60 n = dim m'
61
62 d_m = (.*) (matrixIdentity n) m
63
64 o_d_m = (#*#) (matrixOne n) d_m
65 d_m_o = transpose o_d_m
66
67 mi = (.*) ((./) m o_d_m) ((./) m d_m_o)
68 d_mi = (.*) (matrixIdentity n) mi
69
70 w = (.-) mi d_mi
71
72 z = (#*#) w (matrixOne n)
73 z' = transpose z
74
75 min_z_z' = zipWith min z z'
76
77 result = (./) min_z_z' z
78
79
80
81
82 --
83 -- The distributional metric P(c) of @i@ and @j@ terms is: \[
84 -- S_{MI} = \frac {\sum_{k \neq i,j ; MI_{ik} >0}^{} \min(MI_{ik},
85 -- MI_{jk})}{\sum_{k \neq i,j ; MI_{ik}>0}^{}} \]
86 --
87 -- Mutual information
88 -- \[S_{MI}({i},{j}) = \log(\frac{C{ij}}{E{ij}})\]
89 --
90 -- Number of cooccurrences of @i@ and @j@ in the same context of text
91 -- \[C{ij}\]
92 --
93 -- The expected value of the cooccurrences @i@ and @j@ (given a map list of size @n@)
94 -- \[E_{ij}^{m} = \frac {S_{i} S_{j}} {N_{m}}\]
95 --
96 -- Total cooccurrences of term @i@ given a map list of size @m@
97 -- \[S_{i} = \sum_{j, j \neq i}^{m} S_{ij}\]
98 --
99 -- Total cooccurrences of terms given a map list of size @m@
100 -- \[N_{m} = \sum_{i,i \neq i}^{m} \sum_{j, j \neq j}^{m} S_{ij}\]
101 --
102
103 distributional'' :: Matrix Int -> Matrix Double
104 distributional'' m = -- run {- $ matMiniMax -}
105 run $ diagNull n
106 $ rIJ n
107 $ filterWith 0 100
108 $ filter' 0
109 $ s_mi
110 $ map fromIntegral
111 {- from Int to Double -}
112 $ use m
113 {- push matrix in Accelerate type -}
114 where
115
116 _ri :: Acc (Matrix Double) -> Acc (Matrix Double)
117 _ri mat = mat1 -- zipWith (/) mat1 mat2
118 where
119 mat1 = matSumCol n $ zipWith min (_myMin mat) (_myMin $ filterWith 0 100 $ diagNull n $ transpose mat)
120 _mat2 = total mat
121
122 _myMin :: Acc (Matrix Double) -> Acc (Matrix Double)
123 _myMin = replicate (constant (Z :. n :. All)) . minimum
124
125
126 -- TODO fix NaN
127 -- Quali TEST: OK
128 s_mi :: Acc (Matrix Double) -> Acc (Matrix Double)
129 s_mi m' = zipWith (\x y -> log (x / y)) (diagNull n m')
130 $ zipWith (/) (crossProduct n m') (total m')
131 -- crossProduct n m'
132
133
134 total :: Acc (Matrix Double) -> Acc (Matrix Double)
135 total = replicate (constant (Z :. n :. n)) . sum . sum
136
137 n :: Dim
138 n = dim m
139
140 rIJ :: (Elt a, Ord a, P.Fractional (Exp a), P.Num a)
141 => Dim -> Acc (Matrix a) -> Acc (Matrix a)
142 rIJ n m = matMiniMax $ divide a b
143 where
144 a = sumRowMin n m
145 b = sumColMin n m
146
147 -- * For Tests (to be removed)
148 -- | Test perfermance with this matrix
149 -- TODO : add this in a benchmark folder
150 distriTest :: Int -> Matrix Double
151 distriTest n = distributional (theMatrixInt n)
152
153