]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Methods/Distances/Accelerate/Distributional.hs
[FIX] memory leak, useable ngrams table version (WIP)
[gargantext.git] / src / Gargantext / Core / Methods / Distances / Accelerate / Distributional.hs
1 {-|
2 Module : Gargantext.Core.Methods.Distances.Accelerate.Distributional
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 This module aims at implementig distances of terms context by context is
11 the same referential of corpus.
12
13 Implementation use Accelerate library which enables GPU and CPU computation
14 See Gargantext.Core.Methods.Graph.Accelerate)
15
16 -}
17
18 {-# LANGUAGE TypeFamilies #-}
19 {-# LANGUAGE TypeOperators #-}
20 {-# LANGUAGE ScopedTypeVariables #-}
21 {-# LANGUAGE ViewPatterns #-}
22
23 module Gargantext.Core.Methods.Distances.Accelerate.Distributional
24 where
25
26 -- import qualified Data.Foldable as P (foldl1)
27 -- import Debug.Trace (trace)
28 import Data.Array.Accelerate
29 import Data.Array.Accelerate.Interpreter (run)
30 import Gargantext.Core.Methods.Matrix.Accelerate.Utils
31 import qualified Gargantext.Prelude as P
32
33
34 -- * Metrics of proximity
35 -----------------------------------------------------------------------
36 -- ** Distributional Distance
37
38 -- | Distributional Distance metric
39 --
40 -- Distributional metric is a relative metric which depends on the
41 -- selected list, it represents structural equivalence of mutual information.
42 --
43 -- The distributional metric P(c) of @i@ and @j@ terms is: \[
44 -- S_{MI} = \frac {\sum_{k \neq i,j ; MI_{ik} >0}^{} \min(MI_{ik},
45 -- MI_{jk})}{\sum_{k \neq i,j ; MI_{ik}>0}^{}} \]
46 --
47 -- Mutual information
48 -- \[S_{MI}({i},{j}) = \log(\frac{C{ij}}{E{ij}})\]
49 --
50 -- Number of cooccurrences of @i@ and @j@ in the same context of text
51 -- \[C{ij}\]
52 --
53 -- The expected value of the cooccurrences @i@ and @j@ (given a map list of size @n@)
54 -- \[E_{ij}^{m} = \frac {S_{i} S_{j}} {N_{m}}\]
55 --
56 -- Total cooccurrences of term @i@ given a map list of size @m@
57 -- \[S_{i} = \sum_{j, j \neq i}^{m} S_{ij}\]
58 --
59 -- Total cooccurrences of terms given a map list of size @m@
60 -- \[N_{m} = \sum_{i,i \neq i}^{m} \sum_{j, j \neq j}^{m} S_{ij}\]
61 --
62 distributional :: Matrix Int -> Matrix Double
63 distributional m = -- run {- $ matMiniMax -}
64 run $ diagNull n
65 $ rIJ n
66 $ filterWith 0 100
67 $ filter' 0
68 $ s_mi
69 $ map fromIntegral
70 {- from Int to Double -}
71 $ use m
72 {- push matrix in Accelerate type -}
73 where
74
75 _ri :: Acc (Matrix Double) -> Acc (Matrix Double)
76 _ri mat = mat1 -- zipWith (/) mat1 mat2
77 where
78 mat1 = matSumCol n $ zipWith min (_myMin mat) (_myMin $ filterWith 0 100 $ diagNull n $ transpose mat)
79 _mat2 = total mat
80
81 _myMin :: Acc (Matrix Double) -> Acc (Matrix Double)
82 _myMin = replicate (constant (Z :. n :. All)) . minimum
83
84
85 -- TODO fix NaN
86 -- Quali TEST: OK
87 s_mi :: Acc (Matrix Double) -> Acc (Matrix Double)
88 s_mi m' = zipWith (\x y -> log (x / y)) (diagNull n m')
89 $ zipWith (/) (crossProduct n m') (total m')
90 -- crossProduct n m'
91
92
93 total :: Acc (Matrix Double) -> Acc (Matrix Double)
94 total = replicate (constant (Z :. n :. n)) . sum . sum
95
96 n :: Dim
97 n = dim m
98
99 rIJ :: (Elt a, Ord a, P.Fractional (Exp a), P.Num a)
100 => Dim -> Acc (Matrix a) -> Acc (Matrix a)
101 rIJ n m = matMiniMax $ divide a b
102 where
103 a = sumRowMin n m
104 b = sumColMin n m
105
106 -- * For Tests (to be removed)
107 -- | Test perfermance with this matrix
108 -- TODO : add this in a benchmark folder
109 distriTest :: Int -> Matrix Double
110 distriTest n = distributional (theMatrix n)
111
112