]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/Phylo/Metrics.hs
Merge branch 'dev-phylo' of ssh://gitlab.iscpif.fr:20022/gargantext/haskell-gargantex...
[gargantext.git] / src / Gargantext / Viz / Phylo / Metrics.hs
1 {-|
2 Module : Gargantext.Viz.Phylo.Tools
3 Description : Phylomemy Tools to build/manage it
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 {-# LANGUAGE FlexibleContexts #-}
12 {-# LANGUAGE NoImplicitPrelude #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE RankNTypes #-}
15 {-# LANGUAGE ViewPatterns #-}
16
17 module Gargantext.Viz.Phylo.Metrics
18 where
19
20 import Gargantext.Prelude
21
22 import Data.List ((\\), sortOn, concat, nub, take, union, intersect, null)
23 import Data.Map (Map, foldlWithKey, toList, size, unionWith, intersection, intersectionWith, filterWithKey, elems, fromList, findWithDefault)
24 import Data.Text (Text)
25
26 -- import Debug.Trace (trace)
27
28 ----------------
29 -- | Ngrams | --
30 ----------------
31
32
33 -- | Return the conditional probability of i knowing j
34 conditional :: Ord a => Map (a,a) Double -> a -> a -> Double
35 conditional m i j = (findWithDefault 0 (i,j) m)
36 / foldlWithKey (\s (x,_) v -> if x == j
37 then s + v
38 else s ) 0 m
39
40
41 -- | Return the genericity score of a given ngram
42 genericity :: Map (Int, Int) Double -> [Int] -> Int -> Double
43 genericity m l i = ( (sum $ map (\j -> conditional m i j) l)
44 - (sum $ map (\j -> conditional m j i) l)) / 2
45
46
47 -- | Return the specificity score of a given ngram
48 specificity :: Map (Int, Int) Double -> [Int] -> Int -> Double
49 specificity m l i = ( (sum $ map (\j -> conditional m j i) l)
50 - (sum $ map (\j -> conditional m i j) l)) / 2
51
52
53 -- | Return the coverage score of a given ngram
54 coverage :: Map (Int, Int) Double -> [Int] -> Int -> Double
55 coverage m l i = ( (sum $ map (\j -> conditional m j i) l)
56 + (sum $ map (\j -> conditional m i j) l)) / 2
57
58
59 -- | Process some metrics on top of ngrams
60 getNgramsMeta :: Map (Int, Int) Double -> [Int] -> Map Text [Double]
61 getNgramsMeta m ngrams = fromList
62 [ ("genericity" , map (\n -> genericity m (ngrams \\ [n]) n) ngrams ),
63 ("specificity", map (\n -> specificity m (ngrams \\ [n]) n) ngrams ),
64 ("coverage" , map (\n -> coverage m (ngrams \\ [n]) n) ngrams )]
65
66
67 -- | To get the nth most occurent elems in a coocurency matrix
68 getNthMostOcc :: Int -> Map (Int,Int) Double -> [Int]
69 getNthMostOcc nth cooc = (nub . concat)
70 $ map (\((idx,idx'),_) -> [idx,idx'])
71 $ take nth
72 $ reverse
73 $ sortOn snd $ toList cooc
74
75
76 -------------------------
77 -- | Ngrams Dynamics | --
78 -------------------------
79
80
81 -------------------
82 -- | Proximity | --
83 -------------------
84
85
86 -- | Process the inverse sumLog
87 sumInvLog :: Double -> [Double] -> Double
88 sumInvLog s l = foldl (\mem x -> mem + (1 / log (s + x))) 0 l
89
90
91 -- | Process the sumLog
92 sumLog :: Double -> [Double] -> Double
93 sumLog s l = foldl (\mem x -> mem + log (s + x)) 0 l
94
95
96 -- | To compute a jaccard similarity between two lists
97 jaccard :: [Int] -> [Int] -> Double
98 jaccard inter' union' = ((fromIntegral . length) $ inter') / ((fromIntegral . length) $ union')
99
100
101 -- | To get the diagonal of a matrix
102 toDiago :: Map (Int, Int) Double -> [Double]
103 toDiago cooc = elems $ filterWithKey (\(x,x') _ -> x == x') cooc
104
105
106 -- | To process WeighedLogJaccard distance between to coocurency matrix
107 weightedLogJaccard :: Double -> Double -> Map (Int, Int) Double -> Map (Int, Int) Double -> [Int] -> [Int] -> Double
108 weightedLogJaccard sens nbDocs cooc cooc' ngrams ngrams'
109 | null gInter = 0
110 | gInter == gUnion = 1
111 | sens == 0 = jaccard gInter gUnion
112 | sens > 0 = (sumInvLog sens wInter) / (sumInvLog sens wUnion)
113 | otherwise = (sumLog sens wInter) / (sumLog sens wUnion)
114 where
115 --------------------------------------
116 gInter :: [Int]
117 gInter = intersect ngrams ngrams'
118 --------------------------------------
119 gUnion :: [Int]
120 gUnion = union ngrams ngrams'
121 --------------------------------------
122 wInter :: [Double]
123 wInter = toDiago $ map (/nbDocs) $ intersectionWith (+) cooc cooc'
124 --------------------------------------
125 wUnion :: [Double]
126 wUnion = toDiago $ map (/nbDocs) $ unionWith (+) cooc cooc'
127 --------------------------------------
128
129
130 -- | To process the Hamming distance between two PhyloGroup fields
131 hamming :: Map (Int, Int) Double -> Map (Int, Int) Double -> Double
132 hamming f1 f2 = fromIntegral $ max ((size inter) - (size f1)) ((size inter) - (size f2))
133 where
134 --------------------------------------
135 inter :: Map (Int, Int) Double
136 inter = intersection f1 f2
137 --------------------------------------
138
139
140
141
142
143