]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/Phylo/Metrics.hs
add Wos parser
[gargantext.git] / src / Gargantext / Viz / Phylo / Metrics.hs
1 {-|
2 Module : Gargantext.Viz.Phylo.Tools
3 Description : Phylomemy Tools to build/manage it
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 {-# LANGUAGE FlexibleContexts #-}
12 {-# LANGUAGE NoImplicitPrelude #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE RankNTypes #-}
15 {-# LANGUAGE ViewPatterns #-}
16
17 module Gargantext.Viz.Phylo.Metrics
18 where
19
20 import Gargantext.Prelude
21 import Gargantext.Viz.Phylo
22 import Gargantext.Viz.Phylo.Tools
23
24 import Control.Lens hiding (Level)
25
26 import Data.List ((\\), sortOn, concat, nub, take, union, intersect, null, (++), sort)
27 import Data.Map (Map, (!), toList, size, insert, unionWith, intersection, intersectionWith, filterWithKey, elems, fromList, findWithDefault, fromListWith)
28 import Data.Text (Text)
29
30 -- import Debug.Trace (trace)
31
32 ----------------
33 -- | Ngrams | --
34 ----------------
35
36
37 -- | Return the conditional probability of i knowing j
38 conditional :: Ord a => Map (a,a) Double -> a -> a -> Double
39 conditional m i j = (findWithDefault 0 (i,j) m)
40 / (m ! (j,j))
41
42
43 -- | Return the genericity score of a given ngram
44 genericity :: Map (Int, Int) Double -> [Int] -> Int -> Double
45 genericity m l i = ( (sum $ map (\j -> conditional m i j) l)
46 - (sum $ map (\j -> conditional m j i) l)) / (fromIntegral $ (length l) + 1)
47
48
49 -- | Return the specificity score of a given ngram
50 specificity :: Map (Int, Int) Double -> [Int] -> Int -> Double
51 specificity m l i = ( (sum $ map (\j -> conditional m j i) l)
52 - (sum $ map (\j -> conditional m i j) l)) / (fromIntegral $ (length l) + 1)
53
54
55 -- | Return the inclusion score of a given ngram
56 inclusion :: Map (Int, Int) Double -> [Int] -> Int -> Double
57 inclusion m l i = ( (sum $ map (\j -> conditional m j i) l)
58 + (sum $ map (\j -> conditional m i j) l)) / (fromIntegral $ (length l) + 1)
59
60
61 -- | Process some metrics on top of ngrams
62 getNgramsMeta :: Map (Int, Int) Double -> [Int] -> Map Text [Double]
63 getNgramsMeta m ngrams = fromList
64 [ ("genericity" , map (\n -> genericity m (ngrams \\ [n]) n) ngrams ),
65 ("specificity", map (\n -> specificity m (ngrams \\ [n]) n) ngrams ),
66 ("inclusion" , map (\n -> inclusion m (ngrams \\ [n]) n) ngrams )]
67
68
69 -- | To get the nth most occurent elems in a coocurency matrix
70 getNthMostOcc :: Int -> Map (Int,Int) Double -> [Int]
71 getNthMostOcc nth cooc = (nub . concat)
72 $ map (\((idx,idx'),_) -> [idx,idx'])
73 $ take nth
74 $ reverse
75 $ sortOn snd $ toList cooc
76
77
78 -------------------------
79 -- | Ngrams Dynamics | --
80 -------------------------
81
82 sharedWithParents :: Date -> PhyloBranchId -> Int -> PhyloView -> Bool
83 sharedWithParents inf bid n pv = elem n
84 $ foldl (\mem pn -> if ((bid == (fromJust $ (pn ^. pn_bid)))
85 && (inf > (fst $ getNodePeriod pn)))
86 then nub $ mem ++ (pn ^. pn_idx)
87 else mem ) []
88 $ (pv ^. pv_nodes)
89
90
91 findDynamics :: Int -> PhyloView -> PhyloNode -> Map Int (Date,Date) -> Double
92 findDynamics n pv pn m =
93 let prd = getNodePeriod pn
94 bid = fromJust $ (pn ^. pn_bid)
95 end = last' "dynamics" (sort $ map snd $ elems m)
96 in if (((snd prd) == (snd $ m ! n)) && (snd prd /= end))
97 -- | emergence
98 then 2
99 else if ((fst prd) == (fst $ m ! n))
100 -- | recombination
101 then 0
102 else if (not $ sharedWithParents (fst prd) bid n pv)
103 -- | decrease
104 then 1
105 else 3
106
107
108
109 processDynamics :: PhyloView -> PhyloView
110 processDynamics pv = alterPhyloNode (\pn ->
111 pn & pn_metrics %~ insert "dynamics" (map (\n -> findDynamics n pv pn ngramsDates) $ (pn ^. pn_idx) ) ) pv
112 where
113 --------------------------------------
114 ngramsDates :: Map Int (Date,Date)
115 ngramsDates = map (\ds -> let ds' = sort ds
116 in (head' "Dynamics" ds', last' "Dynamics" ds'))
117 $ fromListWith (++)
118 $ foldl (\mem pn -> mem ++ (map (\n -> (n, [fst $ getNodePeriod pn, snd $ getNodePeriod pn]))
119 $ (pn ^. pn_idx))) []
120 $ (pv ^. pv_nodes)
121 --------------------------------------
122
123
124
125 -------------------
126 -- | Proximity | --
127 -------------------
128
129
130 -- | Process the inverse sumLog
131 sumInvLog :: Double -> [Double] -> Double
132 sumInvLog s l = foldl (\mem x -> mem + (1 / log (s + x))) 0 l
133
134
135 -- | Process the sumLog
136 sumLog :: Double -> [Double] -> Double
137 sumLog s l = foldl (\mem x -> mem + log (s + x)) 0 l
138
139
140 -- | To compute a jaccard similarity between two lists
141 jaccard :: [Int] -> [Int] -> Double
142 jaccard inter' union' = ((fromIntegral . length) $ inter') / ((fromIntegral . length) $ union')
143
144
145 -- | To get the diagonal of a matrix
146 toDiago :: Map (Int, Int) Double -> [Double]
147 toDiago cooc = elems $ filterWithKey (\(x,x') _ -> x == x') cooc
148
149
150 -- | To process WeighedLogJaccard distance between to coocurency matrix
151 weightedLogJaccard :: Double -> Double -> Map (Int, Int) Double -> Map (Int, Int) Double -> [Int] -> [Int] -> Double
152 weightedLogJaccard sens nbDocs cooc cooc' ngrams ngrams'
153 | null gInter = 0
154 | gInter == gUnion = 1
155 | sens == 0 = jaccard gInter gUnion
156 | sens > 0 = (sumInvLog sens wInter) / (sumInvLog sens wUnion)
157 | otherwise = (sumLog sens wInter) / (sumLog sens wUnion)
158 where
159 --------------------------------------
160 gInter :: [Int]
161 gInter = intersect ngrams ngrams'
162 --------------------------------------
163 gUnion :: [Int]
164 gUnion = union ngrams ngrams'
165 --------------------------------------
166 wInter :: [Double]
167 wInter = toDiago $ map (/nbDocs) $ intersectionWith (+) cooc cooc'
168 --------------------------------------
169 wUnion :: [Double]
170 wUnion = toDiago $ map (/nbDocs) $ unionWith (+) cooc cooc'
171 --------------------------------------
172
173
174 -- | To process the Hamming distance between two PhyloGroup fields
175 hamming :: Map (Int, Int) Double -> Map (Int, Int) Double -> Double
176 hamming f1 f2 = fromIntegral $ max ((size inter) - (size f1)) ((size inter) - (size f2))
177 where
178 --------------------------------------
179 inter :: Map (Int, Int) Double
180 inter = intersection f1 f2
181 --------------------------------------
182
183
184
185
186
187