]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/Phylo/TemporalMatching.hs
working on perf
[gargantext.git] / src / Gargantext / Viz / Phylo / TemporalMatching.hs
1 {-|
2 Module : Gargantext.Viz.Phylo.TemporalMatching
3 Description : Module dedicated to the adaptative temporal matching of a Phylo.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 {-# LANGUAGE NoImplicitPrelude #-}
12 {-# LANGUAGE FlexibleContexts #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE MultiParamTypeClasses #-}
15
16 module Gargantext.Viz.Phylo.TemporalMatching where
17
18 import Data.List (concat, splitAt, tail, sortOn, (++), intersect, null, inits, groupBy, scanl, nub, union, elemIndex, (!!), dropWhile)
19 import Data.Map (Map, fromList, elems, restrictKeys, unionWith, intersectionWith, findWithDefault, filterWithKey)
20
21 import Gargantext.Prelude
22 import Gargantext.Viz.AdaptativePhylo
23 import Gargantext.Viz.Phylo.PhyloTools
24
25 import Prelude (logBase)
26 import Control.Lens hiding (Level)
27 import Control.Parallel.Strategies (parList, rdeepseq, using)
28
29 import qualified Data.Set as Set
30
31
32 -------------------
33 -- | Proximity | --
34 -------------------
35
36
37 -- | Process the inverse sumLog
38 sumInvLog :: Double -> [Double] -> Double
39 sumInvLog s l = foldl (\mem x -> mem + (1 / log (s + x))) 0 l
40
41
42 -- | Process the sumLog
43 sumLog :: Double -> [Double] -> Double
44 sumLog s l = foldl (\mem x -> mem + log (s + x)) 0 l
45
46
47 -- | To compute a jaccard similarity between two lists
48 jaccard :: [Int] -> [Int] -> Double
49 jaccard inter' union' = ((fromIntegral . length) $ inter') / ((fromIntegral . length) $ union')
50
51
52 -- | To process a WeighedLogJaccard distance between to coocurency matrix
53 weightedLogJaccard :: Double -> Double -> Cooc -> Cooc -> [Int] -> [Int] -> Double
54 weightedLogJaccard sens docs cooc cooc' ngrams ngrams'
55 | null ngramsInter = 0
56 | ngramsInter == ngramsUnion = 1
57 | sens == 0 = jaccard ngramsInter ngramsUnion
58 | sens > 0 = (sumInvLog sens coocInter) / (sumInvLog sens coocUnion)
59 | otherwise = (sumLog sens coocInter) / (sumLog sens coocUnion)
60 where
61 --------------------------------------
62 ngramsInter :: [Int]
63 ngramsInter = intersect ngrams ngrams'
64 --------------------------------------
65 ngramsUnion :: [Int]
66 ngramsUnion = union ngrams ngrams'
67 --------------------------------------
68 coocInter :: [Double]
69 coocInter = elems $ map (/docs) $ filterWithKey (\(k,k') _ -> k == k') $ intersectionWith (+) cooc cooc'
70 --------------------------------------
71 coocUnion :: [Double]
72 coocUnion = elems $ map (/docs) $ filterWithKey (\(k,k') _ -> k == k') $ unionWith (+) cooc cooc'
73 --------------------------------------
74
75
76 -- | To choose a proximity function
77 pickProximity :: Proximity -> Double -> Cooc -> Cooc -> [Int] -> [Int] -> Double
78 pickProximity proximity docs cooc cooc' ngrams ngrams' = case proximity of
79 WeightedLogJaccard sens _ _ -> weightedLogJaccard sens docs cooc cooc' ngrams ngrams'
80 Hamming -> undefined
81
82
83 -- | To process the proximity between a current group and a pair of targets group
84 toProximity :: Map Date Double -> Proximity -> PhyloGroup -> PhyloGroup -> PhyloGroup -> Double
85 toProximity docs proximity ego target target' =
86 let docs' = sum $ elems docs
87 cooc = if target == target'
88 then (target ^. phylo_groupCooc)
89 else sumCooc (target ^. phylo_groupCooc) (target' ^. phylo_groupCooc)
90 ngrams = if target == target'
91 then (target ^. phylo_groupNgrams)
92 else union (target ^. phylo_groupNgrams) (target' ^. phylo_groupNgrams)
93 in pickProximity proximity docs' (ego ^. phylo_groupCooc) cooc (ego ^. phylo_groupNgrams) ngrams
94
95
96 ------------------------
97 -- | Local Matching | --
98 ------------------------
99
100
101 -- | Find pairs of valuable candidates to be matched
102 makePairs :: [PhyloGroup] -> [PhyloPeriodId] -> [(PhyloGroup,PhyloGroup)]
103 makePairs candidates periods = case null periods of
104 True -> []
105 -- | at least on of the pair candidates should be from the last added period
106 False -> filter (\(cdt,cdt') -> (inLastPeriod cdt periods)
107 || (inLastPeriod cdt' periods))
108 $ listToKeys candidates
109 where
110 inLastPeriod :: PhyloGroup -> [PhyloPeriodId] -> Bool
111 inLastPeriod g prds = (g ^. phylo_groupPeriod) == (last' "makePairs" prds)
112
113
114 phyloGroupMatching :: [[PhyloGroup]] -> Filiation -> Proximity -> Map Date Double -> Double-> PhyloGroup -> PhyloGroup
115 phyloGroupMatching candidates fil proxi docs thr ego =
116 case null (getPeriodPointers fil ego) of
117 False -> filterPointers fil TemporalPointer proxi thr ego
118 True -> case null pointers of
119 True -> addPointers ego fil TemporalPointer []
120 False -> addPointers ego fil TemporalPointer
121 $ head' "phyloGroupMatching"
122 -- | Keep only the best set of pointers grouped by proximity
123 $ groupBy (\pt pt' -> snd pt == snd pt')
124 $ reverse $ sortOn snd $ head' "pointers" pointers
125 -- | Find the first time frame where at leats one pointer satisfies the proximity threshold
126 where
127 pointers :: [[Pointer]]
128 pointers = take 1
129 $ dropWhile (null)
130 -- | for each time frame, process the proximity on relevant pairs of targeted groups
131 $ scanl (\acc groups ->
132 let periods = nub
133 $ concat $ map (\gs -> if null gs
134 then []
135 else [_phylo_groupPeriod $ head' "pointers" gs]) groups
136 pairs = makePairs (concat groups) periods
137 in acc ++ ( filter (\(_,proximity) -> filterProximity proxi thr proximity)
138 $ concat
139 $ map (\(c,c') ->
140 -- | process the proximity between the current group and a pair of candidates
141 let proximity = toProximity (filterDocs docs ([ego ^. phylo_groupPeriod] ++ periods)) proxi ego c c'
142 in if (c == c')
143 then [(getGroupId c,proximity)]
144 else [(getGroupId c,proximity),(getGroupId c',proximity)] ) pairs)
145 ) []
146 -- | groups from [[1900],[1900,1901],[1900,1901,1902],...]
147 $ inits candidates
148
149
150 filterDocs :: Map Date Double -> [PhyloPeriodId] -> Map Date Double
151 filterDocs d pds = restrictKeys d $ periodsToYears pds
152
153
154 -----------------------------
155 -- | Matching Processing | --
156 -----------------------------
157
158
159 getNextPeriods :: Filiation -> Int -> PhyloPeriodId -> [PhyloPeriodId] -> [PhyloPeriodId]
160 getNextPeriods fil max' pId pIds =
161 case fil of
162 ToChilds -> take max' $ (tail . snd) $ splitAt (elemIndex' pId pIds) pIds
163 ToParents -> take max' $ (reverse . fst) $ splitAt (elemIndex' pId pIds) pIds
164
165
166 getCandidates :: Filiation -> PhyloGroup -> [[PhyloGroup]] -> [[PhyloGroup]]
167 getCandidates fil ego targets =
168 case fil of
169 ToChilds -> targets'
170 ToParents -> reverse targets'
171 where
172 targets' :: [[PhyloGroup]]
173 targets' =
174 map (\groups' ->
175 filter (\g' -> (not . null) $ intersect (ego ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams)
176 ) groups') targets
177
178
179 phyloBranchMatching :: Int -> [PhyloPeriodId] -> Proximity -> Double -> Map Date Double -> [PhyloGroup] -> [PhyloGroup]
180 phyloBranchMatching frame periods proximity thr docs branch = traceBranchMatching proximity thr
181 $ matchByPeriods ToParents
182 $ groupByField _phylo_groupPeriod
183 $ matchByPeriods ToChilds
184 $ groupByField _phylo_groupPeriod branch
185 where
186 --------------------------------------
187 matchByPeriods :: Filiation -> Map PhyloPeriodId [PhyloGroup] -> [PhyloGroup]
188 matchByPeriods fil branch' = foldl' (\acc prd ->
189 let periods' = getNextPeriods fil frame prd periods
190 candidates = map (\prd' -> findWithDefault [] prd' branch') periods'
191 docs' = filterDocs docs ([prd] ++ periods')
192 egos = map (\g -> phyloGroupMatching (getCandidates fil g candidates) fil proximity docs' thr g)
193 $ findWithDefault [] prd branch'
194 egos' = egos `using` parList rdeepseq
195 in acc ++ egos' ) [] periods
196
197
198 -----------------------
199 -- | Phylo Quality | --
200 -----------------------
201
202
203 termFreq :: Int -> [[PhyloGroup]] -> Double
204 termFreq term branches = (sum $ map (\g -> findWithDefault 0 (term,term) (g ^. phylo_groupCooc)) $ concat branches)
205 / (sum $ map (\g -> getTrace $ g ^. phylo_groupCooc) $ concat branches)
206
207
208 entropy :: [[PhyloGroup]] -> Double
209 entropy branches =
210 let terms = ngramsInBranches branches
211 in sum $ map (\term -> (1 / log (termFreq term branches))
212 / (sum $ map (\branch -> 1 / log (termFreq term [branch])) branches)
213 * (sum $ map (\branch ->
214 let q = branchObs term (length $ concat branches) branch
215 in if (q == 0)
216 then 0
217 else - q * logBase 2 q ) branches) ) terms
218 where
219 -- | Probability to observe a branch given a random term of the phylo
220 branchObs :: Int -> Int -> [PhyloGroup] -> Double
221 branchObs term total branch = (fromIntegral $ length $ filter (\g -> elem term $ g ^. phylo_groupNgrams) branch)
222 / (fromIntegral total)
223
224
225 homogeneity :: [[PhyloGroup]] -> Double
226 homogeneity branches =
227 let nbGroups = length $ concat branches
228 in sum
229 $ map (\branch -> (if (length branch == nbGroups)
230 then 1
231 else (1 / log (branchCov branch nbGroups))
232 / (sum $ map (\branch' -> 1 / log (branchCov branch' nbGroups)) branches))
233 * (sum $ map (\term -> (termFreq term branches)
234 / (sum $ map (\term' -> termFreq term' branches) $ ngramsInBranches [branch])
235 * (fromIntegral $ sum $ ngramsInBranches [filter (\g -> elem term $ g ^. phylo_groupNgrams) branch])
236 / (fromIntegral $ sum $ ngramsInBranches [branch])
237 ) $ ngramsInBranches [branch]) ) branches
238 where
239 branchCov :: [PhyloGroup] -> Int -> Double
240 branchCov branch total = (fromIntegral $ length branch) / (fromIntegral total)
241
242
243 toPhyloQuality :: [[PhyloGroup]] -> Double
244 toPhyloQuality branches = sqrt (homogeneity branches / entropy branches)
245
246
247 -----------------------------
248 -- | Adaptative Matching | --
249 -----------------------------
250
251
252 groupsToBranches :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
253 groupsToBranches groups =
254 -- | run the related component algorithm
255 let graph = zip [1..]
256 $ relatedComponents
257 $ map (\group -> [getGroupId group]
258 ++ (map fst $ group ^. phylo_groupPeriodParents)
259 ++ (map fst $ group ^. phylo_groupPeriodChilds) ) $ elems groups
260 -- | update each group's branch id
261 in map (\(bId,ids) ->
262 map (\group -> group & phylo_groupBranchId %~ (\(lvl,lst) -> (lvl,lst ++ [bId])))
263 $ elems $ restrictKeys groups (Set.fromList ids)) graph
264
265
266 recursiveMatching :: Proximity -> Double -> Int -> [PhyloPeriodId] -> Map Date Double -> Double -> [[PhyloGroup]] -> [PhyloGroup]
267 recursiveMatching proximity thr frame periods docs quality branches =
268 if (length branches == (length $ concat branches))
269 then concat branches
270 else if thr > 1
271 then concat branches
272 else
273 case quality <= (sum nextQualities) of
274 -- | success : the new threshold improves the quality score, let's go deeper (traceMatchSuccess thr quality (sum nextQualities))
275 True -> concat
276 $ map (\branches' ->
277 let idx = fromJust $ elemIndex branches' nextBranches
278 in recursiveMatching proximity (thr + (getThresholdStep proximity)) frame periods docs (nextQualities !! idx) branches')
279 $ nextBranches
280 -- | failure : last step was a local maximum of quality, let's validate it (traceMatchFailure thr quality (sum nextQualities))
281 False -> concat branches
282 where
283 -- | 2) for each of the possible next branches process the phyloQuality score
284 nextQualities :: [Double]
285 nextQualities = map toPhyloQuality nextBranches
286 -- | 1) for each local branch process a temporal matching then find the resulting branches
287 nextBranches :: [[[PhyloGroup]]]
288 nextBranches =
289 let branches' = map (\branch -> phyloBranchMatching frame periods proximity thr docs branch) branches
290 clusters = map (\branch -> groupsToBranches $ fromList $ map (\group -> (getGroupId group, group)) branch) branches'
291 clusters' = clusters `using` parList rdeepseq
292 in clusters'
293
294
295
296 temporalMatching :: Phylo -> Phylo
297 temporalMatching phylo = updatePhyloGroups 1 branches' phylo
298 where
299 -- | 4) run the recursive matching to find the best repartition among branches
300 branches' :: Map PhyloGroupId PhyloGroup
301 branches' = fromList
302 $ map (\g -> (getGroupId g, g))
303 $ traceMatchEnd
304 $ recursiveMatching (phyloProximity $ getConfig phylo)
305 ( (getThresholdInit $ phyloProximity $ getConfig phylo)
306 + (getThresholdStep $ phyloProximity $ getConfig phylo))
307 (getTimeFrame $ timeUnit $ getConfig phylo)
308 (getPeriodIds phylo)
309 (phylo ^. phylo_timeDocs) quality branches
310 -- | 3) process the quality score
311 quality :: Double
312 quality = toPhyloQuality branches
313 -- | 2) group into branches
314 branches :: [[PhyloGroup]]
315 branches = groupsToBranches $ fromList $ map (\group -> (getGroupId group, group)) groups'
316 -- | 1) for each group process an initial temporal Matching
317 groups' :: [PhyloGroup]
318 groups' = phyloBranchMatching (getTimeFrame $ timeUnit $ getConfig phylo) (getPeriodIds phylo)
319 (phyloProximity $ getConfig phylo) (getThresholdInit $ phyloProximity $ getConfig phylo)
320 (phylo ^. phylo_timeDocs)
321 (traceTemporalMatching $ getGroupsFromLevel 1 phylo)