]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/Phylo/TemporalMatching.hs
optimisation for temporal matching
[gargantext.git] / src / Gargantext / Viz / Phylo / TemporalMatching.hs
1 {-|
2 Module : Gargantext.Viz.Phylo.TemporalMatching
3 Description : Module dedicated to the adaptative temporal matching of a Phylo.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 {-# LANGUAGE NoImplicitPrelude #-}
12 {-# LANGUAGE FlexibleContexts #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE MultiParamTypeClasses #-}
15
16 module Gargantext.Viz.Phylo.TemporalMatching where
17
18 import Data.List (concat, splitAt, tail, sortOn, (++), intersect, null, inits, groupBy, scanl, nub, union, elemIndex, (!!), dropWhile)
19 import Data.Map (Map, fromList, elems, restrictKeys, unionWith, intersectionWith, findWithDefault, filterWithKey, keys, (!))
20
21 import Gargantext.Prelude
22 import Gargantext.Viz.AdaptativePhylo
23 import Gargantext.Viz.Phylo.PhyloTools
24
25 -- import Prelude (logBase)
26 import Control.Lens hiding (Level)
27 import Control.Parallel.Strategies (parList, rdeepseq, using)
28 -- import Debug.Trace (trace)
29
30 import qualified Data.Set as Set
31
32
33 -------------------
34 -- | Proximity | --
35 -------------------
36
37
38 -- | Process the inverse sumLog
39 sumInvLog :: Double -> [Double] -> Double
40 sumInvLog s l = foldl (\mem x -> mem + (1 / log (s + x))) 0 l
41
42
43 -- | Process the sumLog
44 sumLog :: Double -> [Double] -> Double
45 sumLog s l = foldl (\mem x -> mem + log (s + x)) 0 l
46
47
48 -- | To compute a jaccard similarity between two lists
49 jaccard :: [Int] -> [Int] -> Double
50 jaccard inter' union' = ((fromIntegral . length) $ inter') / ((fromIntegral . length) $ union')
51
52
53 -- | To process a WeighedLogJaccard distance between to coocurency matrix
54 weightedLogJaccard :: Double -> Double -> Cooc -> Cooc -> [Int] -> [Int] -> Double
55 weightedLogJaccard sens docs cooc cooc' ngrams ngrams'
56 | null ngramsInter = 0
57 | ngramsInter == ngramsUnion = 1
58 | sens == 0 = jaccard ngramsInter ngramsUnion
59 | sens > 0 = (sumInvLog sens coocInter) / (sumInvLog sens coocUnion)
60 | otherwise = (sumLog sens coocInter) / (sumLog sens coocUnion)
61 where
62 --------------------------------------
63 ngramsInter :: [Int]
64 ngramsInter = intersect ngrams ngrams'
65 --------------------------------------
66 ngramsUnion :: [Int]
67 ngramsUnion = union ngrams ngrams'
68 --------------------------------------
69 coocInter :: [Double]
70 coocInter = elems $ map (/docs) $ filterWithKey (\(k,k') _ -> k == k') $ intersectionWith (+) cooc cooc'
71 --------------------------------------
72 coocUnion :: [Double]
73 coocUnion = elems $ map (/docs) $ filterWithKey (\(k,k') _ -> k == k') $ unionWith (+) cooc cooc'
74 --------------------------------------
75
76
77 -- | To choose a proximity function
78 pickProximity :: Proximity -> Double -> Cooc -> Cooc -> [Int] -> [Int] -> Double
79 pickProximity proximity docs cooc cooc' ngrams ngrams' = case proximity of
80 WeightedLogJaccard sens _ _ -> weightedLogJaccard sens docs cooc cooc' ngrams ngrams'
81 Hamming -> undefined
82
83
84 -- | To process the proximity between a current group and a pair of targets group
85 toProximity :: Map Date Double -> Proximity -> PhyloGroup -> PhyloGroup -> PhyloGroup -> Double
86 toProximity docs proximity ego target target' =
87 let docs' = sum $ elems docs
88 cooc = if target == target'
89 then (target ^. phylo_groupCooc)
90 else sumCooc (target ^. phylo_groupCooc) (target' ^. phylo_groupCooc)
91 ngrams = if target == target'
92 then (target ^. phylo_groupNgrams)
93 else union (target ^. phylo_groupNgrams) (target' ^. phylo_groupNgrams)
94 in pickProximity proximity docs' (ego ^. phylo_groupCooc) cooc (ego ^. phylo_groupNgrams) ngrams
95
96
97 ------------------------
98 -- | Local Matching | --
99 ------------------------
100
101 toLastPeriod :: Filiation -> [PhyloPeriodId] -> PhyloPeriodId
102 toLastPeriod fil periods = case fil of
103 ToParents -> head' "toLastPeriod" (sortOn fst periods)
104 ToChilds -> last' "toLastPeriod" (sortOn fst periods)
105
106
107 toLazyPairs :: [Pointer] -> Filiation -> Double -> Proximity -> PhyloPeriodId -> [(PhyloGroup,PhyloGroup)] -> [(PhyloGroup,PhyloGroup)]
108 toLazyPairs pointers fil thr prox prd pairs =
109 if null pointers then pairs
110 else let rest = filterPointers prox thr pointers
111 in if null rest
112 then let prd' = toLastPeriod fil (map (fst . fst . fst) pointers)
113 in if prd' == prd
114 then []
115 else filter (\(g,g') ->
116 case fil of
117 ToParents -> ((fst $ g ^. phylo_groupPeriod) < (fst prd'))
118 || ((fst $ g' ^. phylo_groupPeriod) < (fst prd'))
119 ToChilds -> ((fst $ g ^. phylo_groupPeriod) > (fst prd'))
120 || ((fst $ g' ^. phylo_groupPeriod) > (fst prd'))) pairs
121 else []
122
123
124 -- | Find pairs of valuable candidates to be matched
125 makePairs' :: PhyloGroup -> [PhyloGroup] -> [PhyloPeriodId] -> [Pointer] -> Filiation -> Double -> Proximity -> Map Date Double -> [(PhyloGroup,PhyloGroup)]
126 makePairs' ego candidates periods pointers fil thr prox docs =
127 case null periods of
128 True -> []
129 False -> toLazyPairs pointers fil thr prox lastPrd
130 -- | at least on of the pair candidates should be from the last added period
131 $ filter (\(g,g') -> ((g ^. phylo_groupPeriod) == lastPrd)
132 || ((g' ^. phylo_groupPeriod) == lastPrd))
133 $ listToKeys
134 $ filter (\g -> (g ^. phylo_groupPeriod == lastPrd)
135 || ((toProximity docs prox ego ego g) >= thr)) candidates
136 where
137 lastPrd :: PhyloPeriodId
138 lastPrd = toLastPeriod fil periods
139
140
141 filterPointers :: Proximity -> Double -> [Pointer] -> [Pointer]
142 filterPointers proxi thr pts = filter (\(_,w) -> filterProximity proxi thr w) pts
143
144
145 phyloGroupMatching :: [[PhyloGroup]] -> Filiation -> Proximity -> Map Date Double -> Double -> PhyloGroup -> PhyloGroup
146 phyloGroupMatching candidates fil proxi docs thr ego =
147 case null nextPointers of
148 -- | let's find new pointers
149 True -> if null $ filterPointers proxi thr $ getPeriodPointers fil ego
150 then addPointers ego fil TemporalPointer []
151 -- | or keep the old ones
152 else addPointers ego fil TemporalPointer
153 $ filterPointers proxi thr $ getPeriodPointers fil ego
154 False -> addPointers ego fil TemporalPointer
155 $ head' "phyloGroupMatching"
156 -- | Keep only the best set of pointers grouped by proximity
157 $ groupBy (\pt pt' -> snd pt == snd pt')
158 $ reverse $ sortOn snd $ head' "pointers"
159 $ nextPointers
160 -- | Find the first time frame where at leats one pointer satisfies the proximity threshold
161 where
162 nextPointers :: [[Pointer]]
163 nextPointers = take 1
164 $ dropWhile (null)
165 -- | for each time frame, process the proximity on relevant pairs of targeted groups
166 $ scanl (\acc groups ->
167 let periods = nub $ map _phylo_groupPeriod $ concat groups
168 docs' = (filterDocs docs ([ego ^. phylo_groupPeriod] ++ periods))
169 pairs = makePairs' ego (concat groups) periods (getPeriodPointers fil ego) fil thr proxi docs
170 in acc ++ ( filterPointers proxi thr
171 $ concat
172 $ map (\(c,c') ->
173 -- | process the proximity between the current group and a pair of candidates
174 let proximity = toProximity docs' proxi ego c c'
175 in if (c == c')
176 then [(getGroupId c,proximity)]
177 else [(getGroupId c,proximity),(getGroupId c',proximity)] ) pairs )) []
178 $ inits candidates -- | groups from [[1900],[1900,1901],[1900,1901,1902],...]
179
180
181 filterDocs :: Map Date Double -> [PhyloPeriodId] -> Map Date Double
182 filterDocs d pds = restrictKeys d $ periodsToYears pds
183
184
185 -----------------------------
186 -- | Matching Processing | --
187 -----------------------------
188
189
190 getNextPeriods :: Filiation -> Int -> PhyloPeriodId -> [PhyloPeriodId] -> [PhyloPeriodId]
191 getNextPeriods fil max' pId pIds =
192 case fil of
193 ToChilds -> take max' $ (tail . snd) $ splitAt (elemIndex' pId pIds) pIds
194 ToParents -> take max' $ (reverse . fst) $ splitAt (elemIndex' pId pIds) pIds
195
196
197 getCandidates :: Filiation -> PhyloGroup -> [[PhyloGroup]] -> [[PhyloGroup]]
198 getCandidates fil ego targets =
199 case fil of
200 ToChilds -> targets'
201 ToParents -> reverse targets'
202 where
203 targets' :: [[PhyloGroup]]
204 targets' =
205 map (\groups' ->
206 filter (\g' -> (not . null) $ intersect (ego ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams)
207 ) groups') targets
208
209
210 phyloBranchMatching :: Int -> [PhyloPeriodId] -> Proximity -> Double -> Map Date Double -> [PhyloGroup] -> [PhyloGroup]
211 phyloBranchMatching frame periods proximity thr docs branch = traceBranchMatching proximity thr
212 -- $ matchByPeriods ToParents
213 -- $ groupByField _phylo_groupPeriod
214 $ matchByPeriods
215 $ groupByField _phylo_groupPeriod branch
216 where
217 --------------------------------------
218 matchByPeriods :: Map PhyloPeriodId [PhyloGroup] -> [PhyloGroup]
219 matchByPeriods branch' = foldl' (\acc prd ->
220 let periodsPar = getNextPeriods ToParents frame prd periods
221 periodsChi = getNextPeriods ToChilds frame prd periods
222 candidatesPar = map (\prd' -> findWithDefault [] prd' branch') periodsPar
223 candidatesChi = map (\prd' -> findWithDefault [] prd' branch') periodsChi
224 docsPar = filterDocs docs ([prd] ++ periodsPar)
225 docsChi = filterDocs docs ([prd] ++ periodsChi)
226 egos = map (\ego -> phyloGroupMatching (getCandidates ToParents ego candidatesPar) ToParents proximity docsPar thr
227 $ phyloGroupMatching (getCandidates ToChilds ego candidatesChi) ToChilds proximity docsChi thr ego)
228 $ findWithDefault [] prd branch'
229 egos' = egos `using` parList rdeepseq
230 in acc ++ egos' ) [] periods
231
232
233 -----------------------
234 -- | Phylo Quality | --
235 -----------------------
236
237
238 count :: Eq a => a -> [a] -> Int
239 count x = length . filter (== x)
240
241 termFreq' :: Int -> [PhyloGroup] -> Double
242 termFreq' term groups =
243 let ngrams = concat $ map _phylo_groupNgrams groups
244 in (fromIntegral $ count term ngrams)
245 / (fromIntegral $ length ngrams)
246
247 relevantBranches :: Int -> Int -> [[PhyloGroup]] -> [[PhyloGroup]]
248 relevantBranches term thr branches =
249 filter (\groups -> (length groups >= thr)
250 && (any (\group -> elem term $ group ^. phylo_groupNgrams) groups)) branches
251
252 branchCov' :: [PhyloGroup] -> [[PhyloGroup]] -> Double
253 branchCov' branch branches =
254 (fromIntegral $ length branch) / (fromIntegral $ length $ concat branches)
255
256
257 toRecall :: Double -> Int -> Int -> [[PhyloGroup]] -> Double
258 toRecall freq term thr branches =
259 -- | given a random term in a phylo
260 freq
261 -- | for each relevant branches
262 * (sum $ map (\branch ->
263 -- | given its local coverage
264 ((branchCov' branch branches') / (sum $ map (\b -> branchCov' b branches') branches'))
265 -- | compute the local recall
266 * ( (fromIntegral $ length $ filter (\group -> elem term $ group ^. phylo_groupNgrams) branch)
267 / (fromIntegral $ length $ filter (\group -> elem term $ group ^. phylo_groupNgrams) $ concat branches'))) branches')
268 where
269 branches' :: [[PhyloGroup]]
270 branches' = relevantBranches term thr branches
271
272
273 toAccuracy :: Double -> Int -> Int -> [[PhyloGroup]] -> Double
274 toAccuracy freq term thr branches =
275 -- | given a random term in a phylo
276 freq
277 -- | for each relevant branches
278 * (sum $ map (\branch ->
279 -- | given its local coverage
280 ((branchCov' branch branches') / (sum $ map (\b -> branchCov' b branches') branches'))
281 -- | compute the local accuracy
282 * ( (fromIntegral $ length $ filter (\group -> elem term $ group ^. phylo_groupNgrams) branch)
283 / (fromIntegral $ length branch))) branches')
284 where
285 branches' :: [[PhyloGroup]]
286 branches' = relevantBranches term thr branches
287
288
289 toPhyloQuality' :: Quality -> Map Int Double -> [[PhyloGroup]] -> Double
290 toPhyloQuality' quality frequency branches =
291 if (foldl' (\acc b -> acc && (length b < (quality ^. qua_minBranch))) True branches)
292 -- | the local phylo is composed of small branches
293 then 0
294 else
295 let relevance = quality ^. qua_relevance
296 -- | compute the F score for a given relevance
297 in ((1 + relevance ** 2) * accuracy * recall)
298 / (((relevance ** 2) * accuracy + recall))
299 where
300 terms :: [Int]
301 terms = keys frequency
302 -- | for each term compute the global accuracy
303 accuracy :: Double
304 accuracy = sum $ map (\term -> toAccuracy (frequency ! term) term (quality ^. qua_minBranch) branches) terms
305 -- | for each term compute the global recall
306 recall :: Double
307 recall = sum $ map (\term -> toRecall (frequency ! term) term (quality ^. qua_minBranch) branches) terms
308
309
310
311
312 -----------------------------
313 -- | Adaptative Matching | --
314 -----------------------------
315
316
317 groupsToBranches :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
318 groupsToBranches groups =
319 -- | run the related component algorithm
320 let egos = groupBy (\gs gs' -> (fst $ fst $ head' "egos" gs) == (fst $ fst $ head' "egos" gs'))
321 $ sortOn (\gs -> fst $ fst $ head' "egos" gs)
322 $ map (\group -> [getGroupId group]
323 ++ (map fst $ group ^. phylo_groupPeriodParents)
324 ++ (map fst $ group ^. phylo_groupPeriodChilds) ) $ elems groups
325 -- | first find the related components by inside each ego's period
326 graph' = map relatedComponents egos
327 -- | then run it for the all the periods
328 graph = zip [1..]
329 $ relatedComponents $ concat (graph' `using` parList rdeepseq)
330 -- | update each group's branch id
331 in map (\(bId,ids) ->
332 let groups' = map (\group -> group & phylo_groupBranchId %~ (\(lvl,lst) -> (lvl,lst ++ [bId])))
333 $ elems $ restrictKeys groups (Set.fromList ids)
334 in groups' `using` parList rdeepseq ) graph
335
336
337 recursiveMatching :: Proximity -> Quality -> Map Int Double -> Double -> Int -> [PhyloPeriodId] -> Map Date Double -> Double -> [[PhyloGroup]] -> [PhyloGroup]
338 recursiveMatching proximity qua freq thr frame periods docs quality branches =
339 if (length branches == (length $ concat branches))
340 then concat branches
341 else if thr >= 1
342 then concat branches
343 else
344 -- trace (show(quality) <> " (vs) sum of " <> show(nextQualities))
345 case quality <= (sum nextQualities) of
346 -- | success : the new threshold improves the quality score, let's go deeper (traceMatchSuccess thr quality (sum nextQualities))
347 True -> concat
348 $ map (\branches' ->
349 let idx = fromJust $ elemIndex branches' nextBranches
350 in recursiveMatching proximity qua freq (thr + (getThresholdStep proximity)) frame periods docs (nextQualities !! idx) branches')
351 $ nextBranches
352 -- | failure : last step was a local maximum of quality, let's validate it (traceMatchFailure thr quality (sum nextQualities))
353 False -> concat branches
354 where
355 -- | 2) for each of the possible next branches process the phyloQuality score
356 nextQualities :: [Double]
357 nextQualities = map (\nextBranch -> toPhyloQuality' qua freq nextBranch) nextBranches
358 -- | 1) for each local branch process a temporal matching then find the resulting branches
359 nextBranches :: [[[PhyloGroup]]]
360 nextBranches =
361 let branches' = map (\branch -> phyloBranchMatching frame periods proximity thr docs branch) branches
362 clusters = map (\branch -> groupsToBranches $ fromList $ map (\group -> (getGroupId group, group)) branch) branches'
363 clusters' = clusters `using` parList rdeepseq
364 in clusters'
365
366
367
368 temporalMatching :: Phylo -> Phylo
369 temporalMatching phylo = updatePhyloGroups 1 branches' phylo
370 where
371 -- | 4) run the recursive matching to find the best repartition among branches
372 branches' :: Map PhyloGroupId PhyloGroup
373 branches' = fromList
374 $ map (\g -> (getGroupId g, g))
375 $ traceMatchEnd
376 $ recursiveMatching (phyloProximity $ getConfig phylo)
377 (phyloQuality $ getConfig phylo)
378 frequency
379 ( (getThresholdInit $ phyloProximity $ getConfig phylo)
380 + (getThresholdStep $ phyloProximity $ getConfig phylo))
381 (getTimeFrame $ timeUnit $ getConfig phylo)
382 (getPeriodIds phylo)
383 (phylo ^. phylo_timeDocs) quality branches
384 -- | 3) process the quality score
385 quality :: Double
386 quality = toPhyloQuality' (phyloQuality $ getConfig phylo) frequency branches
387 -- | 3) process the constants of the quality score
388 frequency :: Map Int Double
389 frequency =
390 let terms = ngramsInBranches branches
391 in fromList $ map (\t -> (t, ((termFreq' t $ concat branches) / (sum $ map (\t' -> termFreq' t' $ concat branches) terms)))) terms
392 -- | 2) group into branches
393 branches :: [[PhyloGroup]]
394 branches = groupsToBranches $ fromList $ map (\group -> (getGroupId group, group)) groups'
395 -- | 1) for each group process an initial temporal Matching
396 groups' :: [PhyloGroup]
397 groups' = phyloBranchMatching (getTimeFrame $ timeUnit $ getConfig phylo) (getPeriodIds phylo)
398 (phyloProximity $ getConfig phylo) (getThresholdInit $ phyloProximity $ getConfig phylo)
399 (phylo ^. phylo_timeDocs)
400 (traceTemporalMatching $ getGroupsFromLevel 1 phylo)