]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/Phylo/TemporalMatching.hs
fix the grand bleu effect
[gargantext.git] / src / Gargantext / Viz / Phylo / TemporalMatching.hs
1 {-|
2 Module : Gargantext.Viz.Phylo.TemporalMatching
3 Description : Module dedicated to the adaptative temporal matching of a Phylo.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 {-# LANGUAGE NoImplicitPrelude #-}
12 {-# LANGUAGE FlexibleContexts #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE MultiParamTypeClasses #-}
15
16 module Gargantext.Viz.Phylo.TemporalMatching where
17
18 import Data.List (concat, splitAt, tail, sortOn, (++), intersect, null, inits, groupBy, scanl, nub, union, dropWhile)
19 import Data.Map (Map, fromList, elems, restrictKeys, unionWith, intersectionWith, findWithDefault, keys, (!), filterWithKey)
20
21 import Gargantext.Prelude
22 import Gargantext.Viz.AdaptativePhylo
23 import Gargantext.Viz.Phylo.PhyloTools
24
25 -- import Prelude (logBase)
26 import Control.Lens hiding (Level)
27 import Control.Parallel.Strategies (parList, rdeepseq, using)
28 import Debug.Trace (trace)
29
30 import qualified Data.Set as Set
31
32
33 -------------------
34 -- | Proximity | --
35 -------------------
36
37
38 -- | Process the inverse sumLog
39 sumInvLog :: Double -> [Double] -> Double
40 sumInvLog s l = foldl (\mem x -> mem + (1 / log (s + x))) 0 l
41
42
43 -- | Process the sumLog
44 sumLog :: Double -> [Double] -> Double
45 sumLog s l = foldl (\mem x -> mem + log (s + x)) 0 l
46
47
48 -- | To compute a jaccard similarity between two lists
49 jaccard :: [Int] -> [Int] -> Double
50 jaccard inter' union' = ((fromIntegral . length) $ inter') / ((fromIntegral . length) $ union')
51
52
53 -- | To process a WeighedLogJaccard distance between to coocurency matrix
54 weightedLogJaccard :: Double -> Double -> Cooc -> Cooc -> [Int] -> [Int] -> Double
55 weightedLogJaccard sens docs cooc cooc' ngrams ngrams'
56 | null ngramsInter = 0
57 | ngramsInter == ngramsUnion = 1
58 | sens == 0 = jaccard ngramsInter ngramsUnion
59 | sens > 0 = (sumInvLog sens coocInter) / (sumInvLog sens coocUnion)
60 | otherwise = (sumLog sens coocInter) / (sumLog sens coocUnion)
61 where
62 --------------------------------------
63 ngramsInter :: [Int]
64 ngramsInter = intersect ngrams ngrams'
65 --------------------------------------
66 ngramsUnion :: [Int]
67 ngramsUnion = union ngrams ngrams'
68 --------------------------------------
69 coocInter :: [Double]
70 coocInter = elems $ map (/docs) $ filterWithKey (\(k,k') _ -> k == k') $ intersectionWith (+) cooc cooc'
71 -- coocInter = elems $ map (/docs) $ intersectionWith (+) cooc cooc'
72 --------------------------------------
73 coocUnion :: [Double]
74 coocUnion = elems $ map (/docs) $ filterWithKey (\(k,k') _ -> k == k') $ unionWith (+) cooc cooc'
75 --------------------------------------
76
77
78 -- | To choose a proximity function
79 pickProximity :: Proximity -> Double -> Cooc -> Cooc -> [Int] -> [Int] -> Double
80 pickProximity proximity docs cooc cooc' ngrams ngrams' = case proximity of
81 WeightedLogJaccard sens _ _ -> weightedLogJaccard sens docs cooc cooc' ngrams ngrams'
82 Hamming -> undefined
83
84
85 -- | To process the proximity between a current group and a pair of targets group
86 toProximity :: Map Date Double -> Proximity -> PhyloGroup -> PhyloGroup -> PhyloGroup -> Double
87 toProximity docs proximity ego target target' =
88 let docs' = sum $ elems docs
89 cooc = if target == target'
90 then (target ^. phylo_groupCooc)
91 else sumCooc (target ^. phylo_groupCooc) (target' ^. phylo_groupCooc)
92 ngrams = if target == target'
93 then (target ^. phylo_groupNgrams)
94 else union (target ^. phylo_groupNgrams) (target' ^. phylo_groupNgrams)
95 in pickProximity proximity docs' (ego ^. phylo_groupCooc) cooc (ego ^. phylo_groupNgrams) ngrams
96
97
98 ------------------------
99 -- | Local Matching | --
100 ------------------------
101
102 toLastPeriod :: Filiation -> [PhyloPeriodId] -> PhyloPeriodId
103 toLastPeriod fil periods = case fil of
104 ToParents -> head' "toLastPeriod" (sortOn fst periods)
105 ToChilds -> last' "toLastPeriod" (sortOn fst periods)
106
107
108 toLazyPairs :: [Pointer] -> Filiation -> Double -> Proximity -> PhyloPeriodId -> [(PhyloGroup,PhyloGroup)] -> [(PhyloGroup,PhyloGroup)]
109 toLazyPairs pointers fil thr prox prd pairs =
110 if null pointers then pairs
111 else let rest = filterPointers prox thr pointers
112 in if null rest
113 then let prd' = toLastPeriod fil (map (fst . fst . fst) pointers)
114 in if prd' == prd
115 then []
116 else filter (\(g,g') ->
117 case fil of
118 ToParents -> ((fst $ g ^. phylo_groupPeriod) < (fst prd'))
119 || ((fst $ g' ^. phylo_groupPeriod) < (fst prd'))
120 ToChilds -> ((fst $ g ^. phylo_groupPeriod) > (fst prd'))
121 || ((fst $ g' ^. phylo_groupPeriod) > (fst prd'))) pairs
122 else []
123
124
125 -- | Find pairs of valuable candidates to be matched
126 makePairs' :: PhyloGroup -> [PhyloGroup] -> [PhyloPeriodId] -> [Pointer] -> Filiation -> Double -> Proximity -> Map Date Double -> [(PhyloGroup,PhyloGroup)]
127 makePairs' ego candidates periods pointers fil thr prox docs =
128 case null periods of
129 True -> []
130 False -> toLazyPairs pointers fil thr prox lastPrd
131 -- | at least on of the pair candidates should be from the last added period
132 $ filter (\(g,g') -> ((g ^. phylo_groupPeriod) == lastPrd)
133 || ((g' ^. phylo_groupPeriod) == lastPrd))
134 $ listToKeys
135 $ filter (\g -> (g ^. phylo_groupPeriod == lastPrd)
136 || ((toProximity docs prox ego ego g) >= thr)) candidates
137 where
138 lastPrd :: PhyloPeriodId
139 lastPrd = toLastPeriod fil periods
140
141
142 filterPointers :: Proximity -> Double -> [Pointer] -> [Pointer]
143 filterPointers proxi thr pts = filter (\(_,w) -> filterProximity proxi thr w) pts
144
145
146 phyloGroupMatching :: [[PhyloGroup]] -> Filiation -> Proximity -> Map Date Double -> Double -> PhyloGroup -> PhyloGroup
147 phyloGroupMatching candidates fil proxi docs thr ego =
148 case null nextPointers of
149 -- | let's find new pointers
150 True -> if null $ filterPointers proxi thr $ getPeriodPointers fil ego
151 then addPointers ego fil TemporalPointer []
152 -- | or keep the old ones
153 else addPointers ego fil TemporalPointer
154 $ filterPointers proxi thr $ getPeriodPointers fil ego
155 False -> addPointers ego fil TemporalPointer
156 $ head' "phyloGroupMatching"
157 -- | Keep only the best set of pointers grouped by proximity
158 $ groupBy (\pt pt' -> snd pt == snd pt')
159 $ reverse $ sortOn snd $ head' "pointers"
160 $ nextPointers
161 -- | Find the first time frame where at leats one pointer satisfies the proximity threshold
162 where
163 nextPointers :: [[Pointer]]
164 nextPointers = take 1
165 $ dropWhile (null)
166 -- | for each time frame, process the proximity on relevant pairs of targeted groups
167 $ scanl (\acc groups ->
168 let periods = nub $ map _phylo_groupPeriod $ concat groups
169 docs' = (filterDocs docs ([ego ^. phylo_groupPeriod] ++ periods))
170 pairs = makePairs' ego (concat groups) periods (getPeriodPointers fil ego) fil thr proxi docs
171 in acc ++ ( filterPointers proxi thr
172 $ concat
173 $ map (\(c,c') ->
174 -- | process the proximity between the current group and a pair of candidates
175 let proximity = toProximity docs' proxi ego c c'
176 in if (c == c')
177 then [(getGroupId c,proximity)]
178 else [(getGroupId c,proximity),(getGroupId c',proximity)] ) pairs )) []
179 $ inits candidates -- | groups from [[1900],[1900,1901],[1900,1901,1902],...]
180
181
182 filterDocs :: Map Date Double -> [PhyloPeriodId] -> Map Date Double
183 filterDocs d pds = restrictKeys d $ periodsToYears pds
184
185
186 -----------------------------
187 -- | Matching Processing | --
188 -----------------------------
189
190
191 getNextPeriods :: Filiation -> Int -> PhyloPeriodId -> [PhyloPeriodId] -> [PhyloPeriodId]
192 getNextPeriods fil max' pId pIds =
193 case fil of
194 ToChilds -> take max' $ (tail . snd) $ splitAt (elemIndex' pId pIds) pIds
195 ToParents -> take max' $ (reverse . fst) $ splitAt (elemIndex' pId pIds) pIds
196
197
198 getCandidates :: Filiation -> PhyloGroup -> [[PhyloGroup]] -> [[PhyloGroup]]
199 getCandidates fil ego targets =
200 case fil of
201 ToChilds -> targets'
202 ToParents -> reverse targets'
203 where
204 targets' :: [[PhyloGroup]]
205 targets' =
206 map (\groups' ->
207 filter (\g' -> (not . null) $ intersect (ego ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams)
208 ) groups') targets
209
210
211 phyloBranchMatching :: Int -> [PhyloPeriodId] -> Proximity -> Double -> Map Date Double -> [PhyloGroup] -> [PhyloGroup]
212 phyloBranchMatching frame periods proximity thr docs branch = traceBranchMatching proximity thr
213 -- $ matchByPeriods ToParents
214 -- $ groupByField _phylo_groupPeriod
215 $ matchByPeriods
216 $ groupByField _phylo_groupPeriod branch
217 where
218 --------------------------------------
219 matchByPeriods :: Map PhyloPeriodId [PhyloGroup] -> [PhyloGroup]
220 matchByPeriods branch' = foldl' (\acc prd ->
221 let periodsPar = getNextPeriods ToParents frame prd periods
222 periodsChi = getNextPeriods ToChilds frame prd periods
223 candidatesPar = map (\prd' -> findWithDefault [] prd' branch') periodsPar
224 candidatesChi = map (\prd' -> findWithDefault [] prd' branch') periodsChi
225 docsPar = filterDocs docs ([prd] ++ periodsPar)
226 docsChi = filterDocs docs ([prd] ++ periodsChi)
227 egos = map (\ego -> phyloGroupMatching (getCandidates ToParents ego candidatesPar) ToParents proximity docsPar thr
228 $ phyloGroupMatching (getCandidates ToChilds ego candidatesChi) ToChilds proximity docsChi thr ego)
229 $ findWithDefault [] prd branch'
230 egos' = egos `using` parList rdeepseq
231 in acc ++ egos' ) [] periods
232
233
234 -----------------------
235 -- | Phylo Quality | --
236 -----------------------
237
238
239 count :: Eq a => a -> [a] -> Int
240 count x = length . filter (== x)
241
242 termFreq' :: Int -> [PhyloGroup] -> Double
243 termFreq' term groups =
244 let ngrams = concat $ map _phylo_groupNgrams groups
245 in log((fromIntegral $ count term ngrams)
246 / (fromIntegral $ length ngrams))
247
248 relevantBranches :: Int -> Int -> [[PhyloGroup]] -> [[PhyloGroup]]
249 relevantBranches term thr branches =
250 filter (\groups -> (length groups >= thr)
251 && (any (\group -> elem term $ group ^. phylo_groupNgrams) groups)) branches
252
253 branchCov' :: [PhyloGroup] -> [[PhyloGroup]] -> Double
254 branchCov' branch branches =
255 (fromIntegral $ length branch) / (fromIntegral $ length $ concat branches)
256
257
258 toRecall :: Double -> Int -> Int -> [[PhyloGroup]] -> Double
259 toRecall freq term thr branches =
260 -- | given a random term in a phylo
261 freq
262 -- | for each relevant branches
263 * (sum $ map (\branch ->
264 -- | given its local coverage
265 ((branchCov' branch branches') / (sum $ map (\b -> branchCov' b branches') branches'))
266 -- | compute the local recall
267 * ( (fromIntegral $ length $ filter (\group -> elem term $ group ^. phylo_groupNgrams) branch)
268 / (fromIntegral $ length $ filter (\group -> elem term $ group ^. phylo_groupNgrams) $ concat branches'))) branches')
269 where
270 branches' :: [[PhyloGroup]]
271 branches' = relevantBranches term thr branches
272
273
274 toAccuracy :: Double -> Int -> Int -> [[PhyloGroup]] -> Double
275 toAccuracy freq term thr branches =
276 -- | given a random term in a phylo
277 freq
278 -- | for each relevant branches
279 * (sum $ map (\branch ->
280 -- | given its local coverage
281 ((branchCov' branch branches') / (sum $ map (\b -> branchCov' b branches') branches'))
282 -- | compute the local accuracy
283 * ( (fromIntegral $ length $ filter (\group -> elem term $ group ^. phylo_groupNgrams) branch)
284 / (fromIntegral $ length branch))) branches')
285 where
286 branches' :: [[PhyloGroup]]
287 branches' = relevantBranches term thr branches
288
289
290 toRecallWeighted :: Double -> Double -> Double
291 toRecallWeighted old curr = curr / (old + curr)
292
293
294 toRecall' :: Int -> Map Int Double -> [[PhyloGroup]] -> Double
295 toRecall' minBranch frequency branches =
296 let terms = keys frequency
297 in sum $ map (\term -> toRecall (frequency ! term) term minBranch branches) terms
298
299
300 toPhyloQuality :: Double -> Int -> Map Int Double -> Double -> [[PhyloGroup]] -> Double
301 toPhyloQuality beta minBranch frequency recall branches =
302 if (foldl' (\acc b -> acc && (length b < minBranch)) True branches)
303 -- | the local phylo is composed of small branches
304 then 0
305 else ((1 + beta ** 2) * accuracy * recall)
306 / (((beta ** 2) * accuracy + recall))
307 where
308 terms :: [Int]
309 terms = keys frequency
310 -- | for each term compute the global accuracy
311 accuracy :: Double
312 accuracy = sum $ map (\term -> toAccuracy (frequency ! term) term minBranch branches) terms
313
314
315 -----------------------------
316 -- | Adaptative Matching | --
317 -----------------------------
318
319
320 groupsToBranches :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
321 groupsToBranches groups =
322 -- | run the related component algorithm
323 let egos = groupBy (\gs gs' -> (fst $ fst $ head' "egos" gs) == (fst $ fst $ head' "egos" gs'))
324 $ sortOn (\gs -> fst $ fst $ head' "egos" gs)
325 $ map (\group -> [getGroupId group]
326 ++ (map fst $ group ^. phylo_groupPeriodParents)
327 ++ (map fst $ group ^. phylo_groupPeriodChilds) ) $ elems groups
328 -- | first find the related components by inside each ego's period
329 graph' = map relatedComponents egos
330 -- | then run it for the all the periods
331 graph = zip [1..]
332 $ relatedComponents $ concat (graph' `using` parList rdeepseq)
333 -- | update each group's branch id
334 in map (\(bId,ids) ->
335 let groups' = map (\group -> group & phylo_groupBranchId %~ (\(lvl,lst) -> (lvl,lst ++ [bId])))
336 $ elems $ restrictKeys groups (Set.fromList ids)
337 in groups' `using` parList rdeepseq ) graph
338
339
340 recursiveMatching :: Proximity -> Double -> Int -> Map Int Double -> Double -> Int -> [PhyloPeriodId] -> Map Date Double -> Double -> Double -> [PhyloGroup] -> [PhyloGroup]
341 recursiveMatching proximity beta minBranch frequency egoThr frame periods docs quality recall groups =
342 if (length groups == 1)
343 then trace ("stop : just one group")
344 $ groups
345 else if (egoThr >= 1)
346 then trace ("stop : thr >= 1")
347 $ groups
348 else if (quality > quality')
349 then trace ("stop : " <> show(quality) <> " > " <> show(quality'))
350 -- $ trace (show(length groups) <> " groups " <> show(length branches'))
351 -- $ trace (show(recall) <> " recall " <> show(recall'))
352 $ groups
353 else trace ("go : " <> show(quality) <> " <= " <> show(quality'))
354 $ concat
355 $ map (\branch -> recursiveMatching proximity beta minBranch frequency (egoThr + (getThresholdStep proximity))
356 frame periods docs quality' recall' branch)
357 $ branches'
358 where
359 -- | 2) for each of the possible next branches process the phyloQuality score
360 quality' :: Double
361 quality' = toPhyloQuality beta minBranch frequency recall' branches'
362 -- | 3) process a new recall weigted by the last one
363 recall' :: Double
364 recall' = toRecallWeighted recall
365 $ toRecall' minBranch frequency branches'
366 -- | 1) for each local branch process a temporal matching then find the resulting branches
367 branches' :: [[PhyloGroup]]
368 branches' =
369 let branches = groupsToBranches $ fromList $ map (\g -> (getGroupId g, g))
370 $ phyloBranchMatching frame periods proximity egoThr docs groups
371 in branches `using` parList rdeepseq
372
373
374 temporalMatching :: Phylo -> Phylo
375 temporalMatching phylo = updatePhyloGroups 1 branches' phylo
376 where
377 -- | 6) apply the recursive matching
378 branches' :: Map PhyloGroupId PhyloGroup
379 branches' = fromList
380 $ map (\g -> (getGroupId g, g))
381 $ traceMatchEnd
382 $ concat
383 $ map (\branch ->
384 recursiveMatching (phyloProximity $ getConfig phylo)
385 (_qua_relevance $ phyloQuality $ getConfig phylo)
386 (_qua_minBranch $ phyloQuality $ getConfig phylo)
387 frequency
388 ( (getThresholdInit $ phyloProximity $ getConfig phylo)
389 + (getThresholdStep $ phyloProximity $ getConfig phylo))
390 (getTimeFrame $ timeUnit $ getConfig phylo)
391 (getPeriodIds phylo)
392 (phylo ^. phylo_timeDocs) quality recall branch
393 ) branches
394 -- | 5) process the quality score
395 quality :: Double
396 quality = toPhyloQuality (_qua_relevance $ phyloQuality $ getConfig phylo) (_qua_minBranch $ phyloQuality $ getConfig phylo) frequency recall branches
397 -- | 4) find the recall
398 recall :: Double
399 recall = toRecall' (_qua_minBranch $ phyloQuality $ getConfig phylo) frequency branches
400 -- | 3) process the constants of the quality score
401 frequency :: Map Int Double
402 frequency =
403 let terms = ngramsInBranches branches
404 in fromList $ map (\t -> (t, ((termFreq' t $ concat branches) / (sum $ map (\t' -> termFreq' t' $ concat branches) terms)))) terms
405 -- | 2) group into branches
406 branches :: [[PhyloGroup]]
407 branches = groupsToBranches $ fromList $ map (\group -> (getGroupId group, group)) groups'
408 -- | 1) for each group process an initial temporal Matching
409 groups' :: [PhyloGroup]
410 groups' = phyloBranchMatching (getTimeFrame $ timeUnit $ getConfig phylo) (getPeriodIds phylo)
411 (phyloProximity $ getConfig phylo) (getThresholdInit $ phyloProximity $ getConfig phylo)
412 (phylo ^. phylo_timeDocs)
413 (traceTemporalMatching $ getGroupsFromLevel 1 phylo)