2 Module : Gargantext.Core.Viz.Phylo.TemporalMatching
3 Description : Module dedicated to the adaptative temporal matching of a Phylo.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 module Gargantext.Core.Viz.Phylo.TemporalMatching where
14 import Data.List (concat, splitAt, tail, sortOn, (++), intersect, null, inits, groupBy, scanl, nub, nubBy, union, dropWhile, partition, or, sort, (!!))
15 import Data.Map (Map, fromList, elems, restrictKeys, unionWith, findWithDefault, keys, (!), (!?), filterWithKey, singleton, empty, mapKeys, adjust)
17 import Gargantext.Prelude
18 import Gargantext.Core.Viz.AdaptativePhylo
19 import Gargantext.Core.Viz.Phylo.PhyloTools
21 import Prelude (floor,tan,pi)
22 import Control.Lens hiding (Level)
23 import Control.Parallel.Strategies (parList, rdeepseq, using)
24 import Debug.Trace (trace)
28 import qualified Data.Map as Map
29 import qualified Data.Set as Set
30 import qualified Data.Vector as Vector
39 -- | To compute a jaccard similarity between two lists
40 jaccard :: [Int] -> [Int] -> Double
41 jaccard inter' union' = ((fromIntegral . length) $ inter') / ((fromIntegral . length) $ union')
44 -- | Process the inverse sumLog
45 sumInvLog' :: Double -> Double -> [Double] -> Double
46 sumInvLog' s nb diago = foldl (\mem occ -> mem + (1 / (log (occ + 1/s) / log (nb + 1/s)))) 0 diago
49 -- | Process the sumLog
50 sumLog' :: Double -> Double -> [Double] -> Double
51 sumLog' s nb diago = foldl (\mem occ -> mem + (log (occ + 1/s) / log (nb + 1/s))) 0 diago
54 weightedLogJaccard' :: Double -> Double -> Map Int Double -> [Int] -> [Int] -> Double
55 weightedLogJaccard' sens nbDocs diago ngrams ngrams'
56 | null ngramsInter = 0
57 | ngramsInter == ngramsUnion = 1
58 | sens == 0 = jaccard ngramsInter ngramsUnion
59 | sens > 0 = (sumInvLog' sens nbDocs diagoInter) / (sumInvLog' sens nbDocs diagoUnion)
60 | otherwise = (sumLog' sens nbDocs diagoInter) / (sumLog' sens nbDocs diagoUnion)
62 --------------------------------------
64 ngramsInter = intersect ngrams ngrams'
65 --------------------------------------
67 ngramsUnion = union ngrams ngrams'
68 --------------------------------------
69 diagoInter :: [Double]
70 diagoInter = elems $ restrictKeys diago (Set.fromList ngramsInter)
71 --------------------------------------
72 diagoUnion :: [Double]
73 diagoUnion = elems $ restrictKeys diago (Set.fromList ngramsUnion)
74 --------------------------------------
76 -- | Process the weighted similarity between clusters. Adapted from Wang, X., Cheng, Q., Lu, W., 2014. Analyzing evolution of research topics with NEViewer: a new method based on dynamic co-word networks. Scientometrics 101, 1253–1271. https://doi.org/10.1007/s11192-014-1347-y (log added in the formula + pair comparison)
77 -- tests not conclusive
78 weightedLogSim' :: Double -> Double -> Map Int Double -> [Int] -> [Int] -> Double
79 weightedLogSim' sens nbDocs diago ego_ngrams target_ngrams
80 | null ngramsInter = 0
81 | ngramsInter == ngramsUnion = 1
82 | sens == 0 = jaccard ngramsInter ngramsUnion
83 | sens > 0 = (sumInvLog' sens nbDocs diagoInter) / minimum [(sumInvLog' sens nbDocs diagoEgo),(sumInvLog' sens nbDocs diagoTarget)]
84 | otherwise = (sumLog' sens nbDocs diagoInter) / minimum [(sumLog' sens nbDocs diagoEgo),(sumLog' sens nbDocs diagoTarget)]
86 --------------------------------------
88 ngramsInter = intersect ego_ngrams target_ngrams
89 --------------------------------------
91 ngramsUnion = union ego_ngrams target_ngrams
92 --------------------------------------
93 diagoInter :: [Double]
94 diagoInter = elems $ restrictKeys diago (Set.fromList ngramsInter)
95 --------------------------------------
97 diagoEgo = elems $ restrictKeys diago (Set.fromList ego_ngrams)
98 --------------------------------------
99 diagoTarget :: [Double]
100 diagoTarget = elems $ restrictKeys diago (Set.fromList target_ngrams)
101 --------------------------------------
103 toProximity :: Double -> Map Int Double -> Proximity -> [Int] -> [Int] -> [Int] -> Double
104 -- | To process the proximity between a current group and a pair of targets group using the adapted Wang et al. Similarity
105 toProximity nbDocs diago proximity egoNgrams targetNgrams targetNgrams' =
107 WeightedLogJaccard sens ->
108 let pairNgrams = if targetNgrams == targetNgrams'
110 else union targetNgrams targetNgrams'
111 in weightedLogJaccard' sens nbDocs diago egoNgrams pairNgrams
112 WeightedLogSim sens ->
113 let pairNgrams = if targetNgrams == targetNgrams'
115 else union targetNgrams targetNgrams'
116 in weightedLogSim' sens nbDocs diago egoNgrams pairNgrams
119 ------------------------
120 -- | Local Matching | --
121 ------------------------
123 findLastPeriod :: Filiation -> [PhyloPeriodId] -> PhyloPeriodId
124 findLastPeriod fil periods = case fil of
125 ToParents -> head' "findLastPeriod" (sortOn fst periods)
126 ToChilds -> last' "findLastPeriod" (sortOn fst periods)
129 -- | To filter pairs of candidates related to old pointers periods
130 removeOldPointers :: [Pointer] -> Filiation -> Double -> Proximity -> PhyloPeriodId
131 -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
132 -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
133 removeOldPointers oldPointers fil thr prox prd pairs
134 | null oldPointers = pairs
135 | null (filterPointers prox thr oldPointers) =
136 let lastMatchedPrd = findLastPeriod fil (map (fst . fst . fst) oldPointers)
137 in if lastMatchedPrd == prd
139 else filter (\((id,_),(id',_)) ->
141 ToParents -> (((fst . fst . fst) id ) < (fst lastMatchedPrd))
142 || (((fst . fst . fst) id') < (fst lastMatchedPrd))
143 ToChilds -> (((fst . fst . fst) id ) > (fst lastMatchedPrd))
144 || (((fst . fst . fst) id') > (fst lastMatchedPrd))) pairs
148 makePairs' :: (PhyloGroupId,[Int]) -> [(PhyloGroupId,[Int])] -> [PhyloPeriodId] -> [Pointer] -> Filiation -> Double -> Proximity
149 -> Map Date Double -> Map Date Cooc -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
150 makePairs' (egoId, egoNgrams) candidates periods oldPointers fil thr prox docs diagos =
153 else removeOldPointers oldPointers fil thr prox lastPrd
154 {- at least on of the pair candidates should be from the last added period -}
155 $ filter (\((id,_),(id',_)) -> ((fst . fst) id == lastPrd) || ((fst . fst) id' == lastPrd))
157 $ filter (\(id,ngrams) ->
158 let nbDocs = (sum . elems) $ filterDocs docs ([(fst . fst) egoId, (fst . fst) id])
159 diago = reduceDiagos $ filterDiago diagos ([(fst . fst) egoId, (fst . fst) id])
160 in (toProximity nbDocs diago prox egoNgrams egoNgrams ngrams) >= thr
163 lastPrd :: PhyloPeriodId
164 lastPrd = findLastPeriod fil periods
167 filterPointers :: Proximity -> Double -> [Pointer] -> [Pointer]
168 filterPointers proxi thr pts = filter (\(_,w) -> filterProximity proxi thr w) pts
170 filterPointers' :: Proximity -> Double -> [(Pointer,[Int])] -> [(Pointer,[Int])]
171 filterPointers' proxi thr pts = filter (\((_,w),_) -> filterProximity proxi thr w) pts
174 reduceDiagos :: Map Date Cooc -> Map Int Double
175 reduceDiagos diagos = mapKeys (\(k,_) -> k)
176 $ foldl (\acc diago -> unionWith (+) acc diago) empty (elems diagos)
178 filterPointersByPeriod :: Filiation -> [(Pointer,[Int])] -> [Pointer]
179 filterPointersByPeriod fil pts =
180 let pts' = sortOn (fst . fst . fst . fst) pts
181 inf = (fst . fst . fst . fst) $ head' "filterPointersByPeriod" pts'
182 sup = (fst . fst . fst . fst) $ last' "filterPointersByPeriod" pts'
184 $ nubBy (\pt pt' -> snd pt == snd pt')
185 $ filter (\pt -> ((fst . fst . fst . fst) pt == inf) || ((fst . fst . fst . fst) pt == sup))
187 ToParents -> reverse pts'
190 phyloGroupMatching :: [[(PhyloGroupId,[Int])]] -> Filiation -> Proximity -> Map Date Double -> Map Date Cooc
191 -> Double -> [Pointer] -> (PhyloGroupId,[Int]) -> [Pointer]
192 phyloGroupMatching candidates fil proxi docs diagos thr oldPointers (id,ngrams) =
193 if (null $ filterPointers proxi thr oldPointers)
194 {- let's find new pointers -}
195 then if null nextPointers
197 else filterPointersByPeriod fil
198 $ head' "phyloGroupMatching"
199 -- Keep only the best set of pointers grouped by proximity
200 $ groupBy (\pt pt' -> (snd . fst) pt == (snd . fst) pt')
201 $ reverse $ sortOn (snd . fst) $ head' "pointers" nextPointers
202 -- Find the first time frame where at leats one pointer satisfies the proximity threshold
205 nextPointers :: [[(Pointer,[Int])]]
206 nextPointers = take 1
208 {- for each time frame, process the proximity on relevant pairs of targeted groups -}
209 $ scanl (\acc groups ->
210 let periods = nub $ map (fst . fst . fst) $ concat groups
211 nbdocs = sum $ elems $ (filterDocs docs ([(fst . fst) id] ++ periods))
213 $ filterDiago diagos ([(fst . fst) id] ++ periods)
214 {- important resize nbdocs et diago dans le make pairs -}
215 pairs = makePairs' (id,ngrams) (concat groups) periods oldPointers fil thr proxi docs diagos
216 in acc ++ ( filterPointers' proxi thr
219 {- process the proximity between the current group and a pair of candidates -}
220 let proximity = toProximity nbdocs diago proxi ngrams (snd c) (snd c')
221 in if ((c == c') || (snd c == snd c'))
222 then [((fst c,proximity),snd c)]
223 else [((fst c,proximity),snd c),((fst c',proximity),snd c')] ) pairs )) []
224 $ inits candidates -- groups from [[1900],[1900,1901],[1900,1901,1902],...]
227 filterDocs :: Map Date Double -> [PhyloPeriodId] -> Map Date Double
228 filterDocs d pds = restrictKeys d $ periodsToYears pds
230 filterDiago :: Map Date Cooc -> [PhyloPeriodId] -> Map Date Cooc
231 filterDiago diago pds = restrictKeys diago $ periodsToYears pds
234 -----------------------------
235 -- | Matching Processing | --
236 -----------------------------
239 getNextPeriods :: Filiation -> Int -> PhyloPeriodId -> [PhyloPeriodId] -> [PhyloPeriodId]
240 getNextPeriods fil max' pId pIds =
242 ToChilds -> take max' $ (tail . snd) $ splitAt (elemIndex' pId pIds) pIds
243 ToParents -> take max' $ (reverse . fst) $ splitAt (elemIndex' pId pIds) pIds
246 getCandidates :: PhyloGroup -> [[(PhyloGroupId,[Int])]] -> [[(PhyloGroupId,[Int])]]
247 getCandidates ego targets =
249 filter (\g' -> (not . null) $ intersect (ego ^. phylo_groupNgrams) (snd g')
253 matchGroupsToGroups :: Int -> [PhyloPeriodId] -> Proximity -> Double -> Map Date Double -> Map Date Cooc -> [PhyloGroup] -> [PhyloGroup]
254 matchGroupsToGroups frame periods proximity thr docs coocs groups =
255 let groups' = groupByField _phylo_groupPeriod groups
256 in foldl' (\acc prd ->
257 let -- 1) find the parents/childs matching periods
258 periodsPar = getNextPeriods ToParents frame prd periods
259 periodsChi = getNextPeriods ToChilds frame prd periods
260 -- 2) find the parents/childs matching candidates
261 candidatesPar = map (\prd' -> map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ findWithDefault [] prd' groups') periodsPar
262 candidatesChi = map (\prd' -> map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ findWithDefault [] prd' groups') periodsChi
263 -- 3) find the parents/child number of docs by years
264 docsPar = filterDocs docs ([prd] ++ periodsPar)
265 docsChi = filterDocs docs ([prd] ++ periodsChi)
266 -- 4) find the parents/child diago by years
267 diagoPar = filterDiago (map coocToDiago coocs) ([prd] ++ periodsPar)
268 diagoChi = filterDiago (map coocToDiago coocs) ([prd] ++ periodsPar)
269 -- 5) match in parallel all the groups (egos) to their possible candidates
271 let pointersPar = phyloGroupMatching (getCandidates ego candidatesPar) ToParents proximity docsPar diagoPar
272 thr (getPeriodPointers ToParents ego) (getGroupId ego, ego ^. phylo_groupNgrams)
273 pointersChi = phyloGroupMatching (getCandidates ego candidatesChi) ToChilds proximity docsChi diagoChi
274 thr (getPeriodPointers ToChilds ego) (getGroupId ego, ego ^. phylo_groupNgrams)
275 in addPointers ToChilds TemporalPointer pointersChi
276 $ addPointers ToParents TemporalPointer pointersPar ego)
277 $ findWithDefault [] prd groups'
278 egos' = egos `using` parList rdeepseq
283 -----------------------
284 -- | Phylo Quality | --
285 -----------------------
288 relevantBranches :: Int -> [[PhyloGroup]] -> [[PhyloGroup]]
289 relevantBranches term branches =
290 filter (\groups -> (any (\group -> elem term $ group ^. phylo_groupNgrams) groups)) branches
292 accuracy :: Int -> [(Date,Date)] -> [PhyloGroup] -> Double
293 -- The accuracy of a branch relatively to a term x is computed only over the periods there exist some cluster mentionning x in the phylomemy
294 accuracy x periods bk = ((fromIntegral $ length $ filter (\g -> elem x $ g ^. phylo_groupNgrams) bk')
295 / (fromIntegral $ length bk'))
298 bk' = filter (\g -> elem (g ^. phylo_groupPeriod) periods) bk
300 recall :: Int -> [PhyloGroup] -> [[PhyloGroup]] -> Double
301 recall x bk bx = ((fromIntegral $ length $ filter (\g -> elem x $ g ^. phylo_groupNgrams) bk)
302 / (fromIntegral $ length $ filter (\g -> elem x $ g ^. phylo_groupNgrams) $ concat bx))
304 fScore :: Double -> Int -> [(Date,Date)] -> [PhyloGroup] -> [[PhyloGroup]] -> Double
305 fScore lambda x periods bk bx =
306 let rec = recall x bk bx
307 acc = accuracy x periods bk
308 in ((1 + lambda ** 2) * acc * rec)
309 / (((lambda ** 2) * acc + rec))
312 wk :: [PhyloGroup] -> Double
313 wk bk = fromIntegral $ length bk
316 toPhyloQuality' :: Double -> Map Int Double -> [[PhyloGroup]] -> Double
317 toPhyloQuality' lambda freq branches =
322 let bks = relevantBranches i branches
323 periods = nub $ map _phylo_groupPeriod $ filter (\g -> elem i $ g ^. phylo_groupNgrams) $ concat bks
324 in (freq ! i) * (sum $ map (\bk -> ((wk bk) / (sum $ map wk bks)) * (fScore lambda i periods bk bks)) bks))
327 toRecall :: Map Int Double -> [[PhyloGroup]] -> Double
328 toRecall freq branches =
334 bx = relevantBranches x branches
335 wks = sum $ map wk bx
336 in (px / pys) * (sum $ map (\bk -> ((wk bk) / wks) * (recall x bk bx)) bx))
340 pys = sum (elems freq)
343 toAccuracy :: Map Int Double -> [[PhyloGroup]] -> Double
344 toAccuracy freq branches =
350 bx = relevantBranches x branches
351 -- | periods containing x
352 periods = nub $ map _phylo_groupPeriod $ filter (\g -> elem x $ g ^. phylo_groupNgrams) $ concat bx
353 wks = sum $ map wk bx
354 in (px / pys) * (sum $ map (\bk -> ((wk bk) / wks) * (accuracy x periods bk)) bx))
358 pys = sum (elems freq)
361 -- | here we do the average of all the local f_scores
362 toPhyloQuality :: Double -> Double -> Map Int Double -> [[PhyloGroup]] -> Double
363 toPhyloQuality fdt lambda freq branches =
369 let bx = relevantBranches x branches
370 -- | periods containing x
371 periods = nub $ map _phylo_groupPeriod $ filter (\g -> elem x $ g ^. phylo_groupNgrams) $ concat bx
372 wks = sum $ map wk bx
373 -- in (px / pys) * (sum $ map (\bk -> ((wk bk) / wks) * (fScore beta x bk bx)) bx))
374 -- in (1 / fdt) * (sum $ map (\bk -> ((wk bk) / wks) * (fScore beta x periods bk bx)) bx))
375 in (1 / fdt) * (sum $ map (\bk -> ((wk bk) / wks) * (fScore (tan (lambda * pi / 2)) x periods bk bx)) bx))
379 -- pys = sum (elems freq)
381 -- 1 / nb de foundation
383 ------------------------------------
384 -- | Constant Temporal Matching | --
385 ------------------------------------
388 groupsToBranches' :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
389 groupsToBranches' groups =
390 {- run the related component algorithm -}
391 let egos = groupBy (\gs gs' -> (fst $ fst $ head' "egos" gs) == (fst $ fst $ head' "egos" gs'))
392 $ sortOn (\gs -> fst $ fst $ head' "egos" gs)
393 $ map (\group -> [getGroupId group]
394 ++ (map fst $ group ^. phylo_groupPeriodParents)
395 ++ (map fst $ group ^. phylo_groupPeriodChilds) ) $ elems groups
396 -- first find the related components by inside each ego's period
398 graph' = map relatedComponents egos
399 -- then run it for the all the periods
401 $ relatedComponents $ concat (graph' `using` parList rdeepseq)
402 -- update each group's branch id
403 in map (\(bId,ids) ->
404 let groups' = map (\group -> group & phylo_groupBranchId %~ (\(lvl,lst) -> (lvl,lst ++ [bId])))
405 $ elems $ restrictKeys groups (Set.fromList ids)
406 in groups' `using` parList rdeepseq ) graph
409 reduceFrequency :: Map Int Double -> [[PhyloGroup]] -> Map Int Double
410 reduceFrequency frequency branches =
411 restrictKeys frequency (Set.fromList $ (nub . concat) $ map _phylo_groupNgrams $ concat branches)
413 updateThr :: Double -> [[PhyloGroup]] -> [[PhyloGroup]]
414 updateThr thr branches = map (\b -> map (\g ->
415 g & phylo_groupMeta .~ (singleton "seaLevels" (((g ^. phylo_groupMeta) ! "seaLevels") ++ [thr]))) b) branches
418 -- Sequentially break each branch of a phylo where
419 -- done = all the allready broken branches
420 -- ego = the current branch we want to break
421 -- rest = the branches we still have to break
422 breakBranches :: Double -> Proximity -> Double -> Map Int Double -> Int -> Double -> Double -> Double
423 -> Int -> Map Date Double -> Map Date Cooc -> [PhyloPeriodId] -> [([PhyloGroup],Bool)] -> ([PhyloGroup],Bool) -> [([PhyloGroup],Bool)] -> [([PhyloGroup],Bool)]
424 breakBranches fdt proximity lambda frequency minBranch thr depth elevation frame docs coocs periods done ego rest =
425 -- 1) keep or not the new division of ego
426 let done' = done ++ (if snd ego
428 (if ((null (fst ego')) || (quality > quality'))
430 -- trace (" ✗ F(β) = " <> show(quality) <> " (vs) " <> show(quality')
431 -- <> " | " <> show(length $ fst ego) <> " groups : "
432 -- <> " |✓ " <> show(length $ fst ego') <> show(map length $ fst ego')
433 -- <> " |✗ " <> show(length $ snd ego') <> "[" <> show(length $ concat $ snd ego') <> "]")
436 -- trace (" ✓ level = " <> printf "%.1f" thr <> "")
437 -- trace (" ✓ F(β) = " <> show(quality) <> " (vs) " <> show(quality')
438 -- <> " | " <> show(length $ fst ego) <> " groups : "
439 -- <> " |✓ " <> show(length $ fst ego') <> show(map length $ fst ego')
440 -- <> " |✗ " <> show(length $ snd ego') <> "[" <> show(length $ concat $ snd ego') <> "]")
441 ((map (\e -> (e,True)) (fst ego')) ++ (map (\e -> (e,False)) (snd ego'))))
444 -- 2) if there is no more branches in rest then return else continue
447 else breakBranches fdt proximity lambda frequency minBranch thr depth elevation frame docs coocs periods
448 done' (head' "breakBranches" rest) (tail' "breakBranches" rest)
450 --------------------------------------
452 quality = toPhyloQuality fdt lambda frequency ((map fst done) ++ [fst ego] ++ (map fst rest))
453 --------------------------------------
454 ego' :: ([[PhyloGroup]],[[PhyloGroup]])
456 let branches = groupsToBranches' $ fromList $ map (\g -> (getGroupId g, g))
457 $ matchGroupsToGroups frame periods proximity thr docs coocs (fst ego)
458 branches' = branches `using` parList rdeepseq
459 in partition (\b -> (length $ nub $ map _phylo_groupPeriod b) >= minBranch)
461 $ depthToMeta (elevation - depth) branches'
462 --------------------------------------
464 quality' = toPhyloQuality fdt lambda frequency
465 ((map fst done) ++ (fst ego') ++ (snd ego') ++ (map fst rest))
468 seaLevelMatching :: Double -> Proximity -> Double -> Int -> Map Int Double -> Double -> Double -> Double -> Double
469 -> Int -> [PhyloPeriodId] -> Map Date Double -> Map Date Cooc -> [([PhyloGroup],Bool)] -> [([PhyloGroup],Bool)]
470 seaLevelMatching fdt proximity lambda minBranch frequency thr step depth elevation frame periods docs coocs branches =
471 -- if there is no branch to break or if seaLvl level > 1 then end
472 if (thr >= 1) || ((not . or) $ map snd branches)
475 -- break all the possible branches at the current seaLvl level
476 let quality = toPhyloQuality fdt lambda frequency (map fst branches)
477 acc = toAccuracy frequency (map fst branches)
478 rec = toRecall frequency (map fst branches)
479 branches' = trace ("↑ level = " <> printf "%.3f" thr <> " F(λ) = " <> printf "%.5f" quality
480 <> " ξ = " <> printf "%.5f" acc
481 <> " ρ = " <> printf "%.5f" rec
482 <> " branches = " <> show(length branches) <> " ↴")
483 $ breakBranches fdt proximity lambda frequency minBranch thr depth elevation frame docs coocs periods
484 [] (head' "seaLevelMatching" branches) (tail' "seaLevelMatching" branches)
485 frequency' = reduceFrequency frequency (map fst branches')
486 in seaLevelMatching fdt proximity lambda minBranch frequency' (thr + step) step (depth - 1) elevation frame periods docs coocs branches'
489 constanteTemporalMatching :: Double -> Double -> Phylo -> Phylo
490 constanteTemporalMatching start step phylo = updatePhyloGroups 1
491 (fromList $ map (\g -> (getGroupId g,g)) $ traceMatchEnd $ concat branches)
492 (toPhyloHorizon phylo)
494 -- 2) process the temporal matching by elevating seaLvl level
495 branches :: [[PhyloGroup]]
497 $ seaLevelMatching (fromIntegral $ Vector.length $ getRoots phylo)
498 (phyloProximity $ getConfig phylo)
499 (_qua_granularity $ phyloQuality $ getConfig phylo)
500 (_qua_minBranch $ phyloQuality $ getConfig phylo)
501 (phylo ^. phylo_termFreq)
503 ((((1 - start) / step) - 1))
504 (((1 - start) / step))
505 (getTimeFrame $ timeUnit $ getConfig phylo)
507 (phylo ^. phylo_timeDocs)
508 (phylo ^. phylo_timeCooc)
509 (reverse $ sortOn (length . fst) groups)
510 -- 1) for each group process an initial temporal Matching
511 -- here we suppose that all the groups of level 1 are part of the same big branch
512 groups :: [([PhyloGroup],Bool)]
513 groups = map (\b -> (b,(length $ nub $ map _phylo_groupPeriod b) >= (_qua_minBranch $ phyloQuality $ getConfig phylo)))
514 $ groupsToBranches' $ fromList $ map (\g -> (getGroupId g, g))
515 $ matchGroupsToGroups (getTimeFrame $ timeUnit $ getConfig phylo)
516 (getPeriodIds phylo) (phyloProximity $ getConfig phylo)
518 (phylo ^. phylo_timeDocs)
519 (phylo ^. phylo_timeCooc)
520 (traceTemporalMatching $ getGroupsFromLevel 1 phylo)
526 toPhyloHorizon :: Phylo -> Phylo
527 toPhyloHorizon phylo =
528 let t0 = take 1 (getPeriodIds phylo)
529 groups = getGroupsFromLevelPeriods 1 t0 phylo
530 sens = getSensibility (phyloProximity $ getConfig phylo)
531 nbDocs = sum $ elems $ filterDocs (phylo ^. phylo_timeDocs) t0
532 diago = reduceDiagos $ filterDiago (phylo ^. phylo_timeCooc) t0
533 in phylo & phylo_horizon .~ (fromList $ map (\(g,g') ->
534 ((getGroupId g,getGroupId g'),weightedLogJaccard' sens nbDocs diago (g ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams))) $ listToCombi' groups)
537 --------------------------------------
538 -- | Adaptative Temporal Matching | --
539 --------------------------------------
542 thrToMeta :: Double -> [[PhyloGroup]] -> [[PhyloGroup]]
543 thrToMeta thr branches =
545 map (\g -> g & phylo_groupMeta .~ (adjust (\lst -> lst ++ [thr]) "seaLevels" (g ^. phylo_groupMeta))) b) branches
547 depthToMeta :: Double -> [[PhyloGroup]] -> [[PhyloGroup]]
548 depthToMeta depth branches =
549 let break = length branches > 1
552 if break then g & phylo_groupMeta .~ (adjust (\lst -> lst ++ [depth]) "breaks"(g ^. phylo_groupMeta))
555 reduceTupleMapByKeys :: Eq a => [a] -> Map (a,a) Double -> Map (a,a) Double
556 reduceTupleMapByKeys ks m = filterWithKey (\(k,k') _ -> (elem k ks) && (elem k' ks)) m
559 getInTupleMap :: Ord a => Map (a,a) Double -> a -> a -> Double
561 | isJust (m !? ( k ,k')) = m ! ( k ,k')
562 | isJust (m !? ( k',k )) = m ! ( k',k )
566 toThreshold :: Double -> Map (PhyloGroupId,PhyloGroupId) Double -> Double
567 toThreshold lvl proxiGroups =
568 let idx = ((Map.size proxiGroups) `div` (floor lvl)) - 1
570 then (sort $ elems proxiGroups) !! idx
574 -- done = all the allready broken branches
575 -- ego = the current branch we want to break
576 -- rest = the branches we still have to break
577 adaptativeBreakBranches :: Double -> Proximity -> Double -> Double -> Map (PhyloGroupId,PhyloGroupId) Double
578 -> Double -> Map Int Double -> Int -> Int -> Map Date Double -> Map Date Cooc
579 -> [PhyloPeriodId] -> [([PhyloGroup],(Bool,[Double]))] -> ([PhyloGroup],(Bool,[Double])) -> [([PhyloGroup],(Bool,[Double]))]
580 -> [([PhyloGroup],(Bool,[Double]))]
581 adaptativeBreakBranches fdt proxiConf depth elevation groupsProxi lambda frequency minBranch frame docs coocs periods done ego rest =
582 -- 1) keep or not the new division of ego
583 let done' = done ++ (if (fst . snd) ego
584 then (if ((null (fst ego')) || (quality > quality'))
586 [(concat $ thrToMeta thr $ [fst ego],(False, ((snd . snd) ego)))]
588 ( (map (\e -> (e,(True, ((snd . snd) ego) ++ [thr]))) (fst ego'))
589 ++ (map (\e -> (e,(False, ((snd . snd) ego)))) (snd ego'))))
590 else [(concat $ thrToMeta thr $ [fst ego], snd ego)])
592 -- uncomment let .. in for debugging
593 -- let part1 = partition (snd) done'
594 -- part2 = partition (snd) rest
595 -- in trace ( "[✓ " <> show(length $ fst part1) <> "(" <> show(length $ concat $ map (fst) $ fst part1) <> ")|✗ " <> show(length $ snd part1) <> "(" <> show(length $ concat $ map (fst) $ snd part1) <> ")] "
596 -- <> "[✓ " <> show(length $ fst part2) <> "(" <> show(length $ concat $ map (fst) $ fst part2) <> ")|✗ " <> show(length $ snd part2) <> "(" <> show(length $ concat $ map (fst) $ snd part2) <> ")]"
598 -- 2) if there is no more branches in rest then return else continue
601 else adaptativeBreakBranches fdt proxiConf depth elevation groupsProxi lambda frequency minBranch frame docs coocs periods
602 done' (head' "breakBranches" rest) (tail' "breakBranches" rest)
604 --------------------------------------
606 thr = toThreshold depth $ Map.filter (\v -> v > (last' "breakBranches" $ (snd . snd) ego)) $ reduceTupleMapByKeys (map getGroupId $ fst ego) groupsProxi
607 --------------------------------------
609 quality = toPhyloQuality fdt lambda frequency ((map fst done) ++ [fst ego] ++ (map fst rest))
610 --------------------------------------
611 ego' :: ([[PhyloGroup]],[[PhyloGroup]])
613 let branches = groupsToBranches' $ fromList $ map (\g -> (getGroupId g, g))
614 $ matchGroupsToGroups frame periods proxiConf thr docs coocs (fst ego)
615 branches' = branches `using` parList rdeepseq
616 in partition (\b -> (length $ nub $ map _phylo_groupPeriod b) > minBranch)
618 $ depthToMeta (elevation - depth) branches'
619 --------------------------------------
621 quality' = toPhyloQuality fdt lambda frequency
622 ((map fst done) ++ (fst ego') ++ (snd ego') ++ (map fst rest))
625 adaptativeSeaLevelMatching :: Double -> Proximity -> Double -> Double -> Map (PhyloGroupId, PhyloGroupId) Double
626 -> Double -> Int -> Map Int Double
627 -> Int -> [PhyloPeriodId] -> Map Date Double -> Map Date Cooc
628 -> [([PhyloGroup],(Bool,[Double]))] -> [([PhyloGroup],(Bool,[Double]))]
629 adaptativeSeaLevelMatching fdt proxiConf depth elevation groupsProxi lambda minBranch frequency frame periods docs coocs branches =
630 -- if there is no branch to break or if seaLvl level >= depth then end
631 if (Map.null groupsProxi) || (depth <= 0) || ((not . or) $ map (fst . snd) branches)
634 -- break all the possible branches at the current seaLvl level
635 let branches' = adaptativeBreakBranches fdt proxiConf depth elevation groupsProxi lambda frequency minBranch frame docs coocs periods
636 [] (head' "seaLevelMatching" branches) (tail' "seaLevelMatching" branches)
637 frequency' = reduceFrequency frequency (map fst branches')
638 groupsProxi' = reduceTupleMapByKeys (map (getGroupId) $ concat $ map (fst) $ filter (fst . snd) branches') groupsProxi
639 -- thr = toThreshold depth groupsProxi
640 in trace("\n " <> foldl (\acc _ -> acc <> "🌊 ") "" [0..(elevation - depth)]
641 <> " [✓ " <> show(length $ filter (fst . snd) branches') <> "(" <> show(length $ concat $ map (fst) $ filter (fst . snd) branches')
642 <> ")|✗ " <> show(length $ filter (not . fst . snd) branches') <> "(" <> show(length $ concat $ map (fst) $ filter (not . fst . snd) branches') <> ")]"
644 $ adaptativeSeaLevelMatching fdt proxiConf (depth - 1) elevation groupsProxi' lambda minBranch frequency' frame periods docs coocs branches'
647 adaptativeTemporalMatching :: Double -> Phylo -> Phylo
648 adaptativeTemporalMatching elevation phylo = updatePhyloGroups 1
649 (fromList $ map (\g -> (getGroupId g,g)) $ traceMatchEnd $ concat branches)
650 (toPhyloHorizon phylo)
652 -- 2) process the temporal matching by elevating seaLvl level
653 branches :: [[PhyloGroup]]
655 $ adaptativeSeaLevelMatching (fromIntegral $ Vector.length $ getRoots phylo)
656 (phyloProximity $ getConfig phylo)
659 (phylo ^. phylo_groupsProxi)
660 (_qua_granularity $ phyloQuality $ getConfig phylo)
661 (_qua_minBranch $ phyloQuality $ getConfig phylo)
662 (phylo ^. phylo_termFreq)
663 (getTimeFrame $ timeUnit $ getConfig phylo)
665 (phylo ^. phylo_timeDocs)
666 (phylo ^. phylo_timeCooc)
668 -- 1) for each group process an initial temporal Matching
669 -- here we suppose that all the groups of level 1 are part of the same big branch
670 groups :: [([PhyloGroup],(Bool,[Double]))]
671 groups = map (\b -> (b,((length $ nub $ map _phylo_groupPeriod b) >= (_qua_minBranch $ phyloQuality $ getConfig phylo),[thr])))
672 $ groupsToBranches' $ fromList $ map (\g -> (getGroupId g, g))
673 $ matchGroupsToGroups (getTimeFrame $ timeUnit $ getConfig phylo)
674 (getPeriodIds phylo) (phyloProximity $ getConfig phylo)
676 (phylo ^. phylo_timeDocs)
677 (phylo ^. phylo_timeCooc)
678 (traceTemporalMatching $ getGroupsFromLevel 1 phylo)
679 --------------------------------------
681 thr = toThreshold elevation (phylo ^. phylo_groupsProxi)