2 Module : Gargantext.Core.Viz.Phylo.TemporalMatching
3 Description : Module dedicated to the adaptative temporal matching of a Phylo.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 module Gargantext.Core.Viz.Phylo.TemporalMatching where
14 import Data.List (concat, splitAt, tail, sortOn, (++), intersect, null, inits, groupBy, scanl, nub, nubBy, union, dropWhile, partition, or, sort, (!!))
15 import Data.Map (Map, fromList, elems, restrictKeys, unionWith, findWithDefault, keys, (!), (!?), filterWithKey, singleton, empty, mapKeys, adjust)
17 import Gargantext.Prelude
18 import Gargantext.Core.Viz.AdaptativePhylo
19 import Gargantext.Core.Viz.Phylo.PhyloTools
21 import Prelude (floor)
22 import Control.Lens hiding (Level)
23 import Control.Parallel.Strategies (parList, rdeepseq, using)
24 import Debug.Trace (trace)
28 import qualified Data.Map as Map
29 import qualified Data.Set as Set
30 import qualified Data.Vector as Vector
39 -- | To compute a jaccard similarity between two lists
40 jaccard :: [Int] -> [Int] -> Double
41 jaccard inter' union' = ((fromIntegral . length) $ inter') / ((fromIntegral . length) $ union')
44 -- | Process the inverse sumLog
45 sumInvLog' :: Double -> Double -> [Double] -> Double
46 sumInvLog' s nb diago = foldl (\mem occ -> mem + (1 / (log (occ + s) / log (nb + s)))) 0 diago
49 -- | Process the sumLog
50 sumLog' :: Double -> Double -> [Double] -> Double
51 sumLog' s nb diago = foldl (\mem occ -> mem + (log (occ + s) / log (nb + s))) 0 diago
54 weightedLogJaccard' :: Double -> Double -> Map Int Double -> [Int] -> [Int] -> Double
55 weightedLogJaccard' sens nbDocs diago ngrams ngrams'
56 | null ngramsInter = 0
57 | ngramsInter == ngramsUnion = 1
58 | sens == 0 = jaccard ngramsInter ngramsUnion
59 | sens > 0 = (sumInvLog' sens nbDocs diagoInter) / (sumInvLog' sens nbDocs diagoUnion)
60 | otherwise = (sumLog' sens nbDocs diagoInter) / (sumLog' sens nbDocs diagoUnion)
62 --------------------------------------
64 ngramsInter = intersect ngrams ngrams'
65 --------------------------------------
67 ngramsUnion = union ngrams ngrams'
68 --------------------------------------
69 diagoInter :: [Double]
70 diagoInter = elems $ restrictKeys diago (Set.fromList ngramsInter)
71 --------------------------------------
72 diagoUnion :: [Double]
73 diagoUnion = elems $ restrictKeys diago (Set.fromList ngramsUnion)
74 --------------------------------------
77 -- | To process the proximity between a current group and a pair of targets group
78 toProximity :: Double -> Map Int Double -> Proximity -> [Int] -> [Int] -> [Int] -> Double
79 toProximity nbDocs diago proximity egoNgrams targetNgrams targetNgrams' =
81 WeightedLogJaccard sens ->
82 let pairNgrams = if targetNgrams == targetNgrams'
84 else union targetNgrams targetNgrams'
85 in weightedLogJaccard' sens nbDocs diago egoNgrams pairNgrams
89 ------------------------
90 -- | Local Matching | --
91 ------------------------
93 findLastPeriod :: Filiation -> [PhyloPeriodId] -> PhyloPeriodId
94 findLastPeriod fil periods = case fil of
95 ToParents -> head' "findLastPeriod" (sortOn fst periods)
96 ToChilds -> last' "findLastPeriod" (sortOn fst periods)
99 -- | To filter pairs of candidates related to old pointers periods
100 removeOldPointers :: [Pointer] -> Filiation -> Double -> Proximity -> PhyloPeriodId
101 -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
102 -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
103 removeOldPointers oldPointers fil thr prox prd pairs
104 | null oldPointers = pairs
105 | null (filterPointers prox thr oldPointers) =
106 let lastMatchedPrd = findLastPeriod fil (map (fst . fst . fst) oldPointers)
107 in if lastMatchedPrd == prd
109 else filter (\((id,_),(id',_)) ->
111 ToParents -> (((fst . fst . fst) id ) < (fst lastMatchedPrd))
112 || (((fst . fst . fst) id') < (fst lastMatchedPrd))
113 ToChilds -> (((fst . fst . fst) id ) > (fst lastMatchedPrd))
114 || (((fst . fst . fst) id') > (fst lastMatchedPrd))) pairs
118 makePairs' :: (PhyloGroupId,[Int]) -> [(PhyloGroupId,[Int])] -> [PhyloPeriodId] -> [Pointer] -> Filiation -> Double -> Proximity
119 -> Map Date Double -> Map Date Cooc -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
120 makePairs' (egoId, egoNgrams) candidates periods oldPointers fil thr prox docs diagos =
123 else removeOldPointers oldPointers fil thr prox lastPrd
124 {- at least on of the pair candidates should be from the last added period -}
125 $ filter (\((id,_),(id',_)) -> ((fst . fst) id == lastPrd) || ((fst . fst) id' == lastPrd))
127 $ filter (\(id,ngrams) ->
128 let nbDocs = (sum . elems) $ filterDocs docs ([(fst . fst) egoId, (fst . fst) id])
129 diago = reduceDiagos $ filterDiago diagos ([(fst . fst) egoId, (fst . fst) id])
130 in (toProximity nbDocs diago prox egoNgrams egoNgrams ngrams) >= thr
133 lastPrd :: PhyloPeriodId
134 lastPrd = findLastPeriod fil periods
137 filterPointers :: Proximity -> Double -> [Pointer] -> [Pointer]
138 filterPointers proxi thr pts = filter (\(_,w) -> filterProximity proxi thr w) pts
140 filterPointers' :: Proximity -> Double -> [(Pointer,[Int])] -> [(Pointer,[Int])]
141 filterPointers' proxi thr pts = filter (\((_,w),_) -> filterProximity proxi thr w) pts
144 reduceDiagos :: Map Date Cooc -> Map Int Double
145 reduceDiagos diagos = mapKeys (\(k,_) -> k)
146 $ foldl (\acc diago -> unionWith (+) acc diago) empty (elems diagos)
148 filterPointersByPeriod :: Filiation -> [(Pointer,[Int])] -> [Pointer]
149 filterPointersByPeriod fil pts =
150 let pts' = sortOn (fst . fst . fst . fst) pts
151 inf = (fst . fst . fst . fst) $ head' "filterPointersByPeriod" pts'
152 sup = (fst . fst . fst . fst) $ last' "filterPointersByPeriod" pts'
154 $ nubBy (\pt pt' -> snd pt == snd pt')
155 $ filter (\pt -> ((fst . fst . fst . fst) pt == inf) || ((fst . fst . fst . fst) pt == sup))
157 ToParents -> reverse pts'
160 phyloGroupMatching :: [[(PhyloGroupId,[Int])]] -> Filiation -> Proximity -> Map Date Double -> Map Date Cooc
161 -> Double -> [Pointer] -> (PhyloGroupId,[Int]) -> [Pointer]
162 phyloGroupMatching candidates fil proxi docs diagos thr oldPointers (id,ngrams) =
163 if (null $ filterPointers proxi thr oldPointers)
164 {- let's find new pointers -}
165 then if null nextPointers
167 else filterPointersByPeriod fil
168 $ head' "phyloGroupMatching"
169 -- Keep only the best set of pointers grouped by proximity
170 $ groupBy (\pt pt' -> (snd . fst) pt == (snd . fst) pt')
171 $ reverse $ sortOn (snd . fst) $ head' "pointers" nextPointers
172 -- Find the first time frame where at leats one pointer satisfies the proximity threshold
175 nextPointers :: [[(Pointer,[Int])]]
176 nextPointers = take 1
178 {- for each time frame, process the proximity on relevant pairs of targeted groups -}
179 $ scanl (\acc groups ->
180 let periods = nub $ map (fst . fst . fst) $ concat groups
181 nbdocs = sum $ elems $ (filterDocs docs ([(fst . fst) id] ++ periods))
183 $ filterDiago diagos ([(fst . fst) id] ++ periods)
184 {- important resize nbdocs et diago dans le make pairs -}
185 pairs = makePairs' (id,ngrams) (concat groups) periods oldPointers fil thr proxi docs diagos
186 in acc ++ ( filterPointers' proxi thr
189 {- process the proximity between the current group and a pair of candidates -}
190 let proximity = toProximity nbdocs diago proxi ngrams (snd c) (snd c')
191 in if ((c == c') || (snd c == snd c'))
192 then [((fst c,proximity),snd c)]
193 else [((fst c,proximity),snd c),((fst c',proximity),snd c')] ) pairs )) []
194 $ inits candidates -- groups from [[1900],[1900,1901],[1900,1901,1902],...]
197 filterDocs :: Map Date Double -> [PhyloPeriodId] -> Map Date Double
198 filterDocs d pds = restrictKeys d $ periodsToYears pds
200 filterDiago :: Map Date Cooc -> [PhyloPeriodId] -> Map Date Cooc
201 filterDiago diago pds = restrictKeys diago $ periodsToYears pds
204 -----------------------------
205 -- | Matching Processing | --
206 -----------------------------
209 getNextPeriods :: Filiation -> Int -> PhyloPeriodId -> [PhyloPeriodId] -> [PhyloPeriodId]
210 getNextPeriods fil max' pId pIds =
212 ToChilds -> take max' $ (tail . snd) $ splitAt (elemIndex' pId pIds) pIds
213 ToParents -> take max' $ (reverse . fst) $ splitAt (elemIndex' pId pIds) pIds
216 getCandidates :: PhyloGroup -> [[(PhyloGroupId,[Int])]] -> [[(PhyloGroupId,[Int])]]
217 getCandidates ego targets =
219 filter (\g' -> (not . null) $ intersect (ego ^. phylo_groupNgrams) (snd g')
223 matchGroupsToGroups :: Int -> [PhyloPeriodId] -> Proximity -> Double -> Map Date Double -> Map Date Cooc -> [PhyloGroup] -> [PhyloGroup]
224 matchGroupsToGroups frame periods proximity thr docs coocs groups =
225 let groups' = groupByField _phylo_groupPeriod groups
226 in foldl' (\acc prd ->
227 let -- 1) find the parents/childs matching periods
228 periodsPar = getNextPeriods ToParents frame prd periods
229 periodsChi = getNextPeriods ToChilds frame prd periods
230 -- 2) find the parents/childs matching candidates
231 candidatesPar = map (\prd' -> map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ findWithDefault [] prd' groups') periodsPar
232 candidatesChi = map (\prd' -> map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ findWithDefault [] prd' groups') periodsChi
233 -- 3) find the parents/child number of docs by years
234 docsPar = filterDocs docs ([prd] ++ periodsPar)
235 docsChi = filterDocs docs ([prd] ++ periodsChi)
236 -- 4) find the parents/child diago by years
237 diagoPar = filterDiago (map coocToDiago coocs) ([prd] ++ periodsPar)
238 diagoChi = filterDiago (map coocToDiago coocs) ([prd] ++ periodsPar)
239 -- 5) match in parallel all the groups (egos) to their possible candidates
241 let pointersPar = phyloGroupMatching (getCandidates ego candidatesPar) ToParents proximity docsPar diagoPar
242 thr (getPeriodPointers ToParents ego) (getGroupId ego, ego ^. phylo_groupNgrams)
243 pointersChi = phyloGroupMatching (getCandidates ego candidatesChi) ToChilds proximity docsChi diagoChi
244 thr (getPeriodPointers ToChilds ego) (getGroupId ego, ego ^. phylo_groupNgrams)
245 in addPointers ToChilds TemporalPointer pointersChi
246 $ addPointers ToParents TemporalPointer pointersPar ego)
247 $ findWithDefault [] prd groups'
248 egos' = egos `using` parList rdeepseq
253 -----------------------
254 -- | Phylo Quality | --
255 -----------------------
258 relevantBranches :: Int -> [[PhyloGroup]] -> [[PhyloGroup]]
259 relevantBranches term branches =
260 filter (\groups -> (any (\group -> elem term $ group ^. phylo_groupNgrams) groups)) branches
262 accuracy :: Int -> [(Date,Date)] -> [PhyloGroup] -> Double
263 -- The accuracy of a branch relatively to a term x is computed only over the periods there exist some cluster mentionning x in the phylomemy
264 accuracy x periods bk = ((fromIntegral $ length $ filter (\g -> elem x $ g ^. phylo_groupNgrams) bk')
265 / (fromIntegral $ length bk'))
268 bk' = filter (\g -> elem (g ^. phylo_groupPeriod) periods) bk
270 recall :: Int -> [PhyloGroup] -> [[PhyloGroup]] -> Double
271 recall x bk bx = ((fromIntegral $ length $ filter (\g -> elem x $ g ^. phylo_groupNgrams) bk)
272 / (fromIntegral $ length $ filter (\g -> elem x $ g ^. phylo_groupNgrams) $ concat bx))
274 fScore :: Double -> Int -> [(Date,Date)] -> [PhyloGroup] -> [[PhyloGroup]] -> Double
275 fScore beta x periods bk bx =
276 let rec = recall x bk bx
277 acc = accuracy x periods bk
278 in ((1 + beta ** 2) * acc * rec)
279 / (((beta ** 2) * rec + acc))
282 wk :: [PhyloGroup] -> Double
283 wk bk = fromIntegral $ length bk
286 toPhyloQuality' :: Double -> Map Int Double -> [[PhyloGroup]] -> Double
287 toPhyloQuality' beta freq branches =
292 let bks = relevantBranches i branches
293 periods = nub $ map _phylo_groupPeriod $ filter (\g -> elem i $ g ^. phylo_groupNgrams) $ concat bks
294 in (freq ! i) * (sum $ map (\bk -> ((wk bk) / (sum $ map wk bks)) * (fScore beta i periods bk bks)) bks))
297 toRecall :: Map Int Double -> [[PhyloGroup]] -> Double
298 toRecall freq branches =
304 bx = relevantBranches x branches
305 wks = sum $ map wk bx
306 in (px / pys) * (sum $ map (\bk -> ((wk bk) / wks) * (recall x bk bx)) bx))
310 pys = sum (elems freq)
313 toAccuracy :: Map Int Double -> [[PhyloGroup]] -> Double
314 toAccuracy freq branches =
320 bx = relevantBranches x branches
321 -- | periods containing x
322 periods = nub $ map _phylo_groupPeriod $ filter (\g -> elem x $ g ^. phylo_groupNgrams) $ concat bx
323 wks = sum $ map wk bx
324 in (px / pys) * (sum $ map (\bk -> ((wk bk) / wks) * (accuracy x periods bk)) bx))
328 pys = sum (elems freq)
331 -- | here we do the average of all the local f_scores
332 toPhyloQuality :: Double -> Double -> Map Int Double -> [[PhyloGroup]] -> Double
333 toPhyloQuality fdt beta freq branches =
339 let bx = relevantBranches x branches
340 -- | periods containing x
341 periods = nub $ map _phylo_groupPeriod $ filter (\g -> elem x $ g ^. phylo_groupNgrams) $ concat bx
342 wks = sum $ map wk bx
343 -- in (px / pys) * (sum $ map (\bk -> ((wk bk) / wks) * (fScore beta x bk bx)) bx))
344 in (1 / fdt) * (sum $ map (\bk -> ((wk bk) / wks) * (fScore beta x periods bk bx)) bx))
348 -- pys = sum (elems freq)
350 -- 1 / nb de foundation
352 ------------------------------------
353 -- | Constant Temporal Matching | --
354 ------------------------------------
357 groupsToBranches' :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
358 groupsToBranches' groups =
359 {- run the related component algorithm -}
360 let egos = groupBy (\gs gs' -> (fst $ fst $ head' "egos" gs) == (fst $ fst $ head' "egos" gs'))
361 $ sortOn (\gs -> fst $ fst $ head' "egos" gs)
362 $ map (\group -> [getGroupId group]
363 ++ (map fst $ group ^. phylo_groupPeriodParents)
364 ++ (map fst $ group ^. phylo_groupPeriodChilds) ) $ elems groups
365 -- first find the related components by inside each ego's period
367 graph' = map relatedComponents egos
368 -- then run it for the all the periods
370 $ relatedComponents $ concat (graph' `using` parList rdeepseq)
371 -- update each group's branch id
372 in map (\(bId,ids) ->
373 let groups' = map (\group -> group & phylo_groupBranchId %~ (\(lvl,lst) -> (lvl,lst ++ [bId])))
374 $ elems $ restrictKeys groups (Set.fromList ids)
375 in groups' `using` parList rdeepseq ) graph
378 reduceFrequency :: Map Int Double -> [[PhyloGroup]] -> Map Int Double
379 reduceFrequency frequency branches =
380 restrictKeys frequency (Set.fromList $ (nub . concat) $ map _phylo_groupNgrams $ concat branches)
382 updateThr :: Double -> [[PhyloGroup]] -> [[PhyloGroup]]
383 updateThr thr branches = map (\b -> map (\g ->
384 g & phylo_groupMeta .~ (singleton "seaLevels" (((g ^. phylo_groupMeta) ! "seaLevels") ++ [thr]))) b) branches
387 -- Sequentially break each branch of a phylo where
388 -- done = all the allready broken branches
389 -- ego = the current branch we want to break
390 -- rest = the branches we still have to break
391 breakBranches :: Double -> Proximity -> Double -> Map Int Double -> Int -> Double -> Double -> Double
392 -> Int -> Map Date Double -> Map Date Cooc -> [PhyloPeriodId] -> [([PhyloGroup],Bool)] -> ([PhyloGroup],Bool) -> [([PhyloGroup],Bool)] -> [([PhyloGroup],Bool)]
393 breakBranches fdt proximity beta frequency minBranch thr depth elevation frame docs coocs periods done ego rest =
394 -- 1) keep or not the new division of ego
395 let done' = done ++ (if snd ego
397 (if ((null (fst ego')) || (quality > quality'))
399 -- trace (" ✗ F(β) = " <> show(quality) <> " (vs) " <> show(quality')
400 -- <> " | " <> show(length $ fst ego) <> " groups : "
401 -- <> " |✓ " <> show(length $ fst ego') <> show(map length $ fst ego')
402 -- <> " |✗ " <> show(length $ snd ego') <> "[" <> show(length $ concat $ snd ego') <> "]")
405 -- trace (" ✓ level = " <> printf "%.1f" thr <> "")
406 -- trace (" ✓ F(β) = " <> show(quality) <> " (vs) " <> show(quality')
407 -- <> " | " <> show(length $ fst ego) <> " groups : "
408 -- <> " |✓ " <> show(length $ fst ego') <> show(map length $ fst ego')
409 -- <> " |✗ " <> show(length $ snd ego') <> "[" <> show(length $ concat $ snd ego') <> "]")
410 ((map (\e -> (e,True)) (fst ego')) ++ (map (\e -> (e,False)) (snd ego'))))
413 -- 2) if there is no more branches in rest then return else continue
416 else breakBranches fdt proximity beta frequency minBranch thr depth elevation frame docs coocs periods
417 done' (head' "breakBranches" rest) (tail' "breakBranches" rest)
419 --------------------------------------
421 quality = toPhyloQuality fdt beta frequency ((map fst done) ++ [fst ego] ++ (map fst rest))
422 --------------------------------------
423 ego' :: ([[PhyloGroup]],[[PhyloGroup]])
425 let branches = groupsToBranches' $ fromList $ map (\g -> (getGroupId g, g))
426 $ matchGroupsToGroups frame periods proximity thr docs coocs (fst ego)
427 branches' = branches `using` parList rdeepseq
428 in partition (\b -> (length $ nub $ map _phylo_groupPeriod b) >= minBranch)
430 $ depthToMeta (elevation - depth) branches'
431 --------------------------------------
433 quality' = toPhyloQuality fdt beta frequency
434 ((map fst done) ++ (fst ego') ++ (snd ego') ++ (map fst rest))
437 seaLevelMatching :: Double -> Proximity -> Double -> Int -> Map Int Double -> Double -> Double -> Double -> Double
438 -> Int -> [PhyloPeriodId] -> Map Date Double -> Map Date Cooc -> [([PhyloGroup],Bool)] -> [([PhyloGroup],Bool)]
439 seaLevelMatching fdt proximity beta minBranch frequency thr step depth elevation frame periods docs coocs branches =
440 -- if there is no branch to break or if seaLvl level > 1 then end
441 if (thr >= 1) || ((not . or) $ map snd branches)
444 -- break all the possible branches at the current seaLvl level
445 let quality = toPhyloQuality fdt beta frequency (map fst branches)
446 acc = toAccuracy frequency (map fst branches)
447 rec = toRecall frequency (map fst branches)
448 branches' = trace ("↑ level = " <> printf "%.3f" thr <> " F(β) = " <> printf "%.5f" quality
449 <> " ξ = " <> printf "%.5f" acc
450 <> " ρ = " <> printf "%.5f" rec
451 <> " branches = " <> show(length branches) <> " ↴")
452 $ breakBranches fdt proximity beta frequency minBranch thr depth elevation frame docs coocs periods
453 [] (head' "seaLevelMatching" branches) (tail' "seaLevelMatching" branches)
454 frequency' = reduceFrequency frequency (map fst branches')
455 in seaLevelMatching fdt proximity beta minBranch frequency' (thr + step) step (depth - 1) elevation frame periods docs coocs branches'
458 constanteTemporalMatching :: Double -> Double -> Phylo -> Phylo
459 constanteTemporalMatching start step phylo = updatePhyloGroups 1
460 (fromList $ map (\g -> (getGroupId g,g)) $ traceMatchEnd $ concat branches)
461 (toPhyloHorizon phylo)
463 -- 2) process the temporal matching by elevating seaLvl level
464 branches :: [[PhyloGroup]]
466 $ seaLevelMatching (fromIntegral $ Vector.length $ getRoots phylo)
467 (phyloProximity $ getConfig phylo)
468 (_qua_granularity $ phyloQuality $ getConfig phylo)
469 (_qua_minBranch $ phyloQuality $ getConfig phylo)
470 (phylo ^. phylo_termFreq)
472 ((((1 - start) / step) - 1))
473 (((1 - start) / step))
474 (getTimeFrame $ timeUnit $ getConfig phylo)
476 (phylo ^. phylo_timeDocs)
477 (phylo ^. phylo_timeCooc)
479 -- 1) for each group process an initial temporal Matching
480 -- here we suppose that all the groups of level 1 are part of the same big branch
481 groups :: [([PhyloGroup],Bool)]
482 groups = map (\b -> (b,(length $ nub $ map _phylo_groupPeriod b) >= (_qua_minBranch $ phyloQuality $ getConfig phylo)))
483 $ groupsToBranches' $ fromList $ map (\g -> (getGroupId g, g))
484 $ matchGroupsToGroups (getTimeFrame $ timeUnit $ getConfig phylo)
485 (getPeriodIds phylo) (phyloProximity $ getConfig phylo)
487 (phylo ^. phylo_timeDocs)
488 (phylo ^. phylo_timeCooc)
489 (traceTemporalMatching $ getGroupsFromLevel 1 phylo)
495 toPhyloHorizon :: Phylo -> Phylo
496 toPhyloHorizon phylo =
497 let t0 = take 1 (getPeriodIds phylo)
498 groups = getGroupsFromLevelPeriods 1 t0 phylo
499 sens = getSensibility (phyloProximity $ getConfig phylo)
500 nbDocs = sum $ elems $ filterDocs (phylo ^. phylo_timeDocs) t0
501 diago = reduceDiagos $ filterDiago (phylo ^. phylo_timeCooc) t0
502 in phylo & phylo_horizon .~ (fromList $ map (\(g,g') ->
503 ((getGroupId g,getGroupId g'),weightedLogJaccard' sens nbDocs diago (g ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams))) $ listToCombi' groups)
506 --------------------------------------
507 -- | Adaptative Temporal Matching | --
508 --------------------------------------
511 thrToMeta :: Double -> [[PhyloGroup]] -> [[PhyloGroup]]
512 thrToMeta thr branches =
514 map (\g -> g & phylo_groupMeta .~ (adjust (\lst -> lst ++ [thr]) "seaLevels" (g ^. phylo_groupMeta))) b) branches
516 depthToMeta :: Double -> [[PhyloGroup]] -> [[PhyloGroup]]
517 depthToMeta depth branches =
518 let break = length branches > 1
521 if break then g & phylo_groupMeta .~ (adjust (\lst -> lst ++ [depth]) "breaks"(g ^. phylo_groupMeta))
524 reduceTupleMapByKeys :: Eq a => [a] -> Map (a,a) Double -> Map (a,a) Double
525 reduceTupleMapByKeys ks m = filterWithKey (\(k,k') _ -> (elem k ks) && (elem k' ks)) m
528 getInTupleMap :: Ord a => Map (a,a) Double -> a -> a -> Double
530 | isJust (m !? ( k ,k')) = m ! ( k ,k')
531 | isJust (m !? ( k',k )) = m ! ( k',k )
535 toThreshold :: Double -> Map (PhyloGroupId,PhyloGroupId) Double -> Double
536 toThreshold lvl proxiGroups =
537 let idx = ((Map.size proxiGroups) `div` (floor lvl)) - 1
539 then (sort $ elems proxiGroups) !! idx
543 -- done = all the allready broken branches
544 -- ego = the current branch we want to break
545 -- rest = the branches we still have to break
546 adaptativeBreakBranches :: Double -> Proximity -> Double -> Double -> Map (PhyloGroupId,PhyloGroupId) Double
547 -> Double -> Map Int Double -> Int -> Int -> Map Date Double -> Map Date Cooc
548 -> [PhyloPeriodId] -> [([PhyloGroup],(Bool,[Double]))] -> ([PhyloGroup],(Bool,[Double])) -> [([PhyloGroup],(Bool,[Double]))]
549 -> [([PhyloGroup],(Bool,[Double]))]
550 adaptativeBreakBranches fdt proxiConf depth elevation groupsProxi beta frequency minBranch frame docs coocs periods done ego rest =
551 -- 1) keep or not the new division of ego
552 let done' = done ++ (if (fst . snd) ego
553 then (if ((null (fst ego')) || (quality > quality'))
555 [(concat $ thrToMeta thr $ [fst ego],(False, ((snd . snd) ego)))]
557 ( (map (\e -> (e,(True, ((snd . snd) ego) ++ [thr]))) (fst ego'))
558 ++ (map (\e -> (e,(False, ((snd . snd) ego)))) (snd ego'))))
559 else [(concat $ thrToMeta thr $ [fst ego], snd ego)])
561 -- uncomment let .. in for debugging
562 -- let part1 = partition (snd) done'
563 -- part2 = partition (snd) rest
564 -- in trace ( "[✓ " <> show(length $ fst part1) <> "(" <> show(length $ concat $ map (fst) $ fst part1) <> ")|✗ " <> show(length $ snd part1) <> "(" <> show(length $ concat $ map (fst) $ snd part1) <> ")] "
565 -- <> "[✓ " <> show(length $ fst part2) <> "(" <> show(length $ concat $ map (fst) $ fst part2) <> ")|✗ " <> show(length $ snd part2) <> "(" <> show(length $ concat $ map (fst) $ snd part2) <> ")]"
567 -- 2) if there is no more branches in rest then return else continue
570 else adaptativeBreakBranches fdt proxiConf depth elevation groupsProxi beta frequency minBranch frame docs coocs periods
571 done' (head' "breakBranches" rest) (tail' "breakBranches" rest)
573 --------------------------------------
575 thr = toThreshold depth $ Map.filter (\v -> v > (last' "breakBranches" $ (snd . snd) ego)) $ reduceTupleMapByKeys (map getGroupId $ fst ego) groupsProxi
576 --------------------------------------
578 quality = toPhyloQuality fdt beta frequency ((map fst done) ++ [fst ego] ++ (map fst rest))
579 --------------------------------------
580 ego' :: ([[PhyloGroup]],[[PhyloGroup]])
582 let branches = groupsToBranches' $ fromList $ map (\g -> (getGroupId g, g))
583 $ matchGroupsToGroups frame periods proxiConf thr docs coocs (fst ego)
584 branches' = branches `using` parList rdeepseq
585 in partition (\b -> (length $ nub $ map _phylo_groupPeriod b) > minBranch)
587 $ depthToMeta (elevation - depth) branches'
588 --------------------------------------
590 quality' = toPhyloQuality fdt beta frequency
591 ((map fst done) ++ (fst ego') ++ (snd ego') ++ (map fst rest))
594 adaptativeSeaLevelMatching :: Double -> Proximity -> Double -> Double -> Map (PhyloGroupId, PhyloGroupId) Double
595 -> Double -> Int -> Map Int Double
596 -> Int -> [PhyloPeriodId] -> Map Date Double -> Map Date Cooc
597 -> [([PhyloGroup],(Bool,[Double]))] -> [([PhyloGroup],(Bool,[Double]))]
598 adaptativeSeaLevelMatching fdt proxiConf depth elevation groupsProxi beta minBranch frequency frame periods docs coocs branches =
599 -- if there is no branch to break or if seaLvl level >= depth then end
600 if (Map.null groupsProxi) || (depth <= 0) || ((not . or) $ map (fst . snd) branches)
603 -- break all the possible branches at the current seaLvl level
604 let branches' = adaptativeBreakBranches fdt proxiConf depth elevation groupsProxi beta frequency minBranch frame docs coocs periods
605 [] (head' "seaLevelMatching" branches) (tail' "seaLevelMatching" branches)
606 frequency' = reduceFrequency frequency (map fst branches')
607 groupsProxi' = reduceTupleMapByKeys (map (getGroupId) $ concat $ map (fst) $ filter (fst . snd) branches') groupsProxi
608 -- thr = toThreshold depth groupsProxi
609 in trace("\n " <> foldl (\acc _ -> acc <> "🌊 ") "" [0..(elevation - depth)]
610 <> " [✓ " <> show(length $ filter (fst . snd) branches') <> "(" <> show(length $ concat $ map (fst) $ filter (fst . snd) branches')
611 <> ")|✗ " <> show(length $ filter (not . fst . snd) branches') <> "(" <> show(length $ concat $ map (fst) $ filter (not . fst . snd) branches') <> ")]"
613 $ adaptativeSeaLevelMatching fdt proxiConf (depth - 1) elevation groupsProxi' beta minBranch frequency' frame periods docs coocs branches'
616 adaptativeTemporalMatching :: Double -> Phylo -> Phylo
617 adaptativeTemporalMatching elevation phylo = updatePhyloGroups 1
618 (fromList $ map (\g -> (getGroupId g,g)) $ traceMatchEnd $ concat branches)
619 (toPhyloHorizon phylo)
621 -- 2) process the temporal matching by elevating seaLvl level
622 branches :: [[PhyloGroup]]
624 $ adaptativeSeaLevelMatching (fromIntegral $ Vector.length $ getRoots phylo)
625 (phyloProximity $ getConfig phylo)
628 (phylo ^. phylo_groupsProxi)
629 (_qua_granularity $ phyloQuality $ getConfig phylo)
630 (_qua_minBranch $ phyloQuality $ getConfig phylo)
631 (phylo ^. phylo_termFreq)
632 (getTimeFrame $ timeUnit $ getConfig phylo)
634 (phylo ^. phylo_timeDocs)
635 (phylo ^. phylo_timeCooc)
637 -- 1) for each group process an initial temporal Matching
638 -- here we suppose that all the groups of level 1 are part of the same big branch
639 groups :: [([PhyloGroup],(Bool,[Double]))]
640 groups = map (\b -> (b,((length $ nub $ map _phylo_groupPeriod b) >= (_qua_minBranch $ phyloQuality $ getConfig phylo),[thr])))
641 $ groupsToBranches' $ fromList $ map (\g -> (getGroupId g, g))
642 $ matchGroupsToGroups (getTimeFrame $ timeUnit $ getConfig phylo)
643 (getPeriodIds phylo) (phyloProximity $ getConfig phylo)
645 (phylo ^. phylo_timeDocs)
646 (phylo ^. phylo_timeCooc)
647 (traceTemporalMatching $ getGroupsFromLevel 1 phylo)
648 --------------------------------------
650 thr = toThreshold elevation (phylo ^. phylo_groupsProxi)