2 Module : Gargantext.Core.Viz.Phylo.TemporalMatching
3 Description : Module dedicated to the adaptative temporal matching of a Phylo.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 module Gargantext.Core.Viz.Phylo.TemporalMatching where
14 import Control.Lens hiding (Level)
15 import Control.Parallel.Strategies (parList, rdeepseq, using)
16 import Data.List (concat, splitAt, tail, sortOn, (++), intersect, null, inits, groupBy, scanl, nub, nubBy, union, dropWhile, partition, or, sort, (!!))
17 import Data.Map (Map, fromList, elems, restrictKeys, unionWith, findWithDefault, keys, (!), (!?), filterWithKey, singleton, empty, mapKeys, adjust)
18 import Debug.Trace (trace)
19 import Gargantext.Core.Viz.Phylo
20 import Gargantext.Core.Viz.Phylo.PhyloTools
21 import Gargantext.Prelude
22 import Prelude (floor,tan,pi)
24 import qualified Data.Map as Map
25 import qualified Data.Set as Set
26 import qualified Data.Vector as Vector
34 -- | To compute a jaccard similarity between two lists
35 jaccard :: [Int] -> [Int] -> Double
36 jaccard inter' union' = ((fromIntegral . length) $ inter') / ((fromIntegral . length) $ union')
39 -- | Process the inverse sumLog
40 sumInvLog' :: Double -> Double -> [Double] -> Double
41 sumInvLog' s nb diago = foldl (\mem occ -> mem + (1 / (log (occ + 1/ tan (s * pi / 2)) / log (nb + 1/ tan (s * pi / 2))))) 0 diago
44 -- | Process the sumLog
45 sumLog' :: Double -> Double -> [Double] -> Double
46 sumLog' s nb diago = foldl (\mem occ -> mem + (log (occ + 1/ tan (s * pi / 2)) / log (nb + 1/ tan (s * pi / 2)))) 0 diago
49 weightedLogJaccard' :: Double -> Double -> Map Int Double -> [Int] -> [Int] -> Double
50 weightedLogJaccard' sens nbDocs diago ngrams ngrams'
51 | null ngramsInter = 0
52 | ngramsInter == ngramsUnion = 1
53 | sens == 0 = jaccard ngramsInter ngramsUnion
54 | sens > 0 = (sumInvLog' sens nbDocs diagoInter) / (sumInvLog' sens nbDocs diagoUnion)
55 | otherwise = (sumLog' sens nbDocs diagoInter) / (sumLog' sens nbDocs diagoUnion)
57 --------------------------------------
59 ngramsInter = intersect ngrams ngrams'
60 --------------------------------------
62 ngramsUnion = union ngrams ngrams'
63 --------------------------------------
64 diagoInter :: [Double]
65 diagoInter = elems $ restrictKeys diago (Set.fromList ngramsInter)
66 --------------------------------------
67 diagoUnion :: [Double]
68 diagoUnion = elems $ restrictKeys diago (Set.fromList ngramsUnion)
69 --------------------------------------
71 -- | Process the weighted similarity between clusters. Adapted from Wang, X., Cheng, Q., Lu, W., 2014. Analyzing evolution of research topics with NEViewer: a new method based on dynamic co-word networks. Scientometrics 101, 1253–1271. https://doi.org/10.1007/s11192-014-1347-y (log added in the formula + pair comparison)
72 -- tests not conclusive
73 weightedLogSim' :: Double -> Double -> Map Int Double -> [Int] -> [Int] -> Double
74 weightedLogSim' sens nbDocs diago ego_ngrams target_ngrams
75 | null ngramsInter = 0
76 | ngramsInter == ngramsUnion = 1
77 | sens == 0 = jaccard ngramsInter ngramsUnion
78 | sens > 0 = (sumInvLog' sens nbDocs diagoInter) / minimum [(sumInvLog' sens nbDocs diagoEgo),(sumInvLog' sens nbDocs diagoTarget)]
79 | otherwise = (sumLog' sens nbDocs diagoInter) / minimum [(sumLog' sens nbDocs diagoEgo),(sumLog' sens nbDocs diagoTarget)]
81 --------------------------------------
83 ngramsInter = intersect ego_ngrams target_ngrams
84 --------------------------------------
86 ngramsUnion = union ego_ngrams target_ngrams
87 --------------------------------------
88 diagoInter :: [Double]
89 diagoInter = elems $ restrictKeys diago (Set.fromList ngramsInter)
90 --------------------------------------
92 diagoEgo = elems $ restrictKeys diago (Set.fromList ego_ngrams)
93 --------------------------------------
94 diagoTarget :: [Double]
95 diagoTarget = elems $ restrictKeys diago (Set.fromList target_ngrams)
96 --------------------------------------
98 toProximity :: Double -> Map Int Double -> Proximity -> [Int] -> [Int] -> [Int] -> Double
99 -- | To process the proximity between a current group and a pair of targets group using the adapted Wang et al. Similarity
100 toProximity nbDocs diago proximity egoNgrams targetNgrams targetNgrams' =
102 WeightedLogJaccard sens ->
103 let pairNgrams = if targetNgrams == targetNgrams'
105 else union targetNgrams targetNgrams'
106 in weightedLogJaccard' sens nbDocs diago egoNgrams pairNgrams
107 WeightedLogSim sens ->
108 let pairNgrams = if targetNgrams == targetNgrams'
110 else union targetNgrams targetNgrams'
111 in weightedLogSim' sens nbDocs diago egoNgrams pairNgrams
114 ------------------------
115 -- | Local Matching | --
116 ------------------------
118 findLastPeriod :: Filiation -> [PhyloPeriodId] -> PhyloPeriodId
119 findLastPeriod fil periods = case fil of
120 ToParents -> head' "findLastPeriod" (sortOn fst periods)
121 ToChilds -> last' "findLastPeriod" (sortOn fst periods)
124 -- | To filter pairs of candidates related to old pointers periods
125 removeOldPointers :: [Pointer] -> Filiation -> Double -> Proximity -> PhyloPeriodId
126 -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
127 -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
128 removeOldPointers oldPointers fil thr prox prd pairs
129 | null oldPointers = pairs
130 | null (filterPointers prox thr oldPointers) =
131 let lastMatchedPrd = findLastPeriod fil (map (fst . fst . fst) oldPointers)
132 in if lastMatchedPrd == prd
134 else filter (\((id,_),(id',_)) ->
136 ToParents -> (((fst . fst . fst) id ) < (fst lastMatchedPrd))
137 || (((fst . fst . fst) id') < (fst lastMatchedPrd))
138 ToChilds -> (((fst . fst . fst) id ) > (fst lastMatchedPrd))
139 || (((fst . fst . fst) id') > (fst lastMatchedPrd))) pairs
143 makePairs' :: (PhyloGroupId,[Int]) -> [(PhyloGroupId,[Int])] -> [PhyloPeriodId] -> [Pointer] -> Filiation -> Double -> Proximity
144 -> Map Date Double -> Map Date Cooc -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
145 makePairs' (egoId, egoNgrams) candidates periods oldPointers fil thr prox docs diagos =
148 else removeOldPointers oldPointers fil thr prox lastPrd
149 {- at least on of the pair candidates should be from the last added period -}
150 $ filter (\((id,_),(id',_)) -> ((fst . fst) id == lastPrd) || ((fst . fst) id' == lastPrd))
152 $ filter (\(id,ngrams) ->
153 let nbDocs = (sum . elems) $ filterDocs docs ([(fst . fst) egoId, (fst . fst) id])
154 diago = reduceDiagos $ filterDiago diagos ([(fst . fst) egoId, (fst . fst) id])
155 in (toProximity nbDocs diago prox egoNgrams egoNgrams ngrams) >= thr
158 lastPrd :: PhyloPeriodId
159 lastPrd = findLastPeriod fil periods
162 filterPointers :: Proximity -> Double -> [Pointer] -> [Pointer]
163 filterPointers proxi thr pts = filter (\(_,w) -> filterProximity proxi thr w) pts
165 filterPointers' :: Proximity -> Double -> [(Pointer,[Int])] -> [(Pointer,[Int])]
166 filterPointers' proxi thr pts = filter (\((_,w),_) -> filterProximity proxi thr w) pts
169 reduceDiagos :: Map Date Cooc -> Map Int Double
170 reduceDiagos diagos = mapKeys (\(k,_) -> k)
171 $ foldl (\acc diago -> unionWith (+) acc diago) empty (elems diagos)
173 filterPointersByPeriod :: Filiation -> [(Pointer,[Int])] -> [Pointer]
174 filterPointersByPeriod fil pts =
175 let pts' = sortOn (fst . fst . fst . fst) pts
176 inf = (fst . fst . fst . fst) $ head' "filterPointersByPeriod" pts'
177 sup = (fst . fst . fst . fst) $ last' "filterPointersByPeriod" pts'
179 $ nubBy (\pt pt' -> snd pt == snd pt')
180 $ filter (\pt -> ((fst . fst . fst . fst) pt == inf) || ((fst . fst . fst . fst) pt == sup))
182 ToParents -> reverse pts'
185 phyloGroupMatching :: [[(PhyloGroupId,[Int])]] -> Filiation -> Proximity -> Map Date Double -> Map Date Cooc
186 -> Double -> [Pointer] -> (PhyloGroupId,[Int]) -> [Pointer]
187 phyloGroupMatching candidates fil proxi docs diagos thr oldPointers (id,ngrams) =
188 if (null $ filterPointers proxi thr oldPointers)
189 {- let's find new pointers -}
190 then if null nextPointers
192 else filterPointersByPeriod fil
193 $ head' "phyloGroupMatching"
194 -- Keep only the best set of pointers grouped by proximity
195 $ groupBy (\pt pt' -> (snd . fst) pt == (snd . fst) pt')
196 $ reverse $ sortOn (snd . fst) $ head' "pointers" nextPointers
197 -- Find the first time frame where at leats one pointer satisfies the proximity threshold
200 nextPointers :: [[(Pointer,[Int])]]
201 nextPointers = take 1
203 {- for each time frame, process the proximity on relevant pairs of targeted groups -}
204 $ scanl (\acc groups ->
205 let periods = nub $ map (fst . fst . fst) $ concat groups
206 nbdocs = sum $ elems $ (filterDocs docs ([(fst . fst) id] ++ periods))
208 $ filterDiago diagos ([(fst . fst) id] ++ periods)
209 {- important resize nbdocs et diago dans le make pairs -}
210 pairs = makePairs' (id,ngrams) (concat groups) periods oldPointers fil thr proxi docs diagos
211 in acc ++ ( filterPointers' proxi thr
214 {- process the proximity between the current group and a pair of candidates -}
215 let proximity = toProximity nbdocs diago proxi ngrams (snd c) (snd c')
216 in if ((c == c') || (snd c == snd c'))
217 then [((fst c,proximity),snd c)]
218 else [((fst c,proximity),snd c),((fst c',proximity),snd c')] ) pairs )) []
219 $ inits candidates -- groups from [[1900],[1900,1901],[1900,1901,1902],...]
222 filterDocs :: Map Date Double -> [PhyloPeriodId] -> Map Date Double
223 filterDocs d pds = restrictKeys d $ periodsToYears pds
225 filterDiago :: Map Date Cooc -> [PhyloPeriodId] -> Map Date Cooc
226 filterDiago diago pds = restrictKeys diago $ periodsToYears pds
229 -----------------------------
230 -- | Matching Processing | --
231 -----------------------------
234 getNextPeriods :: Filiation -> Int -> PhyloPeriodId -> [PhyloPeriodId] -> [PhyloPeriodId]
235 getNextPeriods fil max' pId pIds =
237 ToChilds -> take max' $ (tail . snd) $ splitAt (elemIndex' pId pIds) pIds
238 ToParents -> take max' $ (reverse . fst) $ splitAt (elemIndex' pId pIds) pIds
241 getCandidates :: PhyloGroup -> [[(PhyloGroupId,[Int])]] -> [[(PhyloGroupId,[Int])]]
242 getCandidates ego targets =
244 filter (\g' -> (not . null) $ intersect (ego ^. phylo_groupNgrams) (snd g')
248 matchGroupsToGroups :: Int -> [PhyloPeriodId] -> Proximity -> Double -> Map Date Double -> Map Date Cooc -> [PhyloGroup] -> [PhyloGroup]
249 matchGroupsToGroups frame periods proximity thr docs coocs groups =
250 let groups' = groupByField _phylo_groupPeriod groups
251 in foldl' (\acc prd ->
252 let -- 1) find the parents/childs matching periods
253 periodsPar = getNextPeriods ToParents frame prd periods
254 periodsChi = getNextPeriods ToChilds frame prd periods
255 -- 2) find the parents/childs matching candidates
256 candidatesPar = map (\prd' -> map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ findWithDefault [] prd' groups') periodsPar
257 candidatesChi = map (\prd' -> map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ findWithDefault [] prd' groups') periodsChi
258 -- 3) find the parents/child number of docs by years
259 docsPar = filterDocs docs ([prd] ++ periodsPar)
260 docsChi = filterDocs docs ([prd] ++ periodsChi)
261 -- 4) find the parents/child diago by years
262 diagoPar = filterDiago (map coocToDiago coocs) ([prd] ++ periodsPar)
263 diagoChi = filterDiago (map coocToDiago coocs) ([prd] ++ periodsPar)
264 -- 5) match in parallel all the groups (egos) to their possible candidates
266 let pointersPar = phyloGroupMatching (getCandidates ego candidatesPar) ToParents proximity docsPar diagoPar
267 thr (getPeriodPointers ToParents ego) (getGroupId ego, ego ^. phylo_groupNgrams)
268 pointersChi = phyloGroupMatching (getCandidates ego candidatesChi) ToChilds proximity docsChi diagoChi
269 thr (getPeriodPointers ToChilds ego) (getGroupId ego, ego ^. phylo_groupNgrams)
270 in addPointers ToChilds TemporalPointer pointersChi
271 $ addPointers ToParents TemporalPointer pointersPar ego)
272 $ findWithDefault [] prd groups'
273 egos' = egos `using` parList rdeepseq
278 -----------------------
279 -- | Phylo Quality | --
280 -----------------------
283 relevantBranches :: Int -> [[PhyloGroup]] -> [[PhyloGroup]]
284 relevantBranches term branches =
285 filter (\groups -> (any (\group -> elem term $ group ^. phylo_groupNgrams) groups)) branches
287 accuracy :: Int -> [(Date,Date)] -> [PhyloGroup] -> Double
288 -- The accuracy of a branch relatively to a term x is computed only over the periods there exist some cluster mentionning x in the phylomemy
289 accuracy x periods bk = ((fromIntegral $ length $ filter (\g -> elem x $ g ^. phylo_groupNgrams) bk')
290 / (fromIntegral $ length bk'))
293 bk' = filter (\g -> elem (g ^. phylo_groupPeriod) periods) bk
295 recall :: Int -> [PhyloGroup] -> [[PhyloGroup]] -> Double
296 recall x bk bx = ((fromIntegral $ length $ filter (\g -> elem x $ g ^. phylo_groupNgrams) bk)
297 / (fromIntegral $ length $ filter (\g -> elem x $ g ^. phylo_groupNgrams) $ concat bx))
299 fScore :: Double -> Int -> [(Date,Date)] -> [PhyloGroup] -> [[PhyloGroup]] -> Double
300 fScore lambda x periods bk bx =
301 let rec = recall x bk bx
302 acc = accuracy x periods bk
303 in ((1 + lambda ** 2) * acc * rec)
304 / (((lambda ** 2) * acc + rec))
307 wk :: [PhyloGroup] -> Double
308 wk bk = fromIntegral $ length bk
311 toPhyloQuality' :: Double -> Map Int Double -> [[PhyloGroup]] -> Double
312 toPhyloQuality' lambda freq branches =
317 let bks = relevantBranches i branches
318 periods = nub $ map _phylo_groupPeriod $ filter (\g -> elem i $ g ^. phylo_groupNgrams) $ concat bks
319 in (freq ! i) * (sum $ map (\bk -> ((wk bk) / (sum $ map wk bks)) * (fScore lambda i periods bk bks)) bks))
322 toRecall :: Map Int Double -> [[PhyloGroup]] -> Double
323 toRecall freq branches =
329 bx = relevantBranches x branches
330 wks = sum $ map wk bx
331 in (px / pys) * (sum $ map (\bk -> ((wk bk) / wks) * (recall x bk bx)) bx))
335 pys = sum (elems freq)
338 toAccuracy :: Map Int Double -> [[PhyloGroup]] -> Double
339 toAccuracy freq branches =
345 bx = relevantBranches x branches
346 -- | periods containing x
347 periods = nub $ map _phylo_groupPeriod $ filter (\g -> elem x $ g ^. phylo_groupNgrams) $ concat bx
348 wks = sum $ map wk bx
349 in (px / pys) * (sum $ map (\bk -> ((wk bk) / wks) * (accuracy x periods bk)) bx))
353 pys = sum (elems freq)
356 -- | here we do the average of all the local f_scores
357 toPhyloQuality :: Double -> Double -> Map Int Double -> [[PhyloGroup]] -> Double
358 toPhyloQuality fdt lambda freq branches =
364 let bx = relevantBranches x branches
365 -- | periods containing x
366 periods = nub $ map _phylo_groupPeriod $ filter (\g -> elem x $ g ^. phylo_groupNgrams) $ concat bx
367 wks = sum $ map wk bx
368 -- in (px / pys) * (sum $ map (\bk -> ((wk bk) / wks) * (fScore beta x bk bx)) bx))
369 -- in (1 / fdt) * (sum $ map (\bk -> ((wk bk) / wks) * (fScore beta x periods bk bx)) bx))
370 in (1 / fdt) * (sum $ map (\bk -> ((wk bk) / wks) * (fScore (tan (lambda * pi / 2)) x periods bk bx)) bx))
374 -- pys = sum (elems freq)
376 -- 1 / nb de foundation
378 ------------------------------------
379 -- | Constant Temporal Matching | --
380 ------------------------------------
383 groupsToBranches' :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
384 groupsToBranches' groups =
385 {- run the related component algorithm -}
386 let egos = groupBy (\gs gs' -> (fst $ fst $ head' "egos" gs) == (fst $ fst $ head' "egos" gs'))
387 $ sortOn (\gs -> fst $ fst $ head' "egos" gs)
388 $ map (\group -> [getGroupId group]
389 ++ (map fst $ group ^. phylo_groupPeriodParents)
390 ++ (map fst $ group ^. phylo_groupPeriodChilds) ) $ elems groups
391 -- first find the related components by inside each ego's period
393 graph' = map relatedComponents egos
394 -- then run it for the all the periods
396 $ relatedComponents $ concat (graph' `using` parList rdeepseq)
397 -- update each group's branch id
398 in map (\(bId,ids) ->
399 let groups' = map (\group -> group & phylo_groupBranchId %~ (\(lvl,lst) -> (lvl,lst ++ [bId])))
400 $ elems $ restrictKeys groups (Set.fromList ids)
401 in groups' `using` parList rdeepseq ) graph
404 reduceFrequency :: Map Int Double -> [[PhyloGroup]] -> Map Int Double
405 reduceFrequency frequency branches =
406 restrictKeys frequency (Set.fromList $ (nub . concat) $ map _phylo_groupNgrams $ concat branches)
408 updateThr :: Double -> [[PhyloGroup]] -> [[PhyloGroup]]
409 updateThr thr branches = map (\b -> map (\g ->
410 g & phylo_groupMeta .~ (singleton "seaLevels" (((g ^. phylo_groupMeta) ! "seaLevels") ++ [thr]))) b) branches
413 -- Sequentially break each branch of a phylo where
414 -- done = all the allready broken branches
415 -- ego = the current branch we want to break
416 -- rest = the branches we still have to break
417 breakBranches :: Double -> Proximity -> Double -> Map Int Double -> Int -> Double -> Double -> Double
418 -> Int -> Map Date Double -> Map Date Cooc -> [PhyloPeriodId] -> [([PhyloGroup],Bool)] -> ([PhyloGroup],Bool) -> [([PhyloGroup],Bool)] -> [([PhyloGroup],Bool)]
419 breakBranches fdt proximity lambda frequency minBranch thr depth elevation frame docs coocs periods done ego rest =
420 -- 1) keep or not the new division of ego
421 let done' = done ++ (if snd ego
423 (if ((null (fst ego')) || (quality > quality'))
425 -- trace (" ✗ F(β) = " <> show(quality) <> " (vs) " <> show(quality')
426 -- <> " | " <> show(length $ fst ego) <> " groups : "
427 -- <> " |✓ " <> show(length $ fst ego') <> show(map length $ fst ego')
428 -- <> " |✗ " <> show(length $ snd ego') <> "[" <> show(length $ concat $ snd ego') <> "]")
431 -- trace (" ✓ level = " <> printf "%.1f" thr <> "")
432 -- trace (" ✓ F(β) = " <> show(quality) <> " (vs) " <> show(quality')
433 -- <> " | " <> show(length $ fst ego) <> " groups : "
434 -- <> " |✓ " <> show(length $ fst ego') <> show(map length $ fst ego')
435 -- <> " |✗ " <> show(length $ snd ego') <> "[" <> show(length $ concat $ snd ego') <> "]")
436 ((map (\e -> (e,True)) (fst ego')) ++ (map (\e -> (e,False)) (snd ego'))))
439 -- 2) if there is no more branches in rest then return else continue
442 else breakBranches fdt proximity lambda frequency minBranch thr depth elevation frame docs coocs periods
443 done' (head' "breakBranches" rest) (tail' "breakBranches" rest)
445 --------------------------------------
447 quality = toPhyloQuality fdt lambda frequency ((map fst done) ++ [fst ego] ++ (map fst rest))
448 --------------------------------------
449 ego' :: ([[PhyloGroup]],[[PhyloGroup]])
451 let branches = groupsToBranches' $ fromList $ map (\g -> (getGroupId g, g))
452 $ matchGroupsToGroups frame periods proximity thr docs coocs (fst ego)
453 branches' = branches `using` parList rdeepseq
454 in partition (\b -> (length $ nub $ map _phylo_groupPeriod b) >= minBranch)
456 $ depthToMeta (elevation - depth) branches'
457 --------------------------------------
459 quality' = toPhyloQuality fdt lambda frequency
460 ((map fst done) ++ (fst ego') ++ (snd ego') ++ (map fst rest))
463 seaLevelMatching :: Double -> Proximity -> Double -> Int -> Map Int Double -> Double -> Double -> Double -> Double
464 -> Int -> [PhyloPeriodId] -> Map Date Double -> Map Date Cooc -> [([PhyloGroup],Bool)] -> [([PhyloGroup],Bool)]
465 seaLevelMatching fdt proximity lambda minBranch frequency thr step depth elevation frame periods docs coocs branches =
466 -- if there is no branch to break or if seaLvl level > 1 then end
467 if (thr >= 1) || ((not . or) $ map snd branches)
470 -- break all the possible branches at the current seaLvl level
471 let quality = toPhyloQuality fdt lambda frequency (map fst branches)
472 acc = toAccuracy frequency (map fst branches)
473 rec = toRecall frequency (map fst branches)
474 branches' = trace ("↑ level = " <> printf "%.3f" thr <> " F(λ) = " <> printf "%.5f" quality
475 <> " ξ = " <> printf "%.5f" acc
476 <> " ρ = " <> printf "%.5f" rec
477 <> " branches = " <> show(length branches) <> " ↴")
478 $ breakBranches fdt proximity lambda frequency minBranch thr depth elevation frame docs coocs periods
479 [] (head' "seaLevelMatching" branches) (tail' "seaLevelMatching" branches)
480 frequency' = reduceFrequency frequency (map fst branches')
481 in seaLevelMatching fdt proximity lambda minBranch frequency' (thr + step) step (depth - 1) elevation frame periods docs coocs branches'
484 constanteTemporalMatching :: Double -> Double -> Phylo -> Phylo
485 constanteTemporalMatching start step phylo = updatePhyloGroups 1
486 (fromList $ map (\g -> (getGroupId g,g)) $ traceMatchEnd $ concat branches)
487 (toPhyloHorizon phylo)
489 -- 2) process the temporal matching by elevating seaLvl level
490 branches :: [[PhyloGroup]]
492 $ seaLevelMatching (fromIntegral $ Vector.length $ getRoots phylo)
493 (phyloProximity $ getConfig phylo)
494 (_qua_granularity $ phyloQuality $ getConfig phylo)
495 (_qua_minBranch $ phyloQuality $ getConfig phylo)
496 (phylo ^. phylo_termFreq)
498 ((((1 - start) / step) - 1))
499 (((1 - start) / step))
500 (getTimeFrame $ timeUnit $ getConfig phylo)
502 (phylo ^. phylo_timeDocs)
503 (phylo ^. phylo_timeCooc)
504 (reverse $ sortOn (length . fst) groups)
505 -- 1) for each group process an initial temporal Matching
506 -- here we suppose that all the groups of level 1 are part of the same big branch
507 groups :: [([PhyloGroup],Bool)]
508 groups = map (\b -> (b,(length $ nub $ map _phylo_groupPeriod b) >= (_qua_minBranch $ phyloQuality $ getConfig phylo)))
509 $ groupsToBranches' $ fromList $ map (\g -> (getGroupId g, g))
510 $ matchGroupsToGroups (getTimeFrame $ timeUnit $ getConfig phylo)
511 (getPeriodIds phylo) (phyloProximity $ getConfig phylo)
513 (phylo ^. phylo_timeDocs)
514 (phylo ^. phylo_timeCooc)
515 (traceTemporalMatching $ getGroupsFromLevel 1 phylo)
521 toPhyloHorizon :: Phylo -> Phylo
522 toPhyloHorizon phylo =
523 let t0 = take 1 (getPeriodIds phylo)
524 groups = getGroupsFromLevelPeriods 1 t0 phylo
525 sens = getSensibility (phyloProximity $ getConfig phylo)
526 nbDocs = sum $ elems $ filterDocs (phylo ^. phylo_timeDocs) t0
527 diago = reduceDiagos $ filterDiago (phylo ^. phylo_timeCooc) t0
528 in phylo & phylo_horizon .~ (fromList $ map (\(g,g') ->
529 ((getGroupId g,getGroupId g'),weightedLogJaccard' sens nbDocs diago (g ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams))) $ listToCombi' groups)
532 --------------------------------------
533 -- | Adaptative Temporal Matching | --
534 --------------------------------------
537 thrToMeta :: Double -> [[PhyloGroup]] -> [[PhyloGroup]]
538 thrToMeta thr branches =
540 map (\g -> g & phylo_groupMeta .~ (adjust (\lst -> lst ++ [thr]) "seaLevels" (g ^. phylo_groupMeta))) b) branches
542 depthToMeta :: Double -> [[PhyloGroup]] -> [[PhyloGroup]]
543 depthToMeta depth branches =
544 let break = length branches > 1
547 if break then g & phylo_groupMeta .~ (adjust (\lst -> lst ++ [depth]) "breaks"(g ^. phylo_groupMeta))
550 reduceTupleMapByKeys :: Eq a => [a] -> Map (a,a) Double -> Map (a,a) Double
551 reduceTupleMapByKeys ks m = filterWithKey (\(k,k') _ -> (elem k ks) && (elem k' ks)) m
554 getInTupleMap :: Ord a => Map (a,a) Double -> a -> a -> Double
556 | isJust (m !? ( k ,k')) = m ! ( k ,k')
557 | isJust (m !? ( k',k )) = m ! ( k',k )
561 toThreshold :: Double -> Map (PhyloGroupId,PhyloGroupId) Double -> Double
562 toThreshold lvl proxiGroups =
563 let idx = ((Map.size proxiGroups) `div` (floor lvl)) - 1
565 then (sort $ elems proxiGroups) !! idx
569 -- done = all the allready broken branches
570 -- ego = the current branch we want to break
571 -- rest = the branches we still have to break
572 adaptativeBreakBranches :: Double -> Proximity -> Double -> Double -> Map (PhyloGroupId,PhyloGroupId) Double
573 -> Double -> Map Int Double -> Int -> Int -> Map Date Double -> Map Date Cooc
574 -> [PhyloPeriodId] -> [([PhyloGroup],(Bool,[Double]))] -> ([PhyloGroup],(Bool,[Double])) -> [([PhyloGroup],(Bool,[Double]))]
575 -> [([PhyloGroup],(Bool,[Double]))]
576 adaptativeBreakBranches fdt proxiConf depth elevation groupsProxi lambda frequency minBranch frame docs coocs periods done ego rest =
577 -- 1) keep or not the new division of ego
578 let done' = done ++ (if (fst . snd) ego
579 then (if ((null (fst ego')) || (quality > quality'))
581 [(concat $ thrToMeta thr $ [fst ego],(False, ((snd . snd) ego)))]
583 ( (map (\e -> (e,(True, ((snd . snd) ego) ++ [thr]))) (fst ego'))
584 ++ (map (\e -> (e,(False, ((snd . snd) ego)))) (snd ego'))))
585 else [(concat $ thrToMeta thr $ [fst ego], snd ego)])
587 -- uncomment let .. in for debugging
588 -- let part1 = partition (snd) done'
589 -- part2 = partition (snd) rest
590 -- in trace ( "[✓ " <> show(length $ fst part1) <> "(" <> show(length $ concat $ map (fst) $ fst part1) <> ")|✗ " <> show(length $ snd part1) <> "(" <> show(length $ concat $ map (fst) $ snd part1) <> ")] "
591 -- <> "[✓ " <> show(length $ fst part2) <> "(" <> show(length $ concat $ map (fst) $ fst part2) <> ")|✗ " <> show(length $ snd part2) <> "(" <> show(length $ concat $ map (fst) $ snd part2) <> ")]"
593 -- 2) if there is no more branches in rest then return else continue
596 else adaptativeBreakBranches fdt proxiConf depth elevation groupsProxi lambda frequency minBranch frame docs coocs periods
597 done' (head' "breakBranches" rest) (tail' "breakBranches" rest)
599 --------------------------------------
601 thr = toThreshold depth $ Map.filter (\v -> v > (last' "breakBranches" $ (snd . snd) ego)) $ reduceTupleMapByKeys (map getGroupId $ fst ego) groupsProxi
602 --------------------------------------
604 quality = toPhyloQuality fdt lambda frequency ((map fst done) ++ [fst ego] ++ (map fst rest))
605 --------------------------------------
606 ego' :: ([[PhyloGroup]],[[PhyloGroup]])
608 let branches = groupsToBranches' $ fromList $ map (\g -> (getGroupId g, g))
609 $ matchGroupsToGroups frame periods proxiConf thr docs coocs (fst ego)
610 branches' = branches `using` parList rdeepseq
611 in partition (\b -> (length $ nub $ map _phylo_groupPeriod b) > minBranch)
613 $ depthToMeta (elevation - depth) branches'
614 --------------------------------------
616 quality' = toPhyloQuality fdt lambda frequency
617 ((map fst done) ++ (fst ego') ++ (snd ego') ++ (map fst rest))
620 adaptativeSeaLevelMatching :: Double -> Proximity -> Double -> Double -> Map (PhyloGroupId, PhyloGroupId) Double
621 -> Double -> Int -> Map Int Double
622 -> Int -> [PhyloPeriodId] -> Map Date Double -> Map Date Cooc
623 -> [([PhyloGroup],(Bool,[Double]))] -> [([PhyloGroup],(Bool,[Double]))]
624 adaptativeSeaLevelMatching fdt proxiConf depth elevation groupsProxi lambda minBranch frequency frame periods docs coocs branches =
625 -- if there is no branch to break or if seaLvl level >= depth then end
626 if (Map.null groupsProxi) || (depth <= 0) || ((not . or) $ map (fst . snd) branches)
629 -- break all the possible branches at the current seaLvl level
630 let branches' = adaptativeBreakBranches fdt proxiConf depth elevation groupsProxi lambda frequency minBranch frame docs coocs periods
631 [] (head' "seaLevelMatching" branches) (tail' "seaLevelMatching" branches)
632 frequency' = reduceFrequency frequency (map fst branches')
633 groupsProxi' = reduceTupleMapByKeys (map (getGroupId) $ concat $ map (fst) $ filter (fst . snd) branches') groupsProxi
634 -- thr = toThreshold depth groupsProxi
635 in trace("\n " <> foldl (\acc _ -> acc <> "🌊 ") "" [0..(elevation - depth)]
636 <> " [✓ " <> show(length $ filter (fst . snd) branches') <> "(" <> show(length $ concat $ map (fst) $ filter (fst . snd) branches')
637 <> ")|✗ " <> show(length $ filter (not . fst . snd) branches') <> "(" <> show(length $ concat $ map (fst) $ filter (not . fst . snd) branches') <> ")]"
639 $ adaptativeSeaLevelMatching fdt proxiConf (depth - 1) elevation groupsProxi' lambda minBranch frequency' frame periods docs coocs branches'
642 adaptativeTemporalMatching :: Double -> Phylo -> Phylo
643 adaptativeTemporalMatching elevation phylo = updatePhyloGroups 1
644 (fromList $ map (\g -> (getGroupId g,g)) $ traceMatchEnd $ concat branches)
645 (toPhyloHorizon phylo)
647 -- 2) process the temporal matching by elevating seaLvl level
648 branches :: [[PhyloGroup]]
650 $ adaptativeSeaLevelMatching (fromIntegral $ Vector.length $ getRoots phylo)
651 (phyloProximity $ getConfig phylo)
654 (phylo ^. phylo_groupsProxi)
655 (_qua_granularity $ phyloQuality $ getConfig phylo)
656 (_qua_minBranch $ phyloQuality $ getConfig phylo)
657 (phylo ^. phylo_termFreq)
658 (getTimeFrame $ timeUnit $ getConfig phylo)
660 (phylo ^. phylo_timeDocs)
661 (phylo ^. phylo_timeCooc)
663 -- 1) for each group process an initial temporal Matching
664 -- here we suppose that all the groups of level 1 are part of the same big branch
665 groups :: [([PhyloGroup],(Bool,[Double]))]
666 groups = map (\b -> (b,((length $ nub $ map _phylo_groupPeriod b) >= (_qua_minBranch $ phyloQuality $ getConfig phylo),[thr])))
667 $ groupsToBranches' $ fromList $ map (\g -> (getGroupId g, g))
668 $ matchGroupsToGroups (getTimeFrame $ timeUnit $ getConfig phylo)
669 (getPeriodIds phylo) (phyloProximity $ getConfig phylo)
671 (phylo ^. phylo_timeDocs)
672 (phylo ^. phylo_timeCooc)
673 (traceTemporalMatching $ getGroupsFromLevel 1 phylo)
674 --------------------------------------
676 thr = toThreshold elevation (phylo ^. phylo_groupsProxi)