2 Module : Gargantext.Core.Viz.Phylo.TemporalMatching
3 Description : Module dedicated to the adaptative temporal matching of a Phylo.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 module Gargantext.Core.Viz.Phylo.TemporalMatching where
14 import Control.Lens hiding (Level)
15 import Control.Parallel.Strategies (parList, rdeepseq, using)
16 import Data.List (concat, splitAt, tail, sortOn, (++), intersect, null, inits, groupBy, scanl, nub, nubBy, union, dropWhile, partition, or, sort, (!!))
17 import Data.Map (Map, fromList, elems, restrictKeys, unionWith, findWithDefault, keys, (!), (!?), filterWithKey, singleton, empty, mapKeys, adjust)
18 import Debug.Trace (trace)
19 import Gargantext.Core.Viz.Phylo
20 import Gargantext.Core.Viz.Phylo.PhyloTools
21 import Gargantext.Prelude
22 import Prelude (floor,tan,pi)
24 import qualified Data.Map as Map
25 import qualified Data.Set as Set
26 import qualified Data.Vector as Vector
34 -- | To compute a jaccard similarity between two lists
35 jaccard :: [Int] -> [Int] -> Double
36 jaccard inter' union' = ((fromIntegral . length) $ inter') / ((fromIntegral . length) $ union')
39 -- | Process the inverse sumLog
40 sumInvLog' :: Double -> Double -> [Double] -> Double
41 sumInvLog' s nb diago = foldl (\mem occ -> mem + (1 / (log (occ + 1/ tan (s * pi / 2)) / log (nb + 1/ tan (s * pi / 2))))) 0 diago
44 -- | Process the sumLog
45 sumLog' :: Double -> Double -> [Double] -> Double
46 sumLog' s nb diago = foldl (\mem occ -> mem + (log (occ + 1/ tan (s * pi / 2)) / log (nb + 1/ tan (s * pi / 2)))) 0 diago
49 weightedLogJaccard' :: Double -> Double -> Map Int Double -> [Int] -> [Int] -> Double
50 weightedLogJaccard' sens nbDocs diago ngrams ngrams'
51 | null ngramsInter = 0
52 | ngramsInter == ngramsUnion = 1
53 | sens == 0 = jaccard ngramsInter ngramsUnion
54 | sens > 0 = (sumInvLog' sens nbDocs diagoInter) / (sumInvLog' sens nbDocs diagoUnion)
55 | otherwise = (sumLog' sens nbDocs diagoInter) / (sumLog' sens nbDocs diagoUnion)
57 --------------------------------------
59 ngramsInter = intersect ngrams ngrams'
60 --------------------------------------
62 ngramsUnion = union ngrams ngrams'
63 --------------------------------------
64 diagoInter :: [Double]
65 diagoInter = elems $ restrictKeys diago (Set.fromList ngramsInter)
66 --------------------------------------
67 diagoUnion :: [Double]
68 diagoUnion = elems $ restrictKeys diago (Set.fromList ngramsUnion)
69 --------------------------------------
71 -- | Process the weighted similarity between clusters. Adapted from Wang, X., Cheng, Q., Lu, W., 2014. Analyzing evolution of research topics with NEViewer: a new method based on dynamic co-word networks. Scientometrics 101, 1253–1271. https://doi.org/10.1007/s11192-014-1347-y (log added in the formula + pair comparison)
72 -- tests not conclusive
73 weightedLogSim' :: Double -> Double -> Map Int Double -> [Int] -> [Int] -> Double
74 weightedLogSim' sens nbDocs diago ego_ngrams target_ngrams
75 | null ngramsInter = 0
76 | ngramsInter == ngramsUnion = 1
77 | sens == 0 = jaccard ngramsInter ngramsUnion
78 | sens > 0 = (sumInvLog' sens nbDocs diagoInter) / minimum [(sumInvLog' sens nbDocs diagoEgo),(sumInvLog' sens nbDocs diagoTarget)]
79 | otherwise = (sumLog' sens nbDocs diagoInter) / minimum [(sumLog' sens nbDocs diagoEgo),(sumLog' sens nbDocs diagoTarget)]
81 --------------------------------------
83 ngramsInter = intersect ego_ngrams target_ngrams
84 --------------------------------------
86 ngramsUnion = union ego_ngrams target_ngrams
87 --------------------------------------
88 diagoInter :: [Double]
89 diagoInter = elems $ restrictKeys diago (Set.fromList ngramsInter)
90 --------------------------------------
92 diagoEgo = elems $ restrictKeys diago (Set.fromList ego_ngrams)
93 --------------------------------------
94 diagoTarget :: [Double]
95 diagoTarget = elems $ restrictKeys diago (Set.fromList target_ngrams)
96 --------------------------------------
98 toProximity :: Double -> Map Int Double -> Proximity -> [Int] -> [Int] -> [Int] -> Double
99 -- | To process the proximity between a current group and a pair of targets group using the adapted Wang et al. Similarity
100 toProximity nbDocs diago proximity egoNgrams targetNgrams targetNgrams' =
102 WeightedLogJaccard sens ->
103 let pairNgrams = if targetNgrams == targetNgrams'
105 else union targetNgrams targetNgrams'
106 in weightedLogJaccard' sens nbDocs diago egoNgrams pairNgrams
107 WeightedLogSim sens ->
108 let pairNgrams = if targetNgrams == targetNgrams'
110 else union targetNgrams targetNgrams'
111 in weightedLogSim' sens nbDocs diago egoNgrams pairNgrams
112 Hamming _ -> undefined
114 ------------------------
115 -- | Local Matching | --
116 ------------------------
118 findLastPeriod :: Filiation -> [PhyloPeriodId] -> PhyloPeriodId
119 findLastPeriod fil periods = case fil of
120 ToParents -> head' "findLastPeriod" (sortOn fst periods)
121 ToChilds -> last' "findLastPeriod" (sortOn fst periods)
122 ToChildsMemory -> undefined
123 ToParentsMemory -> undefined
126 -- | To filter pairs of candidates related to old pointers periods
127 removeOldPointers :: [Pointer] -> Filiation -> Double -> Proximity -> PhyloPeriodId
128 -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
129 -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
130 removeOldPointers oldPointers fil thr prox prd pairs
131 | null oldPointers = pairs
132 | null (filterPointers prox thr oldPointers) =
133 let lastMatchedPrd = findLastPeriod fil (map (fst . fst . fst) oldPointers)
134 in if lastMatchedPrd == prd
136 else filter (\((id,_),(id',_)) ->
138 ToChildsMemory -> undefined
139 ToParentsMemory -> undefined
140 ToParents -> (((fst . fst . fst) id ) < (fst lastMatchedPrd))
141 || (((fst . fst . fst) id') < (fst lastMatchedPrd))
142 ToChilds -> (((fst . fst . fst) id ) > (fst lastMatchedPrd))
143 || (((fst . fst . fst) id') > (fst lastMatchedPrd))) pairs
147 makePairs' :: (PhyloGroupId,[Int]) -> [(PhyloGroupId,[Int])] -> [PhyloPeriodId] -> [Pointer] -> Filiation -> Double -> Proximity
148 -> Map Date Double -> Map Date Cooc -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
149 makePairs' (egoId, egoNgrams) candidates periods oldPointers fil thr prox docs diagos =
152 else removeOldPointers oldPointers fil thr prox lastPrd
153 {- at least on of the pair candidates should be from the last added period -}
154 $ filter (\((id,_),(id',_)) -> ((fst . fst) id == lastPrd) || ((fst . fst) id' == lastPrd))
156 $ filter (\(id,ngrams) ->
157 let nbDocs = (sum . elems) $ filterDocs docs ([(fst . fst) egoId, (fst . fst) id])
158 diago = reduceDiagos $ filterDiago diagos ([(fst . fst) egoId, (fst . fst) id])
159 in (toProximity nbDocs diago prox egoNgrams egoNgrams ngrams) >= thr
162 lastPrd :: PhyloPeriodId
163 lastPrd = findLastPeriod fil periods
166 filterPointers :: Proximity -> Double -> [Pointer] -> [Pointer]
167 filterPointers proxi thr pts = filter (\(_,w) -> filterProximity proxi thr w) pts
169 filterPointers' :: Proximity -> Double -> [(Pointer,[Int])] -> [(Pointer,[Int])]
170 filterPointers' proxi thr pts = filter (\((_,w),_) -> filterProximity proxi thr w) pts
173 reduceDiagos :: Map Date Cooc -> Map Int Double
174 reduceDiagos diagos = mapKeys (\(k,_) -> k)
175 $ foldl (\acc diago -> unionWith (+) acc diago) empty (elems diagos)
177 filterPointersByPeriod :: Filiation -> [(Pointer,[Int])] -> [Pointer]
178 filterPointersByPeriod fil pts =
179 let pts' = sortOn (fst . fst . fst . fst) pts
180 inf = (fst . fst . fst . fst) $ head' "filterPointersByPeriod" pts'
181 sup = (fst . fst . fst . fst) $ last' "filterPointersByPeriod" pts'
183 $ nubBy (\pt pt' -> snd pt == snd pt')
184 $ filter (\pt -> ((fst . fst . fst . fst) pt == inf) || ((fst . fst . fst . fst) pt == sup))
186 ToParents -> reverse pts'
188 ToChildsMemory -> undefined
189 ToParentsMemory -> undefined
191 phyloGroupMatching :: [[(PhyloGroupId,[Int])]] -> Filiation -> Proximity -> Map Date Double -> Map Date Cooc
192 -> Double -> [Pointer] -> (PhyloGroupId,[Int]) -> [Pointer]
193 phyloGroupMatching candidates fil proxi docs diagos thr oldPointers (id,ngrams) =
194 if (null $ filterPointers proxi thr oldPointers)
195 {- let's find new pointers -}
196 then if null nextPointers
198 else filterPointersByPeriod fil
199 $ head' "phyloGroupMatching"
200 -- Keep only the best set of pointers grouped by proximity
201 $ groupBy (\pt pt' -> (snd . fst) pt == (snd . fst) pt')
202 $ reverse $ sortOn (snd . fst) $ head' "pointers" nextPointers
203 -- Find the first time frame where at leats one pointer satisfies the proximity threshold
206 nextPointers :: [[(Pointer,[Int])]]
207 nextPointers = take 1
209 {- for each time frame, process the proximity on relevant pairs of targeted groups -}
210 $ scanl (\acc groups ->
211 let periods = nub $ map (fst . fst . fst) $ concat groups
212 nbdocs = sum $ elems $ (filterDocs docs ([(fst . fst) id] ++ periods))
214 $ filterDiago diagos ([(fst . fst) id] ++ periods)
215 {- important resize nbdocs et diago dans le make pairs -}
216 pairs = makePairs' (id,ngrams) (concat groups) periods oldPointers fil thr proxi docs diagos
217 in acc ++ ( filterPointers' proxi thr
220 {- process the proximity between the current group and a pair of candidates -}
221 let proximity = toProximity nbdocs diago proxi ngrams (snd c) (snd c')
222 in if ((c == c') || (snd c == snd c'))
223 then [((fst c,proximity),snd c)]
224 else [((fst c,proximity),snd c),((fst c',proximity),snd c')] ) pairs )) []
225 $ inits candidates -- groups from [[1900],[1900,1901],[1900,1901,1902],...]
228 filterDocs :: Map Date Double -> [PhyloPeriodId] -> Map Date Double
229 filterDocs d pds = restrictKeys d $ periodsToYears pds
231 filterDiago :: Map Date Cooc -> [PhyloPeriodId] -> Map Date Cooc
232 filterDiago diago pds = restrictKeys diago $ periodsToYears pds
235 -----------------------------
236 -- | Matching Processing | --
237 -----------------------------
240 getNextPeriods :: Filiation -> Int -> PhyloPeriodId -> [PhyloPeriodId] -> [PhyloPeriodId]
241 getNextPeriods fil max' pId pIds =
243 ToChilds -> take max' $ (tail . snd) $ splitAt (elemIndex' pId pIds) pIds
244 ToParents -> take max' $ (reverse . fst) $ splitAt (elemIndex' pId pIds) pIds
245 ToChildsMemory -> undefined
246 ToParentsMemory -> undefined
249 getCandidates :: PhyloGroup -> [[(PhyloGroupId,[Int])]] -> [[(PhyloGroupId,[Int])]]
250 getCandidates ego targets =
252 filter (\g' -> (not . null) $ intersect (ego ^. phylo_groupNgrams) (snd g')
256 matchGroupsToGroups :: Int -> [PhyloPeriodId] -> Proximity -> Double -> Map Date Double -> Map Date Cooc -> [PhyloGroup] -> [PhyloGroup]
257 matchGroupsToGroups frame periods proximity thr docs coocs groups =
258 let groups' = groupByField _phylo_groupPeriod groups
259 in foldl' (\acc prd ->
260 let -- 1) find the parents/childs matching periods
261 periodsPar = getNextPeriods ToParents frame prd periods
262 periodsChi = getNextPeriods ToChilds frame prd periods
263 -- 2) find the parents/childs matching candidates
264 candidatesPar = map (\prd' -> map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ findWithDefault [] prd' groups') periodsPar
265 candidatesChi = map (\prd' -> map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ findWithDefault [] prd' groups') periodsChi
266 -- 3) find the parents/child number of docs by years
267 docsPar = filterDocs docs ([prd] ++ periodsPar)
268 docsChi = filterDocs docs ([prd] ++ periodsChi)
269 -- 4) find the parents/child diago by years
270 diagoPar = filterDiago (map coocToDiago coocs) ([prd] ++ periodsPar)
271 diagoChi = filterDiago (map coocToDiago coocs) ([prd] ++ periodsPar)
272 -- 5) match in parallel all the groups (egos) to their possible candidates
274 let pointersPar = phyloGroupMatching (getCandidates ego candidatesPar) ToParents proximity docsPar diagoPar
275 thr (getPeriodPointers ToParents ego) (getGroupId ego, ego ^. phylo_groupNgrams)
276 pointersChi = phyloGroupMatching (getCandidates ego candidatesChi) ToChilds proximity docsChi diagoChi
277 thr (getPeriodPointers ToChilds ego) (getGroupId ego, ego ^. phylo_groupNgrams)
278 in addPointers ToChilds TemporalPointer pointersChi
279 $ addPointers ToParents TemporalPointer pointersPar
280 $ addMemoryPointers ToChildsMemory TemporalPointer thr pointersChi
281 $ addMemoryPointers ToParentsMemory TemporalPointer thr pointersPar ego)
282 $ findWithDefault [] prd groups'
283 egos' = egos `using` parList rdeepseq
288 -----------------------
289 -- | Phylo Quality | --
290 -----------------------
293 relevantBranches :: Int -> [[PhyloGroup]] -> [[PhyloGroup]]
294 relevantBranches term branches =
295 filter (\groups -> (any (\group -> elem term $ group ^. phylo_groupNgrams) groups)) branches
297 accuracy :: Int -> [(Date,Date)] -> [PhyloGroup] -> Double
298 -- The accuracy of a branch relatively to a term x is computed only over the periods there exist some cluster mentionning x in the phylomemy
299 accuracy x periods bk = ((fromIntegral $ length $ filter (\g -> elem x $ g ^. phylo_groupNgrams) bk')
300 / (fromIntegral $ length bk'))
303 bk' = filter (\g -> elem (g ^. phylo_groupPeriod) periods) bk
305 recall :: Int -> [PhyloGroup] -> [[PhyloGroup]] -> Double
306 recall x bk bx = ((fromIntegral $ length $ filter (\g -> elem x $ g ^. phylo_groupNgrams) bk)
307 / (fromIntegral $ length $ filter (\g -> elem x $ g ^. phylo_groupNgrams) $ concat bx))
309 fScore :: Double -> Int -> [(Date,Date)] -> [PhyloGroup] -> [[PhyloGroup]] -> Double
310 fScore lambda x periods bk bx =
311 let rec = recall x bk bx
312 acc = accuracy x periods bk
313 in ((1 + lambda ** 2) * acc * rec)
314 / (((lambda ** 2) * acc + rec))
317 wk :: [PhyloGroup] -> Double
318 wk bk = fromIntegral $ length bk
321 toPhyloQuality' :: Double -> Map Int Double -> [[PhyloGroup]] -> Double
322 toPhyloQuality' lambda freq branches =
327 let bks = relevantBranches i branches
328 periods = nub $ map _phylo_groupPeriod $ filter (\g -> elem i $ g ^. phylo_groupNgrams) $ concat bks
329 in (freq ! i) * (sum $ map (\bk -> ((wk bk) / (sum $ map wk bks)) * (fScore lambda i periods bk bks)) bks))
332 toRecall :: Map Int Double -> [[PhyloGroup]] -> Double
333 toRecall freq branches =
339 bx = relevantBranches x branches
340 wks = sum $ map wk bx
341 in (px / pys) * (sum $ map (\bk -> ((wk bk) / wks) * (recall x bk bx)) bx))
345 pys = sum (elems freq)
348 toAccuracy :: Map Int Double -> [[PhyloGroup]] -> Double
349 toAccuracy freq branches =
355 bx = relevantBranches x branches
356 -- | periods containing x
357 periods = nub $ map _phylo_groupPeriod $ filter (\g -> elem x $ g ^. phylo_groupNgrams) $ concat bx
358 wks = sum $ map wk bx
359 in (px / pys) * (sum $ map (\bk -> ((wk bk) / wks) * (accuracy x periods bk)) bx))
363 pys = sum (elems freq)
366 -- | here we do the average of all the local f_scores
367 toPhyloQuality :: Double -> Double -> Map Int Double -> [[PhyloGroup]] -> Double
368 toPhyloQuality fdt lambda freq branches =
374 let bx = relevantBranches x branches
375 -- | periods containing x
376 periods = nub $ map _phylo_groupPeriod $ filter (\g -> elem x $ g ^. phylo_groupNgrams) $ concat bx
377 wks = sum $ map wk bx
378 -- in (px / pys) * (sum $ map (\bk -> ((wk bk) / wks) * (fScore beta x bk bx)) bx))
379 -- in (1 / fdt) * (sum $ map (\bk -> ((wk bk) / wks) * (fScore beta x periods bk bx)) bx))
380 in (1 / fdt) * (sum $ map (\bk -> ((wk bk) / wks) * (fScore (tan (lambda * pi / 2)) x periods bk bx)) bx))
384 -- pys = sum (elems freq)
386 -- 1 / nb de foundation
388 ------------------------------------
389 -- | Constant Temporal Matching | --
390 ------------------------------------
393 groupsToBranches' :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
394 groupsToBranches' groups =
395 {- run the related component algorithm -}
396 let egos = groupBy (\gs gs' -> (fst $ fst $ head' "egos" gs) == (fst $ fst $ head' "egos" gs'))
397 $ sortOn (\gs -> fst $ fst $ head' "egos" gs)
398 $ map (\group -> [getGroupId group]
399 ++ (map fst $ group ^. phylo_groupPeriodParents)
400 ++ (map fst $ group ^. phylo_groupPeriodChilds) ) $ elems groups
401 -- first find the related components by inside each ego's period
403 graph' = map relatedComponents egos
404 -- then run it for the all the periods
406 $ relatedComponents $ concat (graph' `using` parList rdeepseq)
407 -- update each group's branch id
408 in map (\(bId,ids) ->
409 let groups' = map (\group -> group & phylo_groupBranchId %~ (\(lvl,lst) -> (lvl,lst ++ [bId])))
410 $ elems $ restrictKeys groups (Set.fromList ids)
411 in groups' `using` parList rdeepseq ) graph
414 reduceFrequency :: Map Int Double -> [[PhyloGroup]] -> Map Int Double
415 reduceFrequency frequency branches =
416 restrictKeys frequency (Set.fromList $ (nub . concat) $ map _phylo_groupNgrams $ concat branches)
418 updateThr :: Double -> [[PhyloGroup]] -> [[PhyloGroup]]
419 updateThr thr branches = map (\b -> map (\g ->
420 g & phylo_groupMeta .~ (singleton "seaLevels" (((g ^. phylo_groupMeta) ! "seaLevels") ++ [thr]))) b) branches
423 -- Sequentially break each branch of a phylo where
424 -- done = all the allready broken branches
425 -- ego = the current branch we want to break
426 -- rest = the branches we still have to break
427 breakBranches :: Double -> Proximity -> Double -> Map Int Double -> Int -> Double -> Double -> Double
428 -> Int -> Map Date Double -> Map Date Cooc -> [PhyloPeriodId] -> [([PhyloGroup],Bool)] -> ([PhyloGroup],Bool) -> [([PhyloGroup],Bool)] -> [([PhyloGroup],Bool)]
429 breakBranches fdt proximity lambda frequency minBranch thr depth elevation frame docs coocs periods done ego rest =
430 -- 1) keep or not the new division of ego
431 let done' = done ++ (if snd ego
433 (if ((null (fst ego')) || (quality > quality'))
435 -- trace (" ✗ F(β) = " <> show(quality) <> " (vs) " <> show(quality')
436 -- <> " | " <> show(length $ fst ego) <> " groups : "
437 -- <> " |✓ " <> show(length $ fst ego') <> show(map length $ fst ego')
438 -- <> " |✗ " <> show(length $ snd ego') <> "[" <> show(length $ concat $ snd ego') <> "]")
441 -- trace (" ✓ level = " <> printf "%.1f" thr <> "")
442 -- trace (" ✓ F(β) = " <> show(quality) <> " (vs) " <> show(quality')
443 -- <> " | " <> show(length $ fst ego) <> " groups : "
444 -- <> " |✓ " <> show(length $ fst ego') <> show(map length $ fst ego')
445 -- <> " |✗ " <> show(length $ snd ego') <> "[" <> show(length $ concat $ snd ego') <> "]")
446 ((map (\e -> (e,True)) (fst ego')) ++ (map (\e -> (e,False)) (snd ego'))))
449 -- 2) if there is no more branches in rest then return else continue
452 else breakBranches fdt proximity lambda frequency minBranch thr depth elevation frame docs coocs periods
453 done' (head' "breakBranches" rest) (tail' "breakBranches" rest)
455 --------------------------------------
457 quality = toPhyloQuality fdt lambda frequency ((map fst done) ++ [fst ego] ++ (map fst rest))
458 --------------------------------------
459 ego' :: ([[PhyloGroup]],[[PhyloGroup]])
461 let branches = groupsToBranches' $ fromList $ map (\g -> (getGroupId g, g))
462 $ matchGroupsToGroups frame periods proximity thr docs coocs (fst ego)
463 branches' = branches `using` parList rdeepseq
464 in partition (\b -> (length $ nub $ map _phylo_groupPeriod b) >= minBranch)
466 $ depthToMeta (elevation - depth) branches'
467 --------------------------------------
469 quality' = toPhyloQuality fdt lambda frequency
470 ((map fst done) ++ (fst ego') ++ (snd ego') ++ (map fst rest))
473 seaLevelMatching :: Double -> Proximity -> Double -> Int -> Map Int Double -> Double -> Double -> Double -> Double
474 -> Int -> [PhyloPeriodId] -> Map Date Double -> Map Date Cooc -> [([PhyloGroup],Bool)] -> [([PhyloGroup],Bool)]
475 seaLevelMatching fdt proximity lambda minBranch frequency thr step depth elevation frame periods docs coocs branches =
476 -- if there is no branch to break or if seaLvl level > 1 then end
477 if (thr >= 1) || ((not . or) $ map snd branches)
480 -- break all the possible branches at the current seaLvl level
481 let quality = toPhyloQuality fdt lambda frequency (map fst branches)
482 acc = toAccuracy frequency (map fst branches)
483 rec = toRecall frequency (map fst branches)
484 branches' = trace ("↑ level = " <> printf "%.3f" thr <> " F(λ) = " <> printf "%.5f" quality
485 <> " ξ = " <> printf "%.5f" acc
486 <> " ρ = " <> printf "%.5f" rec
487 <> " branches = " <> show(length branches) <> " ↴")
488 $ breakBranches fdt proximity lambda frequency minBranch thr depth elevation frame docs coocs periods
489 [] (head' "seaLevelMatching" branches) (tail' "seaLevelMatching" branches)
490 frequency' = reduceFrequency frequency (map fst branches')
491 in seaLevelMatching fdt proximity lambda minBranch frequency' (thr + step) step (depth - 1) elevation frame periods docs coocs branches'
494 constanteTemporalMatching :: Double -> Double -> Phylo -> Phylo
495 constanteTemporalMatching start step phylo = updatePhyloGroups 1
496 (fromList $ map (\g -> (getGroupId g,g)) $ traceMatchEnd $ concat branches)
497 (toPhyloHorizon phylo)
499 -- 2) process the temporal matching by elevating seaLvl level
500 branches :: [[PhyloGroup]]
502 $ seaLevelMatching (fromIntegral $ Vector.length $ getRoots phylo)
503 (phyloProximity $ getConfig phylo)
504 (_qua_granularity $ phyloQuality $ getConfig phylo)
505 (_qua_minBranch $ phyloQuality $ getConfig phylo)
506 (phylo ^. phylo_termFreq)
508 ((((1 - start) / step) - 1))
509 (((1 - start) / step))
510 (getTimeFrame $ timeUnit $ getConfig phylo)
512 (phylo ^. phylo_timeDocs)
513 (phylo ^. phylo_timeCooc)
514 (reverse $ sortOn (length . fst) groups)
515 -- 1) for each group process an initial temporal Matching
516 -- here we suppose that all the groups of level 1 are part of the same big branch
517 groups :: [([PhyloGroup],Bool)]
518 groups = map (\b -> (b,(length $ nub $ map _phylo_groupPeriod b) >= (_qua_minBranch $ phyloQuality $ getConfig phylo)))
519 $ groupsToBranches' $ fromList $ map (\g -> (getGroupId g, g))
520 $ matchGroupsToGroups (getTimeFrame $ timeUnit $ getConfig phylo)
521 (getPeriodIds phylo) (phyloProximity $ getConfig phylo)
523 (phylo ^. phylo_timeDocs)
524 (phylo ^. phylo_timeCooc)
525 (traceTemporalMatching $ getGroupsFromLevel 1 phylo)
531 toPhyloHorizon :: Phylo -> Phylo
532 toPhyloHorizon phylo =
533 let t0 = take 1 (getPeriodIds phylo)
534 groups = getGroupsFromLevelPeriods 1 t0 phylo
535 sens = getSensibility (phyloProximity $ getConfig phylo)
536 nbDocs = sum $ elems $ filterDocs (phylo ^. phylo_timeDocs) t0
537 diago = reduceDiagos $ filterDiago (phylo ^. phylo_timeCooc) t0
538 in phylo & phylo_horizon .~ (fromList $ map (\(g,g') ->
539 ((getGroupId g,getGroupId g'),weightedLogJaccard' sens nbDocs diago (g ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams))) $ listToCombi' groups)
542 --------------------------------------
543 -- | Adaptative Temporal Matching | --
544 --------------------------------------
547 thrToMeta :: Double -> [[PhyloGroup]] -> [[PhyloGroup]]
548 thrToMeta thr branches =
550 map (\g -> g & phylo_groupMeta .~ (adjust (\lst -> lst ++ [thr]) "seaLevels" (g ^. phylo_groupMeta))) b) branches
552 depthToMeta :: Double -> [[PhyloGroup]] -> [[PhyloGroup]]
553 depthToMeta depth branches =
554 let break = length branches > 1
557 if break then g & phylo_groupMeta .~ (adjust (\lst -> lst ++ [depth]) "breaks"(g ^. phylo_groupMeta))
560 reduceTupleMapByKeys :: Eq a => [a] -> Map (a,a) Double -> Map (a,a) Double
561 reduceTupleMapByKeys ks m = filterWithKey (\(k,k') _ -> (elem k ks) && (elem k' ks)) m
564 getInTupleMap :: Ord a => Map (a,a) Double -> a -> a -> Double
566 | isJust (m !? ( k ,k')) = m ! ( k ,k')
567 | isJust (m !? ( k',k )) = m ! ( k',k )
571 toThreshold :: Double -> Map (PhyloGroupId,PhyloGroupId) Double -> Double
572 toThreshold lvl proxiGroups =
573 let idx = ((Map.size proxiGroups) `div` (floor lvl)) - 1
575 then (sort $ elems proxiGroups) !! idx
579 -- done = all the allready broken branches
580 -- ego = the current branch we want to break
581 -- rest = the branches we still have to break
582 adaptativeBreakBranches :: Double -> Proximity -> Double -> Double -> Map (PhyloGroupId,PhyloGroupId) Double
583 -> Double -> Map Int Double -> Int -> Int -> Map Date Double -> Map Date Cooc
584 -> [PhyloPeriodId] -> [([PhyloGroup],(Bool,[Double]))] -> ([PhyloGroup],(Bool,[Double])) -> [([PhyloGroup],(Bool,[Double]))]
585 -> [([PhyloGroup],(Bool,[Double]))]
586 adaptativeBreakBranches fdt proxiConf depth elevation groupsProxi lambda frequency minBranch frame docs coocs periods done ego rest =
587 -- 1) keep or not the new division of ego
588 let done' = done ++ (if (fst . snd) ego
589 then (if ((null (fst ego')) || (quality > quality'))
591 [(concat $ thrToMeta thr $ [fst ego],(False, ((snd . snd) ego)))]
593 ( (map (\e -> (e,(True, ((snd . snd) ego) ++ [thr]))) (fst ego'))
594 ++ (map (\e -> (e,(False, ((snd . snd) ego)))) (snd ego'))))
595 else [(concat $ thrToMeta thr $ [fst ego], snd ego)])
597 -- uncomment let .. in for debugging
598 -- let part1 = partition (snd) done'
599 -- part2 = partition (snd) rest
600 -- in trace ( "[✓ " <> show(length $ fst part1) <> "(" <> show(length $ concat $ map (fst) $ fst part1) <> ")|✗ " <> show(length $ snd part1) <> "(" <> show(length $ concat $ map (fst) $ snd part1) <> ")] "
601 -- <> "[✓ " <> show(length $ fst part2) <> "(" <> show(length $ concat $ map (fst) $ fst part2) <> ")|✗ " <> show(length $ snd part2) <> "(" <> show(length $ concat $ map (fst) $ snd part2) <> ")]"
603 -- 2) if there is no more branches in rest then return else continue
606 else adaptativeBreakBranches fdt proxiConf depth elevation groupsProxi lambda frequency minBranch frame docs coocs periods
607 done' (head' "breakBranches" rest) (tail' "breakBranches" rest)
609 --------------------------------------
611 thr = toThreshold depth $ Map.filter (\v -> v > (last' "breakBranches" $ (snd . snd) ego)) $ reduceTupleMapByKeys (map getGroupId $ fst ego) groupsProxi
612 --------------------------------------
614 quality = toPhyloQuality fdt lambda frequency ((map fst done) ++ [fst ego] ++ (map fst rest))
615 --------------------------------------
616 ego' :: ([[PhyloGroup]],[[PhyloGroup]])
618 let branches = groupsToBranches' $ fromList $ map (\g -> (getGroupId g, g))
619 $ matchGroupsToGroups frame periods proxiConf thr docs coocs (fst ego)
620 branches' = branches `using` parList rdeepseq
621 in partition (\b -> (length $ nub $ map _phylo_groupPeriod b) > minBranch)
623 $ depthToMeta (elevation - depth) branches'
624 --------------------------------------
626 quality' = toPhyloQuality fdt lambda frequency
627 ((map fst done) ++ (fst ego') ++ (snd ego') ++ (map fst rest))
630 adaptativeSeaLevelMatching :: Double -> Proximity -> Double -> Double -> Map (PhyloGroupId, PhyloGroupId) Double
631 -> Double -> Int -> Map Int Double
632 -> Int -> [PhyloPeriodId] -> Map Date Double -> Map Date Cooc
633 -> [([PhyloGroup],(Bool,[Double]))] -> [([PhyloGroup],(Bool,[Double]))]
634 adaptativeSeaLevelMatching fdt proxiConf depth elevation groupsProxi lambda minBranch frequency frame periods docs coocs branches =
635 -- if there is no branch to break or if seaLvl level >= depth then end
636 if (Map.null groupsProxi) || (depth <= 0) || ((not . or) $ map (fst . snd) branches)
639 -- break all the possible branches at the current seaLvl level
640 let branches' = adaptativeBreakBranches fdt proxiConf depth elevation groupsProxi lambda frequency minBranch frame docs coocs periods
641 [] (head' "seaLevelMatching" branches) (tail' "seaLevelMatching" branches)
642 frequency' = reduceFrequency frequency (map fst branches')
643 groupsProxi' = reduceTupleMapByKeys (map (getGroupId) $ concat $ map (fst) $ filter (fst . snd) branches') groupsProxi
644 -- thr = toThreshold depth groupsProxi
645 in trace("\n " <> foldl (\acc _ -> acc <> "🌊 ") "" [0..(elevation - depth)]
646 <> " [✓ " <> show(length $ filter (fst . snd) branches') <> "(" <> show(length $ concat $ map (fst) $ filter (fst . snd) branches')
647 <> ")|✗ " <> show(length $ filter (not . fst . snd) branches') <> "(" <> show(length $ concat $ map (fst) $ filter (not . fst . snd) branches') <> ")]"
649 $ adaptativeSeaLevelMatching fdt proxiConf (depth - 1) elevation groupsProxi' lambda minBranch frequency' frame periods docs coocs branches'
652 adaptativeTemporalMatching :: Double -> Phylo -> Phylo
653 adaptativeTemporalMatching elevation phylo = updatePhyloGroups 1
654 (fromList $ map (\g -> (getGroupId g,g)) $ traceMatchEnd $ concat branches)
655 (toPhyloHorizon phylo)
657 -- 2) process the temporal matching by elevating seaLvl level
658 branches :: [[PhyloGroup]]
660 $ adaptativeSeaLevelMatching (fromIntegral $ Vector.length $ getRoots phylo)
661 (phyloProximity $ getConfig phylo)
664 (phylo ^. phylo_groupsProxi)
665 (_qua_granularity $ phyloQuality $ getConfig phylo)
666 (_qua_minBranch $ phyloQuality $ getConfig phylo)
667 (phylo ^. phylo_termFreq)
668 (getTimeFrame $ timeUnit $ getConfig phylo)
670 (phylo ^. phylo_timeDocs)
671 (phylo ^. phylo_timeCooc)
673 -- 1) for each group process an initial temporal Matching
674 -- here we suppose that all the groups of level 1 are part of the same big branch
675 groups :: [([PhyloGroup],(Bool,[Double]))]
676 groups = map (\b -> (b,((length $ nub $ map _phylo_groupPeriod b) >= (_qua_minBranch $ phyloQuality $ getConfig phylo),[thr])))
677 $ groupsToBranches' $ fromList $ map (\g -> (getGroupId g, g))
678 $ matchGroupsToGroups (getTimeFrame $ timeUnit $ getConfig phylo)
679 (getPeriodIds phylo) (phyloProximity $ getConfig phylo)
681 (phylo ^. phylo_timeDocs)
682 (phylo ^. phylo_timeCooc)
683 (traceTemporalMatching $ getGroupsFromLevel 1 phylo)
684 --------------------------------------
686 thr = toThreshold elevation (phylo ^. phylo_groupsProxi)