2 Module : Gargantext.Core.Viz.Phylo.TemporalMatching
3 Description : Module dedicated to the adaptative temporal matching of a Phylo.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 module Gargantext.Core.Viz.Phylo.TemporalMatching where
14 import Control.Lens hiding (Level)
15 import Control.Parallel.Strategies (parList, rdeepseq, using)
16 import Data.List (concat, splitAt, tail, sortOn, (++), intersect, null, inits, groupBy, scanl, nub, nubBy, union, dropWhile, partition, or, sort, (!!))
17 import Data.Map (Map, fromList, elems, restrictKeys, unionWith, findWithDefault, keys, (!), (!?), filterWithKey, singleton, empty, mapKeys, adjust)
18 import Debug.Trace (trace)
19 import Gargantext.Core.Viz.Phylo
20 import Gargantext.Core.Viz.Phylo.PhyloTools
21 import Gargantext.Prelude
22 import Prelude (floor,tan,pi)
24 import qualified Data.Map as Map
25 import qualified Data.Set as Set
26 import qualified Data.Vector as Vector
34 -- | To compute a jaccard similarity between two lists
35 jaccard :: [Int] -> [Int] -> Double
36 jaccard inter' union' = ((fromIntegral . length) $ inter') / ((fromIntegral . length) $ union')
39 -- | Process the inverse sumLog
40 sumInvLog' :: Double -> Double -> [Double] -> Double
41 sumInvLog' s nb diago = foldl (\mem occ -> mem + (1 / (log (occ + 1/ tan (s * pi / 2)) / log (nb + 1/ tan (s * pi / 2))))) 0 diago
44 -- | Process the sumLog
45 sumLog' :: Double -> Double -> [Double] -> Double
46 sumLog' s nb diago = foldl (\mem occ -> mem + (log (occ + 1/ tan (s * pi / 2)) / log (nb + 1/ tan (s * pi / 2)))) 0 diago
49 weightedLogJaccard' :: Double -> Double -> Map Int Double -> [Int] -> [Int] -> Double
50 weightedLogJaccard' sens nbDocs diago ngrams ngrams'
51 | null ngramsInter = 0
52 | ngramsInter == ngramsUnion = 1
53 | sens == 0 = jaccard ngramsInter ngramsUnion
54 | sens > 0 = (sumInvLog' sens nbDocs diagoInter) / (sumInvLog' sens nbDocs diagoUnion)
55 | otherwise = (sumLog' sens nbDocs diagoInter) / (sumLog' sens nbDocs diagoUnion)
57 --------------------------------------
59 ngramsInter = intersect ngrams ngrams'
60 --------------------------------------
62 ngramsUnion = union ngrams ngrams'
63 --------------------------------------
64 diagoInter :: [Double]
65 diagoInter = elems $ restrictKeys diago (Set.fromList ngramsInter)
66 --------------------------------------
67 diagoUnion :: [Double]
68 diagoUnion = elems $ restrictKeys diago (Set.fromList ngramsUnion)
69 --------------------------------------
71 -- | Process the weighted similarity between clusters. Adapted from Wang, X., Cheng, Q., Lu, W., 2014. Analyzing evolution of research topics with NEViewer: a new method based on dynamic co-word networks. Scientometrics 101, 1253–1271. https://doi.org/10.1007/s11192-014-1347-y (log added in the formula + pair comparison)
72 -- tests not conclusive
73 weightedLogSim' :: Double -> Double -> Map Int Double -> [Int] -> [Int] -> Double
74 weightedLogSim' sens nbDocs diago ego_ngrams target_ngrams
75 | null ngramsInter = 0
76 | ngramsInter == ngramsUnion = 1
77 | sens == 0 = jaccard ngramsInter ngramsUnion
78 | sens > 0 = (sumInvLog' sens nbDocs diagoInter) / minimum [(sumInvLog' sens nbDocs diagoEgo),(sumInvLog' sens nbDocs diagoTarget)]
79 | otherwise = (sumLog' sens nbDocs diagoInter) / minimum [(sumLog' sens nbDocs diagoEgo),(sumLog' sens nbDocs diagoTarget)]
81 --------------------------------------
83 ngramsInter = intersect ego_ngrams target_ngrams
84 --------------------------------------
86 ngramsUnion = union ego_ngrams target_ngrams
87 --------------------------------------
88 diagoInter :: [Double]
89 diagoInter = elems $ restrictKeys diago (Set.fromList ngramsInter)
90 --------------------------------------
92 diagoEgo = elems $ restrictKeys diago (Set.fromList ego_ngrams)
93 --------------------------------------
94 diagoTarget :: [Double]
95 diagoTarget = elems $ restrictKeys diago (Set.fromList target_ngrams)
96 --------------------------------------
98 toProximity :: Double -> Map Int Double -> Proximity -> [Int] -> [Int] -> [Int] -> Double
99 -- | To process the proximity between a current group and a pair of targets group using the adapted Wang et al. Similarity
100 toProximity nbDocs diago proximity egoNgrams targetNgrams targetNgrams' =
102 WeightedLogJaccard sens ->
103 let pairNgrams = if targetNgrams == targetNgrams'
105 else union targetNgrams targetNgrams'
106 in weightedLogJaccard' sens nbDocs diago egoNgrams pairNgrams
107 WeightedLogSim sens ->
108 let pairNgrams = if targetNgrams == targetNgrams'
110 else union targetNgrams targetNgrams'
111 in weightedLogSim' sens nbDocs diago egoNgrams pairNgrams
112 Hamming _ -> undefined
114 ------------------------
115 -- | Local Matching | --
116 ------------------------
118 findLastPeriod :: Filiation -> [PhyloPeriodId] -> PhyloPeriodId
119 findLastPeriod fil periods = case fil of
120 ToParents -> head' "findLastPeriod" (sortOn fst periods)
121 ToChilds -> last' "findLastPeriod" (sortOn fst periods)
122 ToChildsMemory -> undefined
123 ToParentsMemory -> undefined
126 -- | To filter pairs of candidates related to old pointers periods
127 removeOldPointers :: [Pointer] -> Filiation -> Double -> Proximity -> PhyloPeriodId
128 -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
129 -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
130 removeOldPointers oldPointers fil thr prox prd pairs
131 | null oldPointers = pairs
132 | null (filterPointers prox thr oldPointers) =
133 let lastMatchedPrd = findLastPeriod fil (map (fst . fst . fst) oldPointers)
134 in if lastMatchedPrd == prd
136 else filter (\((id,_),(id',_)) ->
138 ToChildsMemory -> undefined
139 ToParentsMemory -> undefined
140 ToParents -> (((fst . fst . fst) id ) < (fst lastMatchedPrd))
141 || (((fst . fst . fst) id') < (fst lastMatchedPrd))
142 ToChilds -> (((fst . fst . fst) id ) > (fst lastMatchedPrd))
143 || (((fst . fst . fst) id') > (fst lastMatchedPrd))) pairs
147 makePairs' :: (PhyloGroupId,[Int]) -> [(PhyloGroupId,[Int])] -> [PhyloPeriodId] -> [Pointer] -> Filiation -> Double -> Proximity
148 -> Map Date Double -> Map Date Cooc -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
149 makePairs' (egoId, egoNgrams) candidates periods oldPointers fil thr prox docs diagos =
152 else removeOldPointers oldPointers fil thr prox lastPrd
153 {- at least on of the pair candidates should be from the last added period -}
154 $ filter (\((id,_),(id',_)) -> ((fst . fst) id == lastPrd) || ((fst . fst) id' == lastPrd))
156 $ filter (\(id,ngrams) ->
157 let nbDocs = (sum . elems) $ filterDocs docs ([(fst . fst) egoId, (fst . fst) id])
158 diago = reduceDiagos $ filterDiago diagos ([(fst . fst) egoId, (fst . fst) id])
159 in (toProximity nbDocs diago prox egoNgrams egoNgrams ngrams) >= thr
162 lastPrd :: PhyloPeriodId
163 lastPrd = findLastPeriod fil periods
166 filterPointers :: Proximity -> Double -> [Pointer] -> [Pointer]
167 filterPointers proxi thr pts = filter (\(_,w) -> filterProximity proxi thr w) pts
169 filterPointers' :: Proximity -> Double -> [(Pointer,[Int])] -> [(Pointer,[Int])]
170 filterPointers' proxi thr pts = filter (\((_,w),_) -> filterProximity proxi thr w) pts
173 reduceDiagos :: Map Date Cooc -> Map Int Double
174 reduceDiagos diagos = mapKeys (\(k,_) -> k)
175 $ foldl (\acc diago -> unionWith (+) acc diago) empty (elems diagos)
177 filterPointersByPeriod :: Filiation -> [(Pointer,[Int])] -> [Pointer]
178 filterPointersByPeriod fil pts =
179 let pts' = sortOn (fst . fst . fst . fst) pts
180 inf = (fst . fst . fst . fst) $ head' "filterPointersByPeriod" pts'
181 sup = (fst . fst . fst . fst) $ last' "filterPointersByPeriod" pts'
183 $ nubBy (\pt pt' -> snd pt == snd pt')
184 $ filter (\pt -> ((fst . fst . fst . fst) pt == inf) || ((fst . fst . fst . fst) pt == sup))
186 ToParents -> reverse pts'
188 ToChildsMemory -> undefined
189 ToParentsMemory -> undefined
191 phyloGroupMatching :: [[(PhyloGroupId,[Int])]] -> Filiation -> Proximity -> Map Date Double -> Map Date Cooc
192 -> Double -> [Pointer] -> (PhyloGroupId,[Int]) -> [Pointer]
193 phyloGroupMatching candidates fil proxi docs diagos thr oldPointers (id,ngrams) =
194 if (null $ filterPointers proxi thr oldPointers)
195 {- let's find new pointers -}
196 then if null nextPointers
198 else filterPointersByPeriod fil
199 $ head' "phyloGroupMatching"
200 -- Keep only the best set of pointers grouped by proximity
201 $ groupBy (\pt pt' -> (snd . fst) pt == (snd . fst) pt')
202 $ reverse $ sortOn (snd . fst) $ head' "pointers" nextPointers
203 -- Find the first time frame where at leats one pointer satisfies the proximity threshold
206 nextPointers :: [[(Pointer,[Int])]]
207 nextPointers = take 1
209 {- for each time frame, process the proximity on relevant pairs of targeted groups -}
210 $ scanl (\acc groups ->
211 let periods = nub $ map (fst . fst . fst) $ concat groups
212 nbdocs = sum $ elems $ (filterDocs docs ([(fst . fst) id] ++ periods))
214 $ filterDiago diagos ([(fst . fst) id] ++ periods)
215 {- important resize nbdocs et diago dans le make pairs -}
216 pairs = makePairs' (id,ngrams) (concat groups) periods oldPointers fil thr proxi docs diagos
217 in acc ++ ( filterPointers' proxi thr
220 {- process the proximity between the current group and a pair of candidates -}
221 let proximity = toProximity nbdocs diago proxi ngrams (snd c) (snd c')
222 in if ((c == c') || (snd c == snd c'))
223 then [((fst c,proximity),snd c)]
224 else [((fst c,proximity),snd c),((fst c',proximity),snd c')] ) pairs )) []
225 $ inits candidates -- groups from [[1900],[1900,1901],[1900,1901,1902],...]
228 filterDocs :: Map Date Double -> [PhyloPeriodId] -> Map Date Double
229 filterDocs d pds = restrictKeys d $ periodsToYears pds
231 filterDiago :: Map Date Cooc -> [PhyloPeriodId] -> Map Date Cooc
232 filterDiago diago pds = restrictKeys diago $ periodsToYears pds
235 -----------------------------
236 -- | Matching Processing | --
237 -----------------------------
240 getNextPeriods :: Filiation -> Int -> PhyloPeriodId -> [PhyloPeriodId] -> [PhyloPeriodId]
241 getNextPeriods fil max' pId pIds =
243 ToChilds -> take max' $ (tail . snd) $ splitAt (elemIndex' pId pIds) pIds
244 ToParents -> take max' $ (reverse . fst) $ splitAt (elemIndex' pId pIds) pIds
245 ToChildsMemory -> undefined
246 ToParentsMemory -> undefined
249 getCandidates :: PhyloGroup -> [[(PhyloGroupId,[Int])]] -> [[(PhyloGroupId,[Int])]]
250 getCandidates ego targets =
251 if (length (ego ^. phylo_groupNgrams)) > 1
253 map (\groups' -> filter (\g' -> (> 1) $ length $ intersect (ego ^. phylo_groupNgrams) (snd g')) groups') targets
255 map (\groups' -> filter (\g' -> (not . null) $ intersect (ego ^. phylo_groupNgrams) (snd g')) groups') targets
258 matchGroupsToGroups :: Int -> [PhyloPeriodId] -> Proximity -> Double -> Map Date Double -> Map Date Cooc -> [PhyloGroup] -> [PhyloGroup]
259 matchGroupsToGroups frame periods proximity thr docs coocs groups =
260 let groups' = groupByField _phylo_groupPeriod groups
261 in foldl' (\acc prd ->
262 let -- 1) find the parents/childs matching periods
263 periodsPar = getNextPeriods ToParents frame prd periods
264 periodsChi = getNextPeriods ToChilds frame prd periods
265 -- 2) find the parents/childs matching candidates
266 candidatesPar = map (\prd' -> map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ findWithDefault [] prd' groups') periodsPar
267 candidatesChi = map (\prd' -> map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ findWithDefault [] prd' groups') periodsChi
268 -- 3) find the parents/child number of docs by years
269 docsPar = filterDocs docs ([prd] ++ periodsPar)
270 docsChi = filterDocs docs ([prd] ++ periodsChi)
271 -- 4) find the parents/child diago by years
272 diagoPar = filterDiago (map coocToDiago coocs) ([prd] ++ periodsPar)
273 diagoChi = filterDiago (map coocToDiago coocs) ([prd] ++ periodsPar)
274 -- 5) match in parallel all the groups (egos) to their possible candidates
276 let pointersPar = phyloGroupMatching (getCandidates ego candidatesPar) ToParents proximity docsPar diagoPar
277 thr (getPeriodPointers ToParents ego) (getGroupId ego, ego ^. phylo_groupNgrams)
278 pointersChi = phyloGroupMatching (getCandidates ego candidatesChi) ToChilds proximity docsChi diagoChi
279 thr (getPeriodPointers ToChilds ego) (getGroupId ego, ego ^. phylo_groupNgrams)
280 in addPointers ToChilds TemporalPointer pointersChi
281 $ addPointers ToParents TemporalPointer pointersPar
282 $ addMemoryPointers ToChildsMemory TemporalPointer thr pointersChi
283 $ addMemoryPointers ToParentsMemory TemporalPointer thr pointersPar ego)
284 $ findWithDefault [] prd groups'
285 egos' = egos `using` parList rdeepseq
290 -----------------------
291 -- | Phylo Quality | --
292 -----------------------
295 relevantBranches :: Int -> [[PhyloGroup]] -> [[PhyloGroup]]
296 relevantBranches term branches =
297 filter (\groups -> (any (\group -> elem term $ group ^. phylo_groupNgrams) groups)) branches
299 accuracy :: Int -> [(Date,Date)] -> [PhyloGroup] -> Double
300 -- The accuracy of a branch relatively to a term x is computed only over the periods there exist some cluster mentionning x in the phylomemy
301 accuracy x periods bk = ((fromIntegral $ length $ filter (\g -> elem x $ g ^. phylo_groupNgrams) bk')
302 / (fromIntegral $ length bk'))
305 bk' = filter (\g -> elem (g ^. phylo_groupPeriod) periods) bk
307 recall :: Int -> [PhyloGroup] -> [[PhyloGroup]] -> Double
308 recall x bk bx = ((fromIntegral $ length $ filter (\g -> elem x $ g ^. phylo_groupNgrams) bk)
309 / (fromIntegral $ length $ filter (\g -> elem x $ g ^. phylo_groupNgrams) $ concat bx))
311 fScore :: Double -> Int -> [(Date,Date)] -> [PhyloGroup] -> [[PhyloGroup]] -> Double
312 fScore lambda x periods bk bx =
313 let rec = recall x bk bx
314 acc = accuracy x periods bk
315 in ((1 + lambda ** 2) * acc * rec)
316 / (((lambda ** 2) * acc + rec))
319 wk :: [PhyloGroup] -> Double
320 wk bk = fromIntegral $ length bk
323 toPhyloQuality' :: Double -> Map Int Double -> [[PhyloGroup]] -> Double
324 toPhyloQuality' lambda freq branches =
329 let bks = relevantBranches i branches
330 periods = nub $ map _phylo_groupPeriod $ filter (\g -> elem i $ g ^. phylo_groupNgrams) $ concat bks
331 in (freq ! i) * (sum $ map (\bk -> ((wk bk) / (sum $ map wk bks)) * (fScore lambda i periods bk bks)) bks))
334 toRecall :: Map Int Double -> [[PhyloGroup]] -> Double
335 toRecall freq branches =
341 bx = relevantBranches x branches
342 wks = sum $ map wk bx
343 in (px / pys) * (sum $ map (\bk -> ((wk bk) / wks) * (recall x bk bx)) bx))
347 pys = sum (elems freq)
350 toAccuracy :: Map Int Double -> [[PhyloGroup]] -> Double
351 toAccuracy freq branches =
357 bx = relevantBranches x branches
358 -- | periods containing x
359 periods = nub $ map _phylo_groupPeriod $ filter (\g -> elem x $ g ^. phylo_groupNgrams) $ concat bx
360 wks = sum $ map wk bx
361 in (px / pys) * (sum $ map (\bk -> ((wk bk) / wks) * (accuracy x periods bk)) bx))
365 pys = sum (elems freq)
368 -- | here we do the average of all the local f_scores
369 toPhyloQuality :: Double -> Double -> Map Int Double -> [[PhyloGroup]] -> Double
370 toPhyloQuality fdt lambda freq branches =
376 let bx = relevantBranches x branches
377 -- | periods containing x
378 periods = nub $ map _phylo_groupPeriod $ filter (\g -> elem x $ g ^. phylo_groupNgrams) $ concat bx
379 wks = sum $ map wk bx
380 -- in (px / pys) * (sum $ map (\bk -> ((wk bk) / wks) * (fScore beta x bk bx)) bx))
381 -- in (1 / fdt) * (sum $ map (\bk -> ((wk bk) / wks) * (fScore beta x periods bk bx)) bx))
382 in (1 / fdt) * (sum $ map (\bk -> ((wk bk) / wks) * (fScore (tan (lambda * pi / 2)) x periods bk bx)) bx))
386 -- pys = sum (elems freq)
388 -- 1 / nb de foundation
390 ------------------------------------
391 -- | Constant Temporal Matching | --
392 ------------------------------------
395 groupsToBranches' :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
396 groupsToBranches' groups =
397 {- run the related component algorithm -}
398 let egos = groupBy (\gs gs' -> (fst $ fst $ head' "egos" gs) == (fst $ fst $ head' "egos" gs'))
399 $ sortOn (\gs -> fst $ fst $ head' "egos" gs)
400 $ map (\group -> [getGroupId group]
401 ++ (map fst $ group ^. phylo_groupPeriodParents)
402 ++ (map fst $ group ^. phylo_groupPeriodChilds) ) $ elems groups
403 -- first find the related components by inside each ego's period
405 graph' = map relatedComponents egos
406 -- then run it for the all the periods
408 $ relatedComponents $ concat (graph' `using` parList rdeepseq)
409 -- update each group's branch id
410 in map (\(bId,ids) ->
411 let groups' = map (\group -> group & phylo_groupBranchId %~ (\(lvl,lst) -> (lvl,lst ++ [bId])))
412 $ elems $ restrictKeys groups (Set.fromList ids)
413 in groups' `using` parList rdeepseq ) graph
416 reduceFrequency :: Map Int Double -> [[PhyloGroup]] -> Map Int Double
417 reduceFrequency frequency branches =
418 restrictKeys frequency (Set.fromList $ (nub . concat) $ map _phylo_groupNgrams $ concat branches)
420 updateThr :: Double -> [[PhyloGroup]] -> [[PhyloGroup]]
421 updateThr thr branches = map (\b -> map (\g ->
422 g & phylo_groupMeta .~ (singleton "seaLevels" (((g ^. phylo_groupMeta) ! "seaLevels") ++ [thr]))) b) branches
425 -- Sequentially break each branch of a phylo where
426 -- done = all the allready broken branches
427 -- ego = the current branch we want to break
428 -- rest = the branches we still have to break
429 breakBranches :: Double -> Proximity -> Double -> Map Int Double -> Int -> Double -> Double -> Double
430 -> Int -> Map Date Double -> Map Date Cooc -> [PhyloPeriodId] -> [([PhyloGroup],Bool)] -> ([PhyloGroup],Bool) -> [([PhyloGroup],Bool)] -> [([PhyloGroup],Bool)]
431 breakBranches fdt proximity lambda frequency minBranch thr depth elevation frame docs coocs periods done ego rest =
432 -- 1) keep or not the new division of ego
433 let done' = done ++ (if snd ego
435 (if ((null (fst ego')) || (quality > quality'))
437 -- trace (" ✗ F(β) = " <> show(quality) <> " (vs) " <> show(quality')
438 -- <> " | " <> show(length $ fst ego) <> " groups : "
439 -- <> " |✓ " <> show(length $ fst ego') <> show(map length $ fst ego')
440 -- <> " |✗ " <> show(length $ snd ego') <> "[" <> show(length $ concat $ snd ego') <> "]")
443 -- trace (" ✓ level = " <> printf "%.1f" thr <> "")
444 -- trace (" ✓ F(β) = " <> show(quality) <> " (vs) " <> show(quality')
445 -- <> " | " <> show(length $ fst ego) <> " groups : "
446 -- <> " |✓ " <> show(length $ fst ego') <> show(map length $ fst ego')
447 -- <> " |✗ " <> show(length $ snd ego') <> "[" <> show(length $ concat $ snd ego') <> "]")
448 ((map (\e -> (e,True)) (fst ego')) ++ (map (\e -> (e,False)) (snd ego'))))
451 -- 2) if there is no more branches in rest then return else continue
454 else breakBranches fdt proximity lambda frequency minBranch thr depth elevation frame docs coocs periods
455 done' (head' "breakBranches" rest) (tail' "breakBranches" rest)
457 --------------------------------------
459 quality = toPhyloQuality fdt lambda frequency ((map fst done) ++ [fst ego] ++ (map fst rest))
460 --------------------------------------
461 ego' :: ([[PhyloGroup]],[[PhyloGroup]])
463 let branches = groupsToBranches' $ fromList $ map (\g -> (getGroupId g, g))
464 $ matchGroupsToGroups frame periods proximity thr docs coocs (fst ego)
465 branches' = branches `using` parList rdeepseq
466 in partition (\b -> (length $ nub $ map _phylo_groupPeriod b) >= minBranch)
468 $ depthToMeta (elevation - depth) branches'
469 --------------------------------------
471 quality' = toPhyloQuality fdt lambda frequency
472 ((map fst done) ++ (fst ego') ++ (snd ego') ++ (map fst rest))
475 seaLevelMatching :: Double -> Proximity -> Double -> Int -> Map Int Double -> Double -> Double -> Double -> Double
476 -> Int -> [PhyloPeriodId] -> Map Date Double -> Map Date Cooc -> [([PhyloGroup],Bool)] -> [([PhyloGroup],Bool)]
477 seaLevelMatching fdt proximity lambda minBranch frequency thr step depth elevation frame periods docs coocs branches =
478 -- if there is no branch to break or if seaLvl level > 1 then end
479 if (thr >= 1) || ((not . or) $ map snd branches)
482 -- break all the possible branches at the current seaLvl level
483 let quality = toPhyloQuality fdt lambda frequency (map fst branches)
484 acc = toAccuracy frequency (map fst branches)
485 rec = toRecall frequency (map fst branches)
486 branches' = trace ("↑ level = " <> printf "%.3f" thr <> " F(λ) = " <> printf "%.5f" quality
487 <> " ξ = " <> printf "%.5f" acc
488 <> " ρ = " <> printf "%.5f" rec
489 <> " branches = " <> show(length branches) <> " ↴")
490 $ breakBranches fdt proximity lambda frequency minBranch thr depth elevation frame docs coocs periods
491 [] (head' "seaLevelMatching" branches) (tail' "seaLevelMatching" branches)
492 frequency' = reduceFrequency frequency (map fst branches')
493 in seaLevelMatching fdt proximity lambda minBranch frequency' (thr + step) step (depth - 1) elevation frame periods docs coocs branches'
496 constanteTemporalMatching :: Double -> Double -> Phylo -> Phylo
497 constanteTemporalMatching start step phylo = updatePhyloGroups 1
498 (fromList $ map (\g -> (getGroupId g,g)) $ traceMatchEnd $ concat branches)
499 (toPhyloHorizon phylo)
501 -- 2) process the temporal matching by elevating seaLvl level
502 branches :: [[PhyloGroup]]
504 $ seaLevelMatching (fromIntegral $ Vector.length $ getRoots phylo)
505 (phyloProximity $ getConfig phylo)
506 (_qua_granularity $ phyloQuality $ getConfig phylo)
507 (_qua_minBranch $ phyloQuality $ getConfig phylo)
508 (phylo ^. phylo_termFreq)
510 ((((1 - start) / step) - 1))
511 (((1 - start) / step))
512 (getTimeFrame $ timeUnit $ getConfig phylo)
514 (phylo ^. phylo_timeDocs)
515 (phylo ^. phylo_timeCooc)
516 (reverse $ sortOn (length . fst) groups)
517 -- 1) for each group process an initial temporal Matching
518 -- here we suppose that all the groups of level 1 are part of the same big branch
519 groups :: [([PhyloGroup],Bool)]
520 groups = map (\b -> (b,(length $ nub $ map _phylo_groupPeriod b) >= (_qua_minBranch $ phyloQuality $ getConfig phylo)))
521 $ groupsToBranches' $ fromList $ map (\g -> (getGroupId g, g))
522 $ matchGroupsToGroups (getTimeFrame $ timeUnit $ getConfig phylo)
523 (getPeriodIds phylo) (phyloProximity $ getConfig phylo)
525 (phylo ^. phylo_timeDocs)
526 (phylo ^. phylo_timeCooc)
527 (traceTemporalMatching $ getGroupsFromLevel 1 phylo)
533 toPhyloHorizon :: Phylo -> Phylo
534 toPhyloHorizon phylo =
535 let t0 = take 1 (getPeriodIds phylo)
536 groups = getGroupsFromLevelPeriods 1 t0 phylo
537 sens = getSensibility (phyloProximity $ getConfig phylo)
538 nbDocs = sum $ elems $ filterDocs (phylo ^. phylo_timeDocs) t0
539 diago = reduceDiagos $ filterDiago (phylo ^. phylo_timeCooc) t0
540 in phylo & phylo_horizon .~ (fromList $ map (\(g,g') ->
541 ((getGroupId g,getGroupId g'),weightedLogJaccard' sens nbDocs diago (g ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams))) $ listToCombi' groups)
544 --------------------------------------
545 -- | Adaptative Temporal Matching | --
546 --------------------------------------
549 thrToMeta :: Double -> [[PhyloGroup]] -> [[PhyloGroup]]
550 thrToMeta thr branches =
552 map (\g -> g & phylo_groupMeta .~ (adjust (\lst -> lst ++ [thr]) "seaLevels" (g ^. phylo_groupMeta))) b) branches
554 depthToMeta :: Double -> [[PhyloGroup]] -> [[PhyloGroup]]
555 depthToMeta depth branches =
556 let break = length branches > 1
559 if break then g & phylo_groupMeta .~ (adjust (\lst -> lst ++ [depth]) "breaks"(g ^. phylo_groupMeta))
562 reduceTupleMapByKeys :: Eq a => [a] -> Map (a,a) Double -> Map (a,a) Double
563 reduceTupleMapByKeys ks m = filterWithKey (\(k,k') _ -> (elem k ks) && (elem k' ks)) m
566 getInTupleMap :: Ord a => Map (a,a) Double -> a -> a -> Double
568 | isJust (m !? ( k ,k')) = m ! ( k ,k')
569 | isJust (m !? ( k',k )) = m ! ( k',k )
573 toThreshold :: Double -> Map (PhyloGroupId,PhyloGroupId) Double -> Double
574 toThreshold lvl proxiGroups =
575 let idx = ((Map.size proxiGroups) `div` (floor lvl)) - 1
577 then (sort $ elems proxiGroups) !! idx
581 -- done = all the allready broken branches
582 -- ego = the current branch we want to break
583 -- rest = the branches we still have to break
584 adaptativeBreakBranches :: Double -> Proximity -> Double -> Double -> Map (PhyloGroupId,PhyloGroupId) Double
585 -> Double -> Map Int Double -> Int -> Int -> Map Date Double -> Map Date Cooc
586 -> [PhyloPeriodId] -> [([PhyloGroup],(Bool,[Double]))] -> ([PhyloGroup],(Bool,[Double])) -> [([PhyloGroup],(Bool,[Double]))]
587 -> [([PhyloGroup],(Bool,[Double]))]
588 adaptativeBreakBranches fdt proxiConf depth elevation groupsProxi lambda frequency minBranch frame docs coocs periods done ego rest =
589 -- 1) keep or not the new division of ego
590 let done' = done ++ (if (fst . snd) ego
591 then (if ((null (fst ego')) || (quality > quality'))
593 [(concat $ thrToMeta thr $ [fst ego],(False, ((snd . snd) ego)))]
595 ( (map (\e -> (e,(True, ((snd . snd) ego) ++ [thr]))) (fst ego'))
596 ++ (map (\e -> (e,(False, ((snd . snd) ego)))) (snd ego'))))
597 else [(concat $ thrToMeta thr $ [fst ego], snd ego)])
599 -- uncomment let .. in for debugging
600 -- let part1 = partition (snd) done'
601 -- part2 = partition (snd) rest
602 -- in trace ( "[✓ " <> show(length $ fst part1) <> "(" <> show(length $ concat $ map (fst) $ fst part1) <> ")|✗ " <> show(length $ snd part1) <> "(" <> show(length $ concat $ map (fst) $ snd part1) <> ")] "
603 -- <> "[✓ " <> show(length $ fst part2) <> "(" <> show(length $ concat $ map (fst) $ fst part2) <> ")|✗ " <> show(length $ snd part2) <> "(" <> show(length $ concat $ map (fst) $ snd part2) <> ")]"
605 -- 2) if there is no more branches in rest then return else continue
608 else adaptativeBreakBranches fdt proxiConf depth elevation groupsProxi lambda frequency minBranch frame docs coocs periods
609 done' (head' "breakBranches" rest) (tail' "breakBranches" rest)
611 --------------------------------------
613 thr = toThreshold depth $ Map.filter (\v -> v > (last' "breakBranches" $ (snd . snd) ego)) $ reduceTupleMapByKeys (map getGroupId $ fst ego) groupsProxi
614 --------------------------------------
616 quality = toPhyloQuality fdt lambda frequency ((map fst done) ++ [fst ego] ++ (map fst rest))
617 --------------------------------------
618 ego' :: ([[PhyloGroup]],[[PhyloGroup]])
620 let branches = groupsToBranches' $ fromList $ map (\g -> (getGroupId g, g))
621 $ matchGroupsToGroups frame periods proxiConf thr docs coocs (fst ego)
622 branches' = branches `using` parList rdeepseq
623 in partition (\b -> (length $ nub $ map _phylo_groupPeriod b) > minBranch)
625 $ depthToMeta (elevation - depth) branches'
626 --------------------------------------
628 quality' = toPhyloQuality fdt lambda frequency
629 ((map fst done) ++ (fst ego') ++ (snd ego') ++ (map fst rest))
632 adaptativeSeaLevelMatching :: Double -> Proximity -> Double -> Double -> Map (PhyloGroupId, PhyloGroupId) Double
633 -> Double -> Int -> Map Int Double
634 -> Int -> [PhyloPeriodId] -> Map Date Double -> Map Date Cooc
635 -> [([PhyloGroup],(Bool,[Double]))] -> [([PhyloGroup],(Bool,[Double]))]
636 adaptativeSeaLevelMatching fdt proxiConf depth elevation groupsProxi lambda minBranch frequency frame periods docs coocs branches =
637 -- if there is no branch to break or if seaLvl level >= depth then end
638 if (Map.null groupsProxi) || (depth <= 0) || ((not . or) $ map (fst . snd) branches)
641 -- break all the possible branches at the current seaLvl level
642 let branches' = adaptativeBreakBranches fdt proxiConf depth elevation groupsProxi lambda frequency minBranch frame docs coocs periods
643 [] (head' "seaLevelMatching" branches) (tail' "seaLevelMatching" branches)
644 frequency' = reduceFrequency frequency (map fst branches')
645 groupsProxi' = reduceTupleMapByKeys (map (getGroupId) $ concat $ map (fst) $ filter (fst . snd) branches') groupsProxi
646 -- thr = toThreshold depth groupsProxi
647 in trace("\n " <> foldl (\acc _ -> acc <> "🌊 ") "" [0..(elevation - depth)]
648 <> " [✓ " <> show(length $ filter (fst . snd) branches') <> "(" <> show(length $ concat $ map (fst) $ filter (fst . snd) branches')
649 <> ")|✗ " <> show(length $ filter (not . fst . snd) branches') <> "(" <> show(length $ concat $ map (fst) $ filter (not . fst . snd) branches') <> ")]"
651 $ adaptativeSeaLevelMatching fdt proxiConf (depth - 1) elevation groupsProxi' lambda minBranch frequency' frame periods docs coocs branches'
654 adaptativeTemporalMatching :: Double -> Phylo -> Phylo
655 adaptativeTemporalMatching elevation phylo = updatePhyloGroups 1
656 (fromList $ map (\g -> (getGroupId g,g)) $ traceMatchEnd $ concat branches)
657 (toPhyloHorizon phylo)
659 -- 2) process the temporal matching by elevating seaLvl level
660 branches :: [[PhyloGroup]]
662 $ adaptativeSeaLevelMatching (fromIntegral $ Vector.length $ getRoots phylo)
663 (phyloProximity $ getConfig phylo)
666 (phylo ^. phylo_groupsProxi)
667 (_qua_granularity $ phyloQuality $ getConfig phylo)
668 (_qua_minBranch $ phyloQuality $ getConfig phylo)
669 (phylo ^. phylo_termFreq)
670 (getTimeFrame $ timeUnit $ getConfig phylo)
672 (phylo ^. phylo_timeDocs)
673 (phylo ^. phylo_timeCooc)
675 -- 1) for each group process an initial temporal Matching
676 -- here we suppose that all the groups of level 1 are part of the same big branch
677 groups :: [([PhyloGroup],(Bool,[Double]))]
678 groups = map (\b -> (b,((length $ nub $ map _phylo_groupPeriod b) >= (_qua_minBranch $ phyloQuality $ getConfig phylo),[thr])))
679 $ groupsToBranches' $ fromList $ map (\g -> (getGroupId g, g))
680 $ matchGroupsToGroups (getTimeFrame $ timeUnit $ getConfig phylo)
681 (getPeriodIds phylo) (phyloProximity $ getConfig phylo)
683 (phylo ^. phylo_timeDocs)
684 (phylo ^. phylo_timeCooc)
685 (traceTemporalMatching $ getGroupsFromLevel 1 phylo)
686 --------------------------------------
688 thr = toThreshold elevation (phylo ^. phylo_groupsProxi)