2 Module : Gargantext.Core.Viz.Phylo.TemporalMatching
3 Description : Module dedicated to the adaptative temporal matching of a Phylo.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 module Gargantext.Core.Viz.Phylo.TemporalMatching where
14 import Control.Lens hiding (Level)
15 import Control.Parallel.Strategies (parList, rdeepseq, using)
17 import Data.List (concat, splitAt, tail, sortOn, sortBy, (++), intersect, null, inits, groupBy, scanl, nub, nubBy, union, dropWhile, partition, or, sort, (!!))
18 import Data.Map (Map, fromList, elems, restrictKeys, unionWith, findWithDefault, keys, (!), (!?), filterWithKey, singleton, empty, mapKeys, adjust)
19 import Debug.Trace (trace)
20 import Gargantext.Core.Viz.Phylo
21 import Gargantext.Core.Viz.Phylo.PhyloTools
22 import Gargantext.Prelude
23 import Prelude (floor,tan,pi)
25 import qualified Data.Map as Map
26 import qualified Data.Set as Set
27 import qualified Data.Vector as Vector
35 -- | To compute a jaccard similarity between two lists
36 jaccard :: [Int] -> [Int] -> Double
37 jaccard inter' union' = ((fromIntegral . length) $ inter') / ((fromIntegral . length) $ union')
40 -- | Process the inverse sumLog
41 sumInvLog' :: Double -> Double -> [Double] -> Double
42 sumInvLog' s nb diago = foldl (\mem occ -> mem + (1 / (log (occ + 1/ tan (s * pi / 2)) / log (nb + 1/ tan (s * pi / 2))))) 0 diago
45 -- | Process the sumLog
46 sumLog' :: Double -> Double -> [Double] -> Double
47 sumLog' s nb diago = foldl (\mem occ -> mem + (log (occ + 1/ tan (s * pi / 2)) / log (nb + 1/ tan (s * pi / 2)))) 0 diago
50 weightedLogJaccard' :: Double -> Double -> Map Int Double -> [Int] -> [Int] -> Double
51 weightedLogJaccard' sens nbDocs diago ngrams ngrams'
52 | null ngramsInter = 0
53 | ngramsInter == ngramsUnion = 1
54 | sens == 0 = jaccard ngramsInter ngramsUnion
55 | sens > 0 = (sumInvLog' sens nbDocs diagoInter) / (sumInvLog' sens nbDocs diagoUnion)
56 | otherwise = (sumLog' sens nbDocs diagoInter) / (sumLog' sens nbDocs diagoUnion)
58 --------------------------------------
60 ngramsInter = intersect ngrams ngrams'
61 --------------------------------------
63 ngramsUnion = union ngrams ngrams'
64 --------------------------------------
65 diagoInter :: [Double]
66 diagoInter = elems $ restrictKeys diago (Set.fromList ngramsInter)
67 --------------------------------------
68 diagoUnion :: [Double]
69 diagoUnion = elems $ restrictKeys diago (Set.fromList ngramsUnion)
70 --------------------------------------
72 -- | Process the weighted similarity between clusters. Adapted from Wang, X., Cheng, Q., Lu, W., 2014. Analyzing evolution of research topics with NEViewer: a new method based on dynamic co-word networks. Scientometrics 101, 1253–1271. https://doi.org/10.1007/s11192-014-1347-y (log added in the formula + pair comparison)
73 -- tests not conclusive
74 weightedLogSim' :: Double -> Double -> Map Int Double -> [Int] -> [Int] -> Double
75 weightedLogSim' sens nbDocs diago ego_ngrams target_ngrams
76 | null ngramsInter = 0
77 | ngramsInter == ngramsUnion = 1
78 | sens == 0 = jaccard ngramsInter ngramsUnion
79 | sens > 0 = (sumInvLog' sens nbDocs diagoInter) / minimum [(sumInvLog' sens nbDocs diagoEgo),(sumInvLog' sens nbDocs diagoTarget)]
80 | otherwise = (sumLog' sens nbDocs diagoInter) / minimum [(sumLog' sens nbDocs diagoEgo),(sumLog' sens nbDocs diagoTarget)]
82 --------------------------------------
84 ngramsInter = intersect ego_ngrams target_ngrams
85 --------------------------------------
87 ngramsUnion = union ego_ngrams target_ngrams
88 --------------------------------------
89 diagoInter :: [Double]
90 diagoInter = elems $ restrictKeys diago (Set.fromList ngramsInter)
91 --------------------------------------
93 diagoEgo = elems $ restrictKeys diago (Set.fromList ego_ngrams)
94 --------------------------------------
95 diagoTarget :: [Double]
96 diagoTarget = elems $ restrictKeys diago (Set.fromList target_ngrams)
97 --------------------------------------
99 toProximity :: Double -> Map Int Double -> Proximity -> [Int] -> [Int] -> [Int] -> Double
100 -- | To process the proximity between a current group and a pair of targets group using the adapted Wang et al. Similarity
101 toProximity nbDocs diago proximity egoNgrams targetNgrams targetNgrams' =
103 WeightedLogJaccard sens _ ->
104 let pairNgrams = if targetNgrams == targetNgrams'
106 else union targetNgrams targetNgrams'
107 in weightedLogJaccard' sens nbDocs diago egoNgrams pairNgrams
108 WeightedLogSim sens _ ->
109 let pairNgrams = if targetNgrams == targetNgrams'
111 else union targetNgrams targetNgrams'
112 in weightedLogSim' sens nbDocs diago egoNgrams pairNgrams
113 Hamming _ _ -> undefined
115 ------------------------
116 -- | Local Matching | --
117 ------------------------
119 findLastPeriod :: Filiation -> [Period] -> Period
120 findLastPeriod fil periods = case fil of
121 ToParents -> head' "findLastPeriod" (sortOn fst periods)
122 ToChilds -> last' "findLastPeriod" (sortOn fst periods)
123 ToChildsMemory -> undefined
124 ToParentsMemory -> undefined
127 -- | To filter pairs of candidates related to old pointers periods
128 removeOldPointers :: [Pointer] -> Filiation -> Double -> Proximity -> Period
129 -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
130 -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
131 removeOldPointers oldPointers fil thr prox prd pairs
132 | null oldPointers = pairs
133 | null (filterPointers prox thr oldPointers) =
134 let lastMatchedPrd = findLastPeriod fil (map (fst . fst . fst) oldPointers)
135 in if lastMatchedPrd == prd
137 else filter (\((id,_),(id',_)) ->
139 ToChildsMemory -> undefined
140 ToParentsMemory -> undefined
141 ToParents -> (((fst . fst . fst) id ) < (fst lastMatchedPrd))
142 || (((fst . fst . fst) id') < (fst lastMatchedPrd))
143 ToChilds -> (((fst . fst . fst) id ) > (fst lastMatchedPrd))
144 || (((fst . fst . fst) id') > (fst lastMatchedPrd))) pairs
149 makePairs :: (PhyloGroupId,[Int]) -> [(PhyloGroupId,[Int])] -> [Period] -> [Pointer] -> Filiation -> Double -> Proximity
150 -> Map Date Double -> Map Date Cooc -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
151 makePairs (egoId, egoNgrams) candidates periods oldPointers fil thr prox docs diagos =
154 else removeOldPointers oldPointers fil thr prox lastPrd
155 {- at least on of the pair candidates should be from the last added period -}
156 $ filter (\((id,_),(id',_)) -> ((fst . fst) id == lastPrd) || ((fst . fst) id' == lastPrd))
157 $ filter (\((id,_),(id',_)) -> (elem id inPairs) || (elem id' inPairs))
158 $ listToCombi' candidates
160 --------------------------------------
161 inPairs :: [PhyloGroupId]
163 $ filter (\(id,ngrams) ->
164 let nbDocs = (sum . elems) $ filterDocs docs ([(fst . fst) egoId, (fst . fst) id])
165 diago = reduceDiagos $ filterDiago diagos ([(fst . fst) egoId, (fst . fst) id])
166 in (toProximity nbDocs diago prox egoNgrams egoNgrams ngrams) >= thr
168 --------------------------------------
170 lastPrd = findLastPeriod fil periods
171 --------------------------------------
174 filterPointers :: Proximity -> Double -> [Pointer] -> [Pointer]
175 filterPointers proxi thr pts = filter (\(_,w) -> filterProximity proxi thr w) pts
177 filterPointers' :: Proximity -> Double -> [(Pointer,[Int])] -> [(Pointer,[Int])]
178 filterPointers' proxi thr pts = filter (\((_,w),_) -> filterProximity proxi thr w) pts
181 reduceDiagos :: Map Date Cooc -> Map Int Double
182 reduceDiagos diagos = mapKeys (\(k,_) -> k)
183 $ foldl (\acc diago -> unionWith (+) acc diago) empty (elems diagos)
185 filterPointersByPeriod :: Filiation -> [(Pointer,[Int])] -> [Pointer]
186 filterPointersByPeriod fil pts =
187 let pts' = sortOn (fst . fst . fst . fst) pts
188 inf = (fst . fst . fst . fst) $ head' "filterPointersByPeriod" pts'
189 sup = (fst . fst . fst . fst) $ last' "filterPointersByPeriod" pts'
191 $ nubBy (\pt pt' -> snd pt == snd pt')
192 $ filter (\pt -> ((fst . fst . fst . fst) pt == inf) || ((fst . fst . fst . fst) pt == sup))
194 ToParents -> reverse pts'
196 ToChildsMemory -> undefined
197 ToParentsMemory -> undefined
200 phyloGroupMatching' :: [[(PhyloGroupId,[Int])]] -> Filiation -> Proximity -> Map Date Double -> Map Date Cooc
201 -> Double -> [Pointer] -> (PhyloGroupId,[Int]) -> [Pointer]
202 phyloGroupMatching' candidates filiation proxi docs diagos thr oldPointers (id,ngrams) =
203 if (null $ filterPointers proxi thr oldPointers)
204 -- if no previous pointers satisfy the current threshold then let's find new pointers
205 then if null nextPointers
207 else filterPointersByPeriod filiation
208 -- 2) keep only the best set of pointers grouped by proximity
209 $ head' "phyloGroupMatching"
210 $ groupBy (\pt pt' -> (snd . fst) pt == (snd . fst) pt')
211 -- 1) find the first time frame where at leats one pointer satisfies the proximity threshold
212 $ sortBy (comparing (Down . snd . fst)) $ head' "pointers" nextPointers
215 nextPointers :: [[(Pointer,[Int])]]
216 nextPointers = take 1
217 -- stop as soon as we find a time frame where at least one singleton / pair satisfies the threshold
219 -- for each time frame, process the proximity on relevant pairs of targeted groups
220 $ scanl (\acc targets ->
221 let periods = nub $ map (fst . fst . fst) targets
222 lastPrd = findLastPeriod filiation periods
223 nbdocs = sum $ elems $ (filterDocs docs ([(fst . fst) id] ++ periods))
225 $ filterDiago diagos ([(fst . fst) id] ++ periods)
226 singletons = processProximity nbdocs diago $ map (\g -> (g,g)) $ filter (\g -> (fst . fst . fst) g == lastPrd) targets
227 pairs = makePairs (id,ngrams) targets periods oldPointers filiation thr proxi docs diagos
230 then acc ++ ( processProximity nbdocs diago pairs )
231 else acc ++ singletons
232 ) [] $ map concat $ inits candidates -- groups from [[1900],[1900,1901],[1900,1901,1902],...]
233 -----------------------------
234 processProximity :: Double -> Map Int Double -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))] -> [(Pointer,[Int])]
235 processProximity nbdocs diago targets = filterPointers' proxi thr
238 let proximity = toProximity nbdocs diago proxi ngrams (snd c) (snd c')
239 in if ((c == c') || (snd c == snd c'))
240 then [((fst c,proximity),snd c)]
241 else [((fst c,proximity),snd c),((fst c',proximity),snd c')] ) targets
244 phyloGroupMatching :: [[(PhyloGroupId,[Int])]] -> Filiation -> Proximity -> Map Date Double -> Map Date Cooc
245 -> Double -> [Pointer] -> (PhyloGroupId,[Int]) -> [Pointer]
246 phyloGroupMatching candidates filiation proxi docs diagos thr oldPointers (id,ngrams) =
247 if (null $ filterPointers proxi thr oldPointers)
248 {- let's find new pointers -}
249 then if null nextPointers
251 else filterPointersByPeriod filiation
252 $ head' "phyloGroupMatching"
253 -- Keep only the best set of pointers grouped by proximity
254 $ groupBy (\pt pt' -> (snd . fst) pt == (snd . fst) pt')
255 -- verifier que l on garde bien les plus importants
256 $ sortBy (comparing (Down . snd . fst)) $ head' "pointers" nextPointers
257 -- Find the first time frame where at leats one pointer satisfies the proximity threshold
260 nextPointers :: [[(Pointer,[Int])]]
261 nextPointers = take 1
263 {- for each time frame, process the proximity on relevant pairs of targeted groups -}
264 $ scanl (\acc groups ->
265 let periods = nub $ map (fst . fst . fst) $ concat groups
266 nbdocs = sum $ elems $ (filterDocs docs ([(fst . fst) id] ++ periods))
268 $ filterDiago diagos ([(fst . fst) id] ++ periods)
269 pairs = makePairs (id,ngrams) (concat groups) periods oldPointers filiation thr proxi docs diagos
270 in acc ++ ( filterPointers' proxi thr
273 {- process the proximity between the current group and a pair of candidates -}
274 let proximity = toProximity nbdocs diago proxi ngrams (snd c) (snd c')
275 in if ((c == c') || (snd c == snd c'))
276 then [((fst c,proximity),snd c)]
277 else [((fst c,proximity),snd c),((fst c',proximity),snd c')] ) pairs )) []
278 $ inits candidates -- groups from [[1900],[1900,1901],[1900,1901,1902],...]
281 filterDocs :: Map Date Double -> [Period] -> Map Date Double
282 filterDocs d pds = restrictKeys d $ periodsToYears pds
284 filterDiago :: Map Date Cooc -> [Period] -> Map Date Cooc
285 filterDiago diago pds = restrictKeys diago $ periodsToYears pds
288 -----------------------------
289 -- | Matching Processing | --
290 -----------------------------
293 getNextPeriods :: Filiation -> Int -> Period -> [Period] -> [Period]
294 getNextPeriods fil max' pId pIds =
296 ToChilds -> take max' $ (tail . snd) $ splitAt (elemIndex' pId pIds) pIds
297 ToParents -> take max' $ (reverse . fst) $ splitAt (elemIndex' pId pIds) pIds
298 ToChildsMemory -> undefined
299 ToParentsMemory -> undefined
302 getCandidates :: Int -> PhyloGroup -> [[(PhyloGroupId,[Int])]] -> [[(PhyloGroupId,[Int])]]
303 getCandidates minNgrams ego targets =
304 if (length (ego ^. phylo_groupNgrams)) > 1
306 map (\groups' -> filter (\g' -> (> minNgrams) $ length $ intersect (ego ^. phylo_groupNgrams) (snd g')) groups') targets
308 map (\groups' -> filter (\g' -> (not . null) $ intersect (ego ^. phylo_groupNgrams) (snd g')) groups') targets
311 matchGroupsToGroups :: Int -> [Period] -> Proximity -> Double -> Map Date Double -> Map Date Cooc -> [PhyloGroup] -> [PhyloGroup]
312 matchGroupsToGroups frame periods proximity thr docs coocs groups =
313 let groups' = groupByField _phylo_groupPeriod groups
314 in foldl' (\acc prd ->
315 let -- 1) find the parents/childs matching periods
316 periodsPar = getNextPeriods ToParents frame prd periods
317 periodsChi = getNextPeriods ToChilds frame prd periods
318 -- 2) find the parents/childs matching candidates
319 candidatesPar = map (\prd' -> map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ findWithDefault [] prd' groups') periodsPar
320 candidatesChi = map (\prd' -> map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ findWithDefault [] prd' groups') periodsChi
321 -- 3) find the parents/child number of docs by years
322 docsPar = filterDocs docs ([prd] ++ periodsPar)
323 docsChi = filterDocs docs ([prd] ++ periodsChi)
324 -- 4) find the parents/child diago by years
325 diagoPar = filterDiago (map coocToDiago coocs) ([prd] ++ periodsPar)
326 diagoChi = filterDiago (map coocToDiago coocs) ([prd] ++ periodsPar)
327 -- 5) match in parallel all the groups (egos) to their possible candidates
329 let pointersPar = phyloGroupMatching' (getCandidates (getMinSharedNgrams proximity) ego candidatesPar) ToParents proximity docsPar diagoPar
330 thr (getPeriodPointers ToParents ego) (getGroupId ego, ego ^. phylo_groupNgrams)
331 pointersChi = phyloGroupMatching' (getCandidates (getMinSharedNgrams proximity) ego candidatesChi) ToChilds proximity docsChi diagoChi
332 thr (getPeriodPointers ToChilds ego) (getGroupId ego, ego ^. phylo_groupNgrams)
333 in addPointers ToChilds TemporalPointer pointersChi
334 $ addPointers ToParents TemporalPointer pointersPar
335 $ addMemoryPointers ToChildsMemory TemporalPointer thr pointersChi
336 $ addMemoryPointers ToParentsMemory TemporalPointer thr pointersPar ego)
337 $ findWithDefault [] prd groups'
338 egos' = egos `using` parList rdeepseq
343 -----------------------
344 -- | Phylo Quality | --
345 -----------------------
348 relevantBranches :: Int -> [[PhyloGroup]] -> [[PhyloGroup]]
349 relevantBranches term branches =
350 filter (\groups -> (any (\group -> elem term $ group ^. phylo_groupNgrams) groups)) branches
352 accuracy :: Int -> [(Date,Date)] -> [PhyloGroup] -> Double
353 -- The accuracy of a branch relatively to a term x is computed only over the periods there exist some cluster mentionning x in the phylomemy
354 accuracy x periods bk = ((fromIntegral $ length $ filter (\g -> elem x $ g ^. phylo_groupNgrams) bk')
355 / (fromIntegral $ length bk'))
358 bk' = filter (\g -> elem (g ^. phylo_groupPeriod) periods) bk
360 recall :: Int -> [PhyloGroup] -> [[PhyloGroup]] -> Double
361 recall x bk bx = ((fromIntegral $ length $ filter (\g -> elem x $ g ^. phylo_groupNgrams) bk)
362 / (fromIntegral $ length $ filter (\g -> elem x $ g ^. phylo_groupNgrams) $ concat bx))
364 fScore :: Double -> Int -> [(Date,Date)] -> [PhyloGroup] -> [[PhyloGroup]] -> Double
365 fScore lambda x periods bk bx =
366 let rec = recall x bk bx
367 acc = accuracy x periods bk
368 in ((1 + lambda ** 2) * acc * rec)
369 / (((lambda ** 2) * acc + rec))
372 wk :: [PhyloGroup] -> Double
373 wk bk = fromIntegral $ length bk
375 toRecall :: Map Int Double -> [[PhyloGroup]] -> Double
376 toRecall freq branches =
382 bx = relevantBranches x branches
383 wks = sum $ map wk bx
384 in (px / pys) * (sum $ map (\bk -> ((wk bk) / wks) * (recall x bk bx)) bx))
388 pys = sum (elems freq)
391 toAccuracy :: Map Int Double -> [[PhyloGroup]] -> Double
392 toAccuracy freq branches =
398 bx = relevantBranches x branches
399 -- | periods containing x
400 periods = nub $ map _phylo_groupPeriod $ filter (\g -> elem x $ g ^. phylo_groupNgrams) $ concat bx
401 wks = sum $ map wk bx
402 in (px / pys) * (sum $ map (\bk -> ((wk bk) / wks) * (accuracy x periods bk)) bx))
406 pys = sum (elems freq)
409 -- | here we do the average of all the local f_scores
410 toPhyloQuality :: Double -> Double -> Map Int Double -> [[PhyloGroup]] -> Double
411 toPhyloQuality fdt lambda freq branches =
417 let bx = relevantBranches x branches
418 -- | periods containing x
419 periods = nub $ map _phylo_groupPeriod $ filter (\g -> elem x $ g ^. phylo_groupNgrams) $ concat bx
420 wks = sum $ map wk bx
421 -- in (px / pys) * (sum $ map (\bk -> ((wk bk) / wks) * (fScore beta x bk bx)) bx))
422 -- in (1 / fdt) * (sum $ map (\bk -> ((wk bk) / wks) * (fScore beta x periods bk bx)) bx))
423 in (1 / fdt) * (sum $ map (\bk -> ((wk bk) / wks) * (fScore (tan (lambda * pi / 2)) x periods bk bx)) bx))
427 -- pys = sum (elems freq)
429 -- 1 / nb de foundation
431 ------------------------------------
432 -- | Constant Temporal Matching | --
433 ------------------------------------
435 -- add a branch id within each group
436 groupsToBranches :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
437 groupsToBranches groups =
438 {- run the related component algorithm -}
439 let egos = groupBy (\gs gs' -> (fst $ fst $ head' "egos" gs) == (fst $ fst $ head' "egos" gs'))
440 $ sortOn (\gs -> fst $ fst $ head' "egos" gs)
441 $ map (\group -> [getGroupId group]
442 ++ (map fst $ group ^. phylo_groupPeriodParents)
443 ++ (map fst $ group ^. phylo_groupPeriodChilds) ) $ elems groups
444 -- first find the related components by inside each ego's period
446 graph' = map relatedComponents egos
447 -- then run it for the all the periods
449 $ relatedComponents $ concat (graph' `using` parList rdeepseq)
450 -- update each group's branch id
451 in map (\(bId,branch) ->
452 let groups' = map (\group -> group & phylo_groupBranchId %~ (\(lvl,lst) -> (lvl,lst ++ [bId])))
453 $ elems $ restrictKeys groups (Set.fromList branch)
454 in groups' `using` parList rdeepseq
455 ) branches `using` parList rdeepseq
458 reduceFrequency :: Map Int Double -> [[PhyloGroup]] -> Map Int Double
459 reduceFrequency frequency branches =
460 restrictKeys frequency (Set.fromList $ (nub . concat) $ map _phylo_groupNgrams $ concat branches)
462 updateThr :: Double -> [[PhyloGroup]] -> [[PhyloGroup]]
463 updateThr thr branches = map (\b -> map (\g ->
464 g & phylo_groupMeta .~ (singleton "seaLevels" (((g ^. phylo_groupMeta) ! "seaLevels") ++ [thr]))) b) branches
467 -- Sequentially break each branch of a phylo where
468 -- done = all the allready broken branches
469 -- ego = the current branch we want to break
470 -- rest = the branches we still have to break
471 breakBranches :: Double -> Proximity -> Double -> Map Int Double -> Int -> Double -> Double -> Double
472 -> Int -> Map Date Double -> Map Date Cooc -> [Period] -> [([PhyloGroup],Bool)] -> ([PhyloGroup],Bool) -> [([PhyloGroup],Bool)] -> [([PhyloGroup],Bool)]
473 breakBranches fdt proximity lambda frequency minBranch thr depth elevation frame docs coocs periods done ego rest =
474 -- 1) keep or not the new division of ego
475 let done' = done ++ (if snd ego
477 (if ((null (fst ego')) || (quality > quality'))
479 -- trace (" ✗ F(β) = " <> show(quality) <> " (vs) " <> show(quality')
480 -- <> " | " <> show(length $ fst ego) <> " groups : "
481 -- <> " |✓ " <> show(length $ fst ego') <> show(map length $ fst ego')
482 -- <> " |✗ " <> show(length $ snd ego') <> "[" <> show(length $ concat $ snd ego') <> "]")
485 -- trace (" ✓ level = " <> printf "%.1f" thr <> "")
486 -- trace (" ✓ F(β) = " <> show(quality) <> " (vs) " <> show(quality')
487 -- <> " | " <> show(length $ fst ego) <> " groups : "
488 -- <> " |✓ " <> show(length $ fst ego') <> show(map length $ fst ego')
489 -- <> " |✗ " <> show(length $ snd ego') <> "[" <> show(length $ concat $ snd ego') <> "]")
490 ((map (\e -> (e,True)) (fst ego')) ++ (map (\e -> (e,False)) (snd ego'))))
493 -- 2) if there is no more branches in rest then return else continue
496 else breakBranches fdt proximity lambda frequency minBranch thr depth elevation frame docs coocs periods
497 done' (head' "breakBranches" rest) (tail' "breakBranches" rest)
499 --------------------------------------
501 quality = toPhyloQuality fdt lambda frequency ((map fst done) ++ [fst ego] ++ (map fst rest))
502 --------------------------------------
503 ego' :: ([[PhyloGroup]],[[PhyloGroup]])
505 let branches = groupsToBranches $ fromList $ map (\g -> (getGroupId g, g))
506 $ matchGroupsToGroups frame periods proximity thr docs coocs (fst ego)
507 branches' = branches `using` parList rdeepseq
508 in partition (\b -> (length $ nub $ map _phylo_groupPeriod b) >= minBranch)
510 $ depthToMeta (elevation - depth) branches'
511 --------------------------------------
513 quality' = toPhyloQuality fdt lambda frequency
514 ((map fst done) ++ (fst ego') ++ (snd ego') ++ (map fst rest))
517 seaLevelMatching :: Double -> Proximity -> Double -> Int -> Map Int Double -> Double -> Double -> Double -> Double
518 -> Int -> [Period] -> Map Date Double -> Map Date Cooc -> [([PhyloGroup],Bool)] -> ([([PhyloGroup],Bool)],Double)
519 seaLevelMatching fdt proximity lambda minBranch frequency thr step depth elevation frame periods docs coocs branches =
520 -- if there is no branch to break or if seaLvl level > 1 then end
521 if (thr >= 1) || ((not . or) $ map snd branches)
522 then (branches, toPhyloQuality fdt lambda frequency (map fst branches))
524 -- break all the possible branches at the current seaLvl level
525 let quality = toPhyloQuality fdt lambda frequency (map fst branches)
526 acc = toAccuracy frequency (map fst branches)
527 rec = toRecall frequency (map fst branches)
528 branches' = trace ("↑ level = " <> printf "%.3f" thr <> " F(λ) = " <> printf "%.5f" quality
529 <> " ξ = " <> printf "%.5f" acc
530 <> " ρ = " <> printf "%.5f" rec
531 <> " branches = " <> show(length branches) <> " ↴")
532 $ breakBranches fdt proximity lambda frequency minBranch thr depth elevation frame docs coocs periods
533 [] (head' "seaLevelMatching" branches) (tail' "seaLevelMatching" branches)
534 frequency' = reduceFrequency frequency (map fst branches')
535 in seaLevelMatching fdt proximity lambda minBranch frequency' (thr + step) step (depth - 1) elevation frame periods docs coocs branches'
538 constanteTemporalMatching :: Double -> Double -> Phylo -> Phylo
539 constanteTemporalMatching start step phylo = updatePhyloGroups 1
540 (fromList $ map (\g -> (getGroupId g,g)) $ traceMatchEnd $ concat (map fst $ (fst branches)))
541 (toPhyloHorizon (updateQuality (snd branches) phylo))
543 -- 2) process the temporal matching by elevating seaLvl level
544 -- branches :: ([([groups in the same branch],should westill break the branch?)],final quality)
545 branches :: ([([PhyloGroup],Bool)],Double)
546 branches = seaLevelMatching (fromIntegral $ Vector.length $ getRoots phylo)
547 (phyloProximity $ getConfig phylo)
548 (_qua_granularity $ phyloQuality $ getConfig phylo)
549 (_qua_minBranch $ phyloQuality $ getConfig phylo)
550 (phylo ^. phylo_termFreq)
552 ((((1 - start) / step) - 1))
553 (((1 - start) / step))
554 (getTimeFrame $ timeUnit $ getConfig phylo)
556 (phylo ^. phylo_timeDocs)
557 (phylo ^. phylo_timeCooc)
558 (reverse $ sortOn (length . fst) initBranches)
559 -- 1) for each group process an initial temporal Matching
560 -- here we suppose that all the groups of level 1 are part of the same big branch
561 -- the Bool param determines weither you should apply the sealevel within the branch
562 -- creer un type [PhyloGroup] <=> Branch
563 initBranches :: [([PhyloGroup],Bool)]
564 initBranches = map (\b -> (b,(length $ nub $ map _phylo_groupPeriod b) >= (_qua_minBranch $ phyloQuality $ getConfig phylo)))
565 $ groupsToBranches $ Map.fromList $ map (\g -> (getGroupId g, g))
566 $ matchGroupsToGroups (getTimeFrame $ timeUnit $ getConfig phylo)
567 (getPeriodIds phylo) (phyloProximity $ getConfig phylo)
569 (phylo ^. phylo_timeDocs)
570 (phylo ^. phylo_timeCooc)
571 (traceTemporalMatching $ getGroupsFromLevel 1 phylo)
577 toPhyloHorizon :: Phylo -> Phylo
578 toPhyloHorizon phylo =
579 let t0 = take 1 (getPeriodIds phylo)
580 groups = getGroupsFromLevelPeriods 1 t0 phylo
581 sens = getSensibility (phyloProximity $ getConfig phylo)
582 nbDocs = sum $ elems $ filterDocs (phylo ^. phylo_timeDocs) t0
583 diago = reduceDiagos $ filterDiago (phylo ^. phylo_timeCooc) t0
584 in phylo & phylo_horizon .~ (fromList $ map (\(g,g') ->
585 ((getGroupId g,getGroupId g'),weightedLogJaccard' sens nbDocs diago (g ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams))) $ listToCombi' groups)
588 --------------------------------------
589 -- | Adaptative Temporal Matching | --
590 --------------------------------------
593 thrToMeta :: Double -> [[PhyloGroup]] -> [[PhyloGroup]]
594 thrToMeta thr branches =
596 map (\g -> g & phylo_groupMeta .~ (adjust (\lst -> lst ++ [thr]) "seaLevels" (g ^. phylo_groupMeta))) b) branches
598 depthToMeta :: Double -> [[PhyloGroup]] -> [[PhyloGroup]]
599 depthToMeta depth branches =
600 let break = length branches > 1
603 if break then g & phylo_groupMeta .~ (adjust (\lst -> lst ++ [depth]) "breaks"(g ^. phylo_groupMeta))
606 reduceTupleMapByKeys :: Eq a => [a] -> Map (a,a) Double -> Map (a,a) Double
607 reduceTupleMapByKeys ks m = filterWithKey (\(k,k') _ -> (elem k ks) && (elem k' ks)) m
610 getInTupleMap :: Ord a => Map (a,a) Double -> a -> a -> Double
612 | isJust (m !? ( k ,k')) = m ! ( k ,k')
613 | isJust (m !? ( k',k )) = m ! ( k',k )
617 toThreshold :: Double -> Map (PhyloGroupId,PhyloGroupId) Double -> Double
618 toThreshold nbSteps proxiGroups =
619 let idx = ((Map.size proxiGroups) `div` (floor nbSteps)) - 1
621 then (sort $ elems proxiGroups) !! idx
625 -- done = all the allready broken branches
626 -- ego = the current branch we want to break
627 -- rest = the branches we still have to break
628 adaptativeBreakBranches :: Double -> Proximity -> Double -> Double -> Map (PhyloGroupId,PhyloGroupId) Double
629 -> Double -> Map Int Double -> Int -> Int -> Map Date Double -> Map Date Cooc
630 -> [Period] -> [([PhyloGroup],(Bool,[Double]))] -> ([PhyloGroup],(Bool,[Double])) -> [([PhyloGroup],(Bool,[Double]))]
631 -> [([PhyloGroup],(Bool,[Double]))]
632 adaptativeBreakBranches fdt proxiConf depth elevation groupsProxi lambda frequency minBranch frame docs coocs periods done ego rest =
633 -- 1) keep or not the new division of ego
634 let done' = done ++ (if (fst . snd) ego
635 then (if ((null (fst ego')) || (quality > quality'))
637 [(concat $ thrToMeta thr $ [fst ego],(False, ((snd . snd) ego)))]
639 ( (map (\e -> (e,(True, ((snd . snd) ego) ++ [thr]))) (fst ego'))
640 ++ (map (\e -> (e,(False, ((snd . snd) ego)))) (snd ego'))))
641 else [(concat $ thrToMeta thr $ [fst ego], snd ego)])
643 -- uncomment let .. in for debugging
644 -- let part1 = partition (snd) done'
645 -- part2 = partition (snd) rest
646 -- in trace ( "[✓ " <> show(length $ fst part1) <> "(" <> show(length $ concat $ map (fst) $ fst part1) <> ")|✗ " <> show(length $ snd part1) <> "(" <> show(length $ concat $ map (fst) $ snd part1) <> ")] "
647 -- <> "[✓ " <> show(length $ fst part2) <> "(" <> show(length $ concat $ map (fst) $ fst part2) <> ")|✗ " <> show(length $ snd part2) <> "(" <> show(length $ concat $ map (fst) $ snd part2) <> ")]"
649 -- 2) if there is no more branches in rest then return else continue
652 else adaptativeBreakBranches fdt proxiConf depth elevation groupsProxi lambda frequency minBranch frame docs coocs periods
653 done' (head' "breakBranches" rest) (tail' "breakBranches" rest)
655 --------------------------------------
657 thr = toThreshold depth $ Map.filter (\v -> v > (last' "breakBranches" $ (snd . snd) ego)) $ reduceTupleMapByKeys (map getGroupId $ fst ego) groupsProxi
658 --------------------------------------
660 quality = toPhyloQuality fdt lambda frequency ((map fst done) ++ [fst ego] ++ (map fst rest))
661 --------------------------------------
662 ego' :: ([[PhyloGroup]],[[PhyloGroup]])
664 let branches = groupsToBranches $ fromList $ map (\g -> (getGroupId g, g))
665 $ matchGroupsToGroups frame periods proxiConf thr docs coocs (fst ego)
666 branches' = branches `using` parList rdeepseq
667 in partition (\b -> (length $ nub $ map _phylo_groupPeriod b) > minBranch)
669 $ depthToMeta (elevation - depth) branches'
670 --------------------------------------
672 quality' = toPhyloQuality fdt lambda frequency
673 ((map fst done) ++ (fst ego') ++ (snd ego') ++ (map fst rest))
676 adaptativeSeaLevelMatching :: Double -> Proximity -> Double -> Double -> Map (PhyloGroupId, PhyloGroupId) Double
677 -> Double -> Int -> Map Int Double
678 -> Int -> [Period] -> Map Date Double -> Map Date Cooc
679 -> [([PhyloGroup],(Bool,[Double]))] -> [([PhyloGroup],(Bool,[Double]))]
680 adaptativeSeaLevelMatching fdt proxiConf depth elevation groupsProxi lambda minBranch frequency frame periods docs coocs branches =
681 -- if there is no branch to break or if seaLvl level >= depth then end
682 if (Map.null groupsProxi) || (depth <= 0) || ((not . or) $ map (fst . snd) branches)
685 -- break all the possible branches at the current seaLvl level
686 let branches' = adaptativeBreakBranches fdt proxiConf depth elevation groupsProxi lambda frequency minBranch frame docs coocs periods
687 [] (head' "seaLevelMatching" branches) (tail' "seaLevelMatching" branches)
688 frequency' = reduceFrequency frequency (map fst branches')
689 groupsProxi' = reduceTupleMapByKeys (map (getGroupId) $ concat $ map (fst) $ filter (fst . snd) branches') groupsProxi
690 thr = toThreshold depth groupsProxi
691 in trace("\n " <> foldl (\acc _ -> acc <> "🌊 ") "" [0..(elevation - depth)]
692 <> " [✓ " <> show(length $ filter (fst . snd) branches') <> "(" <> show(length $ concat $ map (fst) $ filter (fst . snd) branches')
693 <> ")|✗ " <> show(length $ filter (not . fst . snd) branches') <> "(" <> show(length $ concat $ map (fst) $ filter (not . fst . snd) branches') <> ")]"
694 <> " thr = " <> show(thr))
695 $ adaptativeSeaLevelMatching fdt proxiConf (depth - 1) elevation groupsProxi' lambda minBranch frequency' frame periods docs coocs branches'
698 adaptativeTemporalMatching :: Double -> Phylo -> Phylo
699 adaptativeTemporalMatching elevation phylo = updatePhyloGroups 1
700 (fromList $ map (\g -> (getGroupId g,g)) $ traceMatchEnd $ concat branches)
701 (toPhyloHorizon phylo)
703 -- 2) process the temporal matching by elevating seaLvl level
704 branches :: [[PhyloGroup]]
706 $ adaptativeSeaLevelMatching (fromIntegral $ Vector.length $ getRoots phylo)
707 (phyloProximity $ getConfig phylo)
710 (phylo ^. phylo_groupsProxi)
711 (_qua_granularity $ phyloQuality $ getConfig phylo)
712 (_qua_minBranch $ phyloQuality $ getConfig phylo)
713 (phylo ^. phylo_termFreq)
714 (getTimeFrame $ timeUnit $ getConfig phylo)
716 (phylo ^. phylo_timeDocs)
717 (phylo ^. phylo_timeCooc)
719 -- 1) for each group process an initial temporal Matching
720 -- here we suppose that all the groups of level 1 are part of the same big branch
721 groups :: [([PhyloGroup],(Bool,[Double]))]
722 groups = map (\b -> (b,((length $ nub $ map _phylo_groupPeriod b) >= (_qua_minBranch $ phyloQuality $ getConfig phylo),[thr])))
723 $ groupsToBranches $ fromList $ map (\g -> (getGroupId g, g))
724 $ matchGroupsToGroups (getTimeFrame $ timeUnit $ getConfig phylo)
725 (getPeriodIds phylo) (phyloProximity $ getConfig phylo)
727 (phylo ^. phylo_timeDocs)
728 (phylo ^. phylo_timeCooc)
729 (traceTemporalMatching $ getGroupsFromLevel 1 phylo)
730 --------------------------------------
732 thr = toThreshold elevation (phylo ^. phylo_groupsProxi)