]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Viz/Phylo/TemporalMatching.hs
refactoring temporal matching
[gargantext.git] / src / Gargantext / Core / Viz / Phylo / TemporalMatching.hs
1 {-|
2 Module : Gargantext.Core.Viz.Phylo.TemporalMatching
3 Description : Module dedicated to the adaptative temporal matching of a Phylo.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 Reference : Chavalarias, D., Lobbé, Q. & Delanoë, A. Draw me Science. Scientometrics 127, 545–575 (2022). https://doi.org/10.1007/s11192-021-04186-5
10 -}
11
12 module Gargantext.Core.Viz.Phylo.TemporalMatching where
13
14 import Control.Lens hiding (Level)
15 import Control.Parallel.Strategies (parList, rdeepseq, using)
16 import Data.Ord
17 import Data.List (concat, splitAt, tail, sortOn, sortBy, (++), intersect, null, inits, groupBy, scanl, nub, nubBy, union, dropWhile, partition, or)
18 import Data.Map (Map, fromList, elems, restrictKeys, unionWith, findWithDefault, keys, (!), empty, mapKeys, adjust)
19 import Debug.Trace (trace)
20 import Gargantext.Core.Viz.Phylo
21 import Gargantext.Core.Viz.Phylo.PhyloTools
22 import Gargantext.Prelude
23 import Prelude (tan,pi)
24 import Text.Printf
25 import qualified Data.Map as Map
26 import qualified Data.List as List
27 import qualified Data.Set as Set
28 import qualified Data.Vector as Vector
29
30 type Branch = [PhyloGroup]
31 type FinalQuality = Double
32 type LocalQuality = Double
33 type ShouldTry = Bool
34
35
36 ----------------------------
37 -- | Similarity Measure | --
38 ----------------------------
39
40
41 {-
42 -- compute a jaccard similarity between two lists
43 -}
44 jaccard :: [Int] -> [Int] -> Double
45 jaccard inter' union' = ((fromIntegral . length) $ inter') / ((fromIntegral . length) $ union')
46
47
48 {-
49 -- process the inverse sumLog
50 -}
51 sumInvLog' :: Double -> Double -> [Double] -> Double
52 sumInvLog' s nb diago = foldl (\mem occ -> mem + (1 / (log (occ + 1/ tan (s * pi / 2)) / log (nb + 1/ tan (s * pi / 2))))) 0 diago
53
54
55 {-
56 -- process the sumLog
57 -}
58 sumLog' :: Double -> Double -> [Double] -> Double
59 sumLog' s nb diago = foldl (\mem occ -> mem + (log (occ + 1/ tan (s * pi / 2)) / log (nb + 1/ tan (s * pi / 2)))) 0 diago
60
61
62 {-
63 -- compute the weightedLogJaccard
64 -}
65 weightedLogJaccard' :: Double -> Double -> Map Int Double -> [Int] -> [Int] -> Double
66 weightedLogJaccard' sens nbDocs diago ngrams ngrams'
67 | null ngramsInter = 0
68 | ngramsInter == ngramsUnion = 1
69 | sens == 0 = jaccard ngramsInter ngramsUnion
70 | sens > 0 = (sumInvLog' sens nbDocs diagoInter) / (sumInvLog' sens nbDocs diagoUnion)
71 | otherwise = (sumLog' sens nbDocs diagoInter) / (sumLog' sens nbDocs diagoUnion)
72 where
73 --------------------------------------
74 ngramsInter :: [Int]
75 ngramsInter = intersect ngrams ngrams'
76 --------------------------------------
77 ngramsUnion :: [Int]
78 ngramsUnion = union ngrams ngrams'
79 --------------------------------------
80 diagoInter :: [Double]
81 diagoInter = elems $ restrictKeys diago (Set.fromList ngramsInter)
82 --------------------------------------
83 diagoUnion :: [Double]
84 diagoUnion = elems $ restrictKeys diago (Set.fromList ngramsUnion)
85 --------------------------------------
86
87
88 {-
89 -- compute the weightedLogSim
90 -- Adapted from Wang, X., Cheng, Q., Lu, W., 2014. Analyzing evolution of research topics with NEViewer: a new method based on dynamic co-word networks. Scientometrics 101, 1253–1271. https://doi.org/10.1007/s11192-014-1347-y (log added in the formula + pair comparison)
91 -- tests not conclusive
92 -}
93 weightedLogSim' :: Double -> Double -> Map Int Double -> [Int] -> [Int] -> Double
94 weightedLogSim' sens nbDocs diago ego_ngrams target_ngrams
95 | null ngramsInter = 0
96 | ngramsInter == ngramsUnion = 1
97 | sens == 0 = jaccard ngramsInter ngramsUnion
98 | sens > 0 = (sumInvLog' sens nbDocs diagoInter) / minimum [(sumInvLog' sens nbDocs diagoEgo),(sumInvLog' sens nbDocs diagoTarget)]
99 | otherwise = (sumLog' sens nbDocs diagoInter) / minimum [(sumLog' sens nbDocs diagoEgo),(sumLog' sens nbDocs diagoTarget)]
100 where
101 --------------------------------------
102 ngramsInter :: [Int]
103 ngramsInter = intersect ego_ngrams target_ngrams
104 --------------------------------------
105 ngramsUnion :: [Int]
106 ngramsUnion = union ego_ngrams target_ngrams
107 --------------------------------------
108 diagoInter :: [Double]
109 diagoInter = elems $ restrictKeys diago (Set.fromList ngramsInter)
110 --------------------------------------
111 diagoEgo :: [Double]
112 diagoEgo = elems $ restrictKeys diago (Set.fromList ego_ngrams)
113 --------------------------------------
114 diagoTarget :: [Double]
115 diagoTarget = elems $ restrictKeys diago (Set.fromList target_ngrams)
116 --------------------------------------
117
118
119 {-
120 -- perform a seamilarity measure between a given group and a pair of targeted groups
121 -}
122 toProximity :: Double -> Map Int Double -> Proximity -> [Int] -> [Int] -> [Int] -> Double
123 toProximity nbDocs diago proximity egoNgrams targetNgrams targetNgrams' =
124 case proximity of
125 WeightedLogJaccard sens _ ->
126 let pairNgrams = if targetNgrams == targetNgrams'
127 then targetNgrams
128 else union targetNgrams targetNgrams'
129 in weightedLogJaccard' sens nbDocs diago egoNgrams pairNgrams
130 WeightedLogSim sens _ ->
131 let pairNgrams = if targetNgrams == targetNgrams'
132 then targetNgrams
133 else union targetNgrams targetNgrams'
134 in weightedLogSim' sens nbDocs diago egoNgrams pairNgrams
135 Hamming _ _ -> undefined
136
137
138 -----------------------------
139 -- | Pointers & Matrices | --
140 -----------------------------
141
142
143 findLastPeriod :: Filiation -> [Period] -> Period
144 findLastPeriod fil periods = case fil of
145 ToParents -> head' "findLastPeriod" (sortOn fst periods)
146 ToChilds -> last' "findLastPeriod" (sortOn fst periods)
147 ToChildsMemory -> undefined
148 ToParentsMemory -> undefined
149
150 removeOldPointers :: [Pointer] -> Filiation -> Double -> Proximity -> Period
151 -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
152 -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
153 removeOldPointers oldPointers fil thr prox prd pairs
154 | null oldPointers = pairs
155 | null (filterPointers prox thr oldPointers) =
156 let lastMatchedPrd = findLastPeriod fil (map (fst . fst . fst) oldPointers)
157 in if lastMatchedPrd == prd
158 then []
159 else filter (\((id,_),(id',_)) ->
160 case fil of
161 ToChildsMemory -> undefined
162 ToParentsMemory -> undefined
163 ToParents -> (((fst . fst . fst) id ) < (fst lastMatchedPrd))
164 || (((fst . fst . fst) id') < (fst lastMatchedPrd))
165 ToChilds -> (((fst . fst . fst) id ) > (fst lastMatchedPrd))
166 || (((fst . fst . fst) id') > (fst lastMatchedPrd))) pairs
167 | otherwise = []
168
169 filterPointers :: Proximity -> Double -> [Pointer] -> [Pointer]
170 filterPointers proxi thr pts = filter (\(_,w) -> filterProximity proxi thr w) pts
171
172 filterPointers' :: Proximity -> Double -> [(Pointer,[Int])] -> [(Pointer,[Int])]
173 filterPointers' proxi thr pts = filter (\((_,w),_) -> filterProximity proxi thr w) pts
174
175
176 reduceDiagos :: Map Date Cooc -> Map Int Double
177 reduceDiagos diagos = mapKeys (\(k,_) -> k)
178 $ foldl (\acc diago -> unionWith (+) acc diago) empty (elems diagos)
179
180 filterPointersByPeriod :: Filiation -> [(Pointer,[Int])] -> [Pointer]
181 filterPointersByPeriod fil pts =
182 let pts' = sortOn (fst . fst . fst . fst) pts
183 inf = (fst . fst . fst . fst) $ head' "filterPointersByPeriod" pts'
184 sup = (fst . fst . fst . fst) $ last' "filterPointersByPeriod" pts'
185 in map fst
186 $ nubBy (\pt pt' -> snd pt == snd pt')
187 $ filter (\pt -> ((fst . fst . fst . fst) pt == inf) || ((fst . fst . fst . fst) pt == sup))
188 $ case fil of
189 ToParents -> reverse pts'
190 ToChilds -> pts'
191 ToChildsMemory -> undefined
192 ToParentsMemory -> undefined
193
194 filterDocs :: Map Date Double -> [Period] -> Map Date Double
195 filterDocs d pds = restrictKeys d $ periodsToYears pds
196
197 filterDiago :: Map Date Cooc -> [Period] -> Map Date Cooc
198 filterDiago diago pds = restrictKeys diago $ periodsToYears pds
199
200
201 ---------------------------------
202 -- | Inter-temporal matching | --
203 ---------------------------------
204
205
206 {-
207 -- perform the related component algorithm, construct the resulting branch id and update the corresponding group's branch id
208 -}
209 groupsToBranches :: Map PhyloGroupId PhyloGroup -> [Branch]
210 groupsToBranches groups =
211 {- run the related component algorithm -}
212 let egos = groupBy (\gs gs' -> (fst $ fst $ head' "egos" gs) == (fst $ fst $ head' "egos" gs'))
213 $ sortOn (\gs -> fst $ fst $ head' "egos" gs)
214 $ map (\group -> [getGroupId group]
215 ++ (map fst $ group ^. phylo_groupPeriodParents)
216 ++ (map fst $ group ^. phylo_groupPeriodChilds) ) $ elems groups
217 -- first find the related components by inside each ego's period
218 -- a supprimer
219 graph' = map relatedComponents egos
220 -- then run it for the all the periods
221 branches = zip [1..]
222 $ relatedComponents $ concat (graph' `using` parList rdeepseq)
223 -- update each group's branch id
224 in map (\(bId,branch) ->
225 let groups' = map (\group -> group & phylo_groupBranchId %~ (\(lvl,lst) -> (lvl,lst ++ [bId])))
226 $ elems $ restrictKeys groups (Set.fromList branch)
227 in groups' `using` parList rdeepseq
228 ) branches `using` parList rdeepseq
229
230
231 {-
232 -- find the best pair/singleton of parents/childs for a given group
233 -}
234 makePairs :: (PhyloGroupId,[Int]) -> [(PhyloGroupId,[Int])] -> [Period] -> [Pointer] -> Filiation -> Double -> Proximity
235 -> Map Date Double -> Map Date Cooc -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
236 makePairs (egoId, egoNgrams) candidates periods oldPointers fil thr prox docs diagos =
237 if (null periods)
238 then []
239 else removeOldPointers oldPointers fil thr prox lastPrd
240 {- at least on of the pair candidates should be from the last added period -}
241 $ filter (\((id,_),(id',_)) -> ((fst . fst) id == lastPrd) || ((fst . fst) id' == lastPrd))
242 $ filter (\((id,_),(id',_)) -> (elem id inPairs) || (elem id' inPairs))
243 $ listToCombi' candidates
244 where
245 --------------------------------------
246 inPairs :: [PhyloGroupId]
247 inPairs = map fst
248 $ filter (\(id,ngrams) ->
249 let nbDocs = (sum . elems) $ filterDocs docs ([(fst . fst) egoId, (fst . fst) id])
250 diago = reduceDiagos $ filterDiago diagos ([(fst . fst) egoId, (fst . fst) id])
251 in (toProximity nbDocs diago prox egoNgrams egoNgrams ngrams) >= thr
252 ) candidates
253 --------------------------------------
254 lastPrd :: Period
255 lastPrd = findLastPeriod fil periods
256 --------------------------------------
257
258 {-
259 -- find the best temporal links between a given group and its parents/childs
260 -}
261 phyloGroupMatching :: [[(PhyloGroupId,[Int])]] -> Filiation -> Proximity -> Map Date Double -> Map Date Cooc
262 -> Double -> [Pointer] -> (PhyloGroupId,[Int]) -> [Pointer]
263 phyloGroupMatching candidates filiation proxi docs diagos thr oldPointers (id,ngrams) =
264 if (null $ filterPointers proxi thr oldPointers)
265 -- if no previous pointers satisfy the current threshold then let's find new pointers
266 then if null nextPointers
267 then []
268 else filterPointersByPeriod filiation
269 -- 2) keep only the best set of pointers grouped by proximity
270 $ head' "phyloGroupMatching"
271 $ groupBy (\pt pt' -> (snd . fst) pt == (snd . fst) pt')
272 -- 1) find the first time frame where at leats one pointer satisfies the proximity threshold
273 $ sortBy (comparing (Down . snd . fst)) $ head' "pointers" nextPointers
274 else oldPointers
275 where
276 nextPointers :: [[(Pointer,[Int])]]
277 nextPointers = take 1
278 -- stop as soon as we find a time frame where at least one singleton / pair satisfies the threshold
279 $ dropWhile (null)
280 -- for each time frame, process the proximity on relevant pairs of targeted groups
281 $ scanl (\acc targets ->
282 let periods = nub $ map (fst . fst . fst) targets
283 lastPrd = findLastPeriod filiation periods
284 nbdocs = sum $ elems $ (filterDocs docs ([(fst . fst) id] ++ periods))
285 diago = reduceDiagos
286 $ filterDiago diagos ([(fst . fst) id] ++ periods)
287 singletons = processProximity nbdocs diago $ map (\g -> (g,g)) $ filter (\g -> (fst . fst . fst) g == lastPrd) targets
288 pairs = makePairs (id,ngrams) targets periods oldPointers filiation thr proxi docs diagos
289 in
290 if (null singletons)
291 then acc ++ ( processProximity nbdocs diago pairs )
292 else acc ++ singletons
293 ) [] $ map concat $ inits candidates -- groups from [[1900],[1900,1901],[1900,1901,1902],...]
294 -----------------------------
295 processProximity :: Double -> Map Int Double -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))] -> [(Pointer,[Int])]
296 processProximity nbdocs diago targets = filterPointers' proxi thr
297 $ concat
298 $ map (\(c,c') ->
299 let proximity = toProximity nbdocs diago proxi ngrams (snd c) (snd c')
300 in if ((c == c') || (snd c == snd c'))
301 then [((fst c,proximity),snd c)]
302 else [((fst c,proximity),snd c),((fst c',proximity),snd c')] ) targets
303
304
305 {-
306 -- get the upstream/downstream timescale of a given period
307 -}
308 getNextPeriods :: Filiation -> Int -> Period -> [Period] -> [Period]
309 getNextPeriods fil max' pId pIds =
310 case fil of
311 ToChilds -> take max' $ (tail . snd) $ splitAt (elemIndex' pId pIds) pIds
312 ToParents -> take max' $ (reverse . fst) $ splitAt (elemIndex' pId pIds) pIds
313 ToChildsMemory -> undefined
314 ToParentsMemory -> undefined
315
316
317 {-
318 -- find all the candidates parents/childs of ego
319 -}
320 getCandidates :: Int -> PhyloGroup -> [[(PhyloGroupId,[Int])]] -> [[(PhyloGroupId,[Int])]]
321 getCandidates minNgrams ego targets =
322 if (length (ego ^. phylo_groupNgrams)) > 1
323 then
324 map (\groups' -> filter (\g' -> (> minNgrams) $ length $ intersect (ego ^. phylo_groupNgrams) (snd g')) groups') targets
325 else
326 map (\groups' -> filter (\g' -> (not . null) $ intersect (ego ^. phylo_groupNgrams) (snd g')) groups') targets
327
328
329 {-
330 -- set up and start performing the upstream/downstream inter‐temporal matching period by period
331 -}
332 reconstructTemporalLinks :: Int -> [Period] -> Proximity -> Double -> Map Date Double -> Map Date Cooc -> [PhyloGroup] -> [PhyloGroup]
333 reconstructTemporalLinks frame periods proximity thr docs coocs groups =
334 let groups' = groupByField _phylo_groupPeriod groups
335 in foldl' (\acc prd ->
336 let -- 1) find the parents/childs matching periods
337 periodsPar = getNextPeriods ToParents frame prd periods
338 periodsChi = getNextPeriods ToChilds frame prd periods
339 -- 2) find the parents/childs matching candidates
340 candidatesPar = map (\prd' -> map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ findWithDefault [] prd' groups') periodsPar
341 candidatesChi = map (\prd' -> map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ findWithDefault [] prd' groups') periodsChi
342 -- 3) find the parents/childs number of docs by years
343 docsPar = filterDocs docs ([prd] ++ periodsPar)
344 docsChi = filterDocs docs ([prd] ++ periodsChi)
345 -- 4) find the parents/child diago by years
346 diagoPar = filterDiago (map coocToDiago coocs) ([prd] ++ periodsPar)
347 diagoChi = filterDiago (map coocToDiago coocs) ([prd] ++ periodsPar)
348 -- 5) match in parallel all the groups (egos) to their possible candidates
349 egos = map (\ego ->
350 let pointersPar = phyloGroupMatching (getCandidates (getMinSharedNgrams proximity) ego candidatesPar) ToParents proximity docsPar diagoPar
351 thr (getPeriodPointers ToParents ego) (getGroupId ego, ego ^. phylo_groupNgrams)
352 pointersChi = phyloGroupMatching (getCandidates (getMinSharedNgrams proximity) ego candidatesChi) ToChilds proximity docsChi diagoChi
353 thr (getPeriodPointers ToChilds ego) (getGroupId ego, ego ^. phylo_groupNgrams)
354 in addPointers ToChilds TemporalPointer pointersChi
355 $ addPointers ToParents TemporalPointer pointersPar
356 $ addMemoryPointers ToChildsMemory TemporalPointer thr pointersChi
357 $ addMemoryPointers ToParentsMemory TemporalPointer thr pointersPar ego)
358 $ findWithDefault [] prd groups'
359 egos' = egos `using` parList rdeepseq
360 in acc ++ egos'
361 ) [] periods
362
363
364 {-
365 -- reconstruct a phylomemetic network from a list of groups and from a given threshold
366 -}
367 toPhylomemeticNetwork :: Int -> [Period] -> Proximity -> Double -> Map Date Double -> Map Date Cooc -> [PhyloGroup] -> [Branch]
368 toPhylomemeticNetwork timescale periods similarity thr docs coocs groups =
369 groupsToBranches $ fromList $ map (\g -> (getGroupId g, g))
370 $ reconstructTemporalLinks timescale periods similarity thr docs coocs groups
371
372
373 ----------------------------
374 -- | Quality Assessment | --
375 ----------------------------
376
377
378 {-
379 -- filter the branches containing x
380 -}
381 relevantBranches :: Int -> [Branch] -> [Branch]
382 relevantBranches x branches =
383 filter (\groups -> (any (\group -> elem x $ group ^. phylo_groupNgrams) groups)) branches
384
385
386 {-
387 -- compute the accuracy ξ
388 -- the accuracy of a branch relatively to a root x is computed only over the periods where clusters mentionning x in the phylo do exist
389 -}
390 accuracy :: Int -> [(Date,Date)] -> Branch -> Double
391 accuracy x periods bk = ((fromIntegral $ length $ filter (\g -> elem x $ g ^. phylo_groupNgrams) bk') / (fromIntegral $ length bk'))
392 where
393 ---
394 bk' :: [PhyloGroup]
395 bk' = filter (\g -> elem (g ^. phylo_groupPeriod) periods) bk
396
397
398 {-
399 -- compute the recall ρ
400 -}
401 recall :: Int -> Branch -> [Branch] -> Double
402 recall x bk bx = ((fromIntegral $ length $ filter (\g -> elem x $ g ^. phylo_groupNgrams) bk)
403 / (fromIntegral $ length $ filter (\g -> elem x $ g ^. phylo_groupNgrams) $ concat bx))
404
405
406 {-
407 -- compute the F-score function
408 -}
409 fScore :: Double -> Int -> [(Date,Date)] -> [PhyloGroup] -> [[PhyloGroup]] -> Double
410 fScore lambda x periods bk bx =
411 let rec = recall x bk bx
412 acc = accuracy x periods bk
413 in ((1 + lambda ** 2) * acc * rec)
414 / (((lambda ** 2) * acc + rec))
415
416
417 {-
418 -- compute the number of groups
419 -}
420 wk :: [PhyloGroup] -> Double
421 wk bk = fromIntegral $ length bk
422
423
424 {-
425 -- compute the recall ρ for all the branches
426 -}
427 globalRecall :: Map Int Double -> [Branch] -> Double
428 globalRecall freq branches =
429 if (null branches)
430 then 0
431 else sum
432 $ map (\x ->
433 let px = freq ! x
434 bx = relevantBranches x branches
435 wks = sum $ map wk bx
436 in (px / pys) * (sum $ map (\bk -> ((wk bk) / wks) * (recall x bk bx)) bx))
437 $ keys freq
438 where
439 pys :: Double
440 pys = sum (elems freq)
441
442
443 {-
444 -- compute the accuracy ξ for all the branches
445 -}
446 globalAccuracy :: Map Int Double -> [Branch] -> Double
447 globalAccuracy freq branches =
448 if (null branches)
449 then 0
450 else sum
451 $ map (\x ->
452 let px = freq ! x
453 bx = relevantBranches x branches
454 -- | periods containing x
455 periods = nub $ map _phylo_groupPeriod $ filter (\g -> elem x $ g ^. phylo_groupNgrams) $ concat bx
456 wks = sum $ map wk bx
457 in (px / pys) * (sum $ map (\bk -> ((wk bk) / wks) * (accuracy x periods bk)) bx))
458 $ keys freq
459 where
460 pys :: Double
461 pys = sum (elems freq)
462
463
464 {-
465 -- compute the quality score F(λ)
466 -}
467 toPhyloQuality :: Double -> Double -> Map Int Double -> [[PhyloGroup]] -> Double
468 toPhyloQuality fdt lambda freq branches =
469 if (null branches)
470 then 0
471 else sum
472 $ map (\x ->
473 -- let px = freq ! x
474 let bx = relevantBranches x branches
475 -- | periods containing x
476 periods = nub $ map _phylo_groupPeriod $ filter (\g -> elem x $ g ^. phylo_groupNgrams) $ concat bx
477 wks = sum $ map wk bx
478 -- in (px / pys) * (sum $ map (\bk -> ((wk bk) / wks) * (fScore beta x bk bx)) bx))
479 -- in (1 / fdt) * (sum $ map (\bk -> ((wk bk) / wks) * (fScore beta x periods bk bx)) bx))
480 in (1 / fdt) * (sum $ map (\bk -> ((wk bk) / wks) * (fScore (tan (lambda * pi / 2)) x periods bk bx)) bx))
481 $ keys freq
482 -- where
483 -- pys :: Double
484 -- pys = sum (elems freq)
485
486
487 -------------------------
488 -- | Sea-level Rise | --
489 -------------------------
490
491
492 {-
493 -- attach a rise value to branches & groups metadata
494 -}
495 riseToMeta :: Double -> [Branch] -> [Branch]
496 riseToMeta rise branches =
497 let break = length branches > 1
498 in map (\b ->
499 map (\g ->
500 if break then g & phylo_groupMeta .~ (adjust (\lst -> lst ++ [rise]) "breaks"(g ^. phylo_groupMeta))
501 else g) b) branches
502
503
504 {-
505 -- attach a thr value to branches & groups metadata
506 -}
507 thrToMeta :: Double -> [Branch] -> [Branch]
508 thrToMeta thr branches =
509 map (\b ->
510 map (\g -> g & phylo_groupMeta .~ (adjust (\lst -> lst ++ [thr]) "seaLevels" (g ^. phylo_groupMeta))) b) branches
511
512
513 {-
514 -- TODO
515 -- 1) try the zipper structure https://wiki.haskell.org/Zipper to performe the sea-level rise algorithme
516 -- 2) investigate how the branches order influences the 'separateBranches' function
517 -}
518
519
520 {-
521 -- sequentially separate each branch for a given threshold and check if it locally increases the quality score
522 -- sequence = [done] | currentBranch | [rest]
523 -- done = all the already separated branches
524 -- rest = all the branches we still have to separate
525 -}
526 separateBranches :: Double -> Proximity -> Double -> Map Int Double -> Int -> Double -> Double
527 -> Int -> Map Date Double -> Map Date Cooc -> [Period]
528 -> [(Branch,ShouldTry)] -> (Branch,ShouldTry) -> [(Branch,ShouldTry)]
529 -> [(Branch,ShouldTry)]
530 separateBranches fdt similarity lambda frequency minBranch thr rise timescale docs coocs periods done currentBranch rest =
531 let done' = done ++ (if snd currentBranch
532 then
533 (if ((null (fst branches')) || (quality > quality'))
534 ---- 5) if the quality is not increased by the new branches or if the new branches are all small
535 ---- then undo the separation and localy stop the sea rise
536 ---- else validate the separation and authorise next sea rise in the long new branches
537 then
538 -- trace (" ✗ F(λ) = " <> show(quality) <> " (vs) " <> show(quality')
539 -- <> " | " <> show(length $ fst ego) <> " groups : "
540 -- <> " |✓ " <> show(length $ fst ego') <> show(map length $ fst ego')
541 -- <> " |✗ " <> show(length $ snd ego') <> "[" <> show(length $ concat $ snd ego') <> "]")
542 [(fst currentBranch,False)]
543 else
544 -- trace (" ✓ F(λ) = " <> show(quality) <> " (vs) " <> show(quality')
545 -- <> " | " <> show(length $ fst ego) <> " groups : "
546 -- <> " |✓ " <> show(length $ fst ego') <> show(map length $ fst ego')
547 -- <> " |✗ " <> show(length $ snd ego') <> "[" <> show(length $ concat $ snd ego') <> "]")
548 ((map (\e -> (e,True)) (fst branches')) ++ (map (\e -> (e,False)) (snd branches'))))
549 else [currentBranch])
550 in
551 -- 6) if there is no more branch to separate tne return [done'] else continue with [rest]
552 if null rest
553 then done'
554 else separateBranches fdt similarity lambda frequency minBranch thr rise timescale docs coocs periods
555 done' (List.head rest) (List.tail rest)
556 where
557 ------- 1) compute the quality before splitting any branch
558 quality :: LocalQuality
559 quality = toPhyloQuality fdt lambda frequency ((map fst done) ++ [fst currentBranch] ++ (map fst rest))
560
561 ------------------- 2) split the current branch and create a new phylomemetic network
562 phylomemeticNetwork :: [Branch]
563 phylomemeticNetwork = toPhylomemeticNetwork timescale periods similarity thr docs coocs (fst currentBranch)
564
565 --------- 3) change the new phylomemetic network into a tuple of new branches
566 --------- on the left : the long branches, on the right : the small ones
567 branches' :: ([Branch],[Branch])
568 branches' = partition (\b -> (length $ nub $ map _phylo_groupPeriod b) >= minBranch)
569 $ thrToMeta thr
570 $ riseToMeta rise phylomemeticNetwork
571
572 -------- 4) compute again the quality by considering the new branches
573 quality' :: LocalQuality
574 quality' = toPhyloQuality fdt lambda frequency
575 ((map fst done) ++ (fst branches') ++ (snd branches') ++ (map fst rest))
576
577
578 {-
579 -- perform the sea-level rise algorithm, browse the similarity ladder and check that we can try out the next step
580 -}
581 seaLevelRise :: Double -> Proximity -> Double -> Int -> Map Int Double
582 -> [Double] -> Double
583 -> Int -> [Period]
584 -> Map Date Double -> Map Date Cooc
585 -> [(Branch,ShouldTry)]
586 -> ([(Branch,ShouldTry)],FinalQuality)
587 seaLevelRise fdt proximity lambda minBranch frequency ladder rise frame periods docs coocs branches =
588 -- if the ladder is empty or thr > 1 or there is no branch to break then stop
589 if (null ladder) || ((List.head ladder) > 1) || (stopRise branches)
590 then (branches, toPhyloQuality fdt lambda frequency (map fst branches))
591 else
592 -- start breaking up all the possible branches for the current similarity threshold
593 let thr = List.head ladder
594 branches' = trace ("threshold = " <> printf "%.3f" thr
595 <> " F(λ) = " <> printf "%.5f" (toPhyloQuality fdt lambda frequency (map fst branches))
596 <> " ξ = " <> printf "%.5f" (globalAccuracy frequency (map fst branches))
597 <> " ρ = " <> printf "%.5f" (globalRecall frequency (map fst branches))
598 <> " branches = " <> show(length branches))
599 $ separateBranches fdt proximity lambda frequency minBranch thr rise frame docs coocs periods
600 [] (List.head branches) (List.tail branches)
601 in seaLevelRise fdt proximity lambda minBranch frequency (List.tail ladder) (rise + 1) frame periods docs coocs branches'
602 where
603 --------
604 stopRise :: [(Branch,ShouldTry)] -> Bool
605 stopRise bs = ((not . or) $ map snd bs)
606
607
608 {-
609 -- start the temporal matching process up, recover the resulting branches and update the groups (at scale 1) consequently
610 -}
611 temporalMatching :: [Double] -> Phylo -> Phylo
612 temporalMatching ladder phylo = updatePhyloGroups 1
613 (Map.fromList $ map (\g -> (getGroupId g,g)) $ traceMatchEnd $ concat branches)
614 (updateQuality quality phylo)
615 where
616 -------
617 quality :: FinalQuality
618 quality = snd sea
619
620 --------
621 branches :: [Branch]
622 branches = map fst $ fst sea
623
624 --- 2) process the temporal matching by elevating the similarity ladder
625 sea :: ([(Branch,ShouldTry)],FinalQuality)
626 sea = seaLevelRise (fromIntegral $ Vector.length $ getRoots phylo)
627 (phyloProximity $ getConfig phylo)
628 (_qua_granularity $ phyloQuality $ getConfig phylo)
629 (_qua_minBranch $ phyloQuality $ getConfig phylo)
630 (phylo ^. phylo_termFreq)
631 ladder 1
632 (getTimeFrame $ timeUnit $ getConfig phylo)
633 (getPeriodIds phylo)
634 (phylo ^. phylo_timeDocs)
635 (phylo ^. phylo_timeCooc)
636 (reverse $ sortOn (length . fst) seabed)
637
638 ------ 1) for each group, process an initial temporal Matching and create a 'seabed'
639 ------ ShouldTry determines if you should apply the seaLevelRise function again within each branch
640 seabed :: [(Branch,ShouldTry)]
641 seabed = map (\b -> (b,(length $ nub $ map _phylo_groupPeriod b) >= (_qua_minBranch $ phyloQuality $ getConfig phylo)))
642 $ toPhylomemeticNetwork (getTimeFrame $ timeUnit $ getConfig phylo)
643 (getPeriodIds phylo)
644 (phyloProximity $ getConfig phylo)
645 (List.head ladder)
646 (phylo ^. phylo_timeDocs)
647 (phylo ^. phylo_timeCooc)
648 (traceTemporalMatching $ getGroupsFromScale 1 phylo)