2 Module : Gargantext.Core.Viz.Phylo.TemporalMatching
3 Description : Module dedicated to the adaptative temporal matching of a Phylo.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
9 Reference : Chavalarias, D., Lobbé, Q. & Delanoë, A. Draw me Science. Scientometrics 127, 545–575 (2022). https://doi.org/10.1007/s11192-021-04186-5
12 module Gargantext.Core.Viz.Phylo.TemporalMatching where
14 import Control.Lens hiding (Level)
15 import Control.Parallel.Strategies (parList, rdeepseq, using)
17 import Data.List (concat, splitAt, tail, sortOn, sortBy, (++), intersect, null, inits, groupBy, scanl, nub, nubBy, union, dropWhile, partition, or)
18 import Data.Map (Map, fromList, elems, restrictKeys, unionWith, findWithDefault, keys, (!), empty, mapKeys, adjust, filterWithKey)
19 import Debug.Trace (trace)
20 import Gargantext.Core.Viz.Phylo
21 import Gargantext.Core.Viz.Phylo.PhyloTools
22 import Gargantext.Prelude
23 import Prelude (tan,pi)
25 import qualified Data.Map as Map
26 import qualified Data.List as List
27 import qualified Data.Set as Set
28 import qualified Data.Vector as Vector
30 type Branch = [PhyloGroup]
31 type FinalQuality = Double
32 type LocalQuality = Double
36 ----------------------------
37 -- | Similarity Measure | --
38 ----------------------------
42 -- compute a jaccard similarity between two lists
44 jaccard :: [Int] -> [Int] -> Double
45 jaccard inter' union' = ((fromIntegral . length) $ inter') / ((fromIntegral . length) $ union')
49 -- process the inverse sumLog
51 sumInvLog' :: Double -> Double -> [Double] -> Double
52 sumInvLog' s nb diago = foldl (\mem occ -> mem + (1 / (log (occ + 1/ tan (s * pi / 2)) / log (nb + 1/ tan (s * pi / 2))))) 0 diago
58 sumLog' :: Double -> Double -> [Double] -> Double
59 sumLog' s nb diago = foldl (\mem occ -> mem + (log (occ + 1/ tan (s * pi / 2)) / log (nb + 1/ tan (s * pi / 2)))) 0 diago
63 -- compute the weightedLogJaccard
65 weightedLogJaccard' :: Double -> Double -> Map Int Double -> [Int] -> [Int] -> Double
66 weightedLogJaccard' sens nbDocs diago ngrams ngrams'
67 | null ngramsInter = 0
68 | ngramsInter == ngramsUnion = 1
69 | sens == 0 = jaccard ngramsInter ngramsUnion
70 | sens > 0 = (sumInvLog' sens nbDocs diagoInter) / (sumInvLog' sens nbDocs diagoUnion)
71 | otherwise = (sumLog' sens nbDocs diagoInter) / (sumLog' sens nbDocs diagoUnion)
73 --------------------------------------
75 ngramsInter = intersect ngrams ngrams'
76 --------------------------------------
78 ngramsUnion = union ngrams ngrams'
79 --------------------------------------
80 diagoInter :: [Double]
81 diagoInter = elems $ restrictKeys diago (Set.fromList ngramsInter)
82 --------------------------------------
83 diagoUnion :: [Double]
84 diagoUnion = elems $ restrictKeys diago (Set.fromList ngramsUnion)
85 --------------------------------------
89 -- compute the weightedLogSim
90 -- Adapted from Wang, X., Cheng, Q., Lu, W., 2014. Analyzing evolution of research topics with NEViewer: a new method based on dynamic co-word networks. Scientometrics 101, 1253–1271. https://doi.org/10.1007/s11192-014-1347-y (log added in the formula + pair comparison)
91 -- tests not conclusive
93 weightedLogSim' :: Double -> Double -> Map Int Double -> [Int] -> [Int] -> Double
94 weightedLogSim' sens nbDocs diago ego_ngrams target_ngrams
95 | null ngramsInter = 0
96 | ngramsInter == ngramsUnion = 1
97 | sens == 0 = jaccard ngramsInter ngramsUnion
98 | sens > 0 = (sumInvLog' sens nbDocs diagoInter) / minimum [(sumInvLog' sens nbDocs diagoEgo),(sumInvLog' sens nbDocs diagoTarget)]
99 | otherwise = (sumLog' sens nbDocs diagoInter) / minimum [(sumLog' sens nbDocs diagoEgo),(sumLog' sens nbDocs diagoTarget)]
101 --------------------------------------
103 ngramsInter = intersect ego_ngrams target_ngrams
104 --------------------------------------
106 ngramsUnion = union ego_ngrams target_ngrams
107 --------------------------------------
108 diagoInter :: [Double]
109 diagoInter = elems $ restrictKeys diago (Set.fromList ngramsInter)
110 --------------------------------------
112 diagoEgo = elems $ restrictKeys diago (Set.fromList ego_ngrams)
113 --------------------------------------
114 diagoTarget :: [Double]
115 diagoTarget = elems $ restrictKeys diago (Set.fromList target_ngrams)
116 --------------------------------------
120 -- perform a seamilarity measure between a given group and a pair of targeted groups
122 toSimilarity :: Double -> Map Int Double -> Similarity -> [Int] -> [Int] -> [Int] -> Double
123 toSimilarity nbDocs diago similarity egoNgrams targetNgrams targetNgrams' =
125 WeightedLogJaccard sens _ ->
126 let pairNgrams = if targetNgrams == targetNgrams'
128 else union targetNgrams targetNgrams'
129 in weightedLogJaccard' sens nbDocs diago egoNgrams pairNgrams
130 WeightedLogSim sens _ ->
131 let pairNgrams = if targetNgrams == targetNgrams'
133 else union targetNgrams targetNgrams'
134 in weightedLogSim' sens nbDocs diago egoNgrams pairNgrams
135 Hamming _ _ -> undefined
138 -----------------------------
139 -- | Pointers & Matrices | --
140 -----------------------------
143 findLastPeriod :: Filiation -> [Period] -> Period
144 findLastPeriod fil periods = case fil of
145 ToParents -> head' "findLastPeriod" (sortOn fst periods)
146 ToChilds -> last' "findLastPeriod" (sortOn fst periods)
147 ToChildsMemory -> undefined
148 ToParentsMemory -> undefined
150 removeOldPointers :: [Pointer] -> Filiation -> Double -> Similarity -> Period
151 -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
152 -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
153 removeOldPointers oldPointers fil thr prox prd pairs
154 | null oldPointers = pairs
155 | null (filterPointers prox thr oldPointers) =
156 let lastMatchedPrd = findLastPeriod fil (map (fst . fst . fst) oldPointers)
157 in if lastMatchedPrd == prd
159 else filter (\((id,_),(id',_)) ->
161 ToChildsMemory -> undefined
162 ToParentsMemory -> undefined
163 ToParents -> (((fst . fst . fst) id ) < (fst lastMatchedPrd))
164 || (((fst . fst . fst) id') < (fst lastMatchedPrd))
165 ToChilds -> (((fst . fst . fst) id ) > (fst lastMatchedPrd))
166 || (((fst . fst . fst) id') > (fst lastMatchedPrd))) pairs
169 filterPointers :: Similarity -> Double -> [Pointer] -> [Pointer]
170 filterPointers proxi thr pts = filter (\(_,w) -> filterSimilarity proxi thr w) pts
172 filterPointers' :: Similarity -> Double -> [(Pointer,[Int])] -> [(Pointer,[Int])]
173 filterPointers' proxi thr pts = filter (\((_,w),_) -> filterSimilarity proxi thr w) pts
176 reduceDiagos :: Map Date Cooc -> Map Int Double
177 reduceDiagos diagos = mapKeys (\(k,_) -> k)
178 $ foldl (\acc diago -> unionWith (+) acc diago) empty (elems diagos)
180 filterPointersByPeriod :: Filiation -> [(Pointer,[Int])] -> [Pointer]
181 filterPointersByPeriod fil pts =
182 let pts' = sortOn (fst . fst . fst . fst) pts
183 inf = (fst . fst . fst . fst) $ head' "filterPointersByPeriod" pts'
184 sup = (fst . fst . fst . fst) $ last' "filterPointersByPeriod" pts'
186 $ nubBy (\pt pt' -> snd pt == snd pt')
187 $ filter (\pt -> ((fst . fst . fst . fst) pt == inf) || ((fst . fst . fst . fst) pt == sup))
189 ToParents -> reverse pts'
191 ToChildsMemory -> undefined
192 ToParentsMemory -> undefined
194 filterDocs :: Map Date Double -> [Period] -> Map Date Double
195 filterDocs d pds = restrictKeys d $ periodsToYears pds
197 filterDiago :: Map Date Cooc -> [Period] -> Map Date Cooc
198 filterDiago diago pds = restrictKeys diago $ periodsToYears pds
201 ---------------------------------
202 -- | Inter-temporal matching | --
203 ---------------------------------
207 -- perform the related component algorithm, construct the resulting branch id and update the corresponding group's branch id
209 groupsToBranches :: Map PhyloGroupId PhyloGroup -> [Branch]
210 groupsToBranches groups =
211 {- run the related component algorithm -}
212 let egos = groupBy (\gs gs' -> (fst $ fst $ head' "egos" gs) == (fst $ fst $ head' "egos" gs'))
213 $ sortOn (\gs -> fst $ fst $ head' "egos" gs)
214 $ map (\group -> [getGroupId group]
215 ++ (map fst $ group ^. phylo_groupPeriodParents)
216 ++ (map fst $ group ^. phylo_groupPeriodChilds) ) $ elems groups
217 -- first find the related components by inside each ego's period
219 graph' = map relatedComponents egos
220 -- then run it for the all the periods
222 $ relatedComponents $ concat (graph' `using` parList rdeepseq)
223 -- update each group's branch id
224 in map (\(bId,branch) ->
225 let groups' = map (\group -> group & phylo_groupBranchId %~ (\(lvl,lst) -> (lvl,lst ++ [bId])))
226 $ elems $ restrictKeys groups (Set.fromList branch)
227 in groups' `using` parList rdeepseq
228 ) branches `using` parList rdeepseq
232 -- find the best pair/singleton of parents/childs for a given group
234 makePairs :: (PhyloGroupId,[Int]) -> [(PhyloGroupId,[Int])] -> [Period] -> [Pointer] -> Filiation -> Double -> Similarity
235 -> Map Date Double -> Map Date Cooc -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
236 makePairs (egoId, egoNgrams) candidates periods oldPointers fil thr prox docs diagos =
239 else removeOldPointers oldPointers fil thr prox lastPrd
240 {- at least on of the pair candidates should be from the last added period -}
241 $ filter (\((id,_),(id',_)) -> ((fst . fst) id == lastPrd) || ((fst . fst) id' == lastPrd))
242 $ filter (\((id,_),(id',_)) -> (elem id inPairs) || (elem id' inPairs))
243 $ listToCombi' candidates
245 --------------------------------------
246 inPairs :: [PhyloGroupId]
248 $ filter (\(id,ngrams) ->
249 let nbDocs = (sum . elems) $ filterDocs docs ([(fst . fst) egoId, (fst . fst) id])
250 diago = reduceDiagos $ filterDiago diagos ([(fst . fst) egoId, (fst . fst) id])
251 in (toSimilarity nbDocs diago prox egoNgrams egoNgrams ngrams) >= thr
253 --------------------------------------
255 lastPrd = findLastPeriod fil periods
256 --------------------------------------
259 -- find the best temporal links between a given group and its parents/childs
261 phyloGroupMatching :: [[(PhyloGroupId,[Int])]] -> Filiation -> Similarity -> Map Date Double -> Map Date Cooc
262 -> Double -> [Pointer] -> (PhyloGroupId,[Int]) -> [Pointer]
263 phyloGroupMatching candidates filiation proxi docs diagos thr oldPointers (id,ngrams) =
264 if (null $ filterPointers proxi thr oldPointers)
265 -- if no previous pointers satisfy the current threshold then let's find new pointers
266 then if null nextPointers
268 else filterPointersByPeriod filiation
269 -- 2) keep only the best set of pointers grouped by Similarity
270 $ head' "phyloGroupMatching"
271 $ groupBy (\pt pt' -> (snd . fst) pt == (snd . fst) pt')
272 -- 1) find the first time frame where at leats one pointer satisfies the Similarity threshold
273 $ sortBy (comparing (Down . snd . fst)) $ head' "pointers" nextPointers
276 nextPointers :: [[(Pointer,[Int])]]
277 nextPointers = take 1
278 -- stop as soon as we find a time frame where at least one singleton / pair satisfies the threshold
280 -- for each time frame, process the Similarity on relevant pairs of targeted groups
281 $ scanl (\acc targets ->
282 let periods = nub $ map (fst . fst . fst) targets
283 lastPrd = findLastPeriod filiation periods
284 nbdocs = sum $ elems $ (filterDocs docs ([(fst . fst) id] ++ periods))
286 $ filterDiago diagos ([(fst . fst) id] ++ periods)
287 singletons = processSimilarity nbdocs diago $ map (\g -> (g,g)) $ filter (\g -> (fst . fst . fst) g == lastPrd) targets
288 pairs = makePairs (id,ngrams) targets periods oldPointers filiation thr proxi docs diagos
291 then acc ++ ( processSimilarity nbdocs diago pairs )
292 else acc ++ singletons
293 ) [] $ map concat $ inits candidates -- groups from [[1900],[1900,1901],[1900,1901,1902],...]
294 -----------------------------
295 processSimilarity :: Double -> Map Int Double -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))] -> [(Pointer,[Int])]
296 processSimilarity nbdocs diago targets = filterPointers' proxi thr
299 let similarity = toSimilarity nbdocs diago proxi ngrams (snd c) (snd c')
300 in if ((c == c') || (snd c == snd c'))
301 then [((fst c,similarity),snd c)]
302 else [((fst c,similarity),snd c),((fst c',similarity),snd c')] ) targets
306 -- get the upstream/downstream timescale of a given period
308 getNextPeriods :: Filiation -> Int -> Period -> [Period] -> [Period]
309 getNextPeriods fil max' pId pIds =
311 ToChilds -> take max' $ (tail . snd) $ splitAt (elemIndex' pId pIds) pIds
312 ToParents -> take max' $ (reverse . fst) $ splitAt (elemIndex' pId pIds) pIds
313 ToChildsMemory -> undefined
314 ToParentsMemory -> undefined
318 -- find all the candidates parents/childs of ego
320 getCandidates :: Int -> PhyloGroup -> [[(PhyloGroupId,[Int])]] -> [[(PhyloGroupId,[Int])]]
321 getCandidates minNgrams ego targets =
322 if (length (ego ^. phylo_groupNgrams)) > 1
324 map (\groups' -> filter (\g' -> (> minNgrams) $ length $ intersect (ego ^. phylo_groupNgrams) (snd g')) groups') targets
326 map (\groups' -> filter (\g' -> (not . null) $ intersect (ego ^. phylo_groupNgrams) (snd g')) groups') targets
330 -- set up and start performing the upstream/downstream inter‐temporal matching period by period
332 reconstructTemporalLinks :: Int -> [Period] -> Similarity -> Double -> Map Date Double -> Map Date Cooc -> [PhyloGroup] -> [PhyloGroup]
333 reconstructTemporalLinks frame periods similarity thr docs coocs groups =
334 let groups' = groupByField _phylo_groupPeriod groups
335 in foldl' (\acc prd ->
336 let -- 1) find the parents/childs matching periods
337 periodsPar = getNextPeriods ToParents frame prd periods
338 periodsChi = getNextPeriods ToChilds frame prd periods
339 -- 2) find the parents/childs matching candidates
340 candidatesPar = map (\prd' -> map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ findWithDefault [] prd' groups') periodsPar
341 candidatesChi = map (\prd' -> map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ findWithDefault [] prd' groups') periodsChi
342 -- 3) find the parents/childs number of docs by years
343 docsPar = filterDocs docs ([prd] ++ periodsPar)
344 docsChi = filterDocs docs ([prd] ++ periodsChi)
345 -- 4) find the parents/child diago by years
346 diagoPar = filterDiago (map coocToDiago coocs) ([prd] ++ periodsPar)
347 diagoChi = filterDiago (map coocToDiago coocs) ([prd] ++ periodsPar)
348 -- 5) match in parallel all the groups (egos) to their possible candidates
350 let pointersPar = phyloGroupMatching (getCandidates (getMinSharedNgrams similarity) ego candidatesPar) ToParents similarity docsPar diagoPar
351 thr (getPeriodPointers ToParents ego) (getGroupId ego, ego ^. phylo_groupNgrams)
352 pointersChi = phyloGroupMatching (getCandidates (getMinSharedNgrams similarity) ego candidatesChi) ToChilds similarity docsChi diagoChi
353 thr (getPeriodPointers ToChilds ego) (getGroupId ego, ego ^. phylo_groupNgrams)
354 in addPointers ToChilds TemporalPointer pointersChi
355 $ addPointers ToParents TemporalPointer pointersPar
356 $ addMemoryPointers ToChildsMemory TemporalPointer thr pointersChi
357 $ addMemoryPointers ToParentsMemory TemporalPointer thr pointersPar ego)
358 $ findWithDefault [] prd groups'
359 egos' = egos `using` parList rdeepseq
365 -- find all the groups matching a list of ngrams
367 findIdsFromNgrams :: [Int] -> Map Int [PhyloGroupId] -> [PhyloGroupId]
368 findIdsFromNgrams ngrams roots = nub $ concat $ elems $ filterWithKey (\k _ -> elem k ngrams) roots
370 formatCandidates :: Filiation -> [PhyloGroup] -> [[(PhyloGroupId,[Int])]]
371 formatCandidates fil groups = case fil of
372 ToChilds -> map (\groups' -> map (\g -> (getGroupId g, getGroupNgrams g)) groups')
374 $ groupByField _phylo_groupPeriod groups
376 $ map (\groups' -> map (\g -> (getGroupId g, getGroupNgrams g)) groups')
378 $ groupByField _phylo_groupPeriod groups
379 ToChildsMemory -> undefined
380 ToParentsMemory -> undefined
382 filterByIds :: PhyloGroupId -> [PhyloGroupId] -> [PhyloGroup] -> [PhyloGroup]
383 filterByIds egoId ids groups = filter (\g -> ((getGroupId g) /= egoId) && (elem (getGroupId g) ids)) groups
385 filterByPeriods :: [Period] -> [PhyloGroup] -> [PhyloGroup]
386 filterByPeriods periods groups = filter (\g -> elem (g ^. phylo_groupPeriod) periods) groups
388 filterByNgrams :: Int -> [Int] -> [PhyloGroup] -> [PhyloGroup]
389 filterByNgrams inf ngrams groups =
390 if (length ngrams) > 1
392 filter (\g -> (> inf) $ length $ intersect (ngrams) (getGroupNgrams g)) groups
394 filter (\g -> (not . null) $ intersect (ngrams) (getGroupNgrams g)) groups
397 -- perform the upstream/downstream inter‐temporal matching process group by group
399 reconstructTemporalLinks' :: Int -> [Period] -> Similarity -> Double -> Map Date Double -> Map Date Cooc -> Map Int [PhyloGroupId] -> [PhyloGroup] -> [PhyloGroup]
400 reconstructTemporalLinks' frame periods similarity thr docs coocs roots groups =
401 let egos = map (\ego ->
402 let -- 1) find the parents/childs matching periods
403 periodsPar = getNextPeriods ToParents frame (ego ^. phylo_groupPeriod) periods
404 periodsChi = getNextPeriods ToChilds frame (ego ^. phylo_groupPeriod) periods
405 -- 2) find the parents/childs matching candidates
406 candidatesPar = formatCandidates ToParents
407 $ filterByNgrams (getMinSharedNgrams similarity) (getGroupNgrams ego)
408 $ filterByPeriods periodsPar
409 $ filterByIds (getGroupId ego) (findIdsFromNgrams (getGroupNgrams ego) roots) groups
410 candidatesChi = formatCandidates ToChilds
411 $ filterByNgrams (getMinSharedNgrams similarity) (getGroupNgrams ego)
412 $ filterByPeriods periodsChi
413 $ filterByIds (getGroupId ego) (findIdsFromNgrams (getGroupNgrams ego) roots) groups
414 -- 3) find the parents/childs number of docs by years
415 docsPar = filterDocs docs ([(ego ^. phylo_groupPeriod)] ++ periodsPar)
416 docsChi = filterDocs docs ([(ego ^. phylo_groupPeriod)] ++ periodsChi)
417 -- 4) find the parents/child diago by years
418 diagoPar = filterDiago (map coocToDiago coocs) ([(ego ^. phylo_groupPeriod)] ++ periodsPar)
419 diagoChi = filterDiago (map coocToDiago coocs) ([(ego ^. phylo_groupPeriod)] ++ periodsPar)
420 -- 5) match ego to their candidates through time
421 pointersPar = phyloGroupMatching candidatesPar ToParents similarity docsPar diagoPar thr (getPeriodPointers ToParents ego) (getGroupId ego, ego ^. phylo_groupNgrams)
422 pointersChi = phyloGroupMatching candidatesChi ToParents similarity docsChi diagoChi thr (getPeriodPointers ToChilds ego) (getGroupId ego, ego ^. phylo_groupNgrams)
423 in addPointers ToChilds TemporalPointer pointersChi
424 $ addPointers ToParents TemporalPointer pointersPar
425 $ addMemoryPointers ToChildsMemory TemporalPointer thr pointersChi
426 $ addMemoryPointers ToParentsMemory TemporalPointer thr pointersPar ego
428 in egos `using` parList rdeepseq
433 -- reconstruct a phylomemetic network from a list of groups and from a given threshold
435 toPhylomemeticNetwork :: Int -> [Period] -> Similarity -> Double -> Map Date Double -> Map Date Cooc -> Map Int [PhyloGroupId] -> [PhyloGroup] -> [Branch]
436 toPhylomemeticNetwork timescale periods similarity thr docs coocs roots groups =
437 groupsToBranches $ fromList $ map (\g -> (getGroupId g, g))
438 -- $ reconstructTemporalLinks timescale periods similarity thr docs coocs groups
439 $ reconstructTemporalLinks' timescale periods similarity thr docs coocs roots groups
442 ----------------------------
443 -- | Quality Assessment | --
444 ----------------------------
448 -- filter the branches containing x
450 relevantBranches :: Int -> [Branch] -> [Branch]
451 relevantBranches x branches =
452 filter (\groups -> (any (\group -> elem x $ group ^. phylo_groupNgrams) groups)) branches
456 -- compute the accuracy ξ
457 -- the accuracy of a branch relatively to a root x is computed only over the periods where clusters mentionning x in the phylo do exist
459 accuracy :: Int -> [(Date,Date)] -> Branch -> Double
460 accuracy x periods bk = ((fromIntegral $ length $ filter (\g -> elem x $ g ^. phylo_groupNgrams) bk') / (fromIntegral $ length bk'))
464 bk' = filter (\g -> elem (g ^. phylo_groupPeriod) periods) bk
468 -- compute the recall ρ
470 recall :: Int -> Branch -> [Branch] -> Double
471 recall x bk bx = ((fromIntegral $ length $ filter (\g -> elem x $ g ^. phylo_groupNgrams) bk)
472 / (fromIntegral $ length $ filter (\g -> elem x $ g ^. phylo_groupNgrams) $ concat bx))
476 -- compute the F-score function
478 fScore :: Double -> Int -> [(Date,Date)] -> [PhyloGroup] -> [[PhyloGroup]] -> Double
479 fScore lambda x periods bk bx =
480 let rec = recall x bk bx
481 acc = accuracy x periods bk
482 in ((1 + lambda ** 2) * acc * rec)
483 / (((lambda ** 2) * acc + rec))
487 -- compute the number of groups
489 wk :: [PhyloGroup] -> Double
490 wk bk = fromIntegral $ length bk
494 -- compute the recall ρ for all the branches
496 globalRecall :: Map Int Double -> [Branch] -> Double
497 globalRecall freq branches =
503 bx = relevantBranches x branches
504 wks = sum $ map wk bx
505 in (px / pys) * (sum $ map (\bk -> ((wk bk) / wks) * (recall x bk bx)) bx))
509 pys = sum (elems freq)
513 -- compute the accuracy ξ for all the branches
515 globalAccuracy :: Map Int Double -> [Branch] -> Double
516 globalAccuracy freq branches =
522 bx = relevantBranches x branches
523 -- | periods containing x
524 periods = nub $ map _phylo_groupPeriod $ filter (\g -> elem x $ g ^. phylo_groupNgrams) $ concat bx
525 wks = sum $ map wk bx
526 in (px / pys) * (sum $ map (\bk -> ((wk bk) / wks) * (accuracy x periods bk)) bx))
530 pys = sum (elems freq)
534 -- compute the quality score F(λ)
536 toPhyloQuality :: Double -> Double -> Map Int Double -> [[PhyloGroup]] -> Double
537 toPhyloQuality fdt lambda freq branches =
543 let bx = relevantBranches x branches
544 -- | periods containing x
545 periods = nub $ map _phylo_groupPeriod $ filter (\g -> elem x $ g ^. phylo_groupNgrams) $ concat bx
546 wks = sum $ map wk bx
547 -- in (px / pys) * (sum $ map (\bk -> ((wk bk) / wks) * (fScore beta x bk bx)) bx))
548 -- in (1 / fdt) * (sum $ map (\bk -> ((wk bk) / wks) * (fScore beta x periods bk bx)) bx))
549 in (1 / fdt) * (sum $ map (\bk -> ((wk bk) / wks) * (fScore (tan (lambda * pi / 2)) x periods bk bx)) bx))
553 -- pys = sum (elems freq)
556 -------------------------
557 -- | Sea-level Rise | --
558 -------------------------
562 -- attach a rise value to branches & groups metadata
564 riseToMeta :: Double -> [Branch] -> [Branch]
565 riseToMeta rise branches =
566 let break = length branches > 1
569 if break then g & phylo_groupMeta .~ (adjust (\lst -> lst ++ [rise]) "breaks"(g ^. phylo_groupMeta))
574 -- attach a thr value to branches & groups metadata
576 thrToMeta :: Double -> [Branch] -> [Branch]
577 thrToMeta thr branches =
579 map (\g -> g & phylo_groupMeta .~ (adjust (\lst -> lst ++ [thr]) "seaLevels" (g ^. phylo_groupMeta))) b) branches
584 -- 1) try the zipper structure https://wiki.haskell.org/Zipper to performe the sea-level rise algorithme
585 -- 2) investigate how the branches order influences the 'separateBranches' function
590 -- sequentially separate each branch for a given threshold and check if it locally increases the quality score
591 -- sequence = [done] | currentBranch | [rest]
592 -- done = all the already separated branches
593 -- rest = all the branches we still have to separate
595 separateBranches :: Double -> Similarity -> Double -> Map Int Double -> Int -> Double -> Double
596 -> Int -> Map Date Double -> Map Date Cooc -> Map Int [PhyloGroupId] -> [Period]
597 -> [(Branch,ShouldTry)] -> (Branch,ShouldTry) -> [(Branch,ShouldTry)]
598 -> [(Branch,ShouldTry)]
599 separateBranches fdt similarity lambda frequency minBranch thr rise timescale docs coocs roots periods done currentBranch rest =
600 let done' = done ++ (if snd currentBranch
602 (if ((null (fst branches')) || (quality > quality'))
603 ---- 5) if the quality is not increased by the new branches or if the new branches are all small
604 ---- then undo the separation and localy stop the sea rise
605 ---- else validate the separation and authorise next sea rise in the long new branches
607 -- trace (" ✗ F(λ) = " <> show(quality) <> " (vs) " <> show(quality')
608 -- <> " | " <> show(length $ fst ego) <> " groups : "
609 -- <> " |✓ " <> show(length $ fst ego') <> show(map length $ fst ego')
610 -- <> " |✗ " <> show(length $ snd ego') <> "[" <> show(length $ concat $ snd ego') <> "]")
611 [(fst currentBranch,False)]
613 -- trace (" ✓ F(λ) = " <> show(quality) <> " (vs) " <> show(quality')
614 -- <> " | " <> show(length $ fst ego) <> " groups : "
615 -- <> " |✓ " <> show(length $ fst ego') <> show(map length $ fst ego')
616 -- <> " |✗ " <> show(length $ snd ego') <> "[" <> show(length $ concat $ snd ego') <> "]")
617 ((map (\e -> (e,True)) (fst branches')) ++ (map (\e -> (e,False)) (snd branches'))))
618 else [currentBranch])
620 -- 6) if there is no more branch to separate tne return [done'] else continue with [rest]
623 else separateBranches fdt similarity lambda frequency minBranch thr rise timescale docs coocs roots periods
624 done' (List.head rest) (List.tail rest)
626 ------- 1) compute the quality before splitting any branch
627 quality :: LocalQuality
628 quality = toPhyloQuality fdt lambda frequency ((map fst done) ++ [fst currentBranch] ++ (map fst rest))
630 ------------------- 2) split the current branch and create a new phylomemetic network
631 phylomemeticNetwork :: [Branch]
632 phylomemeticNetwork = toPhylomemeticNetwork timescale periods similarity thr docs coocs roots (fst currentBranch)
634 --------- 3) change the new phylomemetic network into a tuple of new branches
635 --------- on the left : the long branches, on the right : the small ones
636 branches' :: ([Branch],[Branch])
637 branches' = partition (\b -> (length $ nub $ map _phylo_groupPeriod b) >= minBranch)
639 $ riseToMeta rise phylomemeticNetwork
641 -------- 4) compute again the quality by considering the new branches
642 quality' :: LocalQuality
643 quality' = toPhyloQuality fdt lambda frequency
644 ((map fst done) ++ (fst branches') ++ (snd branches') ++ (map fst rest))
648 -- perform the sea-level rise algorithm, browse the similarity ladder and check that we can try out the next step
650 seaLevelRise :: Double -> Similarity -> Double -> Int -> Map Int Double
651 -> [Double] -> Double
653 -> Map Date Double -> Map Date Cooc
654 -> Map Int [PhyloGroupId]
655 -> [(Branch,ShouldTry)]
656 -> ([(Branch,ShouldTry)],FinalQuality)
657 seaLevelRise fdt similarity lambda minBranch frequency ladder rise frame periods docs coocs roots branches =
658 -- if the ladder is empty or thr > 1 or there is no branch to break then stop
659 if (null ladder) || ((List.head ladder) > 1) || (stopRise branches)
660 then (branches, toPhyloQuality fdt lambda frequency (map fst branches))
662 -- start breaking up all the possible branches for the current similarity threshold
663 let thr = List.head ladder
664 branches' = trace ("threshold = " <> printf "%.3f" thr
665 <> " F(λ) = " <> printf "%.5f" (toPhyloQuality fdt lambda frequency (map fst branches))
666 <> " ξ = " <> printf "%.5f" (globalAccuracy frequency (map fst branches))
667 <> " ρ = " <> printf "%.5f" (globalRecall frequency (map fst branches))
668 <> " branches = " <> show(length branches))
669 $ separateBranches fdt similarity lambda frequency minBranch thr rise frame docs coocs roots periods
670 [] (List.head branches) (List.tail branches)
671 in seaLevelRise fdt similarity lambda minBranch frequency (List.tail ladder) (rise + 1) frame periods docs coocs roots branches'
674 stopRise :: [(Branch,ShouldTry)] -> Bool
675 stopRise bs = ((not . or) $ map snd bs)
679 -- start the temporal matching process up, recover the resulting branches and update the groups (at scale 1) consequently
681 temporalMatching :: [Double] -> Phylo -> Phylo
682 temporalMatching ladder phylo = updatePhyloGroups 1
683 (Map.fromList $ map (\g -> (getGroupId g,g)) $ traceMatchEnd $ concat branches)
684 (updateQuality quality phylo)
687 quality :: FinalQuality
692 branches = map fst $ fst sea
694 --- 2) process the temporal matching by elevating the similarity ladder
695 sea :: ([(Branch,ShouldTry)],FinalQuality)
696 sea = seaLevelRise (fromIntegral $ Vector.length $ getRoots phylo)
697 (similarity $ getConfig phylo)
698 (_qua_granularity $ phyloQuality $ getConfig phylo)
699 (_qua_minBranch $ phyloQuality $ getConfig phylo)
700 (phylo ^. phylo_termFreq)
702 (getTimeFrame $ timeUnit $ getConfig phylo)
704 (phylo ^. phylo_timeDocs)
705 (phylo ^. phylo_timeCooc)
706 ((phylo ^. phylo_foundations) ^. foundations_rootsInGroups)
707 (reverse $ sortOn (length . fst) seabed)
709 ------ 1) for each group, process an initial temporal Matching and create a 'seabed'
710 ------ ShouldTry determines if you should apply the seaLevelRise function again within each branch
711 seabed :: [(Branch,ShouldTry)]
712 seabed = map (\b -> (b,(length $ nub $ map _phylo_groupPeriod b) >= (_qua_minBranch $ phyloQuality $ getConfig phylo)))
713 $ toPhylomemeticNetwork (getTimeFrame $ timeUnit $ getConfig phylo)
715 (similarity $ getConfig phylo)
717 (phylo ^. phylo_timeDocs)
718 (phylo ^. phylo_timeCooc)
719 ((phylo ^. phylo_foundations) ^. foundations_rootsInGroups)
720 (traceTemporalMatching $ getGroupsFromScale 1 phylo)