]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Viz/Phylo/TemporalMatching.hs
parameter change in the sensibility for WeightedLogSim and WeightedLogJaccard similar...
[gargantext.git] / src / Gargantext / Core / Viz / Phylo / TemporalMatching.hs
1 {-|
2 Module : Gargantext.Core.Viz.Phylo.TemporalMatching
3 Description : Module dedicated to the adaptative temporal matching of a Phylo.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11
12 module Gargantext.Core.Viz.Phylo.TemporalMatching where
13
14 import Data.List (concat, splitAt, tail, sortOn, (++), intersect, null, inits, groupBy, scanl, nub, nubBy, union, dropWhile, partition, or, sort, (!!))
15 import Data.Map (Map, fromList, elems, restrictKeys, unionWith, findWithDefault, keys, (!), (!?), filterWithKey, singleton, empty, mapKeys, adjust)
16
17 import Gargantext.Prelude
18 import Gargantext.Core.Viz.AdaptativePhylo
19 import Gargantext.Core.Viz.Phylo.PhyloTools
20
21 import Prelude (floor,tan,pi)
22 import Control.Lens hiding (Level)
23 import Control.Parallel.Strategies (parList, rdeepseq, using)
24 import Debug.Trace (trace)
25
26 import Text.Printf
27
28 import qualified Data.Map as Map
29 import qualified Data.Set as Set
30 import qualified Data.Vector as Vector
31
32
33
34 -------------------
35 -- | Proximity | --
36 -------------------
37
38
39 -- | To compute a jaccard similarity between two lists
40 jaccard :: [Int] -> [Int] -> Double
41 jaccard inter' union' = ((fromIntegral . length) $ inter') / ((fromIntegral . length) $ union')
42
43
44 -- | Process the inverse sumLog
45 sumInvLog' :: Double -> Double -> [Double] -> Double
46 sumInvLog' s nb diago = foldl (\mem occ -> mem + (1 / (log (occ + 1/s) / log (nb + 1/s)))) 0 diago
47
48
49 -- | Process the sumLog
50 sumLog' :: Double -> Double -> [Double] -> Double
51 sumLog' s nb diago = foldl (\mem occ -> mem + (log (occ + 1/s) / log (nb + 1/s))) 0 diago
52
53
54 weightedLogJaccard' :: Double -> Double -> Map Int Double -> [Int] -> [Int] -> Double
55 weightedLogJaccard' sens nbDocs diago ngrams ngrams'
56 | null ngramsInter = 0
57 | ngramsInter == ngramsUnion = 1
58 | sens == 0 = jaccard ngramsInter ngramsUnion
59 | sens > 0 = (sumInvLog' sens nbDocs diagoInter) / (sumInvLog' sens nbDocs diagoUnion)
60 | otherwise = (sumLog' sens nbDocs diagoInter) / (sumLog' sens nbDocs diagoUnion)
61 where
62 --------------------------------------
63 ngramsInter :: [Int]
64 ngramsInter = intersect ngrams ngrams'
65 --------------------------------------
66 ngramsUnion :: [Int]
67 ngramsUnion = union ngrams ngrams'
68 --------------------------------------
69 diagoInter :: [Double]
70 diagoInter = elems $ restrictKeys diago (Set.fromList ngramsInter)
71 --------------------------------------
72 diagoUnion :: [Double]
73 diagoUnion = elems $ restrictKeys diago (Set.fromList ngramsUnion)
74 --------------------------------------
75
76 -- | Process the weighted similarity between clusters. Adapted from Wang, X., Cheng, Q., Lu, W., 2014. Analyzing evolution of research topics with NEViewer: a new method based on dynamic co-word networks. Scientometrics 101, 1253–1271. https://doi.org/10.1007/s11192-014-1347-y (log added in the formula + pair comparison)
77 -- tests not conclusive
78 weightedLogSim' :: Double -> Double -> Map Int Double -> [Int] -> [Int] -> Double
79 weightedLogSim' sens nbDocs diago ego_ngrams target_ngrams
80 | null ngramsInter = 0
81 | ngramsInter == ngramsUnion = 1
82 | sens == 0 = jaccard ngramsInter ngramsUnion
83 | sens > 0 = (sumInvLog' sens nbDocs diagoInter) / minimum [(sumInvLog' sens nbDocs diagoEgo),(sumInvLog' sens nbDocs diagoTarget)]
84 | otherwise = (sumLog' sens nbDocs diagoInter) / minimum [(sumLog' sens nbDocs diagoEgo),(sumLog' sens nbDocs diagoTarget)]
85 where
86 --------------------------------------
87 ngramsInter :: [Int]
88 ngramsInter = intersect ego_ngrams target_ngrams
89 --------------------------------------
90 ngramsUnion :: [Int]
91 ngramsUnion = union ego_ngrams target_ngrams
92 --------------------------------------
93 diagoInter :: [Double]
94 diagoInter = elems $ restrictKeys diago (Set.fromList ngramsInter)
95 --------------------------------------
96 diagoEgo :: [Double]
97 diagoEgo = elems $ restrictKeys diago (Set.fromList ego_ngrams)
98 --------------------------------------
99 diagoTarget :: [Double]
100 diagoTarget = elems $ restrictKeys diago (Set.fromList target_ngrams)
101 --------------------------------------
102
103 toProximity :: Double -> Map Int Double -> Proximity -> [Int] -> [Int] -> [Int] -> Double
104 -- | To process the proximity between a current group and a pair of targets group using the adapted Wang et al. Similarity
105 toProximity nbDocs diago proximity egoNgrams targetNgrams targetNgrams' =
106 case proximity of
107 WeightedLogJaccard sens ->
108 let pairNgrams = if targetNgrams == targetNgrams'
109 then targetNgrams
110 else union targetNgrams targetNgrams'
111 in weightedLogJaccard' sens nbDocs diago egoNgrams pairNgrams
112 WeightedLogSim sens ->
113 let pairNgrams = if targetNgrams == targetNgrams'
114 then targetNgrams
115 else union targetNgrams targetNgrams'
116 in weightedLogSim' sens nbDocs diago egoNgrams pairNgrams
117 Hamming -> undefined
118
119 ------------------------
120 -- | Local Matching | --
121 ------------------------
122
123 findLastPeriod :: Filiation -> [PhyloPeriodId] -> PhyloPeriodId
124 findLastPeriod fil periods = case fil of
125 ToParents -> head' "findLastPeriod" (sortOn fst periods)
126 ToChilds -> last' "findLastPeriod" (sortOn fst periods)
127
128
129 -- | To filter pairs of candidates related to old pointers periods
130 removeOldPointers :: [Pointer] -> Filiation -> Double -> Proximity -> PhyloPeriodId
131 -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
132 -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
133 removeOldPointers oldPointers fil thr prox prd pairs
134 | null oldPointers = pairs
135 | null (filterPointers prox thr oldPointers) =
136 let lastMatchedPrd = findLastPeriod fil (map (fst . fst . fst) oldPointers)
137 in if lastMatchedPrd == prd
138 then []
139 else filter (\((id,_),(id',_)) ->
140 case fil of
141 ToParents -> (((fst . fst . fst) id ) < (fst lastMatchedPrd))
142 || (((fst . fst . fst) id') < (fst lastMatchedPrd))
143 ToChilds -> (((fst . fst . fst) id ) > (fst lastMatchedPrd))
144 || (((fst . fst . fst) id') > (fst lastMatchedPrd))) pairs
145 | otherwise = []
146
147
148 makePairs' :: (PhyloGroupId,[Int]) -> [(PhyloGroupId,[Int])] -> [PhyloPeriodId] -> [Pointer] -> Filiation -> Double -> Proximity
149 -> Map Date Double -> Map Date Cooc -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
150 makePairs' (egoId, egoNgrams) candidates periods oldPointers fil thr prox docs diagos =
151 if (null periods)
152 then []
153 else removeOldPointers oldPointers fil thr prox lastPrd
154 {- at least on of the pair candidates should be from the last added period -}
155 $ filter (\((id,_),(id',_)) -> ((fst . fst) id == lastPrd) || ((fst . fst) id' == lastPrd))
156 $ listToKeys
157 $ filter (\(id,ngrams) ->
158 let nbDocs = (sum . elems) $ filterDocs docs ([(fst . fst) egoId, (fst . fst) id])
159 diago = reduceDiagos $ filterDiago diagos ([(fst . fst) egoId, (fst . fst) id])
160 in (toProximity nbDocs diago prox egoNgrams egoNgrams ngrams) >= thr
161 ) candidates
162 where
163 lastPrd :: PhyloPeriodId
164 lastPrd = findLastPeriod fil periods
165
166
167 filterPointers :: Proximity -> Double -> [Pointer] -> [Pointer]
168 filterPointers proxi thr pts = filter (\(_,w) -> filterProximity proxi thr w) pts
169
170 filterPointers' :: Proximity -> Double -> [(Pointer,[Int])] -> [(Pointer,[Int])]
171 filterPointers' proxi thr pts = filter (\((_,w),_) -> filterProximity proxi thr w) pts
172
173
174 reduceDiagos :: Map Date Cooc -> Map Int Double
175 reduceDiagos diagos = mapKeys (\(k,_) -> k)
176 $ foldl (\acc diago -> unionWith (+) acc diago) empty (elems diagos)
177
178 filterPointersByPeriod :: Filiation -> [(Pointer,[Int])] -> [Pointer]
179 filterPointersByPeriod fil pts =
180 let pts' = sortOn (fst . fst . fst . fst) pts
181 inf = (fst . fst . fst . fst) $ head' "filterPointersByPeriod" pts'
182 sup = (fst . fst . fst . fst) $ last' "filterPointersByPeriod" pts'
183 in map fst
184 $ nubBy (\pt pt' -> snd pt == snd pt')
185 $ filter (\pt -> ((fst . fst . fst . fst) pt == inf) || ((fst . fst . fst . fst) pt == sup))
186 $ case fil of
187 ToParents -> reverse pts'
188 ToChilds -> pts'
189
190 phyloGroupMatching :: [[(PhyloGroupId,[Int])]] -> Filiation -> Proximity -> Map Date Double -> Map Date Cooc
191 -> Double -> [Pointer] -> (PhyloGroupId,[Int]) -> [Pointer]
192 phyloGroupMatching candidates fil proxi docs diagos thr oldPointers (id,ngrams) =
193 if (null $ filterPointers proxi thr oldPointers)
194 {- let's find new pointers -}
195 then if null nextPointers
196 then []
197 else filterPointersByPeriod fil
198 $ head' "phyloGroupMatching"
199 -- Keep only the best set of pointers grouped by proximity
200 $ groupBy (\pt pt' -> (snd . fst) pt == (snd . fst) pt')
201 $ reverse $ sortOn (snd . fst) $ head' "pointers" nextPointers
202 -- Find the first time frame where at leats one pointer satisfies the proximity threshold
203 else oldPointers
204 where
205 nextPointers :: [[(Pointer,[Int])]]
206 nextPointers = take 1
207 $ dropWhile (null)
208 {- for each time frame, process the proximity on relevant pairs of targeted groups -}
209 $ scanl (\acc groups ->
210 let periods = nub $ map (fst . fst . fst) $ concat groups
211 nbdocs = sum $ elems $ (filterDocs docs ([(fst . fst) id] ++ periods))
212 diago = reduceDiagos
213 $ filterDiago diagos ([(fst . fst) id] ++ periods)
214 {- important resize nbdocs et diago dans le make pairs -}
215 pairs = makePairs' (id,ngrams) (concat groups) periods oldPointers fil thr proxi docs diagos
216 in acc ++ ( filterPointers' proxi thr
217 $ concat
218 $ map (\(c,c') ->
219 {- process the proximity between the current group and a pair of candidates -}
220 let proximity = toProximity nbdocs diago proxi ngrams (snd c) (snd c')
221 in if ((c == c') || (snd c == snd c'))
222 then [((fst c,proximity),snd c)]
223 else [((fst c,proximity),snd c),((fst c',proximity),snd c')] ) pairs )) []
224 $ inits candidates -- groups from [[1900],[1900,1901],[1900,1901,1902],...]
225
226
227 filterDocs :: Map Date Double -> [PhyloPeriodId] -> Map Date Double
228 filterDocs d pds = restrictKeys d $ periodsToYears pds
229
230 filterDiago :: Map Date Cooc -> [PhyloPeriodId] -> Map Date Cooc
231 filterDiago diago pds = restrictKeys diago $ periodsToYears pds
232
233
234 -----------------------------
235 -- | Matching Processing | --
236 -----------------------------
237
238
239 getNextPeriods :: Filiation -> Int -> PhyloPeriodId -> [PhyloPeriodId] -> [PhyloPeriodId]
240 getNextPeriods fil max' pId pIds =
241 case fil of
242 ToChilds -> take max' $ (tail . snd) $ splitAt (elemIndex' pId pIds) pIds
243 ToParents -> take max' $ (reverse . fst) $ splitAt (elemIndex' pId pIds) pIds
244
245
246 getCandidates :: PhyloGroup -> [[(PhyloGroupId,[Int])]] -> [[(PhyloGroupId,[Int])]]
247 getCandidates ego targets =
248 map (\groups' ->
249 filter (\g' -> (not . null) $ intersect (ego ^. phylo_groupNgrams) (snd g')
250 ) groups') targets
251
252
253 matchGroupsToGroups :: Int -> [PhyloPeriodId] -> Proximity -> Double -> Map Date Double -> Map Date Cooc -> [PhyloGroup] -> [PhyloGroup]
254 matchGroupsToGroups frame periods proximity thr docs coocs groups =
255 let groups' = groupByField _phylo_groupPeriod groups
256 in foldl' (\acc prd ->
257 let -- 1) find the parents/childs matching periods
258 periodsPar = getNextPeriods ToParents frame prd periods
259 periodsChi = getNextPeriods ToChilds frame prd periods
260 -- 2) find the parents/childs matching candidates
261 candidatesPar = map (\prd' -> map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ findWithDefault [] prd' groups') periodsPar
262 candidatesChi = map (\prd' -> map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ findWithDefault [] prd' groups') periodsChi
263 -- 3) find the parents/child number of docs by years
264 docsPar = filterDocs docs ([prd] ++ periodsPar)
265 docsChi = filterDocs docs ([prd] ++ periodsChi)
266 -- 4) find the parents/child diago by years
267 diagoPar = filterDiago (map coocToDiago coocs) ([prd] ++ periodsPar)
268 diagoChi = filterDiago (map coocToDiago coocs) ([prd] ++ periodsPar)
269 -- 5) match in parallel all the groups (egos) to their possible candidates
270 egos = map (\ego ->
271 let pointersPar = phyloGroupMatching (getCandidates ego candidatesPar) ToParents proximity docsPar diagoPar
272 thr (getPeriodPointers ToParents ego) (getGroupId ego, ego ^. phylo_groupNgrams)
273 pointersChi = phyloGroupMatching (getCandidates ego candidatesChi) ToChilds proximity docsChi diagoChi
274 thr (getPeriodPointers ToChilds ego) (getGroupId ego, ego ^. phylo_groupNgrams)
275 in addPointers ToChilds TemporalPointer pointersChi
276 $ addPointers ToParents TemporalPointer pointersPar ego)
277 $ findWithDefault [] prd groups'
278 egos' = egos `using` parList rdeepseq
279 in acc ++ egos'
280 ) [] periods
281
282
283 -----------------------
284 -- | Phylo Quality | --
285 -----------------------
286
287
288 relevantBranches :: Int -> [[PhyloGroup]] -> [[PhyloGroup]]
289 relevantBranches term branches =
290 filter (\groups -> (any (\group -> elem term $ group ^. phylo_groupNgrams) groups)) branches
291
292 accuracy :: Int -> [(Date,Date)] -> [PhyloGroup] -> Double
293 -- The accuracy of a branch relatively to a term x is computed only over the periods there exist some cluster mentionning x in the phylomemy
294 accuracy x periods bk = ((fromIntegral $ length $ filter (\g -> elem x $ g ^. phylo_groupNgrams) bk')
295 / (fromIntegral $ length bk'))
296 where
297 bk' :: [PhyloGroup]
298 bk' = filter (\g -> elem (g ^. phylo_groupPeriod) periods) bk
299
300 recall :: Int -> [PhyloGroup] -> [[PhyloGroup]] -> Double
301 recall x bk bx = ((fromIntegral $ length $ filter (\g -> elem x $ g ^. phylo_groupNgrams) bk)
302 / (fromIntegral $ length $ filter (\g -> elem x $ g ^. phylo_groupNgrams) $ concat bx))
303
304 fScore :: Double -> Int -> [(Date,Date)] -> [PhyloGroup] -> [[PhyloGroup]] -> Double
305 fScore lambda x periods bk bx =
306 let rec = recall x bk bx
307 acc = accuracy x periods bk
308 in ((1 + lambda ** 2) * acc * rec)
309 / (((lambda ** 2) * acc + rec))
310
311
312 wk :: [PhyloGroup] -> Double
313 wk bk = fromIntegral $ length bk
314
315
316 toPhyloQuality' :: Double -> Map Int Double -> [[PhyloGroup]] -> Double
317 toPhyloQuality' lambda freq branches =
318 if (null branches)
319 then 0
320 else sum
321 $ map (\i ->
322 let bks = relevantBranches i branches
323 periods = nub $ map _phylo_groupPeriod $ filter (\g -> elem i $ g ^. phylo_groupNgrams) $ concat bks
324 in (freq ! i) * (sum $ map (\bk -> ((wk bk) / (sum $ map wk bks)) * (fScore lambda i periods bk bks)) bks))
325 $ keys freq
326
327 toRecall :: Map Int Double -> [[PhyloGroup]] -> Double
328 toRecall freq branches =
329 if (null branches)
330 then 0
331 else sum
332 $ map (\x ->
333 let px = freq ! x
334 bx = relevantBranches x branches
335 wks = sum $ map wk bx
336 in (px / pys) * (sum $ map (\bk -> ((wk bk) / wks) * (recall x bk bx)) bx))
337 $ keys freq
338 where
339 pys :: Double
340 pys = sum (elems freq)
341
342
343 toAccuracy :: Map Int Double -> [[PhyloGroup]] -> Double
344 toAccuracy freq branches =
345 if (null branches)
346 then 0
347 else sum
348 $ map (\x ->
349 let px = freq ! x
350 bx = relevantBranches x branches
351 -- | periods containing x
352 periods = nub $ map _phylo_groupPeriod $ filter (\g -> elem x $ g ^. phylo_groupNgrams) $ concat bx
353 wks = sum $ map wk bx
354 in (px / pys) * (sum $ map (\bk -> ((wk bk) / wks) * (accuracy x periods bk)) bx))
355 $ keys freq
356 where
357 pys :: Double
358 pys = sum (elems freq)
359
360
361 -- | here we do the average of all the local f_scores
362 toPhyloQuality :: Double -> Double -> Map Int Double -> [[PhyloGroup]] -> Double
363 toPhyloQuality fdt lambda freq branches =
364 if (null branches)
365 then 0
366 else sum
367 $ map (\x ->
368 -- let px = freq ! x
369 let bx = relevantBranches x branches
370 -- | periods containing x
371 periods = nub $ map _phylo_groupPeriod $ filter (\g -> elem x $ g ^. phylo_groupNgrams) $ concat bx
372 wks = sum $ map wk bx
373 -- in (px / pys) * (sum $ map (\bk -> ((wk bk) / wks) * (fScore beta x bk bx)) bx))
374 -- in (1 / fdt) * (sum $ map (\bk -> ((wk bk) / wks) * (fScore beta x periods bk bx)) bx))
375 in (1 / fdt) * (sum $ map (\bk -> ((wk bk) / wks) * (fScore (tan (lambda * pi / 2)) x periods bk bx)) bx))
376 $ keys freq
377 -- where
378 -- pys :: Double
379 -- pys = sum (elems freq)
380
381 -- 1 / nb de foundation
382
383 ------------------------------------
384 -- | Constant Temporal Matching | --
385 ------------------------------------
386
387
388 groupsToBranches' :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
389 groupsToBranches' groups =
390 {- run the related component algorithm -}
391 let egos = groupBy (\gs gs' -> (fst $ fst $ head' "egos" gs) == (fst $ fst $ head' "egos" gs'))
392 $ sortOn (\gs -> fst $ fst $ head' "egos" gs)
393 $ map (\group -> [getGroupId group]
394 ++ (map fst $ group ^. phylo_groupPeriodParents)
395 ++ (map fst $ group ^. phylo_groupPeriodChilds) ) $ elems groups
396 -- first find the related components by inside each ego's period
397 -- a supprimer
398 graph' = map relatedComponents egos
399 -- then run it for the all the periods
400 graph = zip [1..]
401 $ relatedComponents $ concat (graph' `using` parList rdeepseq)
402 -- update each group's branch id
403 in map (\(bId,ids) ->
404 let groups' = map (\group -> group & phylo_groupBranchId %~ (\(lvl,lst) -> (lvl,lst ++ [bId])))
405 $ elems $ restrictKeys groups (Set.fromList ids)
406 in groups' `using` parList rdeepseq ) graph
407
408
409 reduceFrequency :: Map Int Double -> [[PhyloGroup]] -> Map Int Double
410 reduceFrequency frequency branches =
411 restrictKeys frequency (Set.fromList $ (nub . concat) $ map _phylo_groupNgrams $ concat branches)
412
413 updateThr :: Double -> [[PhyloGroup]] -> [[PhyloGroup]]
414 updateThr thr branches = map (\b -> map (\g ->
415 g & phylo_groupMeta .~ (singleton "seaLevels" (((g ^. phylo_groupMeta) ! "seaLevels") ++ [thr]))) b) branches
416
417
418 -- Sequentially break each branch of a phylo where
419 -- done = all the allready broken branches
420 -- ego = the current branch we want to break
421 -- rest = the branches we still have to break
422 breakBranches :: Double -> Proximity -> Double -> Map Int Double -> Int -> Double -> Double -> Double
423 -> Int -> Map Date Double -> Map Date Cooc -> [PhyloPeriodId] -> [([PhyloGroup],Bool)] -> ([PhyloGroup],Bool) -> [([PhyloGroup],Bool)] -> [([PhyloGroup],Bool)]
424 breakBranches fdt proximity lambda frequency minBranch thr depth elevation frame docs coocs periods done ego rest =
425 -- 1) keep or not the new division of ego
426 let done' = done ++ (if snd ego
427 then
428 (if ((null (fst ego')) || (quality > quality'))
429 then
430 -- trace (" ✗ F(β) = " <> show(quality) <> " (vs) " <> show(quality')
431 -- <> " | " <> show(length $ fst ego) <> " groups : "
432 -- <> " |✓ " <> show(length $ fst ego') <> show(map length $ fst ego')
433 -- <> " |✗ " <> show(length $ snd ego') <> "[" <> show(length $ concat $ snd ego') <> "]")
434 [(fst ego,False)]
435 else
436 -- trace (" ✓ level = " <> printf "%.1f" thr <> "")
437 -- trace (" ✓ F(β) = " <> show(quality) <> " (vs) " <> show(quality')
438 -- <> " | " <> show(length $ fst ego) <> " groups : "
439 -- <> " |✓ " <> show(length $ fst ego') <> show(map length $ fst ego')
440 -- <> " |✗ " <> show(length $ snd ego') <> "[" <> show(length $ concat $ snd ego') <> "]")
441 ((map (\e -> (e,True)) (fst ego')) ++ (map (\e -> (e,False)) (snd ego'))))
442 else [ego])
443 in
444 -- 2) if there is no more branches in rest then return else continue
445 if null rest
446 then done'
447 else breakBranches fdt proximity lambda frequency minBranch thr depth elevation frame docs coocs periods
448 done' (head' "breakBranches" rest) (tail' "breakBranches" rest)
449 where
450 --------------------------------------
451 quality :: Double
452 quality = toPhyloQuality fdt lambda frequency ((map fst done) ++ [fst ego] ++ (map fst rest))
453 --------------------------------------
454 ego' :: ([[PhyloGroup]],[[PhyloGroup]])
455 ego' =
456 let branches = groupsToBranches' $ fromList $ map (\g -> (getGroupId g, g))
457 $ matchGroupsToGroups frame periods proximity thr docs coocs (fst ego)
458 branches' = branches `using` parList rdeepseq
459 in partition (\b -> (length $ nub $ map _phylo_groupPeriod b) >= minBranch)
460 $ thrToMeta thr
461 $ depthToMeta (elevation - depth) branches'
462 --------------------------------------
463 quality' :: Double
464 quality' = toPhyloQuality fdt lambda frequency
465 ((map fst done) ++ (fst ego') ++ (snd ego') ++ (map fst rest))
466
467
468 seaLevelMatching :: Double -> Proximity -> Double -> Int -> Map Int Double -> Double -> Double -> Double -> Double
469 -> Int -> [PhyloPeriodId] -> Map Date Double -> Map Date Cooc -> [([PhyloGroup],Bool)] -> [([PhyloGroup],Bool)]
470 seaLevelMatching fdt proximity lambda minBranch frequency thr step depth elevation frame periods docs coocs branches =
471 -- if there is no branch to break or if seaLvl level > 1 then end
472 if (thr >= 1) || ((not . or) $ map snd branches)
473 then branches
474 else
475 -- break all the possible branches at the current seaLvl level
476 let quality = toPhyloQuality fdt lambda frequency (map fst branches)
477 acc = toAccuracy frequency (map fst branches)
478 rec = toRecall frequency (map fst branches)
479 branches' = trace ("↑ level = " <> printf "%.3f" thr <> " F(λ) = " <> printf "%.5f" quality
480 <> " ξ = " <> printf "%.5f" acc
481 <> " ρ = " <> printf "%.5f" rec
482 <> " branches = " <> show(length branches) <> " ↴")
483 $ breakBranches fdt proximity lambda frequency minBranch thr depth elevation frame docs coocs periods
484 [] (head' "seaLevelMatching" branches) (tail' "seaLevelMatching" branches)
485 frequency' = reduceFrequency frequency (map fst branches')
486 in seaLevelMatching fdt proximity lambda minBranch frequency' (thr + step) step (depth - 1) elevation frame periods docs coocs branches'
487
488
489 constanteTemporalMatching :: Double -> Double -> Phylo -> Phylo
490 constanteTemporalMatching start step phylo = updatePhyloGroups 1
491 (fromList $ map (\g -> (getGroupId g,g)) $ traceMatchEnd $ concat branches)
492 (toPhyloHorizon phylo)
493 where
494 -- 2) process the temporal matching by elevating seaLvl level
495 branches :: [[PhyloGroup]]
496 branches = map fst
497 $ seaLevelMatching (fromIntegral $ Vector.length $ getRoots phylo)
498 (phyloProximity $ getConfig phylo)
499 (_qua_granularity $ phyloQuality $ getConfig phylo)
500 (_qua_minBranch $ phyloQuality $ getConfig phylo)
501 (phylo ^. phylo_termFreq)
502 start step
503 ((((1 - start) / step) - 1))
504 (((1 - start) / step))
505 (getTimeFrame $ timeUnit $ getConfig phylo)
506 (getPeriodIds phylo)
507 (phylo ^. phylo_timeDocs)
508 (phylo ^. phylo_timeCooc)
509 (reverse $ sortOn (length . fst) groups)
510 -- 1) for each group process an initial temporal Matching
511 -- here we suppose that all the groups of level 1 are part of the same big branch
512 groups :: [([PhyloGroup],Bool)]
513 groups = map (\b -> (b,(length $ nub $ map _phylo_groupPeriod b) >= (_qua_minBranch $ phyloQuality $ getConfig phylo)))
514 $ groupsToBranches' $ fromList $ map (\g -> (getGroupId g, g))
515 $ matchGroupsToGroups (getTimeFrame $ timeUnit $ getConfig phylo)
516 (getPeriodIds phylo) (phyloProximity $ getConfig phylo)
517 start
518 (phylo ^. phylo_timeDocs)
519 (phylo ^. phylo_timeCooc)
520 (traceTemporalMatching $ getGroupsFromLevel 1 phylo)
521
522 -----------------
523 -- | Horizon | --
524 -----------------
525
526 toPhyloHorizon :: Phylo -> Phylo
527 toPhyloHorizon phylo =
528 let t0 = take 1 (getPeriodIds phylo)
529 groups = getGroupsFromLevelPeriods 1 t0 phylo
530 sens = getSensibility (phyloProximity $ getConfig phylo)
531 nbDocs = sum $ elems $ filterDocs (phylo ^. phylo_timeDocs) t0
532 diago = reduceDiagos $ filterDiago (phylo ^. phylo_timeCooc) t0
533 in phylo & phylo_horizon .~ (fromList $ map (\(g,g') ->
534 ((getGroupId g,getGroupId g'),weightedLogJaccard' sens nbDocs diago (g ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams))) $ listToCombi' groups)
535
536
537 --------------------------------------
538 -- | Adaptative Temporal Matching | --
539 --------------------------------------
540
541
542 thrToMeta :: Double -> [[PhyloGroup]] -> [[PhyloGroup]]
543 thrToMeta thr branches =
544 map (\b ->
545 map (\g -> g & phylo_groupMeta .~ (adjust (\lst -> lst ++ [thr]) "seaLevels" (g ^. phylo_groupMeta))) b) branches
546
547 depthToMeta :: Double -> [[PhyloGroup]] -> [[PhyloGroup]]
548 depthToMeta depth branches =
549 let break = length branches > 1
550 in map (\b ->
551 map (\g ->
552 if break then g & phylo_groupMeta .~ (adjust (\lst -> lst ++ [depth]) "breaks"(g ^. phylo_groupMeta))
553 else g) b) branches
554
555 reduceTupleMapByKeys :: Eq a => [a] -> Map (a,a) Double -> Map (a,a) Double
556 reduceTupleMapByKeys ks m = filterWithKey (\(k,k') _ -> (elem k ks) && (elem k' ks)) m
557
558
559 getInTupleMap :: Ord a => Map (a,a) Double -> a -> a -> Double
560 getInTupleMap m k k'
561 | isJust (m !? ( k ,k')) = m ! ( k ,k')
562 | isJust (m !? ( k',k )) = m ! ( k',k )
563 | otherwise = 0
564
565
566 toThreshold :: Double -> Map (PhyloGroupId,PhyloGroupId) Double -> Double
567 toThreshold lvl proxiGroups =
568 let idx = ((Map.size proxiGroups) `div` (floor lvl)) - 1
569 in if idx >= 0
570 then (sort $ elems proxiGroups) !! idx
571 else 1
572
573
574 -- done = all the allready broken branches
575 -- ego = the current branch we want to break
576 -- rest = the branches we still have to break
577 adaptativeBreakBranches :: Double -> Proximity -> Double -> Double -> Map (PhyloGroupId,PhyloGroupId) Double
578 -> Double -> Map Int Double -> Int -> Int -> Map Date Double -> Map Date Cooc
579 -> [PhyloPeriodId] -> [([PhyloGroup],(Bool,[Double]))] -> ([PhyloGroup],(Bool,[Double])) -> [([PhyloGroup],(Bool,[Double]))]
580 -> [([PhyloGroup],(Bool,[Double]))]
581 adaptativeBreakBranches fdt proxiConf depth elevation groupsProxi lambda frequency minBranch frame docs coocs periods done ego rest =
582 -- 1) keep or not the new division of ego
583 let done' = done ++ (if (fst . snd) ego
584 then (if ((null (fst ego')) || (quality > quality'))
585 then
586 [(concat $ thrToMeta thr $ [fst ego],(False, ((snd . snd) ego)))]
587 else
588 ( (map (\e -> (e,(True, ((snd . snd) ego) ++ [thr]))) (fst ego'))
589 ++ (map (\e -> (e,(False, ((snd . snd) ego)))) (snd ego'))))
590 else [(concat $ thrToMeta thr $ [fst ego], snd ego)])
591 in
592 -- uncomment let .. in for debugging
593 -- let part1 = partition (snd) done'
594 -- part2 = partition (snd) rest
595 -- in trace ( "[✓ " <> show(length $ fst part1) <> "(" <> show(length $ concat $ map (fst) $ fst part1) <> ")|✗ " <> show(length $ snd part1) <> "(" <> show(length $ concat $ map (fst) $ snd part1) <> ")] "
596 -- <> "[✓ " <> show(length $ fst part2) <> "(" <> show(length $ concat $ map (fst) $ fst part2) <> ")|✗ " <> show(length $ snd part2) <> "(" <> show(length $ concat $ map (fst) $ snd part2) <> ")]"
597 -- ) $
598 -- 2) if there is no more branches in rest then return else continue
599 if null rest
600 then done'
601 else adaptativeBreakBranches fdt proxiConf depth elevation groupsProxi lambda frequency minBranch frame docs coocs periods
602 done' (head' "breakBranches" rest) (tail' "breakBranches" rest)
603 where
604 --------------------------------------
605 thr :: Double
606 thr = toThreshold depth $ Map.filter (\v -> v > (last' "breakBranches" $ (snd . snd) ego)) $ reduceTupleMapByKeys (map getGroupId $ fst ego) groupsProxi
607 --------------------------------------
608 quality :: Double
609 quality = toPhyloQuality fdt lambda frequency ((map fst done) ++ [fst ego] ++ (map fst rest))
610 --------------------------------------
611 ego' :: ([[PhyloGroup]],[[PhyloGroup]])
612 ego' =
613 let branches = groupsToBranches' $ fromList $ map (\g -> (getGroupId g, g))
614 $ matchGroupsToGroups frame periods proxiConf thr docs coocs (fst ego)
615 branches' = branches `using` parList rdeepseq
616 in partition (\b -> (length $ nub $ map _phylo_groupPeriod b) > minBranch)
617 $ thrToMeta thr
618 $ depthToMeta (elevation - depth) branches'
619 --------------------------------------
620 quality' :: Double
621 quality' = toPhyloQuality fdt lambda frequency
622 ((map fst done) ++ (fst ego') ++ (snd ego') ++ (map fst rest))
623
624
625 adaptativeSeaLevelMatching :: Double -> Proximity -> Double -> Double -> Map (PhyloGroupId, PhyloGroupId) Double
626 -> Double -> Int -> Map Int Double
627 -> Int -> [PhyloPeriodId] -> Map Date Double -> Map Date Cooc
628 -> [([PhyloGroup],(Bool,[Double]))] -> [([PhyloGroup],(Bool,[Double]))]
629 adaptativeSeaLevelMatching fdt proxiConf depth elevation groupsProxi lambda minBranch frequency frame periods docs coocs branches =
630 -- if there is no branch to break or if seaLvl level >= depth then end
631 if (Map.null groupsProxi) || (depth <= 0) || ((not . or) $ map (fst . snd) branches)
632 then branches
633 else
634 -- break all the possible branches at the current seaLvl level
635 let branches' = adaptativeBreakBranches fdt proxiConf depth elevation groupsProxi lambda frequency minBranch frame docs coocs periods
636 [] (head' "seaLevelMatching" branches) (tail' "seaLevelMatching" branches)
637 frequency' = reduceFrequency frequency (map fst branches')
638 groupsProxi' = reduceTupleMapByKeys (map (getGroupId) $ concat $ map (fst) $ filter (fst . snd) branches') groupsProxi
639 -- thr = toThreshold depth groupsProxi
640 in trace("\n " <> foldl (\acc _ -> acc <> "🌊 ") "" [0..(elevation - depth)]
641 <> " [✓ " <> show(length $ filter (fst . snd) branches') <> "(" <> show(length $ concat $ map (fst) $ filter (fst . snd) branches')
642 <> ")|✗ " <> show(length $ filter (not . fst . snd) branches') <> "(" <> show(length $ concat $ map (fst) $ filter (not . fst . snd) branches') <> ")]"
643 <> " thr = ")
644 $ adaptativeSeaLevelMatching fdt proxiConf (depth - 1) elevation groupsProxi' lambda minBranch frequency' frame periods docs coocs branches'
645
646
647 adaptativeTemporalMatching :: Double -> Phylo -> Phylo
648 adaptativeTemporalMatching elevation phylo = updatePhyloGroups 1
649 (fromList $ map (\g -> (getGroupId g,g)) $ traceMatchEnd $ concat branches)
650 (toPhyloHorizon phylo)
651 where
652 -- 2) process the temporal matching by elevating seaLvl level
653 branches :: [[PhyloGroup]]
654 branches = map fst
655 $ adaptativeSeaLevelMatching (fromIntegral $ Vector.length $ getRoots phylo)
656 (phyloProximity $ getConfig phylo)
657 (elevation - 1)
658 elevation
659 (phylo ^. phylo_groupsProxi)
660 (_qua_granularity $ phyloQuality $ getConfig phylo)
661 (_qua_minBranch $ phyloQuality $ getConfig phylo)
662 (phylo ^. phylo_termFreq)
663 (getTimeFrame $ timeUnit $ getConfig phylo)
664 (getPeriodIds phylo)
665 (phylo ^. phylo_timeDocs)
666 (phylo ^. phylo_timeCooc)
667 groups
668 -- 1) for each group process an initial temporal Matching
669 -- here we suppose that all the groups of level 1 are part of the same big branch
670 groups :: [([PhyloGroup],(Bool,[Double]))]
671 groups = map (\b -> (b,((length $ nub $ map _phylo_groupPeriod b) >= (_qua_minBranch $ phyloQuality $ getConfig phylo),[thr])))
672 $ groupsToBranches' $ fromList $ map (\g -> (getGroupId g, g))
673 $ matchGroupsToGroups (getTimeFrame $ timeUnit $ getConfig phylo)
674 (getPeriodIds phylo) (phyloProximity $ getConfig phylo)
675 thr
676 (phylo ^. phylo_timeDocs)
677 (phylo ^. phylo_timeCooc)
678 (traceTemporalMatching $ getGroupsFromLevel 1 phylo)
679 --------------------------------------
680 thr :: Double
681 thr = toThreshold elevation (phylo ^. phylo_groupsProxi)