]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Viz/Phylo/TemporalMatching.hs
[phylo] Inversion of level range in re-parameterization
[gargantext.git] / src / Gargantext / Core / Viz / Phylo / TemporalMatching.hs
1 {-|
2 Module : Gargantext.Core.Viz.Phylo.TemporalMatching
3 Description : Module dedicated to the adaptative temporal matching of a Phylo.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11
12 module Gargantext.Core.Viz.Phylo.TemporalMatching where
13
14 import Data.List (concat, splitAt, tail, sortOn, (++), intersect, null, inits, groupBy, scanl, nub, nubBy, union, dropWhile, partition, or, sort, (!!))
15 import Data.Map (Map, fromList, elems, restrictKeys, unionWith, findWithDefault, keys, (!), (!?), filterWithKey, singleton, empty, mapKeys, adjust)
16
17 import Gargantext.Prelude
18 import Gargantext.Core.Viz.AdaptativePhylo
19 import Gargantext.Core.Viz.Phylo.PhyloTools
20
21 import Prelude (floor,tan,pi)
22 import Control.Lens hiding (Level)
23 import Control.Parallel.Strategies (parList, rdeepseq, using)
24 import Debug.Trace (trace)
25
26 import Text.Printf
27
28 import qualified Data.Map as Map
29 import qualified Data.Set as Set
30 import qualified Data.Vector as Vector
31
32
33
34 -------------------
35 -- | Proximity | --
36 -------------------
37
38
39 -- | To compute a jaccard similarity between two lists
40 jaccard :: [Int] -> [Int] -> Double
41 jaccard inter' union' = ((fromIntegral . length) $ inter') / ((fromIntegral . length) $ union')
42
43
44 -- | Process the inverse sumLog
45 sumInvLog' :: Double -> Double -> [Double] -> Double
46 sumInvLog' s nb diago = foldl (\mem occ -> mem + (1 / (log (occ + s) / log (nb + s)))) 0 diago
47
48
49 -- | Process the sumLog
50 sumLog' :: Double -> Double -> [Double] -> Double
51 sumLog' s nb diago = foldl (\mem occ -> mem + (log (occ + s) / log (nb + s))) 0 diago
52
53
54 weightedLogJaccard' :: Double -> Double -> Map Int Double -> [Int] -> [Int] -> Double
55 weightedLogJaccard' sens nbDocs diago ngrams ngrams'
56 | null ngramsInter = 0
57 | ngramsInter == ngramsUnion = 1
58 | sens == 0 = jaccard ngramsInter ngramsUnion
59 | sens > 0 = (sumInvLog' sens nbDocs diagoInter) / (sumInvLog' sens nbDocs diagoUnion)
60 | otherwise = (sumLog' sens nbDocs diagoInter) / (sumLog' sens nbDocs diagoUnion)
61 where
62 --------------------------------------
63 ngramsInter :: [Int]
64 ngramsInter = intersect ngrams ngrams'
65 --------------------------------------
66 ngramsUnion :: [Int]
67 ngramsUnion = union ngrams ngrams'
68 --------------------------------------
69 diagoInter :: [Double]
70 diagoInter = elems $ restrictKeys diago (Set.fromList ngramsInter)
71 --------------------------------------
72 diagoUnion :: [Double]
73 diagoUnion = elems $ restrictKeys diago (Set.fromList ngramsUnion)
74 --------------------------------------
75
76
77 -- | To process the proximity between a current group and a pair of targets group
78 toProximity :: Double -> Map Int Double -> Proximity -> [Int] -> [Int] -> [Int] -> Double
79 toProximity nbDocs diago proximity egoNgrams targetNgrams targetNgrams' =
80 case proximity of
81 WeightedLogJaccard sens ->
82 let pairNgrams = if targetNgrams == targetNgrams'
83 then targetNgrams
84 else union targetNgrams targetNgrams'
85 in weightedLogJaccard' sens nbDocs diago egoNgrams pairNgrams
86 Hamming -> undefined
87
88
89 ------------------------
90 -- | Local Matching | --
91 ------------------------
92
93 findLastPeriod :: Filiation -> [PhyloPeriodId] -> PhyloPeriodId
94 findLastPeriod fil periods = case fil of
95 ToParents -> head' "findLastPeriod" (sortOn fst periods)
96 ToChilds -> last' "findLastPeriod" (sortOn fst periods)
97
98
99 -- | To filter pairs of candidates related to old pointers periods
100 removeOldPointers :: [Pointer] -> Filiation -> Double -> Proximity -> PhyloPeriodId
101 -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
102 -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
103 removeOldPointers oldPointers fil thr prox prd pairs
104 | null oldPointers = pairs
105 | null (filterPointers prox thr oldPointers) =
106 let lastMatchedPrd = findLastPeriod fil (map (fst . fst . fst) oldPointers)
107 in if lastMatchedPrd == prd
108 then []
109 else filter (\((id,_),(id',_)) ->
110 case fil of
111 ToParents -> (((fst . fst . fst) id ) < (fst lastMatchedPrd))
112 || (((fst . fst . fst) id') < (fst lastMatchedPrd))
113 ToChilds -> (((fst . fst . fst) id ) > (fst lastMatchedPrd))
114 || (((fst . fst . fst) id') > (fst lastMatchedPrd))) pairs
115 | otherwise = []
116
117
118 makePairs' :: (PhyloGroupId,[Int]) -> [(PhyloGroupId,[Int])] -> [PhyloPeriodId] -> [Pointer] -> Filiation -> Double -> Proximity
119 -> Map Date Double -> Map Date Cooc -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
120 makePairs' (egoId, egoNgrams) candidates periods oldPointers fil thr prox docs diagos =
121 if (null periods)
122 then []
123 else removeOldPointers oldPointers fil thr prox lastPrd
124 {- at least on of the pair candidates should be from the last added period -}
125 $ filter (\((id,_),(id',_)) -> ((fst . fst) id == lastPrd) || ((fst . fst) id' == lastPrd))
126 $ listToKeys
127 $ filter (\(id,ngrams) ->
128 let nbDocs = (sum . elems) $ filterDocs docs ([(fst . fst) egoId, (fst . fst) id])
129 diago = reduceDiagos $ filterDiago diagos ([(fst . fst) egoId, (fst . fst) id])
130 in (toProximity nbDocs diago prox egoNgrams egoNgrams ngrams) >= thr
131 ) candidates
132 where
133 lastPrd :: PhyloPeriodId
134 lastPrd = findLastPeriod fil periods
135
136
137 filterPointers :: Proximity -> Double -> [Pointer] -> [Pointer]
138 filterPointers proxi thr pts = filter (\(_,w) -> filterProximity proxi thr w) pts
139
140 filterPointers' :: Proximity -> Double -> [(Pointer,[Int])] -> [(Pointer,[Int])]
141 filterPointers' proxi thr pts = filter (\((_,w),_) -> filterProximity proxi thr w) pts
142
143
144 reduceDiagos :: Map Date Cooc -> Map Int Double
145 reduceDiagos diagos = mapKeys (\(k,_) -> k)
146 $ foldl (\acc diago -> unionWith (+) acc diago) empty (elems diagos)
147
148 filterPointersByPeriod :: Filiation -> [(Pointer,[Int])] -> [Pointer]
149 filterPointersByPeriod fil pts =
150 let pts' = sortOn (fst . fst . fst . fst) pts
151 inf = (fst . fst . fst . fst) $ head' "filterPointersByPeriod" pts'
152 sup = (fst . fst . fst . fst) $ last' "filterPointersByPeriod" pts'
153 in map fst
154 $ nubBy (\pt pt' -> snd pt == snd pt')
155 $ filter (\pt -> ((fst . fst . fst . fst) pt == inf) || ((fst . fst . fst . fst) pt == sup))
156 $ case fil of
157 ToParents -> reverse pts'
158 ToChilds -> pts'
159
160 phyloGroupMatching :: [[(PhyloGroupId,[Int])]] -> Filiation -> Proximity -> Map Date Double -> Map Date Cooc
161 -> Double -> [Pointer] -> (PhyloGroupId,[Int]) -> [Pointer]
162 phyloGroupMatching candidates fil proxi docs diagos thr oldPointers (id,ngrams) =
163 if (null $ filterPointers proxi thr oldPointers)
164 {- let's find new pointers -}
165 then if null nextPointers
166 then []
167 else filterPointersByPeriod fil
168 $ head' "phyloGroupMatching"
169 -- Keep only the best set of pointers grouped by proximity
170 $ groupBy (\pt pt' -> (snd . fst) pt == (snd . fst) pt')
171 $ reverse $ sortOn (snd . fst) $ head' "pointers" nextPointers
172 -- Find the first time frame where at leats one pointer satisfies the proximity threshold
173 else oldPointers
174 where
175 nextPointers :: [[(Pointer,[Int])]]
176 nextPointers = take 1
177 $ dropWhile (null)
178 {- for each time frame, process the proximity on relevant pairs of targeted groups -}
179 $ scanl (\acc groups ->
180 let periods = nub $ map (fst . fst . fst) $ concat groups
181 nbdocs = sum $ elems $ (filterDocs docs ([(fst . fst) id] ++ periods))
182 diago = reduceDiagos
183 $ filterDiago diagos ([(fst . fst) id] ++ periods)
184 {- important resize nbdocs et diago dans le make pairs -}
185 pairs = makePairs' (id,ngrams) (concat groups) periods oldPointers fil thr proxi docs diagos
186 in acc ++ ( filterPointers' proxi thr
187 $ concat
188 $ map (\(c,c') ->
189 {- process the proximity between the current group and a pair of candidates -}
190 let proximity = toProximity nbdocs diago proxi ngrams (snd c) (snd c')
191 in if ((c == c') || (snd c == snd c'))
192 then [((fst c,proximity),snd c)]
193 else [((fst c,proximity),snd c),((fst c',proximity),snd c')] ) pairs )) []
194 $ inits candidates -- groups from [[1900],[1900,1901],[1900,1901,1902],...]
195
196
197 filterDocs :: Map Date Double -> [PhyloPeriodId] -> Map Date Double
198 filterDocs d pds = restrictKeys d $ periodsToYears pds
199
200 filterDiago :: Map Date Cooc -> [PhyloPeriodId] -> Map Date Cooc
201 filterDiago diago pds = restrictKeys diago $ periodsToYears pds
202
203
204 -----------------------------
205 -- | Matching Processing | --
206 -----------------------------
207
208
209 getNextPeriods :: Filiation -> Int -> PhyloPeriodId -> [PhyloPeriodId] -> [PhyloPeriodId]
210 getNextPeriods fil max' pId pIds =
211 case fil of
212 ToChilds -> take max' $ (tail . snd) $ splitAt (elemIndex' pId pIds) pIds
213 ToParents -> take max' $ (reverse . fst) $ splitAt (elemIndex' pId pIds) pIds
214
215
216 getCandidates :: PhyloGroup -> [[(PhyloGroupId,[Int])]] -> [[(PhyloGroupId,[Int])]]
217 getCandidates ego targets =
218 map (\groups' ->
219 filter (\g' -> (not . null) $ intersect (ego ^. phylo_groupNgrams) (snd g')
220 ) groups') targets
221
222
223 matchGroupsToGroups :: Int -> [PhyloPeriodId] -> Proximity -> Double -> Map Date Double -> Map Date Cooc -> [PhyloGroup] -> [PhyloGroup]
224 matchGroupsToGroups frame periods proximity thr docs coocs groups =
225 let groups' = groupByField _phylo_groupPeriod groups
226 in foldl' (\acc prd ->
227 let -- 1) find the parents/childs matching periods
228 periodsPar = getNextPeriods ToParents frame prd periods
229 periodsChi = getNextPeriods ToChilds frame prd periods
230 -- 2) find the parents/childs matching candidates
231 candidatesPar = map (\prd' -> map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ findWithDefault [] prd' groups') periodsPar
232 candidatesChi = map (\prd' -> map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ findWithDefault [] prd' groups') periodsChi
233 -- 3) find the parents/child number of docs by years
234 docsPar = filterDocs docs ([prd] ++ periodsPar)
235 docsChi = filterDocs docs ([prd] ++ periodsChi)
236 -- 4) find the parents/child diago by years
237 diagoPar = filterDiago (map coocToDiago coocs) ([prd] ++ periodsPar)
238 diagoChi = filterDiago (map coocToDiago coocs) ([prd] ++ periodsPar)
239 -- 5) match in parallel all the groups (egos) to their possible candidates
240 egos = map (\ego ->
241 let pointersPar = phyloGroupMatching (getCandidates ego candidatesPar) ToParents proximity docsPar diagoPar
242 thr (getPeriodPointers ToParents ego) (getGroupId ego, ego ^. phylo_groupNgrams)
243 pointersChi = phyloGroupMatching (getCandidates ego candidatesChi) ToChilds proximity docsChi diagoChi
244 thr (getPeriodPointers ToChilds ego) (getGroupId ego, ego ^. phylo_groupNgrams)
245 in addPointers ToChilds TemporalPointer pointersChi
246 $ addPointers ToParents TemporalPointer pointersPar ego)
247 $ findWithDefault [] prd groups'
248 egos' = egos `using` parList rdeepseq
249 in acc ++ egos'
250 ) [] periods
251
252
253 -----------------------
254 -- | Phylo Quality | --
255 -----------------------
256
257
258 relevantBranches :: Int -> [[PhyloGroup]] -> [[PhyloGroup]]
259 relevantBranches term branches =
260 filter (\groups -> (any (\group -> elem term $ group ^. phylo_groupNgrams) groups)) branches
261
262 accuracy :: Int -> [(Date,Date)] -> [PhyloGroup] -> Double
263 -- The accuracy of a branch relatively to a term x is computed only over the periods there exist some cluster mentionning x in the phylomemy
264 accuracy x periods bk = ((fromIntegral $ length $ filter (\g -> elem x $ g ^. phylo_groupNgrams) bk')
265 / (fromIntegral $ length bk'))
266 where
267 bk' :: [PhyloGroup]
268 bk' = filter (\g -> elem (g ^. phylo_groupPeriod) periods) bk
269
270 recall :: Int -> [PhyloGroup] -> [[PhyloGroup]] -> Double
271 recall x bk bx = ((fromIntegral $ length $ filter (\g -> elem x $ g ^. phylo_groupNgrams) bk)
272 / (fromIntegral $ length $ filter (\g -> elem x $ g ^. phylo_groupNgrams) $ concat bx))
273
274 fScore :: Double -> Int -> [(Date,Date)] -> [PhyloGroup] -> [[PhyloGroup]] -> Double
275 fScore lambda x periods bk bx =
276 let rec = recall x bk bx
277 acc = accuracy x periods bk
278 in ((1 + lambda ** 2) * acc * rec)
279 / (((lambda ** 2) * acc + rec))
280
281
282 wk :: [PhyloGroup] -> Double
283 wk bk = fromIntegral $ length bk
284
285
286 toPhyloQuality' :: Double -> Map Int Double -> [[PhyloGroup]] -> Double
287 toPhyloQuality' lambda freq branches =
288 if (null branches)
289 then 0
290 else sum
291 $ map (\i ->
292 let bks = relevantBranches i branches
293 periods = nub $ map _phylo_groupPeriod $ filter (\g -> elem i $ g ^. phylo_groupNgrams) $ concat bks
294 in (freq ! i) * (sum $ map (\bk -> ((wk bk) / (sum $ map wk bks)) * (fScore lambda i periods bk bks)) bks))
295 $ keys freq
296
297 toRecall :: Map Int Double -> [[PhyloGroup]] -> Double
298 toRecall freq branches =
299 if (null branches)
300 then 0
301 else sum
302 $ map (\x ->
303 let px = freq ! x
304 bx = relevantBranches x branches
305 wks = sum $ map wk bx
306 in (px / pys) * (sum $ map (\bk -> ((wk bk) / wks) * (recall x bk bx)) bx))
307 $ keys freq
308 where
309 pys :: Double
310 pys = sum (elems freq)
311
312
313 toAccuracy :: Map Int Double -> [[PhyloGroup]] -> Double
314 toAccuracy freq branches =
315 if (null branches)
316 then 0
317 else sum
318 $ map (\x ->
319 let px = freq ! x
320 bx = relevantBranches x branches
321 -- | periods containing x
322 periods = nub $ map _phylo_groupPeriod $ filter (\g -> elem x $ g ^. phylo_groupNgrams) $ concat bx
323 wks = sum $ map wk bx
324 in (px / pys) * (sum $ map (\bk -> ((wk bk) / wks) * (accuracy x periods bk)) bx))
325 $ keys freq
326 where
327 pys :: Double
328 pys = sum (elems freq)
329
330
331 -- | here we do the average of all the local f_scores
332 toPhyloQuality :: Double -> Double -> Map Int Double -> [[PhyloGroup]] -> Double
333 toPhyloQuality fdt lambda freq branches =
334 if (null branches)
335 then 0
336 else sum
337 $ map (\x ->
338 -- let px = freq ! x
339 let bx = relevantBranches x branches
340 -- | periods containing x
341 periods = nub $ map _phylo_groupPeriod $ filter (\g -> elem x $ g ^. phylo_groupNgrams) $ concat bx
342 wks = sum $ map wk bx
343 -- in (px / pys) * (sum $ map (\bk -> ((wk bk) / wks) * (fScore beta x bk bx)) bx))
344 -- in (1 / fdt) * (sum $ map (\bk -> ((wk bk) / wks) * (fScore beta x periods bk bx)) bx))
345 in (1 / fdt) * (sum $ map (\bk -> ((wk bk) / wks) * (fScore (tan (lambda * pi / 2)) x periods bk bx)) bx))
346 $ keys freq
347 -- where
348 -- pys :: Double
349 -- pys = sum (elems freq)
350
351 -- 1 / nb de foundation
352
353 ------------------------------------
354 -- | Constant Temporal Matching | --
355 ------------------------------------
356
357
358 groupsToBranches' :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
359 groupsToBranches' groups =
360 {- run the related component algorithm -}
361 let egos = groupBy (\gs gs' -> (fst $ fst $ head' "egos" gs) == (fst $ fst $ head' "egos" gs'))
362 $ sortOn (\gs -> fst $ fst $ head' "egos" gs)
363 $ map (\group -> [getGroupId group]
364 ++ (map fst $ group ^. phylo_groupPeriodParents)
365 ++ (map fst $ group ^. phylo_groupPeriodChilds) ) $ elems groups
366 -- first find the related components by inside each ego's period
367 -- a supprimer
368 graph' = map relatedComponents egos
369 -- then run it for the all the periods
370 graph = zip [1..]
371 $ relatedComponents $ concat (graph' `using` parList rdeepseq)
372 -- update each group's branch id
373 in map (\(bId,ids) ->
374 let groups' = map (\group -> group & phylo_groupBranchId %~ (\(lvl,lst) -> (lvl,lst ++ [bId])))
375 $ elems $ restrictKeys groups (Set.fromList ids)
376 in groups' `using` parList rdeepseq ) graph
377
378
379 reduceFrequency :: Map Int Double -> [[PhyloGroup]] -> Map Int Double
380 reduceFrequency frequency branches =
381 restrictKeys frequency (Set.fromList $ (nub . concat) $ map _phylo_groupNgrams $ concat branches)
382
383 updateThr :: Double -> [[PhyloGroup]] -> [[PhyloGroup]]
384 updateThr thr branches = map (\b -> map (\g ->
385 g & phylo_groupMeta .~ (singleton "seaLevels" (((g ^. phylo_groupMeta) ! "seaLevels") ++ [thr]))) b) branches
386
387
388 -- Sequentially break each branch of a phylo where
389 -- done = all the allready broken branches
390 -- ego = the current branch we want to break
391 -- rest = the branches we still have to break
392 breakBranches :: Double -> Proximity -> Double -> Map Int Double -> Int -> Double -> Double -> Double
393 -> Int -> Map Date Double -> Map Date Cooc -> [PhyloPeriodId] -> [([PhyloGroup],Bool)] -> ([PhyloGroup],Bool) -> [([PhyloGroup],Bool)] -> [([PhyloGroup],Bool)]
394 breakBranches fdt proximity lambda frequency minBranch thr depth elevation frame docs coocs periods done ego rest =
395 -- 1) keep or not the new division of ego
396 let done' = done ++ (if snd ego
397 then
398 (if ((null (fst ego')) || (quality > quality'))
399 then
400 -- trace (" ✗ F(β) = " <> show(quality) <> " (vs) " <> show(quality')
401 -- <> " | " <> show(length $ fst ego) <> " groups : "
402 -- <> " |✓ " <> show(length $ fst ego') <> show(map length $ fst ego')
403 -- <> " |✗ " <> show(length $ snd ego') <> "[" <> show(length $ concat $ snd ego') <> "]")
404 [(fst ego,False)]
405 else
406 -- trace (" ✓ level = " <> printf "%.1f" thr <> "")
407 -- trace (" ✓ F(β) = " <> show(quality) <> " (vs) " <> show(quality')
408 -- <> " | " <> show(length $ fst ego) <> " groups : "
409 -- <> " |✓ " <> show(length $ fst ego') <> show(map length $ fst ego')
410 -- <> " |✗ " <> show(length $ snd ego') <> "[" <> show(length $ concat $ snd ego') <> "]")
411 ((map (\e -> (e,True)) (fst ego')) ++ (map (\e -> (e,False)) (snd ego'))))
412 else [ego])
413 in
414 -- 2) if there is no more branches in rest then return else continue
415 if null rest
416 then done'
417 else breakBranches fdt proximity lambda frequency minBranch thr depth elevation frame docs coocs periods
418 done' (head' "breakBranches" rest) (tail' "breakBranches" rest)
419 where
420 --------------------------------------
421 quality :: Double
422 quality = toPhyloQuality fdt lambda frequency ((map fst done) ++ [fst ego] ++ (map fst rest))
423 --------------------------------------
424 ego' :: ([[PhyloGroup]],[[PhyloGroup]])
425 ego' =
426 let branches = groupsToBranches' $ fromList $ map (\g -> (getGroupId g, g))
427 $ matchGroupsToGroups frame periods proximity thr docs coocs (fst ego)
428 branches' = branches `using` parList rdeepseq
429 in partition (\b -> (length $ nub $ map _phylo_groupPeriod b) >= minBranch)
430 $ thrToMeta thr
431 $ depthToMeta (elevation - depth) branches'
432 --------------------------------------
433 quality' :: Double
434 quality' = toPhyloQuality fdt lambda frequency
435 ((map fst done) ++ (fst ego') ++ (snd ego') ++ (map fst rest))
436
437
438 seaLevelMatching :: Double -> Proximity -> Double -> Int -> Map Int Double -> Double -> Double -> Double -> Double
439 -> Int -> [PhyloPeriodId] -> Map Date Double -> Map Date Cooc -> [([PhyloGroup],Bool)] -> [([PhyloGroup],Bool)]
440 seaLevelMatching fdt proximity lambda minBranch frequency thr step depth elevation frame periods docs coocs branches =
441 -- if there is no branch to break or if seaLvl level > 1 then end
442 if (thr >= 1) || ((not . or) $ map snd branches)
443 then branches
444 else
445 -- break all the possible branches at the current seaLvl level
446 let quality = toPhyloQuality fdt lambda frequency (map fst branches)
447 acc = toAccuracy frequency (map fst branches)
448 rec = toRecall frequency (map fst branches)
449 branches' = trace ("↑ level = " <> printf "%.3f" thr <> " F(λ) = " <> printf "%.5f" quality
450 <> " ξ = " <> printf "%.5f" acc
451 <> " ρ = " <> printf "%.5f" rec
452 <> " branches = " <> show(length branches) <> " ↴")
453 $ breakBranches fdt proximity lambda frequency minBranch thr depth elevation frame docs coocs periods
454 [] (head' "seaLevelMatching" branches) (tail' "seaLevelMatching" branches)
455 frequency' = reduceFrequency frequency (map fst branches')
456 in seaLevelMatching fdt proximity lambda minBranch frequency' (thr + step) step (depth - 1) elevation frame periods docs coocs branches'
457
458
459 constanteTemporalMatching :: Double -> Double -> Phylo -> Phylo
460 constanteTemporalMatching start step phylo = updatePhyloGroups 1
461 (fromList $ map (\g -> (getGroupId g,g)) $ traceMatchEnd $ concat branches)
462 (toPhyloHorizon phylo)
463 where
464 -- 2) process the temporal matching by elevating seaLvl level
465 branches :: [[PhyloGroup]]
466 branches = map fst
467 $ seaLevelMatching (fromIntegral $ Vector.length $ getRoots phylo)
468 (phyloProximity $ getConfig phylo)
469 (_qua_granularity $ phyloQuality $ getConfig phylo)
470 (_qua_minBranch $ phyloQuality $ getConfig phylo)
471 (phylo ^. phylo_termFreq)
472 start step
473 ((((1 - start) / step) - 1))
474 (((1 - start) / step))
475 (getTimeFrame $ timeUnit $ getConfig phylo)
476 (getPeriodIds phylo)
477 (phylo ^. phylo_timeDocs)
478 (phylo ^. phylo_timeCooc)
479 (reverse $ sortOn (length . fst) groups)
480 -- 1) for each group process an initial temporal Matching
481 -- here we suppose that all the groups of level 1 are part of the same big branch
482 groups :: [([PhyloGroup],Bool)]
483 groups = map (\b -> (b,(length $ nub $ map _phylo_groupPeriod b) >= (_qua_minBranch $ phyloQuality $ getConfig phylo)))
484 $ groupsToBranches' $ fromList $ map (\g -> (getGroupId g, g))
485 $ matchGroupsToGroups (getTimeFrame $ timeUnit $ getConfig phylo)
486 (getPeriodIds phylo) (phyloProximity $ getConfig phylo)
487 start
488 (phylo ^. phylo_timeDocs)
489 (phylo ^. phylo_timeCooc)
490 (traceTemporalMatching $ getGroupsFromLevel 1 phylo)
491
492 -----------------
493 -- | Horizon | --
494 -----------------
495
496 toPhyloHorizon :: Phylo -> Phylo
497 toPhyloHorizon phylo =
498 let t0 = take 1 (getPeriodIds phylo)
499 groups = getGroupsFromLevelPeriods 1 t0 phylo
500 sens = getSensibility (phyloProximity $ getConfig phylo)
501 nbDocs = sum $ elems $ filterDocs (phylo ^. phylo_timeDocs) t0
502 diago = reduceDiagos $ filterDiago (phylo ^. phylo_timeCooc) t0
503 in phylo & phylo_horizon .~ (fromList $ map (\(g,g') ->
504 ((getGroupId g,getGroupId g'),weightedLogJaccard' sens nbDocs diago (g ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams))) $ listToCombi' groups)
505
506
507 --------------------------------------
508 -- | Adaptative Temporal Matching | --
509 --------------------------------------
510
511
512 thrToMeta :: Double -> [[PhyloGroup]] -> [[PhyloGroup]]
513 thrToMeta thr branches =
514 map (\b ->
515 map (\g -> g & phylo_groupMeta .~ (adjust (\lst -> lst ++ [thr]) "seaLevels" (g ^. phylo_groupMeta))) b) branches
516
517 depthToMeta :: Double -> [[PhyloGroup]] -> [[PhyloGroup]]
518 depthToMeta depth branches =
519 let break = length branches > 1
520 in map (\b ->
521 map (\g ->
522 if break then g & phylo_groupMeta .~ (adjust (\lst -> lst ++ [depth]) "breaks"(g ^. phylo_groupMeta))
523 else g) b) branches
524
525 reduceTupleMapByKeys :: Eq a => [a] -> Map (a,a) Double -> Map (a,a) Double
526 reduceTupleMapByKeys ks m = filterWithKey (\(k,k') _ -> (elem k ks) && (elem k' ks)) m
527
528
529 getInTupleMap :: Ord a => Map (a,a) Double -> a -> a -> Double
530 getInTupleMap m k k'
531 | isJust (m !? ( k ,k')) = m ! ( k ,k')
532 | isJust (m !? ( k',k )) = m ! ( k',k )
533 | otherwise = 0
534
535
536 toThreshold :: Double -> Map (PhyloGroupId,PhyloGroupId) Double -> Double
537 toThreshold lvl proxiGroups =
538 let idx = ((Map.size proxiGroups) `div` (floor lvl)) - 1
539 in if idx >= 0
540 then (sort $ elems proxiGroups) !! idx
541 else 1
542
543
544 -- done = all the allready broken branches
545 -- ego = the current branch we want to break
546 -- rest = the branches we still have to break
547 adaptativeBreakBranches :: Double -> Proximity -> Double -> Double -> Map (PhyloGroupId,PhyloGroupId) Double
548 -> Double -> Map Int Double -> Int -> Int -> Map Date Double -> Map Date Cooc
549 -> [PhyloPeriodId] -> [([PhyloGroup],(Bool,[Double]))] -> ([PhyloGroup],(Bool,[Double])) -> [([PhyloGroup],(Bool,[Double]))]
550 -> [([PhyloGroup],(Bool,[Double]))]
551 adaptativeBreakBranches fdt proxiConf depth elevation groupsProxi lambda frequency minBranch frame docs coocs periods done ego rest =
552 -- 1) keep or not the new division of ego
553 let done' = done ++ (if (fst . snd) ego
554 then (if ((null (fst ego')) || (quality > quality'))
555 then
556 [(concat $ thrToMeta thr $ [fst ego],(False, ((snd . snd) ego)))]
557 else
558 ( (map (\e -> (e,(True, ((snd . snd) ego) ++ [thr]))) (fst ego'))
559 ++ (map (\e -> (e,(False, ((snd . snd) ego)))) (snd ego'))))
560 else [(concat $ thrToMeta thr $ [fst ego], snd ego)])
561 in
562 -- uncomment let .. in for debugging
563 -- let part1 = partition (snd) done'
564 -- part2 = partition (snd) rest
565 -- in trace ( "[✓ " <> show(length $ fst part1) <> "(" <> show(length $ concat $ map (fst) $ fst part1) <> ")|✗ " <> show(length $ snd part1) <> "(" <> show(length $ concat $ map (fst) $ snd part1) <> ")] "
566 -- <> "[✓ " <> show(length $ fst part2) <> "(" <> show(length $ concat $ map (fst) $ fst part2) <> ")|✗ " <> show(length $ snd part2) <> "(" <> show(length $ concat $ map (fst) $ snd part2) <> ")]"
567 -- ) $
568 -- 2) if there is no more branches in rest then return else continue
569 if null rest
570 then done'
571 else adaptativeBreakBranches fdt proxiConf depth elevation groupsProxi lambda frequency minBranch frame docs coocs periods
572 done' (head' "breakBranches" rest) (tail' "breakBranches" rest)
573 where
574 --------------------------------------
575 thr :: Double
576 thr = toThreshold depth $ Map.filter (\v -> v > (last' "breakBranches" $ (snd . snd) ego)) $ reduceTupleMapByKeys (map getGroupId $ fst ego) groupsProxi
577 --------------------------------------
578 quality :: Double
579 quality = toPhyloQuality fdt lambda frequency ((map fst done) ++ [fst ego] ++ (map fst rest))
580 --------------------------------------
581 ego' :: ([[PhyloGroup]],[[PhyloGroup]])
582 ego' =
583 let branches = groupsToBranches' $ fromList $ map (\g -> (getGroupId g, g))
584 $ matchGroupsToGroups frame periods proxiConf thr docs coocs (fst ego)
585 branches' = branches `using` parList rdeepseq
586 in partition (\b -> (length $ nub $ map _phylo_groupPeriod b) > minBranch)
587 $ thrToMeta thr
588 $ depthToMeta (elevation - depth) branches'
589 --------------------------------------
590 quality' :: Double
591 quality' = toPhyloQuality fdt lambda frequency
592 ((map fst done) ++ (fst ego') ++ (snd ego') ++ (map fst rest))
593
594
595 adaptativeSeaLevelMatching :: Double -> Proximity -> Double -> Double -> Map (PhyloGroupId, PhyloGroupId) Double
596 -> Double -> Int -> Map Int Double
597 -> Int -> [PhyloPeriodId] -> Map Date Double -> Map Date Cooc
598 -> [([PhyloGroup],(Bool,[Double]))] -> [([PhyloGroup],(Bool,[Double]))]
599 adaptativeSeaLevelMatching fdt proxiConf depth elevation groupsProxi lambda minBranch frequency frame periods docs coocs branches =
600 -- if there is no branch to break or if seaLvl level >= depth then end
601 if (Map.null groupsProxi) || (depth <= 0) || ((not . or) $ map (fst . snd) branches)
602 then branches
603 else
604 -- break all the possible branches at the current seaLvl level
605 let branches' = adaptativeBreakBranches fdt proxiConf depth elevation groupsProxi lambda frequency minBranch frame docs coocs periods
606 [] (head' "seaLevelMatching" branches) (tail' "seaLevelMatching" branches)
607 frequency' = reduceFrequency frequency (map fst branches')
608 groupsProxi' = reduceTupleMapByKeys (map (getGroupId) $ concat $ map (fst) $ filter (fst . snd) branches') groupsProxi
609 -- thr = toThreshold depth groupsProxi
610 in trace("\n " <> foldl (\acc _ -> acc <> "🌊 ") "" [0..(elevation - depth)]
611 <> " [✓ " <> show(length $ filter (fst . snd) branches') <> "(" <> show(length $ concat $ map (fst) $ filter (fst . snd) branches')
612 <> ")|✗ " <> show(length $ filter (not . fst . snd) branches') <> "(" <> show(length $ concat $ map (fst) $ filter (not . fst . snd) branches') <> ")]"
613 <> " thr = ")
614 $ adaptativeSeaLevelMatching fdt proxiConf (depth - 1) elevation groupsProxi' lambda minBranch frequency' frame periods docs coocs branches'
615
616
617 adaptativeTemporalMatching :: Double -> Phylo -> Phylo
618 adaptativeTemporalMatching elevation phylo = updatePhyloGroups 1
619 (fromList $ map (\g -> (getGroupId g,g)) $ traceMatchEnd $ concat branches)
620 (toPhyloHorizon phylo)
621 where
622 -- 2) process the temporal matching by elevating seaLvl level
623 branches :: [[PhyloGroup]]
624 branches = map fst
625 $ adaptativeSeaLevelMatching (fromIntegral $ Vector.length $ getRoots phylo)
626 (phyloProximity $ getConfig phylo)
627 (elevation - 1)
628 elevation
629 (phylo ^. phylo_groupsProxi)
630 (_qua_granularity $ phyloQuality $ getConfig phylo)
631 (_qua_minBranch $ phyloQuality $ getConfig phylo)
632 (phylo ^. phylo_termFreq)
633 (getTimeFrame $ timeUnit $ getConfig phylo)
634 (getPeriodIds phylo)
635 (phylo ^. phylo_timeDocs)
636 (phylo ^. phylo_timeCooc)
637 groups
638 -- 1) for each group process an initial temporal Matching
639 -- here we suppose that all the groups of level 1 are part of the same big branch
640 groups :: [([PhyloGroup],(Bool,[Double]))]
641 groups = map (\b -> (b,((length $ nub $ map _phylo_groupPeriod b) >= (_qua_minBranch $ phyloQuality $ getConfig phylo),[thr])))
642 $ groupsToBranches' $ fromList $ map (\g -> (getGroupId g, g))
643 $ matchGroupsToGroups (getTimeFrame $ timeUnit $ getConfig phylo)
644 (getPeriodIds phylo) (phyloProximity $ getConfig phylo)
645 thr
646 (phylo ^. phylo_timeDocs)
647 (phylo ^. phylo_timeCooc)
648 (traceTemporalMatching $ getGroupsFromLevel 1 phylo)
649 --------------------------------------
650 thr :: Double
651 thr = toThreshold elevation (phylo ^. phylo_groupsProxi)