]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Viz/Phylo/TemporalMatching.hs
process the ancestors before the synchronic clustering
[gargantext.git] / src / Gargantext / Core / Viz / Phylo / TemporalMatching.hs
1 {-|
2 Module : Gargantext.Core.Viz.Phylo.TemporalMatching
3 Description : Module dedicated to the adaptative temporal matching of a Phylo.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11
12 module Gargantext.Core.Viz.Phylo.TemporalMatching where
13
14 import Data.List (concat, splitAt, tail, sortOn, (++), intersect, null, inits, groupBy, scanl, nub, nubBy, union, dropWhile, partition, or, sort, (!!))
15 import Data.Map (Map, fromList, elems, restrictKeys, unionWith, findWithDefault, keys, (!), (!?), filterWithKey, singleton, empty, mapKeys, adjust)
16
17 import Gargantext.Prelude
18 import Gargantext.Core.Viz.AdaptativePhylo
19 import Gargantext.Core.Viz.Phylo.PhyloTools
20
21 import Prelude (floor)
22 import Control.Lens hiding (Level)
23 import Control.Parallel.Strategies (parList, rdeepseq, using)
24 import Debug.Trace (trace)
25
26 import Text.Printf
27
28 import qualified Data.Map as Map
29 import qualified Data.Set as Set
30 import qualified Data.Vector as Vector
31
32
33
34 -------------------
35 -- | Proximity | --
36 -------------------
37
38
39 -- | To compute a jaccard similarity between two lists
40 jaccard :: [Int] -> [Int] -> Double
41 jaccard inter' union' = ((fromIntegral . length) $ inter') / ((fromIntegral . length) $ union')
42
43
44 -- | Process the inverse sumLog
45 sumInvLog' :: Double -> Double -> [Double] -> Double
46 sumInvLog' s nb diago = foldl (\mem occ -> mem + (1 / (log (occ + s) / log (nb + s)))) 0 diago
47
48
49 -- | Process the sumLog
50 sumLog' :: Double -> Double -> [Double] -> Double
51 sumLog' s nb diago = foldl (\mem occ -> mem + (log (occ + s) / log (nb + s))) 0 diago
52
53
54 weightedLogJaccard' :: Double -> Double -> Map Int Double -> [Int] -> [Int] -> Double
55 weightedLogJaccard' sens nbDocs diago ngrams ngrams'
56 | null ngramsInter = 0
57 | ngramsInter == ngramsUnion = 1
58 | sens == 0 = jaccard ngramsInter ngramsUnion
59 | sens > 0 = (sumInvLog' sens nbDocs diagoInter) / (sumInvLog' sens nbDocs diagoUnion)
60 | otherwise = (sumLog' sens nbDocs diagoInter) / (sumLog' sens nbDocs diagoUnion)
61 where
62 --------------------------------------
63 ngramsInter :: [Int]
64 ngramsInter = intersect ngrams ngrams'
65 --------------------------------------
66 ngramsUnion :: [Int]
67 ngramsUnion = union ngrams ngrams'
68 --------------------------------------
69 diagoInter :: [Double]
70 diagoInter = elems $ restrictKeys diago (Set.fromList ngramsInter)
71 --------------------------------------
72 diagoUnion :: [Double]
73 diagoUnion = elems $ restrictKeys diago (Set.fromList ngramsUnion)
74 --------------------------------------
75
76
77 -- | To process the proximity between a current group and a pair of targets group
78 toProximity :: Double -> Map Int Double -> Proximity -> [Int] -> [Int] -> [Int] -> Double
79 toProximity nbDocs diago proximity egoNgrams targetNgrams targetNgrams' =
80 case proximity of
81 WeightedLogJaccard sens ->
82 let pairNgrams = if targetNgrams == targetNgrams'
83 then targetNgrams
84 else union targetNgrams targetNgrams'
85 in weightedLogJaccard' sens nbDocs diago egoNgrams pairNgrams
86 Hamming -> undefined
87
88
89 ------------------------
90 -- | Local Matching | --
91 ------------------------
92
93 findLastPeriod :: Filiation -> [PhyloPeriodId] -> PhyloPeriodId
94 findLastPeriod fil periods = case fil of
95 ToParents -> head' "findLastPeriod" (sortOn fst periods)
96 ToChilds -> last' "findLastPeriod" (sortOn fst periods)
97
98
99 -- | To filter pairs of candidates related to old pointers periods
100 removeOldPointers :: [Pointer] -> Filiation -> Double -> Proximity -> PhyloPeriodId
101 -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
102 -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
103 removeOldPointers oldPointers fil thr prox prd pairs
104 | null oldPointers = pairs
105 | null (filterPointers prox thr oldPointers) =
106 let lastMatchedPrd = findLastPeriod fil (map (fst . fst . fst) oldPointers)
107 in if lastMatchedPrd == prd
108 then []
109 else filter (\((id,_),(id',_)) ->
110 case fil of
111 ToParents -> (((fst . fst . fst) id ) < (fst lastMatchedPrd))
112 || (((fst . fst . fst) id') < (fst lastMatchedPrd))
113 ToChilds -> (((fst . fst . fst) id ) > (fst lastMatchedPrd))
114 || (((fst . fst . fst) id') > (fst lastMatchedPrd))) pairs
115 | otherwise = []
116
117
118 makePairs' :: (PhyloGroupId,[Int]) -> [(PhyloGroupId,[Int])] -> [PhyloPeriodId] -> [Pointer] -> Filiation -> Double -> Proximity
119 -> Map Date Double -> Map Date Cooc -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
120 makePairs' (egoId, egoNgrams) candidates periods oldPointers fil thr prox docs diagos =
121 if (null periods)
122 then []
123 else removeOldPointers oldPointers fil thr prox lastPrd
124 {- at least on of the pair candidates should be from the last added period -}
125 $ filter (\((id,_),(id',_)) -> ((fst . fst) id == lastPrd) || ((fst . fst) id' == lastPrd))
126 $ listToKeys
127 $ filter (\(id,ngrams) ->
128 let nbDocs = (sum . elems) $ filterDocs docs ([(fst . fst) egoId, (fst . fst) id])
129 diago = reduceDiagos $ filterDiago diagos ([(fst . fst) egoId, (fst . fst) id])
130 in (toProximity nbDocs diago prox egoNgrams egoNgrams ngrams) >= thr
131 ) candidates
132 where
133 lastPrd :: PhyloPeriodId
134 lastPrd = findLastPeriod fil periods
135
136
137 filterPointers :: Proximity -> Double -> [Pointer] -> [Pointer]
138 filterPointers proxi thr pts = filter (\(_,w) -> filterProximity proxi thr w) pts
139
140 filterPointers' :: Proximity -> Double -> [(Pointer,[Int])] -> [(Pointer,[Int])]
141 filterPointers' proxi thr pts = filter (\((_,w),_) -> filterProximity proxi thr w) pts
142
143
144 reduceDiagos :: Map Date Cooc -> Map Int Double
145 reduceDiagos diagos = mapKeys (\(k,_) -> k)
146 $ foldl (\acc diago -> unionWith (+) acc diago) empty (elems diagos)
147
148 filterPointersByPeriod :: Filiation -> [(Pointer,[Int])] -> [Pointer]
149 filterPointersByPeriod fil pts =
150 let pts' = sortOn (fst . fst . fst . fst) pts
151 inf = (fst . fst . fst . fst) $ head' "filterPointersByPeriod" pts'
152 sup = (fst . fst . fst . fst) $ last' "filterPointersByPeriod" pts'
153 in map fst
154 $ nubBy (\pt pt' -> snd pt == snd pt')
155 $ filter (\pt -> ((fst . fst . fst . fst) pt == inf) || ((fst . fst . fst . fst) pt == sup))
156 $ case fil of
157 ToParents -> reverse pts'
158 ToChilds -> pts'
159
160 phyloGroupMatching :: [[(PhyloGroupId,[Int])]] -> Filiation -> Proximity -> Map Date Double -> Map Date Cooc
161 -> Double -> [Pointer] -> (PhyloGroupId,[Int]) -> [Pointer]
162 phyloGroupMatching candidates fil proxi docs diagos thr oldPointers (id,ngrams) =
163 if (null $ filterPointers proxi thr oldPointers)
164 {- let's find new pointers -}
165 then if null nextPointers
166 then []
167 else filterPointersByPeriod fil
168 $ head' "phyloGroupMatching"
169 -- Keep only the best set of pointers grouped by proximity
170 $ groupBy (\pt pt' -> (snd . fst) pt == (snd . fst) pt')
171 $ reverse $ sortOn (snd . fst) $ head' "pointers" nextPointers
172 -- Find the first time frame where at leats one pointer satisfies the proximity threshold
173 else oldPointers
174 where
175 nextPointers :: [[(Pointer,[Int])]]
176 nextPointers = take 1
177 $ dropWhile (null)
178 {- for each time frame, process the proximity on relevant pairs of targeted groups -}
179 $ scanl (\acc groups ->
180 let periods = nub $ map (fst . fst . fst) $ concat groups
181 nbdocs = sum $ elems $ (filterDocs docs ([(fst . fst) id] ++ periods))
182 diago = reduceDiagos
183 $ filterDiago diagos ([(fst . fst) id] ++ periods)
184 {- important resize nbdocs et diago dans le make pairs -}
185 pairs = makePairs' (id,ngrams) (concat groups) periods oldPointers fil thr proxi docs diagos
186 in acc ++ ( filterPointers' proxi thr
187 $ concat
188 $ map (\(c,c') ->
189 {- process the proximity between the current group and a pair of candidates -}
190 let proximity = toProximity nbdocs diago proxi ngrams (snd c) (snd c')
191 in if ((c == c') || (snd c == snd c'))
192 then [((fst c,proximity),snd c)]
193 else [((fst c,proximity),snd c),((fst c',proximity),snd c')] ) pairs )) []
194 $ inits candidates -- groups from [[1900],[1900,1901],[1900,1901,1902],...]
195
196
197 filterDocs :: Map Date Double -> [PhyloPeriodId] -> Map Date Double
198 filterDocs d pds = restrictKeys d $ periodsToYears pds
199
200 filterDiago :: Map Date Cooc -> [PhyloPeriodId] -> Map Date Cooc
201 filterDiago diago pds = restrictKeys diago $ periodsToYears pds
202
203
204 -----------------------------
205 -- | Matching Processing | --
206 -----------------------------
207
208
209 getNextPeriods :: Filiation -> Int -> PhyloPeriodId -> [PhyloPeriodId] -> [PhyloPeriodId]
210 getNextPeriods fil max' pId pIds =
211 case fil of
212 ToChilds -> take max' $ (tail . snd) $ splitAt (elemIndex' pId pIds) pIds
213 ToParents -> take max' $ (reverse . fst) $ splitAt (elemIndex' pId pIds) pIds
214
215
216 getCandidates :: PhyloGroup -> [[(PhyloGroupId,[Int])]] -> [[(PhyloGroupId,[Int])]]
217 getCandidates ego targets =
218 map (\groups' ->
219 filter (\g' -> (not . null) $ intersect (ego ^. phylo_groupNgrams) (snd g')
220 ) groups') targets
221
222
223 matchGroupsToGroups :: Int -> [PhyloPeriodId] -> Proximity -> Double -> Map Date Double -> Map Date Cooc -> [PhyloGroup] -> [PhyloGroup]
224 matchGroupsToGroups frame periods proximity thr docs coocs groups =
225 let groups' = groupByField _phylo_groupPeriod groups
226 in foldl' (\acc prd ->
227 let -- 1) find the parents/childs matching periods
228 periodsPar = getNextPeriods ToParents frame prd periods
229 periodsChi = getNextPeriods ToChilds frame prd periods
230 -- 2) find the parents/childs matching candidates
231 candidatesPar = map (\prd' -> map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ findWithDefault [] prd' groups') periodsPar
232 candidatesChi = map (\prd' -> map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ findWithDefault [] prd' groups') periodsChi
233 -- 3) find the parents/child number of docs by years
234 docsPar = filterDocs docs ([prd] ++ periodsPar)
235 docsChi = filterDocs docs ([prd] ++ periodsChi)
236 -- 4) find the parents/child diago by years
237 diagoPar = filterDiago (map coocToDiago coocs) ([prd] ++ periodsPar)
238 diagoChi = filterDiago (map coocToDiago coocs) ([prd] ++ periodsPar)
239 -- 5) match in parallel all the groups (egos) to their possible candidates
240 egos = map (\ego ->
241 let pointersPar = phyloGroupMatching (getCandidates ego candidatesPar) ToParents proximity docsPar diagoPar
242 thr (getPeriodPointers ToParents ego) (getGroupId ego, ego ^. phylo_groupNgrams)
243 pointersChi = phyloGroupMatching (getCandidates ego candidatesChi) ToChilds proximity docsChi diagoChi
244 thr (getPeriodPointers ToChilds ego) (getGroupId ego, ego ^. phylo_groupNgrams)
245 in addPointers ToChilds TemporalPointer pointersChi
246 $ addPointers ToParents TemporalPointer pointersPar ego)
247 $ findWithDefault [] prd groups'
248 egos' = egos `using` parList rdeepseq
249 in acc ++ egos'
250 ) [] periods
251
252
253 -----------------------
254 -- | Phylo Quality | --
255 -----------------------
256
257
258 relevantBranches :: Int -> [[PhyloGroup]] -> [[PhyloGroup]]
259 relevantBranches term branches =
260 filter (\groups -> (any (\group -> elem term $ group ^. phylo_groupNgrams) groups)) branches
261
262 accuracy :: Int -> [(Date,Date)] -> [PhyloGroup] -> Double
263 -- The accuracy of a branch relatively to a term x is computed only over the periods there exist some cluster mentionning x in the phylomemy
264 accuracy x periods bk = ((fromIntegral $ length $ filter (\g -> elem x $ g ^. phylo_groupNgrams) bk')
265 / (fromIntegral $ length bk'))
266 where
267 bk' :: [PhyloGroup]
268 bk' = filter (\g -> elem (g ^. phylo_groupPeriod) periods) bk
269
270 recall :: Int -> [PhyloGroup] -> [[PhyloGroup]] -> Double
271 recall x bk bx = ((fromIntegral $ length $ filter (\g -> elem x $ g ^. phylo_groupNgrams) bk)
272 / (fromIntegral $ length $ filter (\g -> elem x $ g ^. phylo_groupNgrams) $ concat bx))
273
274 fScore :: Double -> Int -> [(Date,Date)] -> [PhyloGroup] -> [[PhyloGroup]] -> Double
275 fScore beta x periods bk bx =
276 let rec = recall x bk bx
277 acc = accuracy x periods bk
278 in ((1 + beta ** 2) * acc * rec)
279 / (((beta ** 2) * rec + acc))
280
281
282 wk :: [PhyloGroup] -> Double
283 wk bk = fromIntegral $ length bk
284
285
286 toPhyloQuality' :: Double -> Map Int Double -> [[PhyloGroup]] -> Double
287 toPhyloQuality' beta freq branches =
288 if (null branches)
289 then 0
290 else sum
291 $ map (\i ->
292 let bks = relevantBranches i branches
293 periods = nub $ map _phylo_groupPeriod $ filter (\g -> elem i $ g ^. phylo_groupNgrams) $ concat bks
294 in (freq ! i) * (sum $ map (\bk -> ((wk bk) / (sum $ map wk bks)) * (fScore beta i periods bk bks)) bks))
295 $ keys freq
296
297 toRecall :: Map Int Double -> [[PhyloGroup]] -> Double
298 toRecall freq branches =
299 if (null branches)
300 then 0
301 else sum
302 $ map (\x ->
303 let px = freq ! x
304 bx = relevantBranches x branches
305 wks = sum $ map wk bx
306 in (px / pys) * (sum $ map (\bk -> ((wk bk) / wks) * (recall x bk bx)) bx))
307 $ keys freq
308 where
309 pys :: Double
310 pys = sum (elems freq)
311
312
313 toAccuracy :: Map Int Double -> [[PhyloGroup]] -> Double
314 toAccuracy freq branches =
315 if (null branches)
316 then 0
317 else sum
318 $ map (\x ->
319 let px = freq ! x
320 bx = relevantBranches x branches
321 -- | periods containing x
322 periods = nub $ map _phylo_groupPeriod $ filter (\g -> elem x $ g ^. phylo_groupNgrams) $ concat bx
323 wks = sum $ map wk bx
324 in (px / pys) * (sum $ map (\bk -> ((wk bk) / wks) * (accuracy x periods bk)) bx))
325 $ keys freq
326 where
327 pys :: Double
328 pys = sum (elems freq)
329
330
331 -- | here we do the average of all the local f_scores
332 toPhyloQuality :: Double -> Double -> Map Int Double -> [[PhyloGroup]] -> Double
333 toPhyloQuality fdt beta freq branches =
334 if (null branches)
335 then 0
336 else sum
337 $ map (\x ->
338 -- let px = freq ! x
339 let bx = relevantBranches x branches
340 -- | periods containing x
341 periods = nub $ map _phylo_groupPeriod $ filter (\g -> elem x $ g ^. phylo_groupNgrams) $ concat bx
342 wks = sum $ map wk bx
343 -- in (px / pys) * (sum $ map (\bk -> ((wk bk) / wks) * (fScore beta x bk bx)) bx))
344 in (1 / fdt) * (sum $ map (\bk -> ((wk bk) / wks) * (fScore beta x periods bk bx)) bx))
345 $ keys freq
346 -- where
347 -- pys :: Double
348 -- pys = sum (elems freq)
349
350 -- 1 / nb de foundation
351
352 ------------------------------------
353 -- | Constant Temporal Matching | --
354 ------------------------------------
355
356
357 groupsToBranches' :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
358 groupsToBranches' groups =
359 {- run the related component algorithm -}
360 let egos = groupBy (\gs gs' -> (fst $ fst $ head' "egos" gs) == (fst $ fst $ head' "egos" gs'))
361 $ sortOn (\gs -> fst $ fst $ head' "egos" gs)
362 $ map (\group -> [getGroupId group]
363 ++ (map fst $ group ^. phylo_groupPeriodParents)
364 ++ (map fst $ group ^. phylo_groupPeriodChilds) ) $ elems groups
365 -- first find the related components by inside each ego's period
366 -- a supprimer
367 graph' = map relatedComponents egos
368 -- then run it for the all the periods
369 graph = zip [1..]
370 $ relatedComponents $ concat (graph' `using` parList rdeepseq)
371 -- update each group's branch id
372 in map (\(bId,ids) ->
373 let groups' = map (\group -> group & phylo_groupBranchId %~ (\(lvl,lst) -> (lvl,lst ++ [bId])))
374 $ elems $ restrictKeys groups (Set.fromList ids)
375 in groups' `using` parList rdeepseq ) graph
376
377
378 reduceFrequency :: Map Int Double -> [[PhyloGroup]] -> Map Int Double
379 reduceFrequency frequency branches =
380 restrictKeys frequency (Set.fromList $ (nub . concat) $ map _phylo_groupNgrams $ concat branches)
381
382 updateThr :: Double -> [[PhyloGroup]] -> [[PhyloGroup]]
383 updateThr thr branches = map (\b -> map (\g ->
384 g & phylo_groupMeta .~ (singleton "seaLevels" (((g ^. phylo_groupMeta) ! "seaLevels") ++ [thr]))) b) branches
385
386
387 -- Sequentially break each branch of a phylo where
388 -- done = all the allready broken branches
389 -- ego = the current branch we want to break
390 -- rest = the branches we still have to break
391 breakBranches :: Double -> Proximity -> Double -> Map Int Double -> Int -> Double -> Double -> Double
392 -> Int -> Map Date Double -> Map Date Cooc -> [PhyloPeriodId] -> [([PhyloGroup],Bool)] -> ([PhyloGroup],Bool) -> [([PhyloGroup],Bool)] -> [([PhyloGroup],Bool)]
393 breakBranches fdt proximity beta frequency minBranch thr depth elevation frame docs coocs periods done ego rest =
394 -- 1) keep or not the new division of ego
395 let done' = done ++ (if snd ego
396 then
397 (if ((null (fst ego')) || (quality > quality'))
398 then
399 -- trace (" ✗ F(β) = " <> show(quality) <> " (vs) " <> show(quality')
400 -- <> " | " <> show(length $ fst ego) <> " groups : "
401 -- <> " |✓ " <> show(length $ fst ego') <> show(map length $ fst ego')
402 -- <> " |✗ " <> show(length $ snd ego') <> "[" <> show(length $ concat $ snd ego') <> "]")
403 [(fst ego,False)]
404 else
405 -- trace (" ✓ level = " <> printf "%.1f" thr <> "")
406 -- trace (" ✓ F(β) = " <> show(quality) <> " (vs) " <> show(quality')
407 -- <> " | " <> show(length $ fst ego) <> " groups : "
408 -- <> " |✓ " <> show(length $ fst ego') <> show(map length $ fst ego')
409 -- <> " |✗ " <> show(length $ snd ego') <> "[" <> show(length $ concat $ snd ego') <> "]")
410 ((map (\e -> (e,True)) (fst ego')) ++ (map (\e -> (e,False)) (snd ego'))))
411 else [ego])
412 in
413 -- 2) if there is no more branches in rest then return else continue
414 if null rest
415 then done'
416 else breakBranches fdt proximity beta frequency minBranch thr depth elevation frame docs coocs periods
417 done' (head' "breakBranches" rest) (tail' "breakBranches" rest)
418 where
419 --------------------------------------
420 quality :: Double
421 quality = toPhyloQuality fdt beta frequency ((map fst done) ++ [fst ego] ++ (map fst rest))
422 --------------------------------------
423 ego' :: ([[PhyloGroup]],[[PhyloGroup]])
424 ego' =
425 let branches = groupsToBranches' $ fromList $ map (\g -> (getGroupId g, g))
426 $ matchGroupsToGroups frame periods proximity thr docs coocs (fst ego)
427 branches' = branches `using` parList rdeepseq
428 in partition (\b -> (length $ nub $ map _phylo_groupPeriod b) >= minBranch)
429 $ thrToMeta thr
430 $ depthToMeta (elevation - depth) branches'
431 --------------------------------------
432 quality' :: Double
433 quality' = toPhyloQuality fdt beta frequency
434 ((map fst done) ++ (fst ego') ++ (snd ego') ++ (map fst rest))
435
436
437 seaLevelMatching :: Double -> Proximity -> Double -> Int -> Map Int Double -> Double -> Double -> Double -> Double
438 -> Int -> [PhyloPeriodId] -> Map Date Double -> Map Date Cooc -> [([PhyloGroup],Bool)] -> [([PhyloGroup],Bool)]
439 seaLevelMatching fdt proximity beta minBranch frequency thr step depth elevation frame periods docs coocs branches =
440 -- if there is no branch to break or if seaLvl level > 1 then end
441 if (thr >= 1) || ((not . or) $ map snd branches)
442 then branches
443 else
444 -- break all the possible branches at the current seaLvl level
445 let quality = toPhyloQuality fdt beta frequency (map fst branches)
446 acc = toAccuracy frequency (map fst branches)
447 rec = toRecall frequency (map fst branches)
448 branches' = trace ("↑ level = " <> printf "%.3f" thr <> " F(β) = " <> printf "%.5f" quality
449 <> " ξ = " <> printf "%.5f" acc
450 <> " ρ = " <> printf "%.5f" rec
451 <> " branches = " <> show(length branches) <> " ↴")
452 $ breakBranches fdt proximity beta frequency minBranch thr depth elevation frame docs coocs periods
453 [] (head' "seaLevelMatching" branches) (tail' "seaLevelMatching" branches)
454 frequency' = reduceFrequency frequency (map fst branches')
455 in seaLevelMatching fdt proximity beta minBranch frequency' (thr + step) step (depth - 1) elevation frame periods docs coocs branches'
456
457
458 constanteTemporalMatching :: Double -> Double -> Phylo -> Phylo
459 constanteTemporalMatching start step phylo = updatePhyloGroups 1
460 (fromList $ map (\g -> (getGroupId g,g)) $ traceMatchEnd $ concat branches)
461 (toPhyloHorizon phylo)
462 where
463 -- 2) process the temporal matching by elevating seaLvl level
464 branches :: [[PhyloGroup]]
465 branches = map fst
466 $ seaLevelMatching (fromIntegral $ Vector.length $ getRoots phylo)
467 (phyloProximity $ getConfig phylo)
468 (_qua_granularity $ phyloQuality $ getConfig phylo)
469 (_qua_minBranch $ phyloQuality $ getConfig phylo)
470 (phylo ^. phylo_termFreq)
471 start step
472 ((((1 - start) / step) - 1))
473 (((1 - start) / step))
474 (getTimeFrame $ timeUnit $ getConfig phylo)
475 (getPeriodIds phylo)
476 (phylo ^. phylo_timeDocs)
477 (phylo ^. phylo_timeCooc)
478 groups
479 -- 1) for each group process an initial temporal Matching
480 -- here we suppose that all the groups of level 1 are part of the same big branch
481 groups :: [([PhyloGroup],Bool)]
482 groups = map (\b -> (b,(length $ nub $ map _phylo_groupPeriod b) >= (_qua_minBranch $ phyloQuality $ getConfig phylo)))
483 $ groupsToBranches' $ fromList $ map (\g -> (getGroupId g, g))
484 $ matchGroupsToGroups (getTimeFrame $ timeUnit $ getConfig phylo)
485 (getPeriodIds phylo) (phyloProximity $ getConfig phylo)
486 start
487 (phylo ^. phylo_timeDocs)
488 (phylo ^. phylo_timeCooc)
489 (traceTemporalMatching $ getGroupsFromLevel 1 phylo)
490
491 -----------------
492 -- | Horizon | --
493 -----------------
494
495 toPhyloHorizon :: Phylo -> Phylo
496 toPhyloHorizon phylo =
497 let t0 = take 1 (getPeriodIds phylo)
498 groups = getGroupsFromLevelPeriods 1 t0 phylo
499 sens = getSensibility (phyloProximity $ getConfig phylo)
500 nbDocs = sum $ elems $ filterDocs (phylo ^. phylo_timeDocs) t0
501 diago = reduceDiagos $ filterDiago (phylo ^. phylo_timeCooc) t0
502 in phylo & phylo_horizon .~ (fromList $ map (\(g,g') ->
503 ((getGroupId g,getGroupId g'),weightedLogJaccard' sens nbDocs diago (g ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams))) $ listToCombi' groups)
504
505
506 --------------------------------------
507 -- | Adaptative Temporal Matching | --
508 --------------------------------------
509
510
511 thrToMeta :: Double -> [[PhyloGroup]] -> [[PhyloGroup]]
512 thrToMeta thr branches =
513 map (\b ->
514 map (\g -> g & phylo_groupMeta .~ (adjust (\lst -> lst ++ [thr]) "seaLevels" (g ^. phylo_groupMeta))) b) branches
515
516 depthToMeta :: Double -> [[PhyloGroup]] -> [[PhyloGroup]]
517 depthToMeta depth branches =
518 let break = length branches > 1
519 in map (\b ->
520 map (\g ->
521 if break then g & phylo_groupMeta .~ (adjust (\lst -> lst ++ [depth]) "breaks"(g ^. phylo_groupMeta))
522 else g) b) branches
523
524 reduceTupleMapByKeys :: Eq a => [a] -> Map (a,a) Double -> Map (a,a) Double
525 reduceTupleMapByKeys ks m = filterWithKey (\(k,k') _ -> (elem k ks) && (elem k' ks)) m
526
527
528 getInTupleMap :: Ord a => Map (a,a) Double -> a -> a -> Double
529 getInTupleMap m k k'
530 | isJust (m !? ( k ,k')) = m ! ( k ,k')
531 | isJust (m !? ( k',k )) = m ! ( k',k )
532 | otherwise = 0
533
534
535 toThreshold :: Double -> Map (PhyloGroupId,PhyloGroupId) Double -> Double
536 toThreshold lvl proxiGroups =
537 let idx = ((Map.size proxiGroups) `div` (floor lvl)) - 1
538 in if idx >= 0
539 then (sort $ elems proxiGroups) !! idx
540 else 1
541
542
543 -- done = all the allready broken branches
544 -- ego = the current branch we want to break
545 -- rest = the branches we still have to break
546 adaptativeBreakBranches :: Double -> Proximity -> Double -> Double -> Map (PhyloGroupId,PhyloGroupId) Double
547 -> Double -> Map Int Double -> Int -> Int -> Map Date Double -> Map Date Cooc
548 -> [PhyloPeriodId] -> [([PhyloGroup],(Bool,[Double]))] -> ([PhyloGroup],(Bool,[Double])) -> [([PhyloGroup],(Bool,[Double]))]
549 -> [([PhyloGroup],(Bool,[Double]))]
550 adaptativeBreakBranches fdt proxiConf depth elevation groupsProxi beta frequency minBranch frame docs coocs periods done ego rest =
551 -- 1) keep or not the new division of ego
552 let done' = done ++ (if (fst . snd) ego
553 then (if ((null (fst ego')) || (quality > quality'))
554 then
555 [(concat $ thrToMeta thr $ [fst ego],(False, ((snd . snd) ego)))]
556 else
557 ( (map (\e -> (e,(True, ((snd . snd) ego) ++ [thr]))) (fst ego'))
558 ++ (map (\e -> (e,(False, ((snd . snd) ego)))) (snd ego'))))
559 else [(concat $ thrToMeta thr $ [fst ego], snd ego)])
560 in
561 -- uncomment let .. in for debugging
562 -- let part1 = partition (snd) done'
563 -- part2 = partition (snd) rest
564 -- in trace ( "[✓ " <> show(length $ fst part1) <> "(" <> show(length $ concat $ map (fst) $ fst part1) <> ")|✗ " <> show(length $ snd part1) <> "(" <> show(length $ concat $ map (fst) $ snd part1) <> ")] "
565 -- <> "[✓ " <> show(length $ fst part2) <> "(" <> show(length $ concat $ map (fst) $ fst part2) <> ")|✗ " <> show(length $ snd part2) <> "(" <> show(length $ concat $ map (fst) $ snd part2) <> ")]"
566 -- ) $
567 -- 2) if there is no more branches in rest then return else continue
568 if null rest
569 then done'
570 else adaptativeBreakBranches fdt proxiConf depth elevation groupsProxi beta frequency minBranch frame docs coocs periods
571 done' (head' "breakBranches" rest) (tail' "breakBranches" rest)
572 where
573 --------------------------------------
574 thr :: Double
575 thr = toThreshold depth $ Map.filter (\v -> v > (last' "breakBranches" $ (snd . snd) ego)) $ reduceTupleMapByKeys (map getGroupId $ fst ego) groupsProxi
576 --------------------------------------
577 quality :: Double
578 quality = toPhyloQuality fdt beta frequency ((map fst done) ++ [fst ego] ++ (map fst rest))
579 --------------------------------------
580 ego' :: ([[PhyloGroup]],[[PhyloGroup]])
581 ego' =
582 let branches = groupsToBranches' $ fromList $ map (\g -> (getGroupId g, g))
583 $ matchGroupsToGroups frame periods proxiConf thr docs coocs (fst ego)
584 branches' = branches `using` parList rdeepseq
585 in partition (\b -> (length $ nub $ map _phylo_groupPeriod b) > minBranch)
586 $ thrToMeta thr
587 $ depthToMeta (elevation - depth) branches'
588 --------------------------------------
589 quality' :: Double
590 quality' = toPhyloQuality fdt beta frequency
591 ((map fst done) ++ (fst ego') ++ (snd ego') ++ (map fst rest))
592
593
594 adaptativeSeaLevelMatching :: Double -> Proximity -> Double -> Double -> Map (PhyloGroupId, PhyloGroupId) Double
595 -> Double -> Int -> Map Int Double
596 -> Int -> [PhyloPeriodId] -> Map Date Double -> Map Date Cooc
597 -> [([PhyloGroup],(Bool,[Double]))] -> [([PhyloGroup],(Bool,[Double]))]
598 adaptativeSeaLevelMatching fdt proxiConf depth elevation groupsProxi beta minBranch frequency frame periods docs coocs branches =
599 -- if there is no branch to break or if seaLvl level >= depth then end
600 if (Map.null groupsProxi) || (depth <= 0) || ((not . or) $ map (fst . snd) branches)
601 then branches
602 else
603 -- break all the possible branches at the current seaLvl level
604 let branches' = adaptativeBreakBranches fdt proxiConf depth elevation groupsProxi beta frequency minBranch frame docs coocs periods
605 [] (head' "seaLevelMatching" branches) (tail' "seaLevelMatching" branches)
606 frequency' = reduceFrequency frequency (map fst branches')
607 groupsProxi' = reduceTupleMapByKeys (map (getGroupId) $ concat $ map (fst) $ filter (fst . snd) branches') groupsProxi
608 -- thr = toThreshold depth groupsProxi
609 in trace("\n " <> foldl (\acc _ -> acc <> "🌊 ") "" [0..(elevation - depth)]
610 <> " [✓ " <> show(length $ filter (fst . snd) branches') <> "(" <> show(length $ concat $ map (fst) $ filter (fst . snd) branches')
611 <> ")|✗ " <> show(length $ filter (not . fst . snd) branches') <> "(" <> show(length $ concat $ map (fst) $ filter (not . fst . snd) branches') <> ")]"
612 <> " thr = ")
613 $ adaptativeSeaLevelMatching fdt proxiConf (depth - 1) elevation groupsProxi' beta minBranch frequency' frame periods docs coocs branches'
614
615
616 adaptativeTemporalMatching :: Double -> Phylo -> Phylo
617 adaptativeTemporalMatching elevation phylo = updatePhyloGroups 1
618 (fromList $ map (\g -> (getGroupId g,g)) $ traceMatchEnd $ concat branches)
619 (toPhyloHorizon phylo)
620 where
621 -- 2) process the temporal matching by elevating seaLvl level
622 branches :: [[PhyloGroup]]
623 branches = map fst
624 $ adaptativeSeaLevelMatching (fromIntegral $ Vector.length $ getRoots phylo)
625 (phyloProximity $ getConfig phylo)
626 (elevation - 1)
627 elevation
628 (phylo ^. phylo_groupsProxi)
629 (_qua_granularity $ phyloQuality $ getConfig phylo)
630 (_qua_minBranch $ phyloQuality $ getConfig phylo)
631 (phylo ^. phylo_termFreq)
632 (getTimeFrame $ timeUnit $ getConfig phylo)
633 (getPeriodIds phylo)
634 (phylo ^. phylo_timeDocs)
635 (phylo ^. phylo_timeCooc)
636 groups
637 -- 1) for each group process an initial temporal Matching
638 -- here we suppose that all the groups of level 1 are part of the same big branch
639 groups :: [([PhyloGroup],(Bool,[Double]))]
640 groups = map (\b -> (b,((length $ nub $ map _phylo_groupPeriod b) >= (_qua_minBranch $ phyloQuality $ getConfig phylo),[thr])))
641 $ groupsToBranches' $ fromList $ map (\g -> (getGroupId g, g))
642 $ matchGroupsToGroups (getTimeFrame $ timeUnit $ getConfig phylo)
643 (getPeriodIds phylo) (phyloProximity $ getConfig phylo)
644 thr
645 (phylo ^. phylo_timeDocs)
646 (phylo ^. phylo_timeCooc)
647 (traceTemporalMatching $ getGroupsFromLevel 1 phylo)
648 --------------------------------------
649 thr :: Double
650 thr = toThreshold elevation (phylo ^. phylo_groupsProxi)