]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/Phylo/TemporalMatching.hs
correct bugs when double pointers
[gargantext.git] / src / Gargantext / Viz / Phylo / TemporalMatching.hs
1 {-|
2 Module : Gargantext.Viz.Phylo.TemporalMatching
3 Description : Module dedicated to the adaptative temporal matching of a Phylo.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 {-# LANGUAGE NoImplicitPrelude #-}
12 {-# LANGUAGE FlexibleContexts #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE MultiParamTypeClasses #-}
15
16 module Gargantext.Viz.Phylo.TemporalMatching where
17
18 import Data.List (concat, splitAt, tail, sortOn, (++), intersect, null, inits, groupBy, scanl, nub, nubBy, union, dropWhile, partition, or, sort, (!!))
19 import Data.Map (Map, fromList, elems, restrictKeys, unionWith, findWithDefault, keys, (!), (!?), filterWithKey, singleton, empty, mapKeys, adjust)
20
21 import Gargantext.Prelude
22 import Gargantext.Viz.AdaptativePhylo
23 import Gargantext.Viz.Phylo.PhyloTools
24
25 import Prelude (floor)
26 import Control.Lens hiding (Level)
27 import Control.Parallel.Strategies (parList, rdeepseq, using)
28 import Debug.Trace (trace)
29
30 import Text.Printf
31
32 import qualified Data.Map as Map
33 import qualified Data.Set as Set
34
35
36 -------------------
37 -- | Proximity | --
38 -------------------
39
40
41 -- | To compute a jaccard similarity between two lists
42 jaccard :: [Int] -> [Int] -> Double
43 jaccard inter' union' = ((fromIntegral . length) $ inter') / ((fromIntegral . length) $ union')
44
45
46 -- | Process the inverse sumLog
47 sumInvLog' :: Double -> Double -> [Double] -> Double
48 sumInvLog' s nb diago = foldl (\mem occ -> mem + (1 / (log (occ + s) / log (nb + s)))) 0 diago
49
50
51 -- | Process the sumLog
52 sumLog' :: Double -> Double -> [Double] -> Double
53 sumLog' s nb diago = foldl (\mem occ -> mem + (log (occ + s) / log (nb + s))) 0 diago
54
55
56 weightedLogJaccard' :: Double -> Double -> Map Int Double -> [Int] -> [Int] -> Double
57 weightedLogJaccard' sens nbDocs diago ngrams ngrams'
58 | null ngramsInter = 0
59 | ngramsInter == ngramsUnion = 1
60 | sens == 0 = jaccard ngramsInter ngramsUnion
61 | sens > 0 = (sumInvLog' sens nbDocs diagoInter) / (sumInvLog' sens nbDocs diagoUnion)
62 | otherwise = (sumLog' sens nbDocs diagoInter) / (sumLog' sens nbDocs diagoUnion)
63 where
64 --------------------------------------
65 ngramsInter :: [Int]
66 ngramsInter = intersect ngrams ngrams'
67 --------------------------------------
68 ngramsUnion :: [Int]
69 ngramsUnion = union ngrams ngrams'
70 --------------------------------------
71 diagoInter :: [Double]
72 diagoInter = elems $ restrictKeys diago (Set.fromList ngramsInter)
73 --------------------------------------
74 diagoUnion :: [Double]
75 diagoUnion = elems $ restrictKeys diago (Set.fromList ngramsUnion)
76 --------------------------------------
77
78
79 -- | To process the proximity between a current group and a pair of targets group
80 toProximity :: Double -> Map Int Double -> Proximity -> [Int] -> [Int] -> [Int] -> Double
81 toProximity nbDocs diago proximity egoNgrams targetNgrams targetNgrams' =
82 case proximity of
83 WeightedLogJaccard sens ->
84 let pairNgrams = if targetNgrams == targetNgrams'
85 then targetNgrams
86 else union targetNgrams targetNgrams'
87 in weightedLogJaccard' sens nbDocs diago egoNgrams pairNgrams
88 Hamming -> undefined
89
90
91 ------------------------
92 -- | Local Matching | --
93 ------------------------
94
95 findLastPeriod :: Filiation -> [PhyloPeriodId] -> PhyloPeriodId
96 findLastPeriod fil periods = case fil of
97 ToParents -> head' "findLastPeriod" (sortOn fst periods)
98 ToChilds -> last' "findLastPeriod" (sortOn fst periods)
99
100
101 -- | To filter pairs of candidates related to old pointers periods
102 removeOldPointers :: [Pointer] -> Filiation -> Double -> Proximity -> PhyloPeriodId
103 -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
104 -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
105 removeOldPointers oldPointers fil thr prox prd pairs
106 | null oldPointers = pairs
107 | null (filterPointers prox thr oldPointers) =
108 let lastMatchedPrd = findLastPeriod fil (map (fst . fst . fst) oldPointers)
109 in if lastMatchedPrd == prd
110 then []
111 else filter (\((id,_),(id',_)) ->
112 case fil of
113 ToParents -> (((fst . fst . fst) id ) < (fst lastMatchedPrd))
114 || (((fst . fst . fst) id') < (fst lastMatchedPrd))
115 ToChilds -> (((fst . fst . fst) id ) > (fst lastMatchedPrd))
116 || (((fst . fst . fst) id') > (fst lastMatchedPrd))) pairs
117 | otherwise = []
118
119
120 makePairs' :: (PhyloGroupId,[Int]) -> [(PhyloGroupId,[Int])] -> [PhyloPeriodId] -> [Pointer] -> Filiation -> Double -> Proximity
121 -> Map Date Double -> Map Date Cooc -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
122 makePairs' (egoId, egoNgrams) candidates periods oldPointers fil thr prox docs diagos =
123 if (null periods)
124 then []
125 else removeOldPointers oldPointers fil thr prox lastPrd
126 -- | at least on of the pair candidates should be from the last added period
127 $ filter (\((id,_),(id',_)) -> ((fst . fst) id == lastPrd) || ((fst . fst) id' == lastPrd))
128 $ listToKeys
129 $ filter (\(id,ngrams) ->
130 let nbDocs = (sum . elems) $ filterDocs docs ([(fst . fst) egoId, (fst . fst) id])
131 diago = reduceDiagos $ filterDiago diagos ([(fst . fst) egoId, (fst . fst) id])
132 in (toProximity nbDocs diago prox egoNgrams egoNgrams ngrams) >= thr
133 ) candidates
134 where
135 lastPrd :: PhyloPeriodId
136 lastPrd = findLastPeriod fil periods
137
138
139 filterPointers :: Proximity -> Double -> [Pointer] -> [Pointer]
140 filterPointers proxi thr pts = filter (\(_,w) -> filterProximity proxi thr w) pts
141
142 filterPointers' :: Proximity -> Double -> [(Pointer,[Int])] -> [(Pointer,[Int])]
143 filterPointers' proxi thr pts = filter (\((_,w),_) -> filterProximity proxi thr w) pts
144
145
146 reduceDiagos :: Map Date Cooc -> Map Int Double
147 reduceDiagos diagos = mapKeys (\(k,_) -> k)
148 $ foldl (\acc diago -> unionWith (+) acc diago) empty (elems diagos)
149
150 filterPointersByPeriod :: Filiation -> [(Pointer,[Int])] -> [Pointer]
151 filterPointersByPeriod fil pts =
152 let pts' = sortOn (fst . fst . fst . fst) pts
153 inf = (fst . fst . fst . fst) $ head' "filterPointersByPeriod" pts'
154 sup = (fst . fst . fst . fst) $ last' "filterPointersByPeriod" pts'
155 in map fst
156 $ nubBy (\pt pt' -> snd pt == snd pt')
157 $ filter (\pt -> ((fst . fst . fst . fst) pt == inf) || ((fst . fst . fst . fst) pt == sup))
158 $ case fil of
159 ToParents -> reverse pts'
160 ToChilds -> pts'
161
162 phyloGroupMatching :: [[(PhyloGroupId,[Int])]] -> Filiation -> Proximity -> Map Date Double -> Map Date Cooc
163 -> Double -> [Pointer] -> (PhyloGroupId,[Int]) -> [Pointer]
164 phyloGroupMatching candidates fil proxi docs diagos thr oldPointers (id,ngrams) =
165 if (null $ filterPointers proxi thr oldPointers)
166 -- | let's find new pointers
167 then if null nextPointers
168 then []
169 else filterPointersByPeriod fil
170 $ head' "phyloGroupMatching"
171 -- | Keep only the best set of pointers grouped by proximity
172 $ groupBy (\pt pt' -> (snd . fst) pt == (snd . fst) pt')
173 $ reverse $ sortOn (snd . fst) $ head' "pointers" nextPointers
174 -- | Find the first time frame where at leats one pointer satisfies the proximity threshold
175 else oldPointers
176 where
177 nextPointers :: [[(Pointer,[Int])]]
178 nextPointers = take 1
179 $ dropWhile (null)
180 -- | for each time frame, process the proximity on relevant pairs of targeted groups
181 $ scanl (\acc groups ->
182 let periods = nub $ map (fst . fst . fst) $ concat groups
183 nbdocs = sum $ elems $ (filterDocs docs ([(fst . fst) id] ++ periods))
184 diago = reduceDiagos
185 $ filterDiago diagos ([(fst . fst) id] ++ periods)
186 -- | important resize nbdocs et diago dans le make pairs
187 pairs = makePairs' (id,ngrams) (concat groups) periods oldPointers fil thr proxi docs diagos
188 in acc ++ ( filterPointers' proxi thr
189 $ concat
190 $ map (\(c,c') ->
191 -- | process the proximity between the current group and a pair of candidates
192 let proximity = toProximity nbdocs diago proxi ngrams (snd c) (snd c')
193 in if ((c == c') || (snd c == snd c'))
194 then [((fst c,proximity),snd c)]
195 else [((fst c,proximity),snd c),((fst c',proximity),snd c')] ) pairs )) []
196 $ inits candidates -- | groups from [[1900],[1900,1901],[1900,1901,1902],...]
197
198
199 filterDocs :: Map Date Double -> [PhyloPeriodId] -> Map Date Double
200 filterDocs d pds = restrictKeys d $ periodsToYears pds
201
202 filterDiago :: Map Date Cooc -> [PhyloPeriodId] -> Map Date Cooc
203 filterDiago diago pds = restrictKeys diago $ periodsToYears pds
204
205
206 -----------------------------
207 -- | Matching Processing | --
208 -----------------------------
209
210
211 getNextPeriods :: Filiation -> Int -> PhyloPeriodId -> [PhyloPeriodId] -> [PhyloPeriodId]
212 getNextPeriods fil max' pId pIds =
213 case fil of
214 ToChilds -> take max' $ (tail . snd) $ splitAt (elemIndex' pId pIds) pIds
215 ToParents -> take max' $ (reverse . fst) $ splitAt (elemIndex' pId pIds) pIds
216
217
218 getCandidates :: PhyloGroup -> [[(PhyloGroupId,[Int])]] -> [[(PhyloGroupId,[Int])]]
219 getCandidates ego targets =
220 map (\groups' ->
221 filter (\g' -> (not . null) $ intersect (ego ^. phylo_groupNgrams) (snd g')
222 ) groups') targets
223
224
225 matchGroupsToGroups :: Int -> [PhyloPeriodId] -> Proximity -> Double -> Map Date Double -> Map Date Cooc -> [PhyloGroup] -> [PhyloGroup]
226 matchGroupsToGroups frame periods proximity thr docs coocs groups =
227 let groups' = groupByField _phylo_groupPeriod groups
228 in foldl' (\acc prd ->
229 let -- | 1) find the parents/childs matching periods
230 periodsPar = getNextPeriods ToParents frame prd periods
231 periodsChi = getNextPeriods ToChilds frame prd periods
232 -- | 2) find the parents/childs matching candidates
233 candidatesPar = map (\prd' -> map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ findWithDefault [] prd' groups') periodsPar
234 candidatesChi = map (\prd' -> map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ findWithDefault [] prd' groups') periodsChi
235 -- | 3) find the parents/child number of docs by years
236 docsPar = filterDocs docs ([prd] ++ periodsPar)
237 docsChi = filterDocs docs ([prd] ++ periodsChi)
238 -- | 4) find the parents/child diago by years
239 diagoPar = filterDiago (map coocToDiago coocs) ([prd] ++ periodsPar)
240 diagoChi = filterDiago (map coocToDiago coocs) ([prd] ++ periodsPar)
241 -- | 5) match in parallel all the groups (egos) to their possible candidates
242 egos = map (\ego ->
243 let pointersPar = phyloGroupMatching (getCandidates ego candidatesPar) ToParents proximity docsPar diagoPar
244 thr (getPeriodPointers ToParents ego) (getGroupId ego, ego ^. phylo_groupNgrams)
245 pointersChi = phyloGroupMatching (getCandidates ego candidatesChi) ToChilds proximity docsChi diagoChi
246 thr (getPeriodPointers ToChilds ego) (getGroupId ego, ego ^. phylo_groupNgrams)
247 in addPointers ToChilds TemporalPointer pointersChi
248 $ addPointers ToParents TemporalPointer pointersPar ego)
249 $ findWithDefault [] prd groups'
250 egos' = egos `using` parList rdeepseq
251 in acc ++ egos'
252 ) [] periods
253
254
255 -----------------------
256 -- | Phylo Quality | --
257 -----------------------
258
259
260 relevantBranches :: Int -> [[PhyloGroup]] -> [[PhyloGroup]]
261 relevantBranches term branches =
262 filter (\groups -> (any (\group -> elem term $ group ^. phylo_groupNgrams) groups)) branches
263
264 accuracy :: Int -> [PhyloGroup] -> Double
265 accuracy x bk = ((fromIntegral $ length $ filter (\g -> elem x $ g ^. phylo_groupNgrams) bk)
266 / (fromIntegral $ length bk))
267
268 recall :: Int -> [PhyloGroup] -> [[PhyloGroup]] -> Double
269 recall x bk bx = ((fromIntegral $ length $ filter (\g -> elem x $ g ^. phylo_groupNgrams) bk)
270 / (fromIntegral $ length $ filter (\g -> elem x $ g ^. phylo_groupNgrams) $ concat bx))
271
272 fScore :: Double -> Int -> [PhyloGroup] -> [[PhyloGroup]] -> Double
273 fScore beta x bk bx =
274 let rec = recall x bk bx
275 acc = accuracy x bk
276 in ((1 + beta ** 2) * acc * rec)
277 / (((beta ** 2) * rec + acc))
278
279
280 wk :: [PhyloGroup] -> Double
281 wk bk = fromIntegral $ length bk
282
283
284 toPhyloQuality' :: Double -> Map Int Double -> [[PhyloGroup]] -> Double
285 toPhyloQuality' beta freq branches =
286 if (null branches)
287 then 0
288 else sum
289 $ map (\i ->
290 let bks = relevantBranches i branches
291 in (freq ! i) * (sum $ map (\bk -> ((wk bk) / (sum $ map wk bks)) * (fScore beta i bk bks)) bks))
292 $ keys freq
293
294 toRecall :: Map Int Double -> [[PhyloGroup]] -> Double
295 toRecall freq branches =
296 if (null branches)
297 then 0
298 else sum
299 $ map (\x ->
300 let px = freq ! x
301 bx = relevantBranches x branches
302 wks = sum $ map wk bx
303 in (px / pys) * (sum $ map (\bk -> ((wk bk) / wks) * (recall x bk bx)) bx))
304 $ keys freq
305 where
306 pys :: Double
307 pys = sum (elems freq)
308
309
310 toAccuracy :: Map Int Double -> [[PhyloGroup]] -> Double
311 toAccuracy freq branches =
312 if (null branches)
313 then 0
314 else sum
315 $ map (\x ->
316 let px = freq ! x
317 bx = relevantBranches x branches
318 wks = sum $ map wk bx
319 in (px / pys) * (sum $ map (\bk -> ((wk bk) / wks) * (accuracy x bk)) bx))
320 $ keys freq
321 where
322 pys :: Double
323 pys = sum (elems freq)
324
325
326 -- | here we do the average of all the local f_scores
327 toPhyloQuality :: Double -> Map Int Double -> [[PhyloGroup]] -> Double
328 toPhyloQuality beta freq branches =
329 if (null branches)
330 then 0
331 else sum
332 $ map (\x ->
333 let px = freq ! x
334 bx = relevantBranches x branches
335 wks = sum $ map wk bx
336 in (px / pys) * (sum $ map (\bk -> ((wk bk) / wks) * (fScore beta x bk bx)) bx))
337 $ keys freq
338 where
339 pys :: Double
340 pys = sum (elems freq)
341
342
343 ------------------------------------
344 -- | Constant Temporal Matching | --
345 ------------------------------------
346
347
348 groupsToBranches :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
349 groupsToBranches groups =
350 -- | run the related component algorithm
351 let egos = groupBy (\gs gs' -> (fst $ fst $ head' "egos" gs) == (fst $ fst $ head' "egos" gs'))
352 $ sortOn (\gs -> fst $ fst $ head' "egos" gs)
353 $ map (\group -> [getGroupId group]
354 ++ (map fst $ group ^. phylo_groupPeriodParents)
355 ++ (map fst $ group ^. phylo_groupPeriodChilds) ) $ elems groups
356 -- | first find the related components by inside each ego's period
357 -- | a supprimer
358 graph' = map relatedComponents egos
359 -- | then run it for the all the periods
360 graph = zip [1..]
361 $ relatedComponents $ concat (graph' `using` parList rdeepseq)
362 -- | update each group's branch id
363 in map (\(bId,ids) ->
364 let groups' = map (\group -> group & phylo_groupBranchId %~ (\(lvl,lst) -> (lvl,lst ++ [bId])))
365 $ elems $ restrictKeys groups (Set.fromList ids)
366 in groups' `using` parList rdeepseq ) graph
367
368
369 reduceFrequency :: Map Int Double -> [[PhyloGroup]] -> Map Int Double
370 reduceFrequency frequency branches =
371 restrictKeys frequency (Set.fromList $ (nub . concat) $ map _phylo_groupNgrams $ concat branches)
372
373 updateThr :: Double -> [[PhyloGroup]] -> [[PhyloGroup]]
374 updateThr thr branches = map (\b -> map (\g ->
375 g & phylo_groupMeta .~ (singleton "seaLevels" (((g ^. phylo_groupMeta) ! "seaLevels") ++ [thr]))) b) branches
376
377
378 -- | Sequentially break each branch of a phylo where
379 -- done = all the allready broken branches
380 -- ego = the current branch we want to break
381 -- rest = the branches we still have to break
382 breakBranches :: Proximity -> Double -> Map Int Double -> Int -> Double -> Double -> Double
383 -> Int -> Map Date Double -> Map Date Cooc -> [PhyloPeriodId] -> [([PhyloGroup],Bool)] -> ([PhyloGroup],Bool) -> [([PhyloGroup],Bool)] -> [([PhyloGroup],Bool)]
384 breakBranches proximity beta frequency minBranch thr depth elevation frame docs coocs periods done ego rest =
385 -- | 1) keep or not the new division of ego
386 let done' = done ++ (if snd ego
387 then
388 (if ((null (fst ego')) || (quality > quality'))
389 then
390 -- trace (" ✗ F(β) = " <> show(quality) <> " (vs) " <> show(quality')
391 -- <> " | " <> show(length $ fst ego) <> " groups : "
392 -- <> " |✓ " <> show(length $ fst ego') <> show(map length $ fst ego')
393 -- <> " |✗ " <> show(length $ snd ego') <> "[" <> show(length $ concat $ snd ego') <> "]")
394 [(fst ego,False)]
395 else
396 -- trace (" ✓ level = " <> printf "%.1f" thr <> "")
397 -- trace (" ✓ F(β) = " <> show(quality) <> " (vs) " <> show(quality')
398 -- <> " | " <> show(length $ fst ego) <> " groups : "
399 -- <> " |✓ " <> show(length $ fst ego') <> show(map length $ fst ego')
400 -- <> " |✗ " <> show(length $ snd ego') <> "[" <> show(length $ concat $ snd ego') <> "]")
401 ((map (\e -> (e,True)) (fst ego')) ++ (map (\e -> (e,False)) (snd ego'))))
402 else [ego])
403 in
404 -- | 2) if there is no more branches in rest then return else continue
405 if null rest
406 then done'
407 else breakBranches proximity beta frequency minBranch thr depth elevation frame docs coocs periods
408 done' (head' "breakBranches" rest) (tail' "breakBranches" rest)
409 where
410 --------------------------------------
411 quality :: Double
412 quality = toPhyloQuality beta frequency ((map fst done) ++ [fst ego] ++ (map fst rest))
413 --------------------------------------
414 ego' :: ([[PhyloGroup]],[[PhyloGroup]])
415 ego' =
416 let branches = groupsToBranches $ fromList $ map (\g -> (getGroupId g, g))
417 $ matchGroupsToGroups frame periods proximity thr docs coocs (fst ego)
418 branches' = branches `using` parList rdeepseq
419 in partition (\b -> (length $ nub $ map _phylo_groupPeriod b) >= minBranch)
420 $ thrToMeta thr
421 $ depthToMeta (elevation - depth) branches'
422 --------------------------------------
423 quality' :: Double
424 quality' = toPhyloQuality beta frequency
425 ((map fst done) ++ (fst ego') ++ (snd ego') ++ (map fst rest))
426
427
428 seaLevelMatching :: Proximity -> Double -> Int -> Map Int Double -> Double -> Double -> Double -> Double
429 -> Int -> [PhyloPeriodId] -> Map Date Double -> Map Date Cooc -> [([PhyloGroup],Bool)] -> [([PhyloGroup],Bool)]
430 seaLevelMatching proximity beta minBranch frequency thr step depth elevation frame periods docs coocs branches =
431 -- | if there is no branch to break or if seaLvl level > 1 then end
432 if (thr >= 1) || ((not . or) $ map snd branches)
433 then branches
434 else
435 -- | break all the possible branches at the current seaLvl level
436 let quality = toPhyloQuality beta frequency (map fst branches)
437 acc = toAccuracy frequency (map fst branches)
438 rec = toRecall frequency (map fst branches)
439 branches' = trace ("↑ level = " <> printf "%.3f" thr <> " F(β) = " <> printf "%.5f" quality
440 <> " ξ = " <> printf "%.5f" acc
441 <> " ρ = " <> printf "%.5f" rec
442 <> " branches = " <> show(length branches) <> " ↴")
443 $ breakBranches proximity beta frequency minBranch thr depth elevation frame docs coocs periods
444 [] (head' "seaLevelMatching" branches) (tail' "seaLevelMatching" branches)
445 frequency' = reduceFrequency frequency (map fst branches')
446 in seaLevelMatching proximity beta minBranch frequency' (thr + step) step (depth - 1) elevation frame periods docs coocs branches'
447
448
449 constanteTemporalMatching :: Double -> Double -> Phylo -> Phylo
450 constanteTemporalMatching start step phylo = updatePhyloGroups 1
451 (fromList $ map (\g -> (getGroupId g,g)) $ traceMatchEnd $ concat branches)
452 (toPhyloHorizon phylo)
453 where
454 -- | 2) process the temporal matching by elevating seaLvl level
455 branches :: [[PhyloGroup]]
456 branches = map fst
457 $ seaLevelMatching (phyloProximity $ getConfig phylo)
458 (_qua_granularity $ phyloQuality $ getConfig phylo)
459 (_qua_minBranch $ phyloQuality $ getConfig phylo)
460 (phylo ^. phylo_termFreq)
461 start step
462 ((((1 - start) / step) - 1))
463 (((1 - start) / step))
464 (getTimeFrame $ timeUnit $ getConfig phylo)
465 (getPeriodIds phylo)
466 (phylo ^. phylo_timeDocs)
467 (phylo ^. phylo_timeCooc)
468 groups
469 -- | 1) for each group process an initial temporal Matching
470 -- | here we suppose that all the groups of level 1 are part of the same big branch
471 groups :: [([PhyloGroup],Bool)]
472 groups = map (\b -> (b,(length $ nub $ map _phylo_groupPeriod b) >= (_qua_minBranch $ phyloQuality $ getConfig phylo)))
473 $ groupsToBranches $ fromList $ map (\g -> (getGroupId g, g))
474 $ matchGroupsToGroups (getTimeFrame $ timeUnit $ getConfig phylo)
475 (getPeriodIds phylo) (phyloProximity $ getConfig phylo)
476 start
477 (phylo ^. phylo_timeDocs)
478 (phylo ^. phylo_timeCooc)
479 (traceTemporalMatching $ getGroupsFromLevel 1 phylo)
480
481 -----------------
482 -- | Horizon | --
483 -----------------
484
485 toPhyloHorizon :: Phylo -> Phylo
486 toPhyloHorizon phylo =
487 let t0 = take 1 (getPeriodIds phylo)
488 groups = getGroupsFromLevelPeriods 1 t0 phylo
489 sens = getSensibility (phyloProximity $ getConfig phylo)
490 nbDocs = sum $ elems $ filterDocs (phylo ^. phylo_timeDocs) t0
491 diago = reduceDiagos $ filterDiago (phylo ^. phylo_timeCooc) t0
492 in phylo & phylo_horizon .~ (fromList $ map (\(g,g') ->
493 ((getGroupId g,getGroupId g'),weightedLogJaccard' sens nbDocs diago (g ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams))) $ listToCombi' groups)
494
495
496 --------------------------------------
497 -- | Adaptative Temporal Matching | --
498 --------------------------------------
499
500
501 thrToMeta :: Double -> [[PhyloGroup]] -> [[PhyloGroup]]
502 thrToMeta thr branches =
503 map (\b ->
504 map (\g -> g & phylo_groupMeta .~ (adjust (\lst -> lst ++ [thr]) "seaLevels" (g ^. phylo_groupMeta))) b) branches
505
506 depthToMeta :: Double -> [[PhyloGroup]] -> [[PhyloGroup]]
507 depthToMeta depth branches =
508 let break = length branches > 1
509 in map (\b ->
510 map (\g ->
511 if break then g & phylo_groupMeta .~ (adjust (\lst -> lst ++ [depth]) "breaks"(g ^. phylo_groupMeta))
512 else g) b) branches
513
514 reduceTupleMapByKeys :: Eq a => [a] -> Map (a,a) Double -> Map (a,a) Double
515 reduceTupleMapByKeys ks m = filterWithKey (\(k,k') _ -> (elem k ks) && (elem k' ks)) m
516
517
518 getInTupleMap :: Ord a => Map (a,a) Double -> a -> a -> Double
519 getInTupleMap m k k'
520 | isJust (m !? ( k ,k')) = m ! ( k ,k')
521 | isJust (m !? ( k',k )) = m ! ( k',k )
522 | otherwise = 0
523
524
525 toThreshold :: Double -> Map (PhyloGroupId,PhyloGroupId) Double -> Double
526 toThreshold lvl proxiGroups =
527 let idx = ((Map.size proxiGroups) `div` (floor lvl)) - 1
528 in if idx >= 0
529 then (sort $ elems proxiGroups) !! idx
530 else 1
531
532
533 -- done = all the allready broken branches
534 -- ego = the current branch we want to break
535 -- rest = the branches we still have to break
536 adaptativeBreakBranches :: Proximity -> Double -> Double -> Map (PhyloGroupId,PhyloGroupId) Double
537 -> Double -> Map Int Double -> Int -> Int -> Map Date Double -> Map Date Cooc
538 -> [PhyloPeriodId] -> [([PhyloGroup],(Bool,[Double]))] -> ([PhyloGroup],(Bool,[Double])) -> [([PhyloGroup],(Bool,[Double]))]
539 -> [([PhyloGroup],(Bool,[Double]))]
540 adaptativeBreakBranches proxiConf depth elevation groupsProxi beta frequency minBranch frame docs coocs periods done ego rest =
541 -- | 1) keep or not the new division of ego
542 let done' = done ++ (if (fst . snd) ego
543 then (if ((null (fst ego')) || (quality > quality'))
544 then
545 [(concat $ thrToMeta thr $ [fst ego],(False, ((snd . snd) ego)))]
546 else
547 ( (map (\e -> (e,(True, ((snd . snd) ego) ++ [thr]))) (fst ego'))
548 ++ (map (\e -> (e,(False, ((snd . snd) ego)))) (snd ego'))))
549 else [(concat $ thrToMeta thr $ [fst ego], snd ego)])
550 in
551 -- | uncomment let .. in for debugging
552 -- let part1 = partition (snd) done'
553 -- part2 = partition (snd) rest
554 -- in trace ( "[✓ " <> show(length $ fst part1) <> "(" <> show(length $ concat $ map (fst) $ fst part1) <> ")|✗ " <> show(length $ snd part1) <> "(" <> show(length $ concat $ map (fst) $ snd part1) <> ")] "
555 -- <> "[✓ " <> show(length $ fst part2) <> "(" <> show(length $ concat $ map (fst) $ fst part2) <> ")|✗ " <> show(length $ snd part2) <> "(" <> show(length $ concat $ map (fst) $ snd part2) <> ")]"
556 -- ) $
557 -- | 2) if there is no more branches in rest then return else continue
558 if null rest
559 then done'
560 else adaptativeBreakBranches proxiConf depth elevation groupsProxi beta frequency minBranch frame docs coocs periods
561 done' (head' "breakBranches" rest) (tail' "breakBranches" rest)
562 where
563 --------------------------------------
564 thr :: Double
565 thr = toThreshold depth $ Map.filter (\v -> v > (last' "breakBranches" $ (snd . snd) ego)) $ reduceTupleMapByKeys (map getGroupId $ fst ego) groupsProxi
566 --------------------------------------
567 quality :: Double
568 quality = toPhyloQuality beta frequency ((map fst done) ++ [fst ego] ++ (map fst rest))
569 --------------------------------------
570 ego' :: ([[PhyloGroup]],[[PhyloGroup]])
571 ego' =
572 let branches = groupsToBranches $ fromList $ map (\g -> (getGroupId g, g))
573 $ matchGroupsToGroups frame periods proxiConf thr docs coocs (fst ego)
574 branches' = branches `using` parList rdeepseq
575 in partition (\b -> (length $ nub $ map _phylo_groupPeriod b) > minBranch)
576 $ thrToMeta thr
577 $ depthToMeta (elevation - depth) branches'
578 --------------------------------------
579 quality' :: Double
580 quality' = toPhyloQuality beta frequency
581 ((map fst done) ++ (fst ego') ++ (snd ego') ++ (map fst rest))
582
583
584 adaptativeSeaLevelMatching :: Proximity -> Double -> Double -> Map (PhyloGroupId, PhyloGroupId) Double
585 -> Double -> Int -> Map Int Double
586 -> Int -> [PhyloPeriodId] -> Map Date Double -> Map Date Cooc
587 -> [([PhyloGroup],(Bool,[Double]))] -> [([PhyloGroup],(Bool,[Double]))]
588 adaptativeSeaLevelMatching proxiConf depth elevation groupsProxi beta minBranch frequency frame periods docs coocs branches =
589 -- | if there is no branch to break or if seaLvl level >= depth then end
590 if (Map.null groupsProxi) || (depth <= 0) || ((not . or) $ map (fst . snd) branches)
591 then branches
592 else
593 -- | break all the possible branches at the current seaLvl level
594 let branches' = adaptativeBreakBranches proxiConf depth elevation groupsProxi beta frequency minBranch frame docs coocs periods
595 [] (head' "seaLevelMatching" branches) (tail' "seaLevelMatching" branches)
596 frequency' = reduceFrequency frequency (map fst branches')
597 groupsProxi' = reduceTupleMapByKeys (map (getGroupId) $ concat $ map (fst) $ filter (fst . snd) branches') groupsProxi
598 -- thr = toThreshold depth groupsProxi
599 in trace("\n " <> foldl (\acc _ -> acc <> "🌊 ") "" [0..(elevation - depth)]
600 <> " [✓ " <> show(length $ filter (fst . snd) branches') <> "(" <> show(length $ concat $ map (fst) $ filter (fst . snd) branches')
601 <> ")|✗ " <> show(length $ filter (not . fst . snd) branches') <> "(" <> show(length $ concat $ map (fst) $ filter (not . fst . snd) branches') <> ")]"
602 <> " thr = ")
603 $ adaptativeSeaLevelMatching proxiConf (depth - 1) elevation groupsProxi' beta minBranch frequency' frame periods docs coocs branches'
604
605
606 adaptativeTemporalMatching :: Double -> Phylo -> Phylo
607 adaptativeTemporalMatching elevation phylo = updatePhyloGroups 1
608 (fromList $ map (\g -> (getGroupId g,g)) $ traceMatchEnd $ concat branches)
609 (toPhyloHorizon phylo)
610 where
611 -- | 2) process the temporal matching by elevating seaLvl level
612 branches :: [[PhyloGroup]]
613 branches = map fst
614 $ adaptativeSeaLevelMatching (phyloProximity $ getConfig phylo)
615 (elevation - 1)
616 elevation
617 (phylo ^. phylo_groupsProxi)
618 (_qua_granularity $ phyloQuality $ getConfig phylo)
619 (_qua_minBranch $ phyloQuality $ getConfig phylo)
620 (phylo ^. phylo_termFreq)
621 (getTimeFrame $ timeUnit $ getConfig phylo)
622 (getPeriodIds phylo)
623 (phylo ^. phylo_timeDocs)
624 (phylo ^. phylo_timeCooc)
625 groups
626 -- | 1) for each group process an initial temporal Matching
627 -- | here we suppose that all the groups of level 1 are part of the same big branch
628 groups :: [([PhyloGroup],(Bool,[Double]))]
629 groups = map (\b -> (b,((length $ nub $ map _phylo_groupPeriod b) >= (_qua_minBranch $ phyloQuality $ getConfig phylo),[thr])))
630 $ groupsToBranches $ fromList $ map (\g -> (getGroupId g, g))
631 $ matchGroupsToGroups (getTimeFrame $ timeUnit $ getConfig phylo)
632 (getPeriodIds phylo) (phyloProximity $ getConfig phylo)
633 thr
634 (phylo ^. phylo_timeDocs)
635 (phylo ^. phylo_timeCooc)
636 (traceTemporalMatching $ getGroupsFromLevel 1 phylo)
637 --------------------------------------
638 thr :: Double
639 thr = toThreshold elevation (phylo ^. phylo_groupsProxi)