]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/Phylo/TemporalMatching.hs
changement du seuil
[gargantext.git] / src / Gargantext / Viz / Phylo / TemporalMatching.hs
1 {-|
2 Module : Gargantext.Viz.Phylo.TemporalMatching
3 Description : Module dedicated to the adaptative temporal matching of a Phylo.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 {-# LANGUAGE NoImplicitPrelude #-}
12 {-# LANGUAGE FlexibleContexts #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE MultiParamTypeClasses #-}
15
16 module Gargantext.Viz.Phylo.TemporalMatching where
17
18 import Data.List (concat, splitAt, tail, sortOn, (++), intersect, null, inits, groupBy, scanl, nub, union, dropWhile, partition, or, sort, (!!))
19 import Data.Map (Map, fromList, elems, restrictKeys, unionWith, findWithDefault, keys, (!), (!?), filterWithKey, singleton, empty, mapKeys, adjust)
20
21 import Gargantext.Prelude
22 import Gargantext.Viz.AdaptativePhylo
23 import Gargantext.Viz.Phylo.PhyloTools
24
25 import Prelude (floor)
26 import Control.Lens hiding (Level)
27 import Control.Parallel.Strategies (parList, rdeepseq, using)
28 import Debug.Trace (trace)
29
30 import Text.Printf
31
32 import qualified Data.Map as Map
33 import qualified Data.Set as Set
34
35
36 -------------------
37 -- | Proximity | --
38 -------------------
39
40
41 -- | To compute a jaccard similarity between two lists
42 jaccard :: [Int] -> [Int] -> Double
43 jaccard inter' union' = ((fromIntegral . length) $ inter') / ((fromIntegral . length) $ union')
44
45
46 -- | Process the inverse sumLog
47 sumInvLog' :: Double -> Double -> [Double] -> Double
48 sumInvLog' s nb diago = foldl (\mem occ -> mem + (1 / (log (occ + s) / log (nb + s)))) 0 diago
49
50
51 -- | Process the sumLog
52 sumLog' :: Double -> Double -> [Double] -> Double
53 sumLog' s nb diago = foldl (\mem occ -> mem + (log (occ + s) / log (nb + s))) 0 diago
54
55
56 weightedLogJaccard' :: Double -> Double -> Map Int Double -> [Int] -> [Int] -> Double
57 weightedLogJaccard' sens nbDocs diago ngrams ngrams'
58 | null ngramsInter = 0
59 | ngramsInter == ngramsUnion = 1
60 | sens == 0 = jaccard ngramsInter ngramsUnion
61 | sens > 0 = (sumInvLog' sens nbDocs diagoInter) / (sumInvLog' sens nbDocs diagoUnion)
62 | otherwise = (sumLog' sens nbDocs diagoInter) / (sumLog' sens nbDocs diagoUnion)
63 where
64 --------------------------------------
65 ngramsInter :: [Int]
66 ngramsInter = intersect ngrams ngrams'
67 --------------------------------------
68 ngramsUnion :: [Int]
69 ngramsUnion = union ngrams ngrams'
70 --------------------------------------
71 diagoInter :: [Double]
72 diagoInter = elems $ restrictKeys diago (Set.fromList ngramsInter)
73 --------------------------------------
74 diagoUnion :: [Double]
75 diagoUnion = elems $ restrictKeys diago (Set.fromList ngramsUnion)
76 --------------------------------------
77
78
79 -- | To process the proximity between a current group and a pair of targets group
80 toProximity :: Double -> Map Int Double -> Proximity -> [Int] -> [Int] -> [Int] -> Double
81 toProximity nbDocs diago proximity egoNgrams targetNgrams targetNgrams' =
82 case proximity of
83 WeightedLogJaccard sens ->
84 let pairNgrams = if targetNgrams == targetNgrams'
85 then targetNgrams
86 else union targetNgrams targetNgrams'
87 in weightedLogJaccard' sens nbDocs diago egoNgrams pairNgrams
88 Hamming -> undefined
89
90
91 ------------------------
92 -- | Local Matching | --
93 ------------------------
94
95 findLastPeriod :: Filiation -> [PhyloPeriodId] -> PhyloPeriodId
96 findLastPeriod fil periods = case fil of
97 ToParents -> head' "findLastPeriod" (sortOn fst periods)
98 ToChilds -> last' "findLastPeriod" (sortOn fst periods)
99
100
101 -- | To filter pairs of candidates related to old pointers periods
102 removeOldPointers :: [Pointer] -> Filiation -> Double -> Proximity -> PhyloPeriodId
103 -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
104 -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
105 removeOldPointers oldPointers fil thr prox prd pairs
106 | null oldPointers = pairs
107 | null (filterPointers prox thr oldPointers) =
108 let lastMatchedPrd = findLastPeriod fil (map (fst . fst . fst) oldPointers)
109 in if lastMatchedPrd == prd
110 then []
111 else filter (\((id,_),(id',_)) ->
112 case fil of
113 ToParents -> (((fst . fst . fst) id ) < (fst lastMatchedPrd))
114 || (((fst . fst . fst) id') < (fst lastMatchedPrd))
115 ToChilds -> (((fst . fst . fst) id ) > (fst lastMatchedPrd))
116 || (((fst . fst . fst) id') > (fst lastMatchedPrd))) pairs
117 | otherwise = []
118
119
120 makePairs' :: (PhyloGroupId,[Int]) -> [(PhyloGroupId,[Int])] -> [PhyloPeriodId] -> [Pointer] -> Filiation -> Double -> Proximity
121 -> Map Date Double -> Map Date Cooc -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
122 makePairs' (egoId, egoNgrams) candidates periods oldPointers fil thr prox docs diagos =
123 if (null periods)
124 then []
125 else removeOldPointers oldPointers fil thr prox lastPrd
126 -- | at least on of the pair candidates should be from the last added period
127 $ filter (\((id,_),(id',_)) -> ((fst . fst) id == lastPrd) || ((fst . fst) id' == lastPrd))
128 $ listToKeys
129 $ filter (\(id,ngrams) ->
130 let nbDocs = (sum . elems) $ filterDocs docs ([(fst . fst) egoId, (fst . fst) id])
131 diago = reduceDiagos $ filterDiago diagos ([(fst . fst) egoId, (fst . fst) id])
132 in (toProximity nbDocs diago prox egoNgrams egoNgrams ngrams) >= thr
133 ) candidates
134 where
135 lastPrd :: PhyloPeriodId
136 lastPrd = findLastPeriod fil periods
137
138
139 filterPointers :: Proximity -> Double -> [Pointer] -> [Pointer]
140 filterPointers proxi thr pts = filter (\(_,w) -> filterProximity proxi thr w) pts
141
142
143 reduceDiagos :: Map Date Cooc -> Map Int Double
144 reduceDiagos diagos = mapKeys (\(k,_) -> k)
145 $ foldl (\acc diago -> unionWith (+) acc diago) empty (elems diagos)
146
147 filterPointersByPeriod :: [Pointer] -> [Pointer]
148 filterPointersByPeriod pts =
149 let pts' = sortOn (fst . fst . fst) pts
150 inf = (fst . fst . fst) $ head' "filterPointersByPeriod" pts'
151 sup = (fst . fst . fst) $ last' "filterPointersByPeriod" pts'
152 in nub
153 $ filter (\pt -> ((fst . fst . fst) pt == inf) || ((fst . fst . fst) pt == sup)) pts'
154
155 phyloGroupMatching :: [[(PhyloGroupId,[Int])]] -> Filiation -> Proximity -> Map Date Double -> Map Date Cooc
156 -> Double -> [Pointer] -> (PhyloGroupId,[Int]) -> [Pointer]
157 phyloGroupMatching candidates fil proxi docs diagos thr oldPointers (id,ngrams) =
158 if (null $ filterPointers proxi thr oldPointers)
159 -- | let's find new pointers
160 then if null nextPointers
161 then []
162 else filterPointersByPeriod
163 $ head' "phyloGroupMatching"
164 -- | Keep only the best set of pointers grouped by proximity
165 $ groupBy (\pt pt' -> snd pt == snd pt')
166 $ reverse $ sortOn snd $ head' "pointers" nextPointers
167 -- | Find the first time frame where at leats one pointer satisfies the proximity threshold
168 else oldPointers
169 where
170 nextPointers :: [[Pointer]]
171 nextPointers = take 1
172 $ dropWhile (null)
173 -- | for each time frame, process the proximity on relevant pairs of targeted groups
174 $ scanl (\acc groups ->
175 let periods = nub $ map (fst . fst . fst) $ concat groups
176 nbdocs = sum $ elems $ (filterDocs docs ([(fst . fst) id] ++ periods))
177 diago = reduceDiagos
178 $ filterDiago diagos ([(fst . fst) id] ++ periods)
179 -- | important resize nbdocs et diago dans le make pairs
180 pairs = makePairs' (id,ngrams) (concat groups) periods oldPointers fil thr proxi docs diagos
181 in acc ++ ( filterPointers proxi thr
182 $ concat
183 $ map (\(c,c') ->
184 -- | process the proximity between the current group and a pair of candidates
185 let proximity = toProximity nbdocs diago proxi ngrams (snd c) (snd c')
186 in if (c == c')
187 then [(fst c,proximity)]
188 else [(fst c,proximity),(fst c',proximity)] ) pairs )) []
189 $ inits candidates -- | groups from [[1900],[1900,1901],[1900,1901,1902],...]
190
191
192 filterDocs :: Map Date Double -> [PhyloPeriodId] -> Map Date Double
193 filterDocs d pds = restrictKeys d $ periodsToYears pds
194
195 filterDiago :: Map Date Cooc -> [PhyloPeriodId] -> Map Date Cooc
196 filterDiago diago pds = restrictKeys diago $ periodsToYears pds
197
198
199 -----------------------------
200 -- | Matching Processing | --
201 -----------------------------
202
203
204 getNextPeriods :: Filiation -> Int -> PhyloPeriodId -> [PhyloPeriodId] -> [PhyloPeriodId]
205 getNextPeriods fil max' pId pIds =
206 case fil of
207 ToChilds -> take max' $ (tail . snd) $ splitAt (elemIndex' pId pIds) pIds
208 ToParents -> take max' $ (reverse . fst) $ splitAt (elemIndex' pId pIds) pIds
209
210
211 getCandidates :: PhyloGroup -> [[(PhyloGroupId,[Int])]] -> [[(PhyloGroupId,[Int])]]
212 getCandidates ego targets =
213 map (\groups' ->
214 filter (\g' -> (not . null) $ intersect (ego ^. phylo_groupNgrams) (snd g')
215 ) groups') targets
216
217
218 matchGroupsToGroups :: Int -> [PhyloPeriodId] -> Proximity -> Double -> Map Date Double -> Map Date Cooc -> [PhyloGroup] -> [PhyloGroup]
219 matchGroupsToGroups frame periods proximity thr docs coocs groups =
220 let groups' = groupByField _phylo_groupPeriod groups
221 in foldl' (\acc prd ->
222 let -- | 1) find the parents/childs matching periods
223 periodsPar = getNextPeriods ToParents frame prd periods
224 periodsChi = getNextPeriods ToChilds frame prd periods
225 -- | 2) find the parents/childs matching candidates
226 candidatesPar = map (\prd' -> map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ findWithDefault [] prd' groups') periodsPar
227 candidatesChi = map (\prd' -> map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ findWithDefault [] prd' groups') periodsChi
228 -- | 3) find the parents/child number of docs by years
229 docsPar = filterDocs docs ([prd] ++ periodsPar)
230 docsChi = filterDocs docs ([prd] ++ periodsChi)
231 -- | 4) find the parents/child diago by years
232 diagoPar = filterDiago (map coocToDiago coocs) ([prd] ++ periodsPar)
233 diagoChi = filterDiago (map coocToDiago coocs) ([prd] ++ periodsPar)
234 -- | 5) match in parallel all the groups (egos) to their possible candidates
235 egos = map (\ego ->
236 let pointersPar = phyloGroupMatching (getCandidates ego candidatesPar) ToParents proximity docsPar diagoPar
237 thr (getPeriodPointers ToParents ego) (getGroupId ego, ego ^. phylo_groupNgrams)
238 pointersChi = phyloGroupMatching (getCandidates ego candidatesChi) ToChilds proximity docsChi diagoChi
239 thr (getPeriodPointers ToChilds ego) (getGroupId ego, ego ^. phylo_groupNgrams)
240 in addPointers ToChilds TemporalPointer pointersChi
241 $ addPointers ToParents TemporalPointer pointersPar ego)
242 $ findWithDefault [] prd groups'
243 egos' = egos `using` parList rdeepseq
244 in acc ++ egos'
245 ) [] periods
246
247
248 -----------------------
249 -- | Phylo Quality | --
250 -----------------------
251
252
253 relevantBranches :: Int -> [[PhyloGroup]] -> [[PhyloGroup]]
254 relevantBranches term branches =
255 filter (\groups -> (any (\group -> elem term $ group ^. phylo_groupNgrams) groups)) branches
256
257 accuracy :: Int -> [PhyloGroup] -> Double
258 accuracy x bk = ((fromIntegral $ length $ filter (\g -> elem x $ g ^. phylo_groupNgrams) bk)
259 / (fromIntegral $ length bk))
260
261 recall :: Int -> [PhyloGroup] -> [[PhyloGroup]] -> Double
262 recall x bk bx = ((fromIntegral $ length $ filter (\g -> elem x $ g ^. phylo_groupNgrams) bk)
263 / (fromIntegral $ length $ filter (\g -> elem x $ g ^. phylo_groupNgrams) $ concat bx))
264
265 fScore :: Double -> Int -> [PhyloGroup] -> [[PhyloGroup]] -> Double
266 fScore beta x bk bx =
267 let rec = recall x bk bx
268 acc = accuracy x bk
269 in ((1 + beta ** 2) * acc * rec)
270 / (((beta ** 2) * rec + acc))
271
272
273 wk :: [PhyloGroup] -> Double
274 wk bk = fromIntegral $ length bk
275
276
277 toPhyloQuality' :: Double -> Map Int Double -> [[PhyloGroup]] -> Double
278 toPhyloQuality' beta freq branches =
279 if (null branches)
280 then 0
281 else sum
282 $ map (\i ->
283 let bks = relevantBranches i branches
284 in (freq ! i) * (sum $ map (\bk -> ((wk bk) / (sum $ map wk bks)) * (fScore beta i bk bks)) bks))
285 $ keys freq
286
287 toRecall :: Map Int Double -> [[PhyloGroup]] -> Double
288 toRecall freq branches =
289 if (null branches)
290 then 0
291 else sum
292 $ map (\x ->
293 let px = freq ! x
294 bx = relevantBranches x branches
295 wks = sum $ map wk bx
296 in (px / pys) * (sum $ map (\bk -> ((wk bk) / wks) * (recall x bk bx)) bx))
297 $ keys freq
298 where
299 pys :: Double
300 pys = sum (elems freq)
301
302
303 toAccuracy :: Map Int Double -> [[PhyloGroup]] -> Double
304 toAccuracy freq branches =
305 if (null branches)
306 then 0
307 else sum
308 $ map (\x ->
309 let px = freq ! x
310 bx = relevantBranches x branches
311 wks = sum $ map wk bx
312 in (px / pys) * (sum $ map (\bk -> ((wk bk) / wks) * (accuracy x bk)) bx))
313 $ keys freq
314 where
315 pys :: Double
316 pys = sum (elems freq)
317
318
319 -- | here we do the average of all the local f_scores
320 toPhyloQuality :: Double -> Map Int Double -> [[PhyloGroup]] -> Double
321 toPhyloQuality beta freq branches =
322 if (null branches)
323 then 0
324 else sum
325 $ map (\x ->
326 let px = freq ! x
327 bx = relevantBranches x branches
328 wks = sum $ map wk bx
329 in (px / pys) * (sum $ map (\bk -> ((wk bk) / wks) * (fScore beta x bk bx)) bx))
330 $ keys freq
331 where
332 pys :: Double
333 pys = sum (elems freq)
334
335
336 ------------------------------------
337 -- | Constant Temporal Matching | --
338 ------------------------------------
339
340
341 groupsToBranches :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
342 groupsToBranches groups =
343 -- | run the related component algorithm
344 let egos = groupBy (\gs gs' -> (fst $ fst $ head' "egos" gs) == (fst $ fst $ head' "egos" gs'))
345 $ sortOn (\gs -> fst $ fst $ head' "egos" gs)
346 $ map (\group -> [getGroupId group]
347 ++ (map fst $ group ^. phylo_groupPeriodParents)
348 ++ (map fst $ group ^. phylo_groupPeriodChilds) ) $ elems groups
349 -- | first find the related components by inside each ego's period
350 -- | a supprimer
351 graph' = map relatedComponents egos
352 -- | then run it for the all the periods
353 graph = zip [1..]
354 $ relatedComponents $ concat (graph' `using` parList rdeepseq)
355 -- | update each group's branch id
356 in map (\(bId,ids) ->
357 let groups' = map (\group -> group & phylo_groupBranchId %~ (\(lvl,lst) -> (lvl,lst ++ [bId])))
358 $ elems $ restrictKeys groups (Set.fromList ids)
359 in groups' `using` parList rdeepseq ) graph
360
361
362 reduceFrequency :: Map Int Double -> [[PhyloGroup]] -> Map Int Double
363 reduceFrequency frequency branches =
364 restrictKeys frequency (Set.fromList $ (nub . concat) $ map _phylo_groupNgrams $ concat branches)
365
366 updateThr :: Double -> [[PhyloGroup]] -> [[PhyloGroup]]
367 updateThr thr branches = map (\b -> map (\g ->
368 g & phylo_groupMeta .~ (singleton "seaLevels" (((g ^. phylo_groupMeta) ! "seaLevels") ++ [thr]))) b) branches
369
370
371 -- | Sequentially break each branch of a phylo where
372 -- done = all the allready broken branches
373 -- ego = the current branch we want to break
374 -- rest = the branches we still have to break
375 breakBranches :: Proximity -> Double -> Map Int Double -> Int -> Double -> Double -> Double
376 -> Int -> Map Date Double -> Map Date Cooc -> [PhyloPeriodId] -> [([PhyloGroup],Bool)] -> ([PhyloGroup],Bool) -> [([PhyloGroup],Bool)] -> [([PhyloGroup],Bool)]
377 breakBranches proximity beta frequency minBranch thr depth elevation frame docs coocs periods done ego rest =
378 -- | 1) keep or not the new division of ego
379 let done' = done ++ (if snd ego
380 then
381 (if ((null (fst ego')) || (quality > quality'))
382 then
383 -- trace (" ✗ F(β) = " <> show(quality) <> " (vs) " <> show(quality')
384 -- <> " | " <> show(length $ fst ego) <> " groups : "
385 -- <> " |✓ " <> show(length $ fst ego') <> show(map length $ fst ego')
386 -- <> " |✗ " <> show(length $ snd ego') <> "[" <> show(length $ concat $ snd ego') <> "]")
387 [(fst ego,False)]
388 else
389 -- trace (" ✓ level = " <> printf "%.1f" thr <> "")
390 -- trace (" ✓ F(β) = " <> show(quality) <> " (vs) " <> show(quality')
391 -- <> " | " <> show(length $ fst ego) <> " groups : "
392 -- <> " |✓ " <> show(length $ fst ego') <> show(map length $ fst ego')
393 -- <> " |✗ " <> show(length $ snd ego') <> "[" <> show(length $ concat $ snd ego') <> "]")
394 ((map (\e -> (e,True)) (fst ego')) ++ (map (\e -> (e,False)) (snd ego'))))
395 else [ego])
396 in
397 -- | 2) if there is no more branches in rest then return else continue
398 if null rest
399 then done'
400 else breakBranches proximity beta frequency minBranch thr depth elevation frame docs coocs periods
401 done' (head' "breakBranches" rest) (tail' "breakBranches" rest)
402 where
403 --------------------------------------
404 quality :: Double
405 quality = toPhyloQuality beta frequency ((map fst done) ++ [fst ego] ++ (map fst rest))
406 --------------------------------------
407 ego' :: ([[PhyloGroup]],[[PhyloGroup]])
408 ego' =
409 let branches = groupsToBranches $ fromList $ map (\g -> (getGroupId g, g))
410 $ matchGroupsToGroups frame periods proximity thr docs coocs (fst ego)
411 branches' = branches `using` parList rdeepseq
412 in partition (\b -> (length $ nub $ map _phylo_groupPeriod b) >= minBranch)
413 $ thrToMeta thr
414 $ depthToMeta (elevation - depth) branches'
415 --------------------------------------
416 quality' :: Double
417 quality' = toPhyloQuality beta frequency
418 ((map fst done) ++ (fst ego') ++ (snd ego') ++ (map fst rest))
419
420
421 seaLevelMatching :: Proximity -> Double -> Int -> Map Int Double -> Double -> Double -> Double -> Double
422 -> Int -> [PhyloPeriodId] -> Map Date Double -> Map Date Cooc -> [([PhyloGroup],Bool)] -> [([PhyloGroup],Bool)]
423 seaLevelMatching proximity beta minBranch frequency thr step depth elevation frame periods docs coocs branches =
424 -- | if there is no branch to break or if seaLvl level > 1 then end
425 if (thr >= 1) || ((not . or) $ map snd branches)
426 then branches
427 else
428 -- | break all the possible branches at the current seaLvl level
429 let quality = toPhyloQuality beta frequency (map fst branches)
430 acc = toAccuracy frequency (map fst branches)
431 rec = toRecall frequency (map fst branches)
432 branches' = trace ("↑ level = " <> printf "%.3f" thr <> " F(β) = " <> printf "%.5f" quality
433 <> " ξ = " <> printf "%.5f" acc
434 <> " ρ = " <> printf "%.5f" rec
435 <> " branches = " <> show(length branches) <> " ↴")
436 $ breakBranches proximity beta frequency minBranch thr depth elevation frame docs coocs periods
437 [] (head' "seaLevelMatching" branches) (tail' "seaLevelMatching" branches)
438 frequency' = reduceFrequency frequency (map fst branches')
439 in seaLevelMatching proximity beta minBranch frequency' (thr + step) step (depth - 1) elevation frame periods docs coocs branches'
440
441
442 constanteTemporalMatching :: Double -> Double -> Phylo -> Phylo
443 constanteTemporalMatching start step phylo = updatePhyloGroups 1
444 (fromList $ map (\g -> (getGroupId g,g)) $ traceMatchEnd $ concat branches)
445 (toPhyloHorizon phylo)
446 where
447 -- | 2) process the temporal matching by elevating seaLvl level
448 branches :: [[PhyloGroup]]
449 branches = map fst
450 $ seaLevelMatching (phyloProximity $ getConfig phylo)
451 (_qua_granularity $ phyloQuality $ getConfig phylo)
452 (_qua_minBranch $ phyloQuality $ getConfig phylo)
453 (phylo ^. phylo_termFreq)
454 start step
455 ((((1 - start) / step) - 1))
456 (((1 - start) / step))
457 (getTimeFrame $ timeUnit $ getConfig phylo)
458 (getPeriodIds phylo)
459 (phylo ^. phylo_timeDocs)
460 (phylo ^. phylo_timeCooc)
461 groups
462 -- | 1) for each group process an initial temporal Matching
463 -- | here we suppose that all the groups of level 1 are part of the same big branch
464 groups :: [([PhyloGroup],Bool)]
465 groups = map (\b -> (b,(length $ nub $ map _phylo_groupPeriod b) >= (_qua_minBranch $ phyloQuality $ getConfig phylo)))
466 $ groupsToBranches $ fromList $ map (\g -> (getGroupId g, g))
467 $ matchGroupsToGroups (getTimeFrame $ timeUnit $ getConfig phylo)
468 (getPeriodIds phylo) (phyloProximity $ getConfig phylo)
469 start
470 (phylo ^. phylo_timeDocs)
471 (phylo ^. phylo_timeCooc)
472 (traceTemporalMatching $ getGroupsFromLevel 1 phylo)
473
474 -----------------
475 -- | Horizon | --
476 -----------------
477
478 toPhyloHorizon :: Phylo -> Phylo
479 toPhyloHorizon phylo =
480 let t0 = take 1 (getPeriodIds phylo)
481 groups = getGroupsFromLevelPeriods 1 t0 phylo
482 sens = getSensibility (phyloProximity $ getConfig phylo)
483 nbDocs = sum $ elems $ filterDocs (phylo ^. phylo_timeDocs) t0
484 diago = reduceDiagos $ filterDiago (phylo ^. phylo_timeCooc) t0
485 in phylo & phylo_horizon .~ (fromList $ map (\(g,g') ->
486 ((getGroupId g,getGroupId g'),weightedLogJaccard' sens nbDocs diago (g ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams))) $ listToCombi' groups)
487
488
489 --------------------------------------
490 -- | Adaptative Temporal Matching | --
491 --------------------------------------
492
493
494 thrToMeta :: Double -> [[PhyloGroup]] -> [[PhyloGroup]]
495 thrToMeta thr branches =
496 map (\b ->
497 map (\g -> g & phylo_groupMeta .~ (adjust (\lst -> lst ++ [thr]) "seaLevels" (g ^. phylo_groupMeta))) b) branches
498
499 depthToMeta :: Double -> [[PhyloGroup]] -> [[PhyloGroup]]
500 depthToMeta depth branches =
501 let break = length branches > 1
502 in map (\b ->
503 map (\g ->
504 if break then g & phylo_groupMeta .~ (adjust (\lst -> lst ++ [depth]) "breaks"(g ^. phylo_groupMeta))
505 else g) b) branches
506
507 reduceTupleMapByKeys :: Eq a => [a] -> Map (a,a) Double -> Map (a,a) Double
508 reduceTupleMapByKeys ks m = filterWithKey (\(k,k') _ -> (elem k ks) && (elem k' ks)) m
509
510
511 getInTupleMap :: Ord a => Map (a,a) Double -> a -> a -> Double
512 getInTupleMap m k k'
513 | isJust (m !? ( k ,k')) = m ! ( k ,k')
514 | isJust (m !? ( k',k )) = m ! ( k',k )
515 | otherwise = 0
516
517
518 toThreshold :: Double -> Map (PhyloGroupId,PhyloGroupId) Double -> Double
519 toThreshold lvl proxiGroups =
520 let idx = ((Map.size proxiGroups) `div` (floor lvl)) - 1
521 in if idx >= 0
522 then (sort $ elems proxiGroups) !! idx
523 else 1
524
525
526 -- done = all the allready broken branches
527 -- ego = the current branch we want to break
528 -- rest = the branches we still have to break
529 adaptativeBreakBranches :: Proximity -> Double -> Double -> Map (PhyloGroupId,PhyloGroupId) Double
530 -> Double -> Map Int Double -> Int -> Int -> Map Date Double -> Map Date Cooc
531 -> [PhyloPeriodId] -> [([PhyloGroup],(Bool,[Double]))] -> ([PhyloGroup],(Bool,[Double])) -> [([PhyloGroup],(Bool,[Double]))]
532 -> [([PhyloGroup],(Bool,[Double]))]
533 adaptativeBreakBranches proxiConf depth elevation groupsProxi beta frequency minBranch frame docs coocs periods done ego rest =
534 -- | 1) keep or not the new division of ego
535 let done' = done ++ (if (fst . snd) ego
536 then (if ((null (fst ego')) || (quality > quality'))
537 then
538 [(concat $ thrToMeta thr $ [fst ego],(False, ((snd . snd) ego)))]
539 else
540 ( (map (\e -> (e,(True, ((snd . snd) ego) ++ [thr]))) (fst ego'))
541 ++ (map (\e -> (e,(False, ((snd . snd) ego)))) (snd ego'))))
542 else [(concat $ thrToMeta thr $ [fst ego], snd ego)])
543 in
544 -- | uncomment let .. in for debugging
545 -- let part1 = partition (snd) done'
546 -- part2 = partition (snd) rest
547 -- in trace ( "[✓ " <> show(length $ fst part1) <> "(" <> show(length $ concat $ map (fst) $ fst part1) <> ")|✗ " <> show(length $ snd part1) <> "(" <> show(length $ concat $ map (fst) $ snd part1) <> ")] "
548 -- <> "[✓ " <> show(length $ fst part2) <> "(" <> show(length $ concat $ map (fst) $ fst part2) <> ")|✗ " <> show(length $ snd part2) <> "(" <> show(length $ concat $ map (fst) $ snd part2) <> ")]"
549 -- ) $
550 -- | 2) if there is no more branches in rest then return else continue
551 if null rest
552 then done'
553 else adaptativeBreakBranches proxiConf depth elevation groupsProxi beta frequency minBranch frame docs coocs periods
554 done' (head' "breakBranches" rest) (tail' "breakBranches" rest)
555 where
556 --------------------------------------
557 thr :: Double
558 thr = toThreshold depth $ Map.filter (\v -> v > (last' "breakBranches" $ (snd . snd) ego)) $ reduceTupleMapByKeys (map getGroupId $ fst ego) groupsProxi
559 --------------------------------------
560 quality :: Double
561 quality = toPhyloQuality beta frequency ((map fst done) ++ [fst ego] ++ (map fst rest))
562 --------------------------------------
563 ego' :: ([[PhyloGroup]],[[PhyloGroup]])
564 ego' =
565 let branches = groupsToBranches $ fromList $ map (\g -> (getGroupId g, g))
566 $ matchGroupsToGroups frame periods proxiConf thr docs coocs (fst ego)
567 branches' = branches `using` parList rdeepseq
568 in partition (\b -> (length $ nub $ map _phylo_groupPeriod b) > minBranch)
569 $ thrToMeta thr
570 $ depthToMeta (elevation - depth) branches'
571 --------------------------------------
572 quality' :: Double
573 quality' = toPhyloQuality beta frequency
574 ((map fst done) ++ (fst ego') ++ (snd ego') ++ (map fst rest))
575
576
577 adaptativeSeaLevelMatching :: Proximity -> Double -> Double -> Map (PhyloGroupId, PhyloGroupId) Double
578 -> Double -> Int -> Map Int Double
579 -> Int -> [PhyloPeriodId] -> Map Date Double -> Map Date Cooc
580 -> [([PhyloGroup],(Bool,[Double]))] -> [([PhyloGroup],(Bool,[Double]))]
581 adaptativeSeaLevelMatching proxiConf depth elevation groupsProxi beta minBranch frequency frame periods docs coocs branches =
582 -- | if there is no branch to break or if seaLvl level >= depth then end
583 if (Map.null groupsProxi) || (depth <= 0) || ((not . or) $ map (fst . snd) branches)
584 then branches
585 else
586 -- | break all the possible branches at the current seaLvl level
587 let branches' = adaptativeBreakBranches proxiConf depth elevation groupsProxi beta frequency minBranch frame docs coocs periods
588 [] (head' "seaLevelMatching" branches) (tail' "seaLevelMatching" branches)
589 frequency' = reduceFrequency frequency (map fst branches')
590 groupsProxi' = reduceTupleMapByKeys (map (getGroupId) $ concat $ map (fst) $ filter (fst . snd) branches') groupsProxi
591 -- thr = toThreshold depth groupsProxi
592 in trace("\n " <> foldl (\acc _ -> acc <> "🌊 ") "" [0..(elevation - depth)]
593 <> " [✓ " <> show(length $ filter (fst . snd) branches') <> "(" <> show(length $ concat $ map (fst) $ filter (fst . snd) branches')
594 <> ")|✗ " <> show(length $ filter (not . fst . snd) branches') <> "(" <> show(length $ concat $ map (fst) $ filter (not . fst . snd) branches') <> ")]"
595 <> " thr = ")
596 $ adaptativeSeaLevelMatching proxiConf (depth - 1) elevation groupsProxi' beta minBranch frequency' frame periods docs coocs branches'
597
598
599 adaptativeTemporalMatching :: Double -> Phylo -> Phylo
600 adaptativeTemporalMatching elevation phylo = updatePhyloGroups 1
601 (fromList $ map (\g -> (getGroupId g,g)) $ traceMatchEnd $ concat branches)
602 (toPhyloHorizon phylo)
603 where
604 -- | 2) process the temporal matching by elevating seaLvl level
605 branches :: [[PhyloGroup]]
606 branches = map fst
607 $ adaptativeSeaLevelMatching (phyloProximity $ getConfig phylo)
608 (elevation - 1)
609 elevation
610 (phylo ^. phylo_groupsProxi)
611 (_qua_granularity $ phyloQuality $ getConfig phylo)
612 (_qua_minBranch $ phyloQuality $ getConfig phylo)
613 (phylo ^. phylo_termFreq)
614 (getTimeFrame $ timeUnit $ getConfig phylo)
615 (getPeriodIds phylo)
616 (phylo ^. phylo_timeDocs)
617 (phylo ^. phylo_timeCooc)
618 groups
619 -- | 1) for each group process an initial temporal Matching
620 -- | here we suppose that all the groups of level 1 are part of the same big branch
621 groups :: [([PhyloGroup],(Bool,[Double]))]
622 groups = map (\b -> (b,((length $ nub $ map _phylo_groupPeriod b) >= (_qua_minBranch $ phyloQuality $ getConfig phylo),[thr])))
623 $ groupsToBranches $ fromList $ map (\g -> (getGroupId g, g))
624 $ matchGroupsToGroups (getTimeFrame $ timeUnit $ getConfig phylo)
625 (getPeriodIds phylo) (phyloProximity $ getConfig phylo)
626 thr
627 (phylo ^. phylo_timeDocs)
628 (phylo ^. phylo_timeCooc)
629 (traceTemporalMatching $ getGroupsFromLevel 1 phylo)
630 --------------------------------------
631 thr :: Double
632 thr = toThreshold elevation (phylo ^. phylo_groupsProxi)