]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Viz/Phylo/TemporalMatching.hs
first commit
[gargantext.git] / src / Gargantext / Core / Viz / Phylo / TemporalMatching.hs
1 {-|
2 Module : Gargantext.Core.Viz.Phylo.TemporalMatching
3 Description : Module dedicated to the adaptative temporal matching of a Phylo.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11
12 module Gargantext.Core.Viz.Phylo.TemporalMatching where
13
14 import Data.List (concat, splitAt, tail, sortOn, (++), intersect, null, inits, groupBy, scanl, nub, nubBy, union, dropWhile, partition, or, sort, (!!))
15 import Data.Map (Map, fromList, elems, restrictKeys, unionWith, findWithDefault, keys, (!), (!?), filterWithKey, singleton, empty, mapKeys, adjust)
16
17 import Gargantext.Prelude
18 import Gargantext.Core.Viz.AdaptativePhylo
19 import Gargantext.Core.Viz.Phylo.PhyloTools
20
21 import Prelude (floor)
22 import Control.Lens hiding (Level)
23 import Control.Parallel.Strategies (parList, rdeepseq, using)
24 import Debug.Trace (trace)
25
26 import Text.Printf
27
28 import qualified Data.Map as Map
29 import qualified Data.Set as Set
30
31
32 -------------------
33 -- | Proximity | --
34 -------------------
35
36
37 -- | To compute a jaccard similarity between two lists
38 jaccard :: [Int] -> [Int] -> Double
39 jaccard inter' union' = ((fromIntegral . length) $ inter') / ((fromIntegral . length) $ union')
40
41
42 -- | Process the inverse sumLog
43 sumInvLog' :: Double -> Double -> [Double] -> Double
44 sumInvLog' s nb diago = foldl (\mem occ -> mem + (1 / (log (occ + s) / log (nb + s)))) 0 diago
45
46
47 -- | Process the sumLog
48 sumLog' :: Double -> Double -> [Double] -> Double
49 sumLog' s nb diago = foldl (\mem occ -> mem + (log (occ + s) / log (nb + s))) 0 diago
50
51
52 weightedLogJaccard' :: Double -> Double -> Map Int Double -> [Int] -> [Int] -> Double
53 weightedLogJaccard' sens nbDocs diago ngrams ngrams'
54 | null ngramsInter = 0
55 | ngramsInter == ngramsUnion = 1
56 | sens == 0 = jaccard ngramsInter ngramsUnion
57 | sens > 0 = (sumInvLog' sens nbDocs diagoInter) / (sumInvLog' sens nbDocs diagoUnion)
58 | otherwise = (sumLog' sens nbDocs diagoInter) / (sumLog' sens nbDocs diagoUnion)
59 where
60 --------------------------------------
61 ngramsInter :: [Int]
62 ngramsInter = intersect ngrams ngrams'
63 --------------------------------------
64 ngramsUnion :: [Int]
65 ngramsUnion = union ngrams ngrams'
66 --------------------------------------
67 diagoInter :: [Double]
68 diagoInter = elems $ restrictKeys diago (Set.fromList ngramsInter)
69 --------------------------------------
70 diagoUnion :: [Double]
71 diagoUnion = elems $ restrictKeys diago (Set.fromList ngramsUnion)
72 --------------------------------------
73
74
75 -- | To process the proximity between a current group and a pair of targets group
76 toProximity :: Double -> Map Int Double -> Proximity -> [Int] -> [Int] -> [Int] -> Double
77 toProximity nbDocs diago proximity egoNgrams targetNgrams targetNgrams' =
78 case proximity of
79 WeightedLogJaccard sens ->
80 let pairNgrams = if targetNgrams == targetNgrams'
81 then targetNgrams
82 else union targetNgrams targetNgrams'
83 in weightedLogJaccard' sens nbDocs diago egoNgrams pairNgrams
84 Hamming -> undefined
85
86
87 ------------------------
88 -- | Local Matching | --
89 ------------------------
90
91 findLastPeriod :: Filiation -> [PhyloPeriodId] -> PhyloPeriodId
92 findLastPeriod fil periods = case fil of
93 ToParents -> head' "findLastPeriod" (sortOn fst periods)
94 ToChilds -> last' "findLastPeriod" (sortOn fst periods)
95
96
97 -- | To filter pairs of candidates related to old pointers periods
98 removeOldPointers :: [Pointer] -> Filiation -> Double -> Proximity -> PhyloPeriodId
99 -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
100 -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
101 removeOldPointers oldPointers fil thr prox prd pairs
102 | null oldPointers = pairs
103 | null (filterPointers prox thr oldPointers) =
104 let lastMatchedPrd = findLastPeriod fil (map (fst . fst . fst) oldPointers)
105 in if lastMatchedPrd == prd
106 then []
107 else filter (\((id,_),(id',_)) ->
108 case fil of
109 ToParents -> (((fst . fst . fst) id ) < (fst lastMatchedPrd))
110 || (((fst . fst . fst) id') < (fst lastMatchedPrd))
111 ToChilds -> (((fst . fst . fst) id ) > (fst lastMatchedPrd))
112 || (((fst . fst . fst) id') > (fst lastMatchedPrd))) pairs
113 | otherwise = []
114
115
116 makePairs' :: (PhyloGroupId,[Int]) -> [(PhyloGroupId,[Int])] -> [PhyloPeriodId] -> [Pointer] -> Filiation -> Double -> Proximity
117 -> Map Date Double -> Map Date Cooc -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
118 makePairs' (egoId, egoNgrams) candidates periods oldPointers fil thr prox docs diagos =
119 if (null periods)
120 then []
121 else removeOldPointers oldPointers fil thr prox lastPrd
122 {- at least on of the pair candidates should be from the last added period -}
123 $ filter (\((id,_),(id',_)) -> ((fst . fst) id == lastPrd) || ((fst . fst) id' == lastPrd))
124 $ listToKeys
125 $ filter (\(id,ngrams) ->
126 let nbDocs = (sum . elems) $ filterDocs docs ([(fst . fst) egoId, (fst . fst) id])
127 diago = reduceDiagos $ filterDiago diagos ([(fst . fst) egoId, (fst . fst) id])
128 in (toProximity nbDocs diago prox egoNgrams egoNgrams ngrams) >= thr
129 ) candidates
130 where
131 lastPrd :: PhyloPeriodId
132 lastPrd = findLastPeriod fil periods
133
134
135 filterPointers :: Proximity -> Double -> [Pointer] -> [Pointer]
136 filterPointers proxi thr pts = filter (\(_,w) -> filterProximity proxi thr w) pts
137
138 filterPointers' :: Proximity -> Double -> [(Pointer,[Int])] -> [(Pointer,[Int])]
139 filterPointers' proxi thr pts = filter (\((_,w),_) -> filterProximity proxi thr w) pts
140
141
142 reduceDiagos :: Map Date Cooc -> Map Int Double
143 reduceDiagos diagos = mapKeys (\(k,_) -> k)
144 $ foldl (\acc diago -> unionWith (+) acc diago) empty (elems diagos)
145
146 filterPointersByPeriod :: Filiation -> [(Pointer,[Int])] -> [Pointer]
147 filterPointersByPeriod fil pts =
148 let pts' = sortOn (fst . fst . fst . fst) pts
149 inf = (fst . fst . fst . fst) $ head' "filterPointersByPeriod" pts'
150 sup = (fst . fst . fst . fst) $ last' "filterPointersByPeriod" pts'
151 in map fst
152 $ nubBy (\pt pt' -> snd pt == snd pt')
153 $ filter (\pt -> ((fst . fst . fst . fst) pt == inf) || ((fst . fst . fst . fst) pt == sup))
154 $ case fil of
155 ToParents -> reverse pts'
156 ToChilds -> pts'
157
158 phyloGroupMatching :: [[(PhyloGroupId,[Int])]] -> Filiation -> Proximity -> Map Date Double -> Map Date Cooc
159 -> Double -> [Pointer] -> (PhyloGroupId,[Int]) -> [Pointer]
160 phyloGroupMatching candidates fil proxi docs diagos thr oldPointers (id,ngrams) =
161 if (null $ filterPointers proxi thr oldPointers)
162 {- let's find new pointers -}
163 then if null nextPointers
164 then []
165 else filterPointersByPeriod fil
166 $ head' "phyloGroupMatching"
167 -- Keep only the best set of pointers grouped by proximity
168 $ groupBy (\pt pt' -> (snd . fst) pt == (snd . fst) pt')
169 $ reverse $ sortOn (snd . fst) $ head' "pointers" nextPointers
170 -- Find the first time frame where at leats one pointer satisfies the proximity threshold
171 else oldPointers
172 where
173 nextPointers :: [[(Pointer,[Int])]]
174 nextPointers = take 1
175 $ dropWhile (null)
176 {- for each time frame, process the proximity on relevant pairs of targeted groups -}
177 $ scanl (\acc groups ->
178 let periods = nub $ map (fst . fst . fst) $ concat groups
179 nbdocs = sum $ elems $ (filterDocs docs ([(fst . fst) id] ++ periods))
180 diago = reduceDiagos
181 $ filterDiago diagos ([(fst . fst) id] ++ periods)
182 {- important resize nbdocs et diago dans le make pairs -}
183 pairs = makePairs' (id,ngrams) (concat groups) periods oldPointers fil thr proxi docs diagos
184 in acc ++ ( filterPointers' proxi thr
185 $ concat
186 $ map (\(c,c') ->
187 {- process the proximity between the current group and a pair of candidates -}
188 let proximity = toProximity nbdocs diago proxi ngrams (snd c) (snd c')
189 in if ((c == c') || (snd c == snd c'))
190 then [((fst c,proximity),snd c)]
191 else [((fst c,proximity),snd c),((fst c',proximity),snd c')] ) pairs )) []
192 $ inits candidates -- groups from [[1900],[1900,1901],[1900,1901,1902],...]
193
194
195 filterDocs :: Map Date Double -> [PhyloPeriodId] -> Map Date Double
196 filterDocs d pds = restrictKeys d $ periodsToYears pds
197
198 filterDiago :: Map Date Cooc -> [PhyloPeriodId] -> Map Date Cooc
199 filterDiago diago pds = restrictKeys diago $ periodsToYears pds
200
201
202 -----------------------------
203 -- | Matching Processing | --
204 -----------------------------
205
206
207 getNextPeriods :: Filiation -> Int -> PhyloPeriodId -> [PhyloPeriodId] -> [PhyloPeriodId]
208 getNextPeriods fil max' pId pIds =
209 case fil of
210 ToChilds -> take max' $ (tail . snd) $ splitAt (elemIndex' pId pIds) pIds
211 ToParents -> take max' $ (reverse . fst) $ splitAt (elemIndex' pId pIds) pIds
212
213
214 getCandidates :: PhyloGroup -> [[(PhyloGroupId,[Int])]] -> [[(PhyloGroupId,[Int])]]
215 getCandidates ego targets =
216 map (\groups' ->
217 filter (\g' -> (not . null) $ intersect (ego ^. phylo_groupNgrams) (snd g')
218 ) groups') targets
219
220
221 matchGroupsToGroups :: Int -> [PhyloPeriodId] -> Proximity -> Double -> Map Date Double -> Map Date Cooc -> [PhyloGroup] -> [PhyloGroup]
222 matchGroupsToGroups frame periods proximity thr docs coocs groups =
223 let groups' = groupByField _phylo_groupPeriod groups
224 in foldl' (\acc prd ->
225 let -- 1) find the parents/childs matching periods
226 periodsPar = getNextPeriods ToParents frame prd periods
227 periodsChi = getNextPeriods ToChilds frame prd periods
228 -- 2) find the parents/childs matching candidates
229 candidatesPar = map (\prd' -> map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ findWithDefault [] prd' groups') periodsPar
230 candidatesChi = map (\prd' -> map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ findWithDefault [] prd' groups') periodsChi
231 -- 3) find the parents/child number of docs by years
232 docsPar = filterDocs docs ([prd] ++ periodsPar)
233 docsChi = filterDocs docs ([prd] ++ periodsChi)
234 -- 4) find the parents/child diago by years
235 diagoPar = filterDiago (map coocToDiago coocs) ([prd] ++ periodsPar)
236 diagoChi = filterDiago (map coocToDiago coocs) ([prd] ++ periodsPar)
237 -- 5) match in parallel all the groups (egos) to their possible candidates
238 egos = map (\ego ->
239 let pointersPar = phyloGroupMatching (getCandidates ego candidatesPar) ToParents proximity docsPar diagoPar
240 thr (getPeriodPointers ToParents ego) (getGroupId ego, ego ^. phylo_groupNgrams)
241 pointersChi = phyloGroupMatching (getCandidates ego candidatesChi) ToChilds proximity docsChi diagoChi
242 thr (getPeriodPointers ToChilds ego) (getGroupId ego, ego ^. phylo_groupNgrams)
243 in addPointers ToChilds TemporalPointer pointersChi
244 $ addPointers ToParents TemporalPointer pointersPar ego)
245 $ findWithDefault [] prd groups'
246 egos' = egos `using` parList rdeepseq
247 in acc ++ egos'
248 ) [] periods
249
250
251 -----------------------
252 -- | Phylo Quality | --
253 -----------------------
254
255
256 relevantBranches :: Int -> [[PhyloGroup]] -> [[PhyloGroup]]
257 relevantBranches term branches =
258 filter (\groups -> (any (\group -> elem term $ group ^. phylo_groupNgrams) groups)) branches
259
260 accuracy :: Int -> [(Date,Date)] -> [PhyloGroup] -> Double
261 accuracy x periods bk = ((fromIntegral $ length $ filter (\g -> elem x $ g ^. phylo_groupNgrams) bk')
262 / (fromIntegral $ length bk'))
263 where
264 bk' :: [PhyloGroup]
265 bk' = filter (\g -> elem (g ^. phylo_groupPeriod) periods) bk
266
267 recall :: Int -> [PhyloGroup] -> [[PhyloGroup]] -> Double
268 recall x bk bx = ((fromIntegral $ length $ filter (\g -> elem x $ g ^. phylo_groupNgrams) bk)
269 / (fromIntegral $ length $ filter (\g -> elem x $ g ^. phylo_groupNgrams) $ concat bx))
270
271 fScore :: Double -> Int -> [(Date,Date)] -> [PhyloGroup] -> [[PhyloGroup]] -> Double
272 fScore beta x periods bk bx =
273 let rec = recall x bk bx
274 acc = accuracy x periods bk
275 in ((1 + beta ** 2) * acc * rec)
276 / (((beta ** 2) * rec + acc))
277
278
279 wk :: [PhyloGroup] -> Double
280 wk bk = fromIntegral $ length bk
281
282
283 toPhyloQuality' :: Double -> Map Int Double -> [[PhyloGroup]] -> Double
284 toPhyloQuality' beta freq branches =
285 if (null branches)
286 then 0
287 else sum
288 $ map (\i ->
289 let bks = relevantBranches i branches
290 periods = nub $ map _phylo_groupPeriod $ concat bks
291 in (freq ! i) * (sum $ map (\bk -> ((wk bk) / (sum $ map wk bks)) * (fScore beta i periods bk bks)) bks))
292 $ keys freq
293
294 toRecall :: Map Int Double -> [[PhyloGroup]] -> Double
295 toRecall freq branches =
296 if (null branches)
297 then 0
298 else sum
299 $ map (\x ->
300 let px = freq ! x
301 bx = relevantBranches x branches
302 wks = sum $ map wk bx
303 in (px / pys) * (sum $ map (\bk -> ((wk bk) / wks) * (recall x bk bx)) bx))
304 $ keys freq
305 where
306 pys :: Double
307 pys = sum (elems freq)
308
309
310 toAccuracy :: Map Int Double -> [[PhyloGroup]] -> Double
311 toAccuracy freq branches =
312 if (null branches)
313 then 0
314 else sum
315 $ map (\x ->
316 let px = freq ! x
317 bx = relevantBranches x branches
318 -- | periods containing x
319 periods = nub $ map _phylo_groupPeriod $ concat bx
320 wks = sum $ map wk bx
321 in (px / pys) * (sum $ map (\bk -> ((wk bk) / wks) * (accuracy x periods bk)) bx))
322 $ keys freq
323 where
324 pys :: Double
325 pys = sum (elems freq)
326
327
328 -- | here we do the average of all the local f_scores
329 toPhyloQuality :: Double -> Map Int Double -> [[PhyloGroup]] -> Double
330 toPhyloQuality beta freq branches =
331 if (null branches)
332 then 0
333 else sum
334 $ map (\x ->
335 let px = freq ! x
336 bx = relevantBranches x branches
337 -- | periods containing x
338 periods = nub $ map _phylo_groupPeriod $ concat bx
339 wks = sum $ map wk bx
340 in (px / pys) * (sum $ map (\bk -> ((wk bk) / wks) * (fScore beta x periods bk bx)) bx))
341 $ keys freq
342 where
343 pys :: Double
344 pys = sum (elems freq)
345
346 -- 1 / nb de foundation
347
348 ------------------------------------
349 -- | Constant Temporal Matching | --
350 ------------------------------------
351
352
353 groupsToBranches' :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
354 groupsToBranches' groups =
355 {- run the related component algorithm -}
356 let egos = groupBy (\gs gs' -> (fst $ fst $ head' "egos" gs) == (fst $ fst $ head' "egos" gs'))
357 $ sortOn (\gs -> fst $ fst $ head' "egos" gs)
358 $ map (\group -> [getGroupId group]
359 ++ (map fst $ group ^. phylo_groupPeriodParents)
360 ++ (map fst $ group ^. phylo_groupPeriodChilds) ) $ elems groups
361 -- first find the related components by inside each ego's period
362 -- a supprimer
363 graph' = map relatedComponents egos
364 -- then run it for the all the periods
365 graph = zip [1..]
366 $ relatedComponents $ concat (graph' `using` parList rdeepseq)
367 -- update each group's branch id
368 in map (\(bId,ids) ->
369 let groups' = map (\group -> group & phylo_groupBranchId %~ (\(lvl,lst) -> (lvl,lst ++ [bId])))
370 $ elems $ restrictKeys groups (Set.fromList ids)
371 in groups' `using` parList rdeepseq ) graph
372
373
374 reduceFrequency :: Map Int Double -> [[PhyloGroup]] -> Map Int Double
375 reduceFrequency frequency branches =
376 restrictKeys frequency (Set.fromList $ (nub . concat) $ map _phylo_groupNgrams $ concat branches)
377
378 updateThr :: Double -> [[PhyloGroup]] -> [[PhyloGroup]]
379 updateThr thr branches = map (\b -> map (\g ->
380 g & phylo_groupMeta .~ (singleton "seaLevels" (((g ^. phylo_groupMeta) ! "seaLevels") ++ [thr]))) b) branches
381
382
383 -- Sequentially break each branch of a phylo where
384 -- done = all the allready broken branches
385 -- ego = the current branch we want to break
386 -- rest = the branches we still have to break
387 breakBranches :: Proximity -> Double -> Map Int Double -> Int -> Double -> Double -> Double
388 -> Int -> Map Date Double -> Map Date Cooc -> [PhyloPeriodId] -> [([PhyloGroup],Bool)] -> ([PhyloGroup],Bool) -> [([PhyloGroup],Bool)] -> [([PhyloGroup],Bool)]
389 breakBranches proximity beta frequency minBranch thr depth elevation frame docs coocs periods done ego rest =
390 -- 1) keep or not the new division of ego
391 let done' = done ++ (if snd ego
392 then
393 (if ((null (fst ego')) || (quality > quality'))
394 then
395 -- trace (" ✗ F(β) = " <> show(quality) <> " (vs) " <> show(quality')
396 -- <> " | " <> show(length $ fst ego) <> " groups : "
397 -- <> " |✓ " <> show(length $ fst ego') <> show(map length $ fst ego')
398 -- <> " |✗ " <> show(length $ snd ego') <> "[" <> show(length $ concat $ snd ego') <> "]")
399 [(fst ego,False)]
400 else
401 -- trace (" ✓ level = " <> printf "%.1f" thr <> "")
402 -- trace (" ✓ F(β) = " <> show(quality) <> " (vs) " <> show(quality')
403 -- <> " | " <> show(length $ fst ego) <> " groups : "
404 -- <> " |✓ " <> show(length $ fst ego') <> show(map length $ fst ego')
405 -- <> " |✗ " <> show(length $ snd ego') <> "[" <> show(length $ concat $ snd ego') <> "]")
406 ((map (\e -> (e,True)) (fst ego')) ++ (map (\e -> (e,False)) (snd ego'))))
407 else [ego])
408 in
409 -- 2) if there is no more branches in rest then return else continue
410 if null rest
411 then done'
412 else breakBranches proximity beta frequency minBranch thr depth elevation frame docs coocs periods
413 done' (head' "breakBranches" rest) (tail' "breakBranches" rest)
414 where
415 --------------------------------------
416 quality :: Double
417 quality = toPhyloQuality beta frequency ((map fst done) ++ [fst ego] ++ (map fst rest))
418 --------------------------------------
419 ego' :: ([[PhyloGroup]],[[PhyloGroup]])
420 ego' =
421 let branches = groupsToBranches' $ fromList $ map (\g -> (getGroupId g, g))
422 $ matchGroupsToGroups frame periods proximity thr docs coocs (fst ego)
423 branches' = branches `using` parList rdeepseq
424 in partition (\b -> (length $ nub $ map _phylo_groupPeriod b) >= minBranch)
425 $ thrToMeta thr
426 $ depthToMeta (elevation - depth) branches'
427 --------------------------------------
428 quality' :: Double
429 quality' = toPhyloQuality beta frequency
430 ((map fst done) ++ (fst ego') ++ (snd ego') ++ (map fst rest))
431
432
433 seaLevelMatching :: Proximity -> Double -> Int -> Map Int Double -> Double -> Double -> Double -> Double
434 -> Int -> [PhyloPeriodId] -> Map Date Double -> Map Date Cooc -> [([PhyloGroup],Bool)] -> [([PhyloGroup],Bool)]
435 seaLevelMatching proximity beta minBranch frequency thr step depth elevation frame periods docs coocs branches =
436 -- if there is no branch to break or if seaLvl level > 1 then end
437 if (thr >= 1) || ((not . or) $ map snd branches)
438 then branches
439 else
440 -- break all the possible branches at the current seaLvl level
441 let quality = toPhyloQuality beta frequency (map fst branches)
442 acc = toAccuracy frequency (map fst branches)
443 rec = toRecall frequency (map fst branches)
444 branches' = trace ("↑ level = " <> printf "%.3f" thr <> " F(β) = " <> printf "%.5f" quality
445 <> " ξ = " <> printf "%.5f" acc
446 <> " ρ = " <> printf "%.5f" rec
447 <> " branches = " <> show(length branches) <> " ↴")
448 $ breakBranches proximity beta frequency minBranch thr depth elevation frame docs coocs periods
449 [] (head' "seaLevelMatching" branches) (tail' "seaLevelMatching" branches)
450 frequency' = reduceFrequency frequency (map fst branches')
451 in seaLevelMatching proximity beta minBranch frequency' (thr + step) step (depth - 1) elevation frame periods docs coocs branches'
452
453
454 constanteTemporalMatching :: Double -> Double -> Phylo -> Phylo
455 constanteTemporalMatching start step phylo = updatePhyloGroups 1
456 (fromList $ map (\g -> (getGroupId g,g)) $ traceMatchEnd $ concat branches)
457 (toPhyloHorizon phylo)
458 where
459 -- 2) process the temporal matching by elevating seaLvl level
460 branches :: [[PhyloGroup]]
461 branches = map fst
462 $ seaLevelMatching (phyloProximity $ getConfig phylo)
463 (_qua_granularity $ phyloQuality $ getConfig phylo)
464 (_qua_minBranch $ phyloQuality $ getConfig phylo)
465 (phylo ^. phylo_termFreq)
466 start step
467 ((((1 - start) / step) - 1))
468 (((1 - start) / step))
469 (getTimeFrame $ timeUnit $ getConfig phylo)
470 (getPeriodIds phylo)
471 (phylo ^. phylo_timeDocs)
472 (phylo ^. phylo_timeCooc)
473 groups
474 -- 1) for each group process an initial temporal Matching
475 -- here we suppose that all the groups of level 1 are part of the same big branch
476 groups :: [([PhyloGroup],Bool)]
477 groups = map (\b -> (b,(length $ nub $ map _phylo_groupPeriod b) >= (_qua_minBranch $ phyloQuality $ getConfig phylo)))
478 $ groupsToBranches' $ fromList $ map (\g -> (getGroupId g, g))
479 $ matchGroupsToGroups (getTimeFrame $ timeUnit $ getConfig phylo)
480 (getPeriodIds phylo) (phyloProximity $ getConfig phylo)
481 start
482 (phylo ^. phylo_timeDocs)
483 (phylo ^. phylo_timeCooc)
484 (traceTemporalMatching $ getGroupsFromLevel 1 phylo)
485
486 -----------------
487 -- | Horizon | --
488 -----------------
489
490 toPhyloHorizon :: Phylo -> Phylo
491 toPhyloHorizon phylo =
492 let t0 = take 1 (getPeriodIds phylo)
493 groups = getGroupsFromLevelPeriods 1 t0 phylo
494 sens = getSensibility (phyloProximity $ getConfig phylo)
495 nbDocs = sum $ elems $ filterDocs (phylo ^. phylo_timeDocs) t0
496 diago = reduceDiagos $ filterDiago (phylo ^. phylo_timeCooc) t0
497 in phylo & phylo_horizon .~ (fromList $ map (\(g,g') ->
498 ((getGroupId g,getGroupId g'),weightedLogJaccard' sens nbDocs diago (g ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams))) $ listToCombi' groups)
499
500
501 --------------------------------------
502 -- | Adaptative Temporal Matching | --
503 --------------------------------------
504
505
506 thrToMeta :: Double -> [[PhyloGroup]] -> [[PhyloGroup]]
507 thrToMeta thr branches =
508 map (\b ->
509 map (\g -> g & phylo_groupMeta .~ (adjust (\lst -> lst ++ [thr]) "seaLevels" (g ^. phylo_groupMeta))) b) branches
510
511 depthToMeta :: Double -> [[PhyloGroup]] -> [[PhyloGroup]]
512 depthToMeta depth branches =
513 let break = length branches > 1
514 in map (\b ->
515 map (\g ->
516 if break then g & phylo_groupMeta .~ (adjust (\lst -> lst ++ [depth]) "breaks"(g ^. phylo_groupMeta))
517 else g) b) branches
518
519 reduceTupleMapByKeys :: Eq a => [a] -> Map (a,a) Double -> Map (a,a) Double
520 reduceTupleMapByKeys ks m = filterWithKey (\(k,k') _ -> (elem k ks) && (elem k' ks)) m
521
522
523 getInTupleMap :: Ord a => Map (a,a) Double -> a -> a -> Double
524 getInTupleMap m k k'
525 | isJust (m !? ( k ,k')) = m ! ( k ,k')
526 | isJust (m !? ( k',k )) = m ! ( k',k )
527 | otherwise = 0
528
529
530 toThreshold :: Double -> Map (PhyloGroupId,PhyloGroupId) Double -> Double
531 toThreshold lvl proxiGroups =
532 let idx = ((Map.size proxiGroups) `div` (floor lvl)) - 1
533 in if idx >= 0
534 then (sort $ elems proxiGroups) !! idx
535 else 1
536
537
538 -- done = all the allready broken branches
539 -- ego = the current branch we want to break
540 -- rest = the branches we still have to break
541 adaptativeBreakBranches :: Proximity -> Double -> Double -> Map (PhyloGroupId,PhyloGroupId) Double
542 -> Double -> Map Int Double -> Int -> Int -> Map Date Double -> Map Date Cooc
543 -> [PhyloPeriodId] -> [([PhyloGroup],(Bool,[Double]))] -> ([PhyloGroup],(Bool,[Double])) -> [([PhyloGroup],(Bool,[Double]))]
544 -> [([PhyloGroup],(Bool,[Double]))]
545 adaptativeBreakBranches proxiConf depth elevation groupsProxi beta frequency minBranch frame docs coocs periods done ego rest =
546 -- 1) keep or not the new division of ego
547 let done' = done ++ (if (fst . snd) ego
548 then (if ((null (fst ego')) || (quality > quality'))
549 then
550 [(concat $ thrToMeta thr $ [fst ego],(False, ((snd . snd) ego)))]
551 else
552 ( (map (\e -> (e,(True, ((snd . snd) ego) ++ [thr]))) (fst ego'))
553 ++ (map (\e -> (e,(False, ((snd . snd) ego)))) (snd ego'))))
554 else [(concat $ thrToMeta thr $ [fst ego], snd ego)])
555 in
556 -- uncomment let .. in for debugging
557 -- let part1 = partition (snd) done'
558 -- part2 = partition (snd) rest
559 -- in trace ( "[✓ " <> show(length $ fst part1) <> "(" <> show(length $ concat $ map (fst) $ fst part1) <> ")|✗ " <> show(length $ snd part1) <> "(" <> show(length $ concat $ map (fst) $ snd part1) <> ")] "
560 -- <> "[✓ " <> show(length $ fst part2) <> "(" <> show(length $ concat $ map (fst) $ fst part2) <> ")|✗ " <> show(length $ snd part2) <> "(" <> show(length $ concat $ map (fst) $ snd part2) <> ")]"
561 -- ) $
562 -- 2) if there is no more branches in rest then return else continue
563 if null rest
564 then done'
565 else adaptativeBreakBranches proxiConf depth elevation groupsProxi beta frequency minBranch frame docs coocs periods
566 done' (head' "breakBranches" rest) (tail' "breakBranches" rest)
567 where
568 --------------------------------------
569 thr :: Double
570 thr = toThreshold depth $ Map.filter (\v -> v > (last' "breakBranches" $ (snd . snd) ego)) $ reduceTupleMapByKeys (map getGroupId $ fst ego) groupsProxi
571 --------------------------------------
572 quality :: Double
573 quality = toPhyloQuality beta frequency ((map fst done) ++ [fst ego] ++ (map fst rest))
574 --------------------------------------
575 ego' :: ([[PhyloGroup]],[[PhyloGroup]])
576 ego' =
577 let branches = groupsToBranches' $ fromList $ map (\g -> (getGroupId g, g))
578 $ matchGroupsToGroups frame periods proxiConf thr docs coocs (fst ego)
579 branches' = branches `using` parList rdeepseq
580 in partition (\b -> (length $ nub $ map _phylo_groupPeriod b) > minBranch)
581 $ thrToMeta thr
582 $ depthToMeta (elevation - depth) branches'
583 --------------------------------------
584 quality' :: Double
585 quality' = toPhyloQuality beta frequency
586 ((map fst done) ++ (fst ego') ++ (snd ego') ++ (map fst rest))
587
588
589 adaptativeSeaLevelMatching :: Proximity -> Double -> Double -> Map (PhyloGroupId, PhyloGroupId) Double
590 -> Double -> Int -> Map Int Double
591 -> Int -> [PhyloPeriodId] -> Map Date Double -> Map Date Cooc
592 -> [([PhyloGroup],(Bool,[Double]))] -> [([PhyloGroup],(Bool,[Double]))]
593 adaptativeSeaLevelMatching proxiConf depth elevation groupsProxi beta minBranch frequency frame periods docs coocs branches =
594 -- if there is no branch to break or if seaLvl level >= depth then end
595 if (Map.null groupsProxi) || (depth <= 0) || ((not . or) $ map (fst . snd) branches)
596 then branches
597 else
598 -- break all the possible branches at the current seaLvl level
599 let branches' = adaptativeBreakBranches proxiConf depth elevation groupsProxi beta frequency minBranch frame docs coocs periods
600 [] (head' "seaLevelMatching" branches) (tail' "seaLevelMatching" branches)
601 frequency' = reduceFrequency frequency (map fst branches')
602 groupsProxi' = reduceTupleMapByKeys (map (getGroupId) $ concat $ map (fst) $ filter (fst . snd) branches') groupsProxi
603 -- thr = toThreshold depth groupsProxi
604 in trace("\n " <> foldl (\acc _ -> acc <> "🌊 ") "" [0..(elevation - depth)]
605 <> " [✓ " <> show(length $ filter (fst . snd) branches') <> "(" <> show(length $ concat $ map (fst) $ filter (fst . snd) branches')
606 <> ")|✗ " <> show(length $ filter (not . fst . snd) branches') <> "(" <> show(length $ concat $ map (fst) $ filter (not . fst . snd) branches') <> ")]"
607 <> " thr = ")
608 $ adaptativeSeaLevelMatching proxiConf (depth - 1) elevation groupsProxi' beta minBranch frequency' frame periods docs coocs branches'
609
610
611 adaptativeTemporalMatching :: Double -> Phylo -> Phylo
612 adaptativeTemporalMatching elevation phylo = updatePhyloGroups 1
613 (fromList $ map (\g -> (getGroupId g,g)) $ traceMatchEnd $ concat branches)
614 (toPhyloHorizon phylo)
615 where
616 -- 2) process the temporal matching by elevating seaLvl level
617 branches :: [[PhyloGroup]]
618 branches = map fst
619 $ adaptativeSeaLevelMatching (phyloProximity $ getConfig phylo)
620 (elevation - 1)
621 elevation
622 (phylo ^. phylo_groupsProxi)
623 (_qua_granularity $ phyloQuality $ getConfig phylo)
624 (_qua_minBranch $ phyloQuality $ getConfig phylo)
625 (phylo ^. phylo_termFreq)
626 (getTimeFrame $ timeUnit $ getConfig phylo)
627 (getPeriodIds phylo)
628 (phylo ^. phylo_timeDocs)
629 (phylo ^. phylo_timeCooc)
630 groups
631 -- 1) for each group process an initial temporal Matching
632 -- here we suppose that all the groups of level 1 are part of the same big branch
633 groups :: [([PhyloGroup],(Bool,[Double]))]
634 groups = map (\b -> (b,((length $ nub $ map _phylo_groupPeriod b) >= (_qua_minBranch $ phyloQuality $ getConfig phylo),[thr])))
635 $ groupsToBranches' $ fromList $ map (\g -> (getGroupId g, g))
636 $ matchGroupsToGroups (getTimeFrame $ timeUnit $ getConfig phylo)
637 (getPeriodIds phylo) (phyloProximity $ getConfig phylo)
638 thr
639 (phylo ^. phylo_timeDocs)
640 (phylo ^. phylo_timeCooc)
641 (traceTemporalMatching $ getGroupsFromLevel 1 phylo)
642 --------------------------------------
643 thr :: Double
644 thr = toThreshold elevation (phylo ^. phylo_groupsProxi)