]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/Phylo/TemporalMatching.hs
Merge branch 'dev-public' into dev
[gargantext.git] / src / Gargantext / Viz / Phylo / TemporalMatching.hs
1 {-|
2 Module : Gargantext.Viz.Phylo.TemporalMatching
3 Description : Module dedicated to the adaptative temporal matching of a Phylo.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11
12 module Gargantext.Viz.Phylo.TemporalMatching where
13
14 import Data.List (concat, splitAt, tail, sortOn, (++), intersect, null, inits, groupBy, scanl, nub, nubBy, union, dropWhile, partition, or, sort, (!!))
15 import Data.Map (Map, fromList, elems, restrictKeys, unionWith, findWithDefault, keys, (!), (!?), filterWithKey, singleton, empty, mapKeys, adjust)
16
17 import Gargantext.Prelude
18 import Gargantext.Viz.AdaptativePhylo
19 import Gargantext.Viz.Phylo.PhyloTools
20
21 import Prelude (floor)
22 import Control.Lens hiding (Level)
23 import Control.Parallel.Strategies (parList, rdeepseq, using)
24 import Debug.Trace (trace)
25
26 import Text.Printf
27
28 import qualified Data.Map as Map
29 import qualified Data.Set as Set
30
31
32 -------------------
33 -- | Proximity | --
34 -------------------
35
36
37 -- | To compute a jaccard similarity between two lists
38 jaccard :: [Int] -> [Int] -> Double
39 jaccard inter' union' = ((fromIntegral . length) $ inter') / ((fromIntegral . length) $ union')
40
41
42 -- | Process the inverse sumLog
43 sumInvLog' :: Double -> Double -> [Double] -> Double
44 sumInvLog' s nb diago = foldl (\mem occ -> mem + (1 / (log (occ + s) / log (nb + s)))) 0 diago
45
46
47 -- | Process the sumLog
48 sumLog' :: Double -> Double -> [Double] -> Double
49 sumLog' s nb diago = foldl (\mem occ -> mem + (log (occ + s) / log (nb + s))) 0 diago
50
51
52 weightedLogJaccard' :: Double -> Double -> Map Int Double -> [Int] -> [Int] -> Double
53 weightedLogJaccard' sens nbDocs diago ngrams ngrams'
54 | null ngramsInter = 0
55 | ngramsInter == ngramsUnion = 1
56 | sens == 0 = jaccard ngramsInter ngramsUnion
57 | sens > 0 = (sumInvLog' sens nbDocs diagoInter) / (sumInvLog' sens nbDocs diagoUnion)
58 | otherwise = (sumLog' sens nbDocs diagoInter) / (sumLog' sens nbDocs diagoUnion)
59 where
60 --------------------------------------
61 ngramsInter :: [Int]
62 ngramsInter = intersect ngrams ngrams'
63 --------------------------------------
64 ngramsUnion :: [Int]
65 ngramsUnion = union ngrams ngrams'
66 --------------------------------------
67 diagoInter :: [Double]
68 diagoInter = elems $ restrictKeys diago (Set.fromList ngramsInter)
69 --------------------------------------
70 diagoUnion :: [Double]
71 diagoUnion = elems $ restrictKeys diago (Set.fromList ngramsUnion)
72 --------------------------------------
73
74
75 -- | To process the proximity between a current group and a pair of targets group
76 toProximity :: Double -> Map Int Double -> Proximity -> [Int] -> [Int] -> [Int] -> Double
77 toProximity nbDocs diago proximity egoNgrams targetNgrams targetNgrams' =
78 case proximity of
79 WeightedLogJaccard sens ->
80 let pairNgrams = if targetNgrams == targetNgrams'
81 then targetNgrams
82 else union targetNgrams targetNgrams'
83 in weightedLogJaccard' sens nbDocs diago egoNgrams pairNgrams
84 Hamming -> undefined
85
86
87 ------------------------
88 -- | Local Matching | --
89 ------------------------
90
91 findLastPeriod :: Filiation -> [PhyloPeriodId] -> PhyloPeriodId
92 findLastPeriod fil periods = case fil of
93 ToParents -> head' "findLastPeriod" (sortOn fst periods)
94 ToChilds -> last' "findLastPeriod" (sortOn fst periods)
95
96
97 -- | To filter pairs of candidates related to old pointers periods
98 removeOldPointers :: [Pointer] -> Filiation -> Double -> Proximity -> PhyloPeriodId
99 -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
100 -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
101 removeOldPointers oldPointers fil thr prox prd pairs
102 | null oldPointers = pairs
103 | null (filterPointers prox thr oldPointers) =
104 let lastMatchedPrd = findLastPeriod fil (map (fst . fst . fst) oldPointers)
105 in if lastMatchedPrd == prd
106 then []
107 else filter (\((id,_),(id',_)) ->
108 case fil of
109 ToParents -> (((fst . fst . fst) id ) < (fst lastMatchedPrd))
110 || (((fst . fst . fst) id') < (fst lastMatchedPrd))
111 ToChilds -> (((fst . fst . fst) id ) > (fst lastMatchedPrd))
112 || (((fst . fst . fst) id') > (fst lastMatchedPrd))) pairs
113 | otherwise = []
114
115
116 makePairs' :: (PhyloGroupId,[Int]) -> [(PhyloGroupId,[Int])] -> [PhyloPeriodId] -> [Pointer] -> Filiation -> Double -> Proximity
117 -> Map Date Double -> Map Date Cooc -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
118 makePairs' (egoId, egoNgrams) candidates periods oldPointers fil thr prox docs diagos =
119 if (null periods)
120 then []
121 else removeOldPointers oldPointers fil thr prox lastPrd
122 {- at least on of the pair candidates should be from the last added period -}
123 $ filter (\((id,_),(id',_)) -> ((fst . fst) id == lastPrd) || ((fst . fst) id' == lastPrd))
124 $ listToKeys
125 $ filter (\(id,ngrams) ->
126 let nbDocs = (sum . elems) $ filterDocs docs ([(fst . fst) egoId, (fst . fst) id])
127 diago = reduceDiagos $ filterDiago diagos ([(fst . fst) egoId, (fst . fst) id])
128 in (toProximity nbDocs diago prox egoNgrams egoNgrams ngrams) >= thr
129 ) candidates
130 where
131 lastPrd :: PhyloPeriodId
132 lastPrd = findLastPeriod fil periods
133
134
135 filterPointers :: Proximity -> Double -> [Pointer] -> [Pointer]
136 filterPointers proxi thr pts = filter (\(_,w) -> filterProximity proxi thr w) pts
137
138 filterPointers' :: Proximity -> Double -> [(Pointer,[Int])] -> [(Pointer,[Int])]
139 filterPointers' proxi thr pts = filter (\((_,w),_) -> filterProximity proxi thr w) pts
140
141
142 reduceDiagos :: Map Date Cooc -> Map Int Double
143 reduceDiagos diagos = mapKeys (\(k,_) -> k)
144 $ foldl (\acc diago -> unionWith (+) acc diago) empty (elems diagos)
145
146 filterPointersByPeriod :: Filiation -> [(Pointer,[Int])] -> [Pointer]
147 filterPointersByPeriod fil pts =
148 let pts' = sortOn (fst . fst . fst . fst) pts
149 inf = (fst . fst . fst . fst) $ head' "filterPointersByPeriod" pts'
150 sup = (fst . fst . fst . fst) $ last' "filterPointersByPeriod" pts'
151 in map fst
152 $ nubBy (\pt pt' -> snd pt == snd pt')
153 $ filter (\pt -> ((fst . fst . fst . fst) pt == inf) || ((fst . fst . fst . fst) pt == sup))
154 $ case fil of
155 ToParents -> reverse pts'
156 ToChilds -> pts'
157
158 phyloGroupMatching :: [[(PhyloGroupId,[Int])]] -> Filiation -> Proximity -> Map Date Double -> Map Date Cooc
159 -> Double -> [Pointer] -> (PhyloGroupId,[Int]) -> [Pointer]
160 phyloGroupMatching candidates fil proxi docs diagos thr oldPointers (id,ngrams) =
161 if (null $ filterPointers proxi thr oldPointers)
162 {- let's find new pointers -}
163 then if null nextPointers
164 then []
165 else filterPointersByPeriod fil
166 $ head' "phyloGroupMatching"
167 -- Keep only the best set of pointers grouped by proximity
168 $ groupBy (\pt pt' -> (snd . fst) pt == (snd . fst) pt')
169 $ reverse $ sortOn (snd . fst) $ head' "pointers" nextPointers
170 -- Find the first time frame where at leats one pointer satisfies the proximity threshold
171 else oldPointers
172 where
173 nextPointers :: [[(Pointer,[Int])]]
174 nextPointers = take 1
175 $ dropWhile (null)
176 {- for each time frame, process the proximity on relevant pairs of targeted groups -}
177 $ scanl (\acc groups ->
178 let periods = nub $ map (fst . fst . fst) $ concat groups
179 nbdocs = sum $ elems $ (filterDocs docs ([(fst . fst) id] ++ periods))
180 diago = reduceDiagos
181 $ filterDiago diagos ([(fst . fst) id] ++ periods)
182 {- important resize nbdocs et diago dans le make pairs -}
183 pairs = makePairs' (id,ngrams) (concat groups) periods oldPointers fil thr proxi docs diagos
184 in acc ++ ( filterPointers' proxi thr
185 $ concat
186 $ map (\(c,c') ->
187 {- process the proximity between the current group and a pair of candidates -}
188 let proximity = toProximity nbdocs diago proxi ngrams (snd c) (snd c')
189 in if ((c == c') || (snd c == snd c'))
190 then [((fst c,proximity),snd c)]
191 else [((fst c,proximity),snd c),((fst c',proximity),snd c')] ) pairs )) []
192 $ inits candidates -- groups from [[1900],[1900,1901],[1900,1901,1902],...]
193
194
195 filterDocs :: Map Date Double -> [PhyloPeriodId] -> Map Date Double
196 filterDocs d pds = restrictKeys d $ periodsToYears pds
197
198 filterDiago :: Map Date Cooc -> [PhyloPeriodId] -> Map Date Cooc
199 filterDiago diago pds = restrictKeys diago $ periodsToYears pds
200
201
202 -----------------------------
203 -- | Matching Processing | --
204 -----------------------------
205
206
207 getNextPeriods :: Filiation -> Int -> PhyloPeriodId -> [PhyloPeriodId] -> [PhyloPeriodId]
208 getNextPeriods fil max' pId pIds =
209 case fil of
210 ToChilds -> take max' $ (tail . snd) $ splitAt (elemIndex' pId pIds) pIds
211 ToParents -> take max' $ (reverse . fst) $ splitAt (elemIndex' pId pIds) pIds
212
213
214 getCandidates :: PhyloGroup -> [[(PhyloGroupId,[Int])]] -> [[(PhyloGroupId,[Int])]]
215 getCandidates ego targets =
216 map (\groups' ->
217 filter (\g' -> (not . null) $ intersect (ego ^. phylo_groupNgrams) (snd g')
218 ) groups') targets
219
220
221 matchGroupsToGroups :: Int -> [PhyloPeriodId] -> Proximity -> Double -> Map Date Double -> Map Date Cooc -> [PhyloGroup] -> [PhyloGroup]
222 matchGroupsToGroups frame periods proximity thr docs coocs groups =
223 let groups' = groupByField _phylo_groupPeriod groups
224 in foldl' (\acc prd ->
225 let -- 1) find the parents/childs matching periods
226 periodsPar = getNextPeriods ToParents frame prd periods
227 periodsChi = getNextPeriods ToChilds frame prd periods
228 -- 2) find the parents/childs matching candidates
229 candidatesPar = map (\prd' -> map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ findWithDefault [] prd' groups') periodsPar
230 candidatesChi = map (\prd' -> map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ findWithDefault [] prd' groups') periodsChi
231 -- 3) find the parents/child number of docs by years
232 docsPar = filterDocs docs ([prd] ++ periodsPar)
233 docsChi = filterDocs docs ([prd] ++ periodsChi)
234 -- 4) find the parents/child diago by years
235 diagoPar = filterDiago (map coocToDiago coocs) ([prd] ++ periodsPar)
236 diagoChi = filterDiago (map coocToDiago coocs) ([prd] ++ periodsPar)
237 -- 5) match in parallel all the groups (egos) to their possible candidates
238 egos = map (\ego ->
239 let pointersPar = phyloGroupMatching (getCandidates ego candidatesPar) ToParents proximity docsPar diagoPar
240 thr (getPeriodPointers ToParents ego) (getGroupId ego, ego ^. phylo_groupNgrams)
241 pointersChi = phyloGroupMatching (getCandidates ego candidatesChi) ToChilds proximity docsChi diagoChi
242 thr (getPeriodPointers ToChilds ego) (getGroupId ego, ego ^. phylo_groupNgrams)
243 in addPointers ToChilds TemporalPointer pointersChi
244 $ addPointers ToParents TemporalPointer pointersPar ego)
245 $ findWithDefault [] prd groups'
246 egos' = egos `using` parList rdeepseq
247 in acc ++ egos'
248 ) [] periods
249
250
251 -----------------------
252 -- | Phylo Quality | --
253 -----------------------
254
255
256 relevantBranches :: Int -> [[PhyloGroup]] -> [[PhyloGroup]]
257 relevantBranches term branches =
258 filter (\groups -> (any (\group -> elem term $ group ^. phylo_groupNgrams) groups)) branches
259
260 accuracy :: Int -> [PhyloGroup] -> Double
261 accuracy x bk = ((fromIntegral $ length $ filter (\g -> elem x $ g ^. phylo_groupNgrams) bk)
262 / (fromIntegral $ length bk))
263
264 recall :: Int -> [PhyloGroup] -> [[PhyloGroup]] -> Double
265 recall x bk bx = ((fromIntegral $ length $ filter (\g -> elem x $ g ^. phylo_groupNgrams) bk)
266 / (fromIntegral $ length $ filter (\g -> elem x $ g ^. phylo_groupNgrams) $ concat bx))
267
268 fScore :: Double -> Int -> [PhyloGroup] -> [[PhyloGroup]] -> Double
269 fScore beta x bk bx =
270 let rec = recall x bk bx
271 acc = accuracy x bk
272 in ((1 + beta ** 2) * acc * rec)
273 / (((beta ** 2) * rec + acc))
274
275
276 wk :: [PhyloGroup] -> Double
277 wk bk = fromIntegral $ length bk
278
279
280 toPhyloQuality' :: Double -> Map Int Double -> [[PhyloGroup]] -> Double
281 toPhyloQuality' beta freq branches =
282 if (null branches)
283 then 0
284 else sum
285 $ map (\i ->
286 let bks = relevantBranches i branches
287 in (freq ! i) * (sum $ map (\bk -> ((wk bk) / (sum $ map wk bks)) * (fScore beta i bk bks)) bks))
288 $ keys freq
289
290 toRecall :: Map Int Double -> [[PhyloGroup]] -> Double
291 toRecall freq branches =
292 if (null branches)
293 then 0
294 else sum
295 $ map (\x ->
296 let px = freq ! x
297 bx = relevantBranches x branches
298 wks = sum $ map wk bx
299 in (px / pys) * (sum $ map (\bk -> ((wk bk) / wks) * (recall x bk bx)) bx))
300 $ keys freq
301 where
302 pys :: Double
303 pys = sum (elems freq)
304
305
306 toAccuracy :: Map Int Double -> [[PhyloGroup]] -> Double
307 toAccuracy freq branches =
308 if (null branches)
309 then 0
310 else sum
311 $ map (\x ->
312 let px = freq ! x
313 bx = relevantBranches x branches
314 wks = sum $ map wk bx
315 in (px / pys) * (sum $ map (\bk -> ((wk bk) / wks) * (accuracy x bk)) bx))
316 $ keys freq
317 where
318 pys :: Double
319 pys = sum (elems freq)
320
321
322 -- | here we do the average of all the local f_scores
323 toPhyloQuality :: Double -> Map Int Double -> [[PhyloGroup]] -> Double
324 toPhyloQuality beta freq branches =
325 if (null branches)
326 then 0
327 else sum
328 $ map (\x ->
329 let px = freq ! x
330 bx = relevantBranches x branches
331 wks = sum $ map wk bx
332 in (px / pys) * (sum $ map (\bk -> ((wk bk) / wks) * (fScore beta x bk bx)) bx))
333 $ keys freq
334 where
335 pys :: Double
336 pys = sum (elems freq)
337
338
339 ------------------------------------
340 -- | Constant Temporal Matching | --
341 ------------------------------------
342
343
344 groupsToBranches :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
345 groupsToBranches groups =
346 -- run the related component algorithm
347 let egos = groupBy (\gs gs' -> (fst $ fst $ head' "egos" gs) == (fst $ fst $ head' "egos" gs'))
348 $ sortOn (\gs -> fst $ fst $ head' "egos" gs)
349 $ map (\group -> [getGroupId group]
350 ++ (map fst $ group ^. phylo_groupPeriodParents)
351 ++ (map fst $ group ^. phylo_groupPeriodChilds) ) $ elems groups
352 -- first find the related components by inside each ego's period
353 -- a supprimer
354 graph' = map relatedComponents egos
355 -- then run it for the all the periods
356 graph = zip [1..]
357 $ relatedComponents $ concat (graph' `using` parList rdeepseq)
358 -- update each group's branch id
359 in map (\(bId,ids) ->
360 let groups' = map (\group -> group & phylo_groupBranchId %~ (\(lvl,lst) -> (lvl,lst ++ [bId])))
361 $ elems $ restrictKeys groups (Set.fromList ids)
362 in groups' `using` parList rdeepseq ) graph
363
364
365 reduceFrequency :: Map Int Double -> [[PhyloGroup]] -> Map Int Double
366 reduceFrequency frequency branches =
367 restrictKeys frequency (Set.fromList $ (nub . concat) $ map _phylo_groupNgrams $ concat branches)
368
369 updateThr :: Double -> [[PhyloGroup]] -> [[PhyloGroup]]
370 updateThr thr branches = map (\b -> map (\g ->
371 g & phylo_groupMeta .~ (singleton "seaLevels" (((g ^. phylo_groupMeta) ! "seaLevels") ++ [thr]))) b) branches
372
373
374 -- Sequentially break each branch of a phylo where
375 -- done = all the allready broken branches
376 -- ego = the current branch we want to break
377 -- rest = the branches we still have to break
378 breakBranches :: Proximity -> Double -> Map Int Double -> Int -> Double -> Double -> Double
379 -> Int -> Map Date Double -> Map Date Cooc -> [PhyloPeriodId] -> [([PhyloGroup],Bool)] -> ([PhyloGroup],Bool) -> [([PhyloGroup],Bool)] -> [([PhyloGroup],Bool)]
380 breakBranches proximity beta frequency minBranch thr depth elevation frame docs coocs periods done ego rest =
381 -- 1) keep or not the new division of ego
382 let done' = done ++ (if snd ego
383 then
384 (if ((null (fst ego')) || (quality > quality'))
385 then
386 -- trace (" ✗ F(β) = " <> show(quality) <> " (vs) " <> show(quality')
387 -- <> " | " <> show(length $ fst ego) <> " groups : "
388 -- <> " |✓ " <> show(length $ fst ego') <> show(map length $ fst ego')
389 -- <> " |✗ " <> show(length $ snd ego') <> "[" <> show(length $ concat $ snd ego') <> "]")
390 [(fst ego,False)]
391 else
392 -- trace (" ✓ level = " <> printf "%.1f" thr <> "")
393 -- trace (" ✓ F(β) = " <> show(quality) <> " (vs) " <> show(quality')
394 -- <> " | " <> show(length $ fst ego) <> " groups : "
395 -- <> " |✓ " <> show(length $ fst ego') <> show(map length $ fst ego')
396 -- <> " |✗ " <> show(length $ snd ego') <> "[" <> show(length $ concat $ snd ego') <> "]")
397 ((map (\e -> (e,True)) (fst ego')) ++ (map (\e -> (e,False)) (snd ego'))))
398 else [ego])
399 in
400 -- 2) if there is no more branches in rest then return else continue
401 if null rest
402 then done'
403 else breakBranches proximity beta frequency minBranch thr depth elevation frame docs coocs periods
404 done' (head' "breakBranches" rest) (tail' "breakBranches" rest)
405 where
406 --------------------------------------
407 quality :: Double
408 quality = toPhyloQuality beta frequency ((map fst done) ++ [fst ego] ++ (map fst rest))
409 --------------------------------------
410 ego' :: ([[PhyloGroup]],[[PhyloGroup]])
411 ego' =
412 let branches = groupsToBranches $ fromList $ map (\g -> (getGroupId g, g))
413 $ matchGroupsToGroups frame periods proximity thr docs coocs (fst ego)
414 branches' = branches `using` parList rdeepseq
415 in partition (\b -> (length $ nub $ map _phylo_groupPeriod b) >= minBranch)
416 $ thrToMeta thr
417 $ depthToMeta (elevation - depth) branches'
418 --------------------------------------
419 quality' :: Double
420 quality' = toPhyloQuality beta frequency
421 ((map fst done) ++ (fst ego') ++ (snd ego') ++ (map fst rest))
422
423
424 seaLevelMatching :: Proximity -> Double -> Int -> Map Int Double -> Double -> Double -> Double -> Double
425 -> Int -> [PhyloPeriodId] -> Map Date Double -> Map Date Cooc -> [([PhyloGroup],Bool)] -> [([PhyloGroup],Bool)]
426 seaLevelMatching proximity beta minBranch frequency thr step depth elevation frame periods docs coocs branches =
427 -- if there is no branch to break or if seaLvl level > 1 then end
428 if (thr >= 1) || ((not . or) $ map snd branches)
429 then branches
430 else
431 -- break all the possible branches at the current seaLvl level
432 let quality = toPhyloQuality beta frequency (map fst branches)
433 acc = toAccuracy frequency (map fst branches)
434 rec = toRecall frequency (map fst branches)
435 branches' = trace ("↑ level = " <> printf "%.3f" thr <> " F(β) = " <> printf "%.5f" quality
436 <> " ξ = " <> printf "%.5f" acc
437 <> " ρ = " <> printf "%.5f" rec
438 <> " branches = " <> show(length branches) <> " ↴")
439 $ breakBranches proximity beta frequency minBranch thr depth elevation frame docs coocs periods
440 [] (head' "seaLevelMatching" branches) (tail' "seaLevelMatching" branches)
441 frequency' = reduceFrequency frequency (map fst branches')
442 in seaLevelMatching proximity beta minBranch frequency' (thr + step) step (depth - 1) elevation frame periods docs coocs branches'
443
444
445 constanteTemporalMatching :: Double -> Double -> Phylo -> Phylo
446 constanteTemporalMatching start step phylo = updatePhyloGroups 1
447 (fromList $ map (\g -> (getGroupId g,g)) $ traceMatchEnd $ concat branches)
448 (toPhyloHorizon phylo)
449 where
450 -- 2) process the temporal matching by elevating seaLvl level
451 branches :: [[PhyloGroup]]
452 branches = map fst
453 $ seaLevelMatching (phyloProximity $ getConfig phylo)
454 (_qua_granularity $ phyloQuality $ getConfig phylo)
455 (_qua_minBranch $ phyloQuality $ getConfig phylo)
456 (phylo ^. phylo_termFreq)
457 start step
458 ((((1 - start) / step) - 1))
459 (((1 - start) / step))
460 (getTimeFrame $ timeUnit $ getConfig phylo)
461 (getPeriodIds phylo)
462 (phylo ^. phylo_timeDocs)
463 (phylo ^. phylo_timeCooc)
464 groups
465 -- 1) for each group process an initial temporal Matching
466 -- here we suppose that all the groups of level 1 are part of the same big branch
467 groups :: [([PhyloGroup],Bool)]
468 groups = map (\b -> (b,(length $ nub $ map _phylo_groupPeriod b) >= (_qua_minBranch $ phyloQuality $ getConfig phylo)))
469 $ groupsToBranches $ fromList $ map (\g -> (getGroupId g, g))
470 $ matchGroupsToGroups (getTimeFrame $ timeUnit $ getConfig phylo)
471 (getPeriodIds phylo) (phyloProximity $ getConfig phylo)
472 start
473 (phylo ^. phylo_timeDocs)
474 (phylo ^. phylo_timeCooc)
475 (traceTemporalMatching $ getGroupsFromLevel 1 phylo)
476
477 -----------------
478 -- | Horizon | --
479 -----------------
480
481 toPhyloHorizon :: Phylo -> Phylo
482 toPhyloHorizon phylo =
483 let t0 = take 1 (getPeriodIds phylo)
484 groups = getGroupsFromLevelPeriods 1 t0 phylo
485 sens = getSensibility (phyloProximity $ getConfig phylo)
486 nbDocs = sum $ elems $ filterDocs (phylo ^. phylo_timeDocs) t0
487 diago = reduceDiagos $ filterDiago (phylo ^. phylo_timeCooc) t0
488 in phylo & phylo_horizon .~ (fromList $ map (\(g,g') ->
489 ((getGroupId g,getGroupId g'),weightedLogJaccard' sens nbDocs diago (g ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams))) $ listToCombi' groups)
490
491
492 --------------------------------------
493 -- | Adaptative Temporal Matching | --
494 --------------------------------------
495
496
497 thrToMeta :: Double -> [[PhyloGroup]] -> [[PhyloGroup]]
498 thrToMeta thr branches =
499 map (\b ->
500 map (\g -> g & phylo_groupMeta .~ (adjust (\lst -> lst ++ [thr]) "seaLevels" (g ^. phylo_groupMeta))) b) branches
501
502 depthToMeta :: Double -> [[PhyloGroup]] -> [[PhyloGroup]]
503 depthToMeta depth branches =
504 let break = length branches > 1
505 in map (\b ->
506 map (\g ->
507 if break then g & phylo_groupMeta .~ (adjust (\lst -> lst ++ [depth]) "breaks"(g ^. phylo_groupMeta))
508 else g) b) branches
509
510 reduceTupleMapByKeys :: Eq a => [a] -> Map (a,a) Double -> Map (a,a) Double
511 reduceTupleMapByKeys ks m = filterWithKey (\(k,k') _ -> (elem k ks) && (elem k' ks)) m
512
513
514 getInTupleMap :: Ord a => Map (a,a) Double -> a -> a -> Double
515 getInTupleMap m k k'
516 | isJust (m !? ( k ,k')) = m ! ( k ,k')
517 | isJust (m !? ( k',k )) = m ! ( k',k )
518 | otherwise = 0
519
520
521 toThreshold :: Double -> Map (PhyloGroupId,PhyloGroupId) Double -> Double
522 toThreshold lvl proxiGroups =
523 let idx = ((Map.size proxiGroups) `div` (floor lvl)) - 1
524 in if idx >= 0
525 then (sort $ elems proxiGroups) !! idx
526 else 1
527
528
529 -- done = all the allready broken branches
530 -- ego = the current branch we want to break
531 -- rest = the branches we still have to break
532 adaptativeBreakBranches :: Proximity -> Double -> Double -> Map (PhyloGroupId,PhyloGroupId) Double
533 -> Double -> Map Int Double -> Int -> Int -> Map Date Double -> Map Date Cooc
534 -> [PhyloPeriodId] -> [([PhyloGroup],(Bool,[Double]))] -> ([PhyloGroup],(Bool,[Double])) -> [([PhyloGroup],(Bool,[Double]))]
535 -> [([PhyloGroup],(Bool,[Double]))]
536 adaptativeBreakBranches proxiConf depth elevation groupsProxi beta frequency minBranch frame docs coocs periods done ego rest =
537 -- 1) keep or not the new division of ego
538 let done' = done ++ (if (fst . snd) ego
539 then (if ((null (fst ego')) || (quality > quality'))
540 then
541 [(concat $ thrToMeta thr $ [fst ego],(False, ((snd . snd) ego)))]
542 else
543 ( (map (\e -> (e,(True, ((snd . snd) ego) ++ [thr]))) (fst ego'))
544 ++ (map (\e -> (e,(False, ((snd . snd) ego)))) (snd ego'))))
545 else [(concat $ thrToMeta thr $ [fst ego], snd ego)])
546 in
547 -- uncomment let .. in for debugging
548 -- let part1 = partition (snd) done'
549 -- part2 = partition (snd) rest
550 -- in trace ( "[✓ " <> show(length $ fst part1) <> "(" <> show(length $ concat $ map (fst) $ fst part1) <> ")|✗ " <> show(length $ snd part1) <> "(" <> show(length $ concat $ map (fst) $ snd part1) <> ")] "
551 -- <> "[✓ " <> show(length $ fst part2) <> "(" <> show(length $ concat $ map (fst) $ fst part2) <> ")|✗ " <> show(length $ snd part2) <> "(" <> show(length $ concat $ map (fst) $ snd part2) <> ")]"
552 -- ) $
553 -- 2) if there is no more branches in rest then return else continue
554 if null rest
555 then done'
556 else adaptativeBreakBranches proxiConf depth elevation groupsProxi beta frequency minBranch frame docs coocs periods
557 done' (head' "breakBranches" rest) (tail' "breakBranches" rest)
558 where
559 --------------------------------------
560 thr :: Double
561 thr = toThreshold depth $ Map.filter (\v -> v > (last' "breakBranches" $ (snd . snd) ego)) $ reduceTupleMapByKeys (map getGroupId $ fst ego) groupsProxi
562 --------------------------------------
563 quality :: Double
564 quality = toPhyloQuality beta frequency ((map fst done) ++ [fst ego] ++ (map fst rest))
565 --------------------------------------
566 ego' :: ([[PhyloGroup]],[[PhyloGroup]])
567 ego' =
568 let branches = groupsToBranches $ fromList $ map (\g -> (getGroupId g, g))
569 $ matchGroupsToGroups frame periods proxiConf thr docs coocs (fst ego)
570 branches' = branches `using` parList rdeepseq
571 in partition (\b -> (length $ nub $ map _phylo_groupPeriod b) > minBranch)
572 $ thrToMeta thr
573 $ depthToMeta (elevation - depth) branches'
574 --------------------------------------
575 quality' :: Double
576 quality' = toPhyloQuality beta frequency
577 ((map fst done) ++ (fst ego') ++ (snd ego') ++ (map fst rest))
578
579
580 adaptativeSeaLevelMatching :: Proximity -> Double -> Double -> Map (PhyloGroupId, PhyloGroupId) Double
581 -> Double -> Int -> Map Int Double
582 -> Int -> [PhyloPeriodId] -> Map Date Double -> Map Date Cooc
583 -> [([PhyloGroup],(Bool,[Double]))] -> [([PhyloGroup],(Bool,[Double]))]
584 adaptativeSeaLevelMatching proxiConf depth elevation groupsProxi beta minBranch frequency frame periods docs coocs branches =
585 -- if there is no branch to break or if seaLvl level >= depth then end
586 if (Map.null groupsProxi) || (depth <= 0) || ((not . or) $ map (fst . snd) branches)
587 then branches
588 else
589 -- break all the possible branches at the current seaLvl level
590 let branches' = adaptativeBreakBranches proxiConf depth elevation groupsProxi beta frequency minBranch frame docs coocs periods
591 [] (head' "seaLevelMatching" branches) (tail' "seaLevelMatching" branches)
592 frequency' = reduceFrequency frequency (map fst branches')
593 groupsProxi' = reduceTupleMapByKeys (map (getGroupId) $ concat $ map (fst) $ filter (fst . snd) branches') groupsProxi
594 -- thr = toThreshold depth groupsProxi
595 in trace("\n " <> foldl (\acc _ -> acc <> "🌊 ") "" [0..(elevation - depth)]
596 <> " [✓ " <> show(length $ filter (fst . snd) branches') <> "(" <> show(length $ concat $ map (fst) $ filter (fst . snd) branches')
597 <> ")|✗ " <> show(length $ filter (not . fst . snd) branches') <> "(" <> show(length $ concat $ map (fst) $ filter (not . fst . snd) branches') <> ")]"
598 <> " thr = ")
599 $ adaptativeSeaLevelMatching proxiConf (depth - 1) elevation groupsProxi' beta minBranch frequency' frame periods docs coocs branches'
600
601
602 adaptativeTemporalMatching :: Double -> Phylo -> Phylo
603 adaptativeTemporalMatching elevation phylo = updatePhyloGroups 1
604 (fromList $ map (\g -> (getGroupId g,g)) $ traceMatchEnd $ concat branches)
605 (toPhyloHorizon phylo)
606 where
607 -- 2) process the temporal matching by elevating seaLvl level
608 branches :: [[PhyloGroup]]
609 branches = map fst
610 $ adaptativeSeaLevelMatching (phyloProximity $ getConfig phylo)
611 (elevation - 1)
612 elevation
613 (phylo ^. phylo_groupsProxi)
614 (_qua_granularity $ phyloQuality $ getConfig phylo)
615 (_qua_minBranch $ phyloQuality $ getConfig phylo)
616 (phylo ^. phylo_termFreq)
617 (getTimeFrame $ timeUnit $ getConfig phylo)
618 (getPeriodIds phylo)
619 (phylo ^. phylo_timeDocs)
620 (phylo ^. phylo_timeCooc)
621 groups
622 -- 1) for each group process an initial temporal Matching
623 -- here we suppose that all the groups of level 1 are part of the same big branch
624 groups :: [([PhyloGroup],(Bool,[Double]))]
625 groups = map (\b -> (b,((length $ nub $ map _phylo_groupPeriod b) >= (_qua_minBranch $ phyloQuality $ getConfig phylo),[thr])))
626 $ groupsToBranches $ fromList $ map (\g -> (getGroupId g, g))
627 $ matchGroupsToGroups (getTimeFrame $ timeUnit $ getConfig phylo)
628 (getPeriodIds phylo) (phyloProximity $ getConfig phylo)
629 thr
630 (phylo ^. phylo_timeDocs)
631 (phylo ^. phylo_timeCooc)
632 (traceTemporalMatching $ getGroupsFromLevel 1 phylo)
633 --------------------------------------
634 thr :: Double
635 thr = toThreshold elevation (phylo ^. phylo_groupsProxi)