]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/Phylo/TemporalMatching.hs
add recall and accuracy as logs
[gargantext.git] / src / Gargantext / Viz / Phylo / TemporalMatching.hs
1 {-|
2 Module : Gargantext.Viz.Phylo.TemporalMatching
3 Description : Module dedicated to the adaptative temporal matching of a Phylo.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 {-# LANGUAGE NoImplicitPrelude #-}
12 {-# LANGUAGE FlexibleContexts #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE MultiParamTypeClasses #-}
15
16 module Gargantext.Viz.Phylo.TemporalMatching where
17
18 import Data.List (concat, splitAt, tail, sortOn, (++), intersect, null, inits, groupBy, scanl, nub, union, dropWhile, partition, or, sort, (!!))
19 import Data.Map (Map, fromList, elems, restrictKeys, unionWith, findWithDefault, keys, (!), (!?), filterWithKey, singleton, empty, mapKeys, adjust)
20
21 import Gargantext.Prelude
22 import Gargantext.Viz.AdaptativePhylo
23 import Gargantext.Viz.Phylo.PhyloTools
24
25 import Prelude (floor)
26 import Control.Lens hiding (Level)
27 import Control.Parallel.Strategies (parList, rdeepseq, using)
28 import Debug.Trace (trace)
29
30 import Text.Printf
31
32 import qualified Data.Map as Map
33 import qualified Data.Set as Set
34
35
36 -------------------
37 -- | Proximity | --
38 -------------------
39
40
41 -- | To compute a jaccard similarity between two lists
42 jaccard :: [Int] -> [Int] -> Double
43 jaccard inter' union' = ((fromIntegral . length) $ inter') / ((fromIntegral . length) $ union')
44
45
46 -- | Process the inverse sumLog
47 sumInvLog' :: Double -> Double -> [Double] -> Double
48 sumInvLog' s nb diago = foldl (\mem occ -> mem + (1 / (log (occ + s) / log (nb + s)))) 0 diago
49
50
51 -- | Process the sumLog
52 sumLog' :: Double -> Double -> [Double] -> Double
53 sumLog' s nb diago = foldl (\mem occ -> mem + (log (occ + s) / log (nb + s))) 0 diago
54
55
56 weightedLogJaccard' :: Double -> Double -> Map Int Double -> [Int] -> [Int] -> Double
57 weightedLogJaccard' sens nbDocs diago ngrams ngrams'
58 | null ngramsInter = 0
59 | ngramsInter == ngramsUnion = 1
60 | sens == 0 = jaccard ngramsInter ngramsUnion
61 | sens > 0 = (sumInvLog' sens nbDocs diagoInter) / (sumInvLog' sens nbDocs diagoUnion)
62 | otherwise = (sumLog' sens nbDocs diagoInter) / (sumLog' sens nbDocs diagoUnion)
63 where
64 --------------------------------------
65 ngramsInter :: [Int]
66 ngramsInter = intersect ngrams ngrams'
67 --------------------------------------
68 ngramsUnion :: [Int]
69 ngramsUnion = union ngrams ngrams'
70 --------------------------------------
71 diagoInter :: [Double]
72 diagoInter = elems $ restrictKeys diago (Set.fromList ngramsInter)
73 --------------------------------------
74 diagoUnion :: [Double]
75 diagoUnion = elems $ restrictKeys diago (Set.fromList ngramsUnion)
76 --------------------------------------
77
78
79 -- | To process the proximity between a current group and a pair of targets group
80 toProximity :: Double -> Map Int Double -> Proximity -> [Int] -> [Int] -> [Int] -> Double
81 toProximity nbDocs diago proximity egoNgrams targetNgrams targetNgrams' =
82 case proximity of
83 WeightedLogJaccard sens ->
84 let pairNgrams = if targetNgrams == targetNgrams'
85 then targetNgrams
86 else union targetNgrams targetNgrams'
87 in weightedLogJaccard' sens nbDocs diago egoNgrams pairNgrams
88 Hamming -> undefined
89
90
91 ------------------------
92 -- | Local Matching | --
93 ------------------------
94
95 findLastPeriod :: Filiation -> [PhyloPeriodId] -> PhyloPeriodId
96 findLastPeriod fil periods = case fil of
97 ToParents -> head' "findLastPeriod" (sortOn fst periods)
98 ToChilds -> last' "findLastPeriod" (sortOn fst periods)
99
100
101 -- | To filter pairs of candidates related to old pointers periods
102 removeOldPointers :: [Pointer] -> Filiation -> Double -> Proximity -> PhyloPeriodId
103 -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
104 -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
105 removeOldPointers oldPointers fil thr prox prd pairs
106 | null oldPointers = pairs
107 | null (filterPointers prox thr oldPointers) =
108 let lastMatchedPrd = findLastPeriod fil (map (fst . fst . fst) oldPointers)
109 in if lastMatchedPrd == prd
110 then []
111 else filter (\((id,_),(id',_)) ->
112 case fil of
113 ToParents -> (((fst . fst . fst) id ) < (fst lastMatchedPrd))
114 || (((fst . fst . fst) id') < (fst lastMatchedPrd))
115 ToChilds -> (((fst . fst . fst) id ) > (fst lastMatchedPrd))
116 || (((fst . fst . fst) id') > (fst lastMatchedPrd))) pairs
117 | otherwise = []
118
119
120 makePairs' :: (PhyloGroupId,[Int]) -> [(PhyloGroupId,[Int])] -> [PhyloPeriodId] -> [Pointer] -> Filiation -> Double -> Proximity
121 -> Map Date Double -> Map Date Cooc -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
122 makePairs' (egoId, egoNgrams) candidates periods oldPointers fil thr prox docs diagos =
123 if (null periods)
124 then []
125 else removeOldPointers oldPointers fil thr prox lastPrd
126 -- | at least on of the pair candidates should be from the last added period
127 $ filter (\((id,_),(id',_)) -> ((fst . fst) id == lastPrd) || ((fst . fst) id' == lastPrd))
128 $ listToKeys
129 $ filter (\(id,ngrams) ->
130 let nbDocs = (sum . elems) $ filterDocs docs ([(fst . fst) egoId, (fst . fst) id])
131 diago = reduceDiagos $ filterDiago diagos ([(fst . fst) egoId, (fst . fst) id])
132 in (toProximity nbDocs diago prox egoNgrams egoNgrams ngrams) >= thr
133 ) candidates
134 where
135 lastPrd :: PhyloPeriodId
136 lastPrd = findLastPeriod fil periods
137
138
139 filterPointers :: Proximity -> Double -> [Pointer] -> [Pointer]
140 filterPointers proxi thr pts = filter (\(_,w) -> filterProximity proxi thr w) pts
141
142
143 reduceDiagos :: Map Date Cooc -> Map Int Double
144 reduceDiagos diagos = mapKeys (\(k,_) -> k)
145 $ foldl (\acc diago -> unionWith (+) acc diago) empty (elems diagos)
146
147
148 phyloGroupMatching :: [[(PhyloGroupId,[Int])]] -> Filiation -> Proximity -> Map Date Double -> Map Date Cooc
149 -> Double -> [Pointer] -> (PhyloGroupId,[Int]) -> [Pointer]
150 phyloGroupMatching candidates fil proxi docs diagos thr oldPointers (id,ngrams) =
151 if (null $ filterPointers proxi thr oldPointers)
152 -- | let's find new pointers
153 then if null nextPointers
154 then []
155 else head' "phyloGroupMatching"
156 -- | Keep only the best set of pointers grouped by proximity
157 $ groupBy (\pt pt' -> snd pt == snd pt')
158 $ reverse $ sortOn snd $ head' "pointers" nextPointers
159 -- | Find the first time frame where at leats one pointer satisfies the proximity threshold
160 else oldPointers
161 where
162 nextPointers :: [[Pointer]]
163 nextPointers = take 1
164 $ dropWhile (null)
165 -- | for each time frame, process the proximity on relevant pairs of targeted groups
166 $ scanl (\acc groups ->
167 let periods = nub $ map (fst . fst . fst) $ concat groups
168 nbdocs = sum $ elems $ (filterDocs docs ([(fst . fst) id] ++ periods))
169 diago = reduceDiagos
170 $ filterDiago diagos ([(fst . fst) id] ++ periods)
171 -- | important resize nbdocs et diago dans le make pairs
172 pairs = makePairs' (id,ngrams) (concat groups) periods oldPointers fil thr proxi docs diagos
173 in acc ++ ( filterPointers proxi thr
174 $ concat
175 $ map (\(c,c') ->
176 -- | process the proximity between the current group and a pair of candidates
177 let proximity = toProximity nbdocs diago proxi ngrams (snd c) (snd c')
178 in if (c == c')
179 then [(fst c,proximity)]
180 else [(fst c,proximity),(fst c',proximity)] ) pairs )) []
181 $ inits candidates -- | groups from [[1900],[1900,1901],[1900,1901,1902],...]
182
183
184 filterDocs :: Map Date Double -> [PhyloPeriodId] -> Map Date Double
185 filterDocs d pds = restrictKeys d $ periodsToYears pds
186
187 filterDiago :: Map Date Cooc -> [PhyloPeriodId] -> Map Date Cooc
188 filterDiago diago pds = restrictKeys diago $ periodsToYears pds
189
190
191 -----------------------------
192 -- | Matching Processing | --
193 -----------------------------
194
195
196 getNextPeriods :: Filiation -> Int -> PhyloPeriodId -> [PhyloPeriodId] -> [PhyloPeriodId]
197 getNextPeriods fil max' pId pIds =
198 case fil of
199 ToChilds -> take max' $ (tail . snd) $ splitAt (elemIndex' pId pIds) pIds
200 ToParents -> take max' $ (reverse . fst) $ splitAt (elemIndex' pId pIds) pIds
201
202
203 getCandidates :: PhyloGroup -> [[(PhyloGroupId,[Int])]] -> [[(PhyloGroupId,[Int])]]
204 getCandidates ego targets =
205 map (\groups' ->
206 filter (\g' -> (not . null) $ intersect (ego ^. phylo_groupNgrams) (snd g')
207 ) groups') targets
208
209
210 matchGroupsToGroups :: Int -> [PhyloPeriodId] -> Proximity -> Double -> Map Date Double -> Map Date Cooc -> [PhyloGroup] -> [PhyloGroup]
211 matchGroupsToGroups frame periods proximity thr docs coocs groups =
212 let groups' = groupByField _phylo_groupPeriod groups
213 in foldl' (\acc prd ->
214 let -- | 1) find the parents/childs matching periods
215 periodsPar = getNextPeriods ToParents frame prd periods
216 periodsChi = getNextPeriods ToChilds frame prd periods
217 -- | 2) find the parents/childs matching candidates
218 candidatesPar = map (\prd' -> map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ findWithDefault [] prd' groups') periodsPar
219 candidatesChi = map (\prd' -> map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ findWithDefault [] prd' groups') periodsChi
220 -- | 3) find the parents/child number of docs by years
221 docsPar = filterDocs docs ([prd] ++ periodsPar)
222 docsChi = filterDocs docs ([prd] ++ periodsChi)
223 -- | 4) find the parents/child diago by years
224 diagoPar = filterDiago (map coocToDiago coocs) ([prd] ++ periodsPar)
225 diagoChi = filterDiago (map coocToDiago coocs) ([prd] ++ periodsPar)
226 -- | 5) match in parallel all the groups (egos) to their possible candidates
227 egos = map (\ego ->
228 let pointersPar = phyloGroupMatching (getCandidates ego candidatesPar) ToParents proximity docsPar diagoPar
229 thr (getPeriodPointers ToParents ego) (getGroupId ego, ego ^. phylo_groupNgrams)
230 pointersChi = phyloGroupMatching (getCandidates ego candidatesChi) ToChilds proximity docsChi diagoChi
231 thr (getPeriodPointers ToChilds ego) (getGroupId ego, ego ^. phylo_groupNgrams)
232 in addPointers ToChilds TemporalPointer pointersChi
233 $ addPointers ToParents TemporalPointer pointersPar ego)
234 $ findWithDefault [] prd groups'
235 egos' = egos `using` parList rdeepseq
236 in acc ++ egos'
237 ) [] periods
238
239
240 -----------------------
241 -- | Phylo Quality | --
242 -----------------------
243
244
245 relevantBranches :: Int -> [[PhyloGroup]] -> [[PhyloGroup]]
246 relevantBranches term branches =
247 filter (\groups -> (any (\group -> elem term $ group ^. phylo_groupNgrams) groups)) branches
248
249 accuracy :: Int -> [PhyloGroup] -> Double
250 accuracy x bk = ((fromIntegral $ length $ filter (\g -> elem x $ g ^. phylo_groupNgrams) bk)
251 / (fromIntegral $ length bk))
252
253 recall :: Int -> [PhyloGroup] -> [[PhyloGroup]] -> Double
254 recall x bk bx = ((fromIntegral $ length $ filter (\g -> elem x $ g ^. phylo_groupNgrams) bk)
255 / (fromIntegral $ length $ filter (\g -> elem x $ g ^. phylo_groupNgrams) $ concat bx))
256
257 fScore :: Double -> Int -> [PhyloGroup] -> [[PhyloGroup]] -> Double
258 fScore beta x bk bx =
259 let rec = recall x bk bx
260 acc = accuracy x bk
261 in ((1 + beta ** 2) * acc * rec)
262 / (((beta ** 2) * rec + acc))
263
264
265 wk :: [PhyloGroup] -> Double
266 wk bk = fromIntegral $ length bk
267
268
269 toPhyloQuality' :: Double -> Map Int Double -> [[PhyloGroup]] -> Double
270 toPhyloQuality' beta freq branches =
271 if (null branches)
272 then 0
273 else sum
274 $ map (\i ->
275 let bks = relevantBranches i branches
276 in (freq ! i) * (sum $ map (\bk -> ((wk bk) / (sum $ map wk bks)) * (fScore beta i bk bks)) bks))
277 $ keys freq
278
279 toRecall :: Map Int Double -> [[PhyloGroup]] -> Double
280 toRecall freq branches =
281 if (null branches)
282 then 0
283 else sum
284 $ map (\x ->
285 let px = freq ! x
286 bx = relevantBranches x branches
287 wks = sum $ map wk bx
288 in (px / pys) * (sum $ map (\bk -> ((wk bk) / wks) * (recall x bk bx)) bx))
289 $ keys freq
290 where
291 pys :: Double
292 pys = sum (elems freq)
293
294
295 toAccuracy :: Map Int Double -> [[PhyloGroup]] -> Double
296 toAccuracy freq branches =
297 if (null branches)
298 then 0
299 else sum
300 $ map (\x ->
301 let px = freq ! x
302 bx = relevantBranches x branches
303 wks = sum $ map wk bx
304 in (px / pys) * (sum $ map (\bk -> ((wk bk) / wks) * (accuracy x bk)) bx))
305 $ keys freq
306 where
307 pys :: Double
308 pys = sum (elems freq)
309
310
311 -- | here we do the average of all the local f_scores
312 toPhyloQuality :: Double -> Map Int Double -> [[PhyloGroup]] -> Double
313 toPhyloQuality beta freq branches =
314 if (null branches)
315 then 0
316 else sum
317 $ map (\x ->
318 let px = freq ! x
319 bx = relevantBranches x branches
320 wks = sum $ map wk bx
321 in (px / pys) * (sum $ map (\bk -> ((wk bk) / wks) * (fScore beta x bk bx)) bx))
322 $ keys freq
323 where
324 pys :: Double
325 pys = sum (elems freq)
326
327
328 ------------------------------------
329 -- | Constant Temporal Matching | --
330 ------------------------------------
331
332
333 groupsToBranches :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
334 groupsToBranches groups =
335 -- | run the related component algorithm
336 let egos = groupBy (\gs gs' -> (fst $ fst $ head' "egos" gs) == (fst $ fst $ head' "egos" gs'))
337 $ sortOn (\gs -> fst $ fst $ head' "egos" gs)
338 $ map (\group -> [getGroupId group]
339 ++ (map fst $ group ^. phylo_groupPeriodParents)
340 ++ (map fst $ group ^. phylo_groupPeriodChilds) ) $ elems groups
341 -- | first find the related components by inside each ego's period
342 -- | a supprimer
343 graph' = map relatedComponents egos
344 -- | then run it for the all the periods
345 graph = zip [1..]
346 $ relatedComponents $ concat (graph' `using` parList rdeepseq)
347 -- | update each group's branch id
348 in map (\(bId,ids) ->
349 let groups' = map (\group -> group & phylo_groupBranchId %~ (\(lvl,lst) -> (lvl,lst ++ [bId])))
350 $ elems $ restrictKeys groups (Set.fromList ids)
351 in groups' `using` parList rdeepseq ) graph
352
353
354 reduceFrequency :: Map Int Double -> [[PhyloGroup]] -> Map Int Double
355 reduceFrequency frequency branches =
356 restrictKeys frequency (Set.fromList $ (nub . concat) $ map _phylo_groupNgrams $ concat branches)
357
358 updateThr :: Double -> [[PhyloGroup]] -> [[PhyloGroup]]
359 updateThr thr branches = map (\b -> map (\g ->
360 g & phylo_groupMeta .~ (singleton "seaLevels" (((g ^. phylo_groupMeta) ! "seaLevels") ++ [thr]))) b) branches
361
362
363 -- | Sequentially break each branch of a phylo where
364 -- done = all the allready broken branches
365 -- ego = the current branch we want to break
366 -- rest = the branches we still have to break
367 breakBranches :: Proximity -> Double -> Map Int Double -> Int -> Double -> Double -> Double
368 -> Int -> Map Date Double -> Map Date Cooc -> [PhyloPeriodId] -> [([PhyloGroup],Bool)] -> ([PhyloGroup],Bool) -> [([PhyloGroup],Bool)] -> [([PhyloGroup],Bool)]
369 breakBranches proximity beta frequency minBranch thr depth elevation frame docs coocs periods done ego rest =
370 -- | 1) keep or not the new division of ego
371 let done' = done ++ (if snd ego
372 then
373 (if ((null (fst ego')) || (quality > quality'))
374 then
375 -- trace (" ✗ F(β) = " <> show(quality) <> " (vs) " <> show(quality')
376 -- <> " | " <> show(length $ fst ego) <> " groups : "
377 -- <> " |✓ " <> show(length $ fst ego') <> show(map length $ fst ego')
378 -- <> " |✗ " <> show(length $ snd ego') <> "[" <> show(length $ concat $ snd ego') <> "]")
379 [(fst ego,False)]
380 else
381 -- trace (" ✓ level = " <> printf "%.1f" thr <> "")
382 -- trace (" ✓ F(β) = " <> show(quality) <> " (vs) " <> show(quality')
383 -- <> " | " <> show(length $ fst ego) <> " groups : "
384 -- <> " |✓ " <> show(length $ fst ego') <> show(map length $ fst ego')
385 -- <> " |✗ " <> show(length $ snd ego') <> "[" <> show(length $ concat $ snd ego') <> "]")
386 ((map (\e -> (e,True)) (fst ego')) ++ (map (\e -> (e,False)) (snd ego'))))
387 else [ego])
388 in
389 -- | 2) if there is no more branches in rest then return else continue
390 if null rest
391 then done'
392 else breakBranches proximity beta frequency minBranch thr depth elevation frame docs coocs periods
393 done' (head' "breakBranches" rest) (tail' "breakBranches" rest)
394 where
395 --------------------------------------
396 quality :: Double
397 quality = toPhyloQuality beta frequency ((map fst done) ++ [fst ego] ++ (map fst rest))
398 --------------------------------------
399 ego' :: ([[PhyloGroup]],[[PhyloGroup]])
400 ego' =
401 let branches = groupsToBranches $ fromList $ map (\g -> (getGroupId g, g))
402 $ matchGroupsToGroups frame periods proximity thr docs coocs (fst ego)
403 branches' = branches `using` parList rdeepseq
404 in partition (\b -> (length $ nub $ map _phylo_groupPeriod b) >= minBranch)
405 $ thrToMeta thr
406 $ depthToMeta (elevation - depth) branches'
407 --------------------------------------
408 quality' :: Double
409 quality' = toPhyloQuality beta frequency
410 ((map fst done) ++ (fst ego') ++ (snd ego') ++ (map fst rest))
411
412
413 seaLevelMatching :: Proximity -> Double -> Int -> Map Int Double -> Double -> Double -> Double -> Double
414 -> Int -> [PhyloPeriodId] -> Map Date Double -> Map Date Cooc -> [([PhyloGroup],Bool)] -> [([PhyloGroup],Bool)]
415 seaLevelMatching proximity beta minBranch frequency thr step depth elevation frame periods docs coocs branches =
416 -- | if there is no branch to break or if seaLvl level > 1 then end
417 if (thr >= 1) || ((not . or) $ map snd branches)
418 then branches
419 else
420 -- | break all the possible branches at the current seaLvl level
421 let quality = toPhyloQuality beta frequency (map fst branches)
422 acc = toAccuracy frequency (map fst branches)
423 rec = toRecall frequency (map fst branches)
424 branches' = trace ("↑ level = " <> printf "%.3f" thr <> " F(β) = " <> printf "%.5f" quality
425 <> " ξ = " <> printf "%.5f" acc
426 <> " ρ = " <> printf "%.5f" rec
427 <> " branches = " <> show(length branches) <> " ↴")
428 $ breakBranches proximity beta frequency minBranch thr depth elevation frame docs coocs periods
429 [] (head' "seaLevelMatching" branches) (tail' "seaLevelMatching" branches)
430 frequency' = reduceFrequency frequency (map fst branches')
431 in seaLevelMatching proximity beta minBranch frequency' (thr + step) step (depth - 1) elevation frame periods docs coocs branches'
432
433
434 constanteTemporalMatching :: Double -> Double -> Phylo -> Phylo
435 constanteTemporalMatching start step phylo = updatePhyloGroups 1
436 (fromList $ map (\g -> (getGroupId g,g)) $ traceMatchEnd $ concat branches)
437 (toPhyloHorizon phylo)
438 where
439 -- | 2) process the temporal matching by elevating seaLvl level
440 branches :: [[PhyloGroup]]
441 branches = map fst
442 $ seaLevelMatching (phyloProximity $ getConfig phylo)
443 (_qua_granularity $ phyloQuality $ getConfig phylo)
444 (_qua_minBranch $ phyloQuality $ getConfig phylo)
445 (phylo ^. phylo_termFreq)
446 start step
447 ((((1 - start) / step) - 1))
448 (((1 - start) / step))
449 (getTimeFrame $ timeUnit $ getConfig phylo)
450 (getPeriodIds phylo)
451 (phylo ^. phylo_timeDocs)
452 (phylo ^. phylo_timeCooc)
453 groups
454 -- | 1) for each group process an initial temporal Matching
455 -- | here we suppose that all the groups of level 1 are part of the same big branch
456 groups :: [([PhyloGroup],Bool)]
457 groups = map (\b -> (b,(length $ nub $ map _phylo_groupPeriod b) >= (_qua_minBranch $ phyloQuality $ getConfig phylo)))
458 $ groupsToBranches $ fromList $ map (\g -> (getGroupId g, g))
459 $ matchGroupsToGroups (getTimeFrame $ timeUnit $ getConfig phylo)
460 (getPeriodIds phylo) (phyloProximity $ getConfig phylo)
461 start
462 (phylo ^. phylo_timeDocs)
463 (phylo ^. phylo_timeCooc)
464 (traceTemporalMatching $ getGroupsFromLevel 1 phylo)
465
466 -----------------
467 -- | Horizon | --
468 -----------------
469
470 toPhyloHorizon :: Phylo -> Phylo
471 toPhyloHorizon phylo =
472 let t0 = take 1 (getPeriodIds phylo)
473 groups = getGroupsFromLevelPeriods 1 t0 phylo
474 sens = getSensibility (phyloProximity $ getConfig phylo)
475 nbDocs = sum $ elems $ filterDocs (phylo ^. phylo_timeDocs) t0
476 diago = reduceDiagos $ filterDiago (phylo ^. phylo_timeCooc) t0
477 in phylo & phylo_horizon .~ (fromList $ map (\(g,g') ->
478 ((getGroupId g,getGroupId g'),weightedLogJaccard' sens nbDocs diago (g ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams))) $ listToCombi' groups)
479
480
481 --------------------------------------
482 -- | Adaptative Temporal Matching | --
483 --------------------------------------
484
485
486 thrToMeta :: Double -> [[PhyloGroup]] -> [[PhyloGroup]]
487 thrToMeta thr branches =
488 map (\b ->
489 map (\g -> g & phylo_groupMeta .~ (adjust (\lst -> lst ++ [thr]) "seaLevels" (g ^. phylo_groupMeta))) b) branches
490
491 depthToMeta :: Double -> [[PhyloGroup]] -> [[PhyloGroup]]
492 depthToMeta depth branches =
493 let break = length branches > 1
494 in map (\b ->
495 map (\g ->
496 if break then g & phylo_groupMeta .~ (adjust (\lst -> lst ++ [depth]) "breaks"(g ^. phylo_groupMeta))
497 else g) b) branches
498
499 reduceTupleMapByKeys :: Eq a => [a] -> Map (a,a) Double -> Map (a,a) Double
500 reduceTupleMapByKeys ks m = filterWithKey (\(k,k') _ -> (elem k ks) && (elem k' ks)) m
501
502
503 getInTupleMap :: Ord a => Map (a,a) Double -> a -> a -> Double
504 getInTupleMap m k k'
505 | isJust (m !? ( k ,k')) = m ! ( k ,k')
506 | isJust (m !? ( k',k )) = m ! ( k',k )
507 | otherwise = 0
508
509
510 toThreshold :: Double -> Map (PhyloGroupId,PhyloGroupId) Double -> Double
511 toThreshold lvl proxiGroups =
512 let idx = ((Map.size proxiGroups) `div` (floor lvl)) - 1
513 in if idx >= 0
514 then (sort $ elems proxiGroups) !! idx
515 else 1
516
517
518 -- done = all the allready broken branches
519 -- ego = the current branch we want to break
520 -- rest = the branches we still have to break
521 adaptativeBreakBranches :: Proximity -> Double -> Double -> Map (PhyloGroupId,PhyloGroupId) Double
522 -> Double -> Map Int Double -> Int -> Int -> Map Date Double -> Map Date Cooc
523 -> [PhyloPeriodId] -> [([PhyloGroup],(Bool,[Double]))] -> ([PhyloGroup],(Bool,[Double])) -> [([PhyloGroup],(Bool,[Double]))]
524 -> [([PhyloGroup],(Bool,[Double]))]
525 adaptativeBreakBranches proxiConf depth elevation groupsProxi beta frequency minBranch frame docs coocs periods done ego rest =
526 -- | 1) keep or not the new division of ego
527 let done' = done ++ (if (fst . snd) ego
528 then (if ((null (fst ego')) || (quality > quality'))
529 then
530 [(concat $ thrToMeta thr $ [fst ego],(False, ((snd . snd) ego)))]
531 else
532 ( (map (\e -> (e,(True, ((snd . snd) ego) ++ [thr]))) (fst ego'))
533 ++ (map (\e -> (e,(False, ((snd . snd) ego)))) (snd ego'))))
534 else [(concat $ thrToMeta thr $ [fst ego], snd ego)])
535 in
536 -- | uncomment let .. in for debugging
537 -- let part1 = partition (snd) done'
538 -- part2 = partition (snd) rest
539 -- in trace ( "[✓ " <> show(length $ fst part1) <> "(" <> show(length $ concat $ map (fst) $ fst part1) <> ")|✗ " <> show(length $ snd part1) <> "(" <> show(length $ concat $ map (fst) $ snd part1) <> ")] "
540 -- <> "[✓ " <> show(length $ fst part2) <> "(" <> show(length $ concat $ map (fst) $ fst part2) <> ")|✗ " <> show(length $ snd part2) <> "(" <> show(length $ concat $ map (fst) $ snd part2) <> ")]"
541 -- ) $
542 -- | 2) if there is no more branches in rest then return else continue
543 if null rest
544 then done'
545 else adaptativeBreakBranches proxiConf depth elevation groupsProxi beta frequency minBranch frame docs coocs periods
546 done' (head' "breakBranches" rest) (tail' "breakBranches" rest)
547 where
548 --------------------------------------
549 thr :: Double
550 thr = toThreshold depth $ Map.filter (\v -> v > (last' "breakBranches" $ (snd . snd) ego)) $ reduceTupleMapByKeys (map getGroupId $ fst ego) groupsProxi
551 --------------------------------------
552 quality :: Double
553 quality = toPhyloQuality beta frequency ((map fst done) ++ [fst ego] ++ (map fst rest))
554 --------------------------------------
555 ego' :: ([[PhyloGroup]],[[PhyloGroup]])
556 ego' =
557 let branches = groupsToBranches $ fromList $ map (\g -> (getGroupId g, g))
558 $ matchGroupsToGroups frame periods proxiConf thr docs coocs (fst ego)
559 branches' = branches `using` parList rdeepseq
560 in partition (\b -> (length $ nub $ map _phylo_groupPeriod b) > minBranch)
561 $ thrToMeta thr
562 $ depthToMeta (elevation - depth) branches'
563 --------------------------------------
564 quality' :: Double
565 quality' = toPhyloQuality beta frequency
566 ((map fst done) ++ (fst ego') ++ (snd ego') ++ (map fst rest))
567
568
569 adaptativeSeaLevelMatching :: Proximity -> Double -> Double -> Map (PhyloGroupId, PhyloGroupId) Double
570 -> Double -> Int -> Map Int Double
571 -> Int -> [PhyloPeriodId] -> Map Date Double -> Map Date Cooc
572 -> [([PhyloGroup],(Bool,[Double]))] -> [([PhyloGroup],(Bool,[Double]))]
573 adaptativeSeaLevelMatching proxiConf depth elevation groupsProxi beta minBranch frequency frame periods docs coocs branches =
574 -- | if there is no branch to break or if seaLvl level >= depth then end
575 if (Map.null groupsProxi) || (depth <= 0) || ((not . or) $ map (fst . snd) branches)
576 then branches
577 else
578 -- | break all the possible branches at the current seaLvl level
579 let branches' = adaptativeBreakBranches proxiConf depth elevation groupsProxi beta frequency minBranch frame docs coocs periods
580 [] (head' "seaLevelMatching" branches) (tail' "seaLevelMatching" branches)
581 frequency' = reduceFrequency frequency (map fst branches')
582 groupsProxi' = reduceTupleMapByKeys (map (getGroupId) $ concat $ map (fst) $ filter (fst . snd) branches') groupsProxi
583 -- thr = toThreshold depth groupsProxi
584 in trace("\n " <> foldl (\acc _ -> acc <> "🌊 ") "" [0..(elevation - depth)]
585 <> " [✓ " <> show(length $ filter (fst . snd) branches') <> "(" <> show(length $ concat $ map (fst) $ filter (fst . snd) branches')
586 <> ")|✗ " <> show(length $ filter (not . fst . snd) branches') <> "(" <> show(length $ concat $ map (fst) $ filter (not . fst . snd) branches') <> ")]"
587 <> " thr = ")
588 $ adaptativeSeaLevelMatching proxiConf (depth - 1) elevation groupsProxi' beta minBranch frequency' frame periods docs coocs branches'
589
590
591 adaptativeTemporalMatching :: Double -> Phylo -> Phylo
592 adaptativeTemporalMatching elevation phylo = updatePhyloGroups 1
593 (fromList $ map (\g -> (getGroupId g,g)) $ traceMatchEnd $ concat branches)
594 (toPhyloHorizon phylo)
595 where
596 -- | 2) process the temporal matching by elevating seaLvl level
597 branches :: [[PhyloGroup]]
598 branches = map fst
599 $ adaptativeSeaLevelMatching (phyloProximity $ getConfig phylo)
600 (elevation - 1)
601 elevation
602 (phylo ^. phylo_groupsProxi)
603 (_qua_granularity $ phyloQuality $ getConfig phylo)
604 (_qua_minBranch $ phyloQuality $ getConfig phylo)
605 (phylo ^. phylo_termFreq)
606 (getTimeFrame $ timeUnit $ getConfig phylo)
607 (getPeriodIds phylo)
608 (phylo ^. phylo_timeDocs)
609 (phylo ^. phylo_timeCooc)
610 groups
611 -- | 1) for each group process an initial temporal Matching
612 -- | here we suppose that all the groups of level 1 are part of the same big branch
613 groups :: [([PhyloGroup],(Bool,[Double]))]
614 groups = map (\b -> (b,((length $ nub $ map _phylo_groupPeriod b) >= (_qua_minBranch $ phyloQuality $ getConfig phylo),[thr])))
615 $ groupsToBranches $ fromList $ map (\g -> (getGroupId g, g))
616 $ matchGroupsToGroups (getTimeFrame $ timeUnit $ getConfig phylo)
617 (getPeriodIds phylo) (phyloProximity $ getConfig phylo)
618 thr
619 (phylo ^. phylo_timeDocs)
620 (phylo ^. phylo_timeCooc)
621 (traceTemporalMatching $ getGroupsFromLevel 1 phylo)
622 --------------------------------------
623 thr :: Double
624 thr = toThreshold elevation (phylo ^. phylo_groupsProxi)