]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/Phylo/TemporalMatching.hs
fix recursive matching
[gargantext.git] / src / Gargantext / Viz / Phylo / TemporalMatching.hs
1 {-|
2 Module : Gargantext.Viz.Phylo.TemporalMatching
3 Description : Module dedicated to the adaptative temporal matching of a Phylo.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 {-# LANGUAGE NoImplicitPrelude #-}
12 {-# LANGUAGE FlexibleContexts #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE MultiParamTypeClasses #-}
15
16 module Gargantext.Viz.Phylo.TemporalMatching where
17
18 import Data.List (concat, splitAt, tail, sortOn, (++), intersect, null, inits, groupBy, scanl, nub, union, dropWhile, partition, delete, and)
19 import Data.Map (Map, fromList, elems, restrictKeys, unionWith, intersectionWith, findWithDefault, keys, (!), filterWithKey, toList)
20
21 import Gargantext.Prelude
22 import Gargantext.Viz.AdaptativePhylo
23 import Gargantext.Viz.Phylo.PhyloTools
24
25 -- import Prelude (logBase)
26 import Control.Lens hiding (Level)
27 import Control.Parallel.Strategies (parList, rdeepseq, using)
28 import Debug.Trace (trace)
29
30 import qualified Data.Set as Set
31 import qualified Data.Map as Map
32
33
34 -------------------
35 -- | Proximity | --
36 -------------------
37
38
39 -- | Process the inverse sumLog
40 sumInvLog :: Double -> [Double] -> Double
41 sumInvLog s l = foldl (\mem x -> mem + (1 / log (s + x))) 0 l
42
43
44 -- | Process the sumLog
45 sumLog :: Double -> [Double] -> Double
46 sumLog s l = foldl (\mem x -> mem + log (s + x)) 0 l
47
48
49 -- | To compute a jaccard similarity between two lists
50 jaccard :: [Int] -> [Int] -> Double
51 jaccard inter' union' = ((fromIntegral . length) $ inter') / ((fromIntegral . length) $ union')
52
53
54 -- | To process a WeighedLogJaccard distance between to coocurency matrix
55 weightedLogJaccard :: Double -> Double -> Cooc -> Cooc -> [Int] -> [Int] -> Double
56 weightedLogJaccard sens docs cooc cooc' ngrams ngrams'
57 | null ngramsInter = 0
58 | ngramsInter == ngramsUnion = 1
59 | sens == 0 = jaccard ngramsInter ngramsUnion
60 | sens > 0 = (sumInvLog sens coocInter) / (sumInvLog sens coocUnion)
61 | otherwise = (sumLog sens coocInter) / (sumLog sens coocUnion)
62 where
63 --------------------------------------
64 ngramsInter :: [Int]
65 ngramsInter = intersect ngrams ngrams'
66 --------------------------------------
67 ngramsUnion :: [Int]
68 ngramsUnion = union ngrams ngrams'
69 --------------------------------------
70 coocInter :: [Double]
71 coocInter = elems $ map (/docs) $ filterWithKey (\(k,k') _ -> k == k') $ intersectionWith (+) cooc cooc'
72 -- coocInter = elems $ map (/docs) $ intersectionWith (+) cooc cooc'
73 --------------------------------------
74 coocUnion :: [Double]
75 coocUnion = elems $ map (/docs) $ filterWithKey (\(k,k') _ -> k == k') $ unionWith (+) cooc cooc'
76 --------------------------------------
77
78
79 -- | To choose a proximity function
80 pickProximity :: Proximity -> Double -> Cooc -> Cooc -> [Int] -> [Int] -> Double
81 pickProximity proximity docs cooc cooc' ngrams ngrams' = case proximity of
82 WeightedLogJaccard sens _ _ -> weightedLogJaccard sens docs cooc cooc' ngrams ngrams'
83 Hamming -> undefined
84
85
86 -- | To process the proximity between a current group and a pair of targets group
87 toProximity :: Map Date Double -> Proximity -> PhyloGroup -> PhyloGroup -> PhyloGroup -> Double
88 toProximity docs proximity ego target target' =
89 let docs' = sum $ elems docs
90 cooc = if target == target'
91 then (target ^. phylo_groupCooc)
92 else sumCooc (target ^. phylo_groupCooc) (target' ^. phylo_groupCooc)
93 ngrams = if target == target'
94 then (target ^. phylo_groupNgrams)
95 else union (target ^. phylo_groupNgrams) (target' ^. phylo_groupNgrams)
96 in pickProximity proximity docs' (ego ^. phylo_groupCooc) cooc (ego ^. phylo_groupNgrams) ngrams
97
98
99 ------------------------
100 -- | Local Matching | --
101 ------------------------
102
103 toLastPeriod :: Filiation -> [PhyloPeriodId] -> PhyloPeriodId
104 toLastPeriod fil periods = case fil of
105 ToParents -> head' "toLastPeriod" (sortOn fst periods)
106 ToChilds -> last' "toLastPeriod" (sortOn fst periods)
107
108
109 toLazyPairs :: [Pointer] -> Filiation -> Double -> Proximity -> PhyloPeriodId -> [(PhyloGroup,PhyloGroup)] -> [(PhyloGroup,PhyloGroup)]
110 toLazyPairs pointers fil thr prox prd pairs =
111 if null pointers then pairs
112 else let rest = filterPointers prox thr pointers
113 in if null rest
114 then let prd' = toLastPeriod fil (map (fst . fst . fst) pointers)
115 in if prd' == prd
116 then []
117 else filter (\(g,g') ->
118 case fil of
119 ToParents -> ((fst $ g ^. phylo_groupPeriod) < (fst prd'))
120 || ((fst $ g' ^. phylo_groupPeriod) < (fst prd'))
121 ToChilds -> ((fst $ g ^. phylo_groupPeriod) > (fst prd'))
122 || ((fst $ g' ^. phylo_groupPeriod) > (fst prd'))) pairs
123 else []
124
125
126 -- | Find pairs of valuable candidates to be matched
127 makePairs' :: PhyloGroup -> [PhyloGroup] -> [PhyloPeriodId] -> [Pointer] -> Filiation -> Double -> Proximity -> Map Date Double -> [(PhyloGroup,PhyloGroup)]
128 makePairs' ego candidates periods pointers fil thr prox docs =
129 case null periods of
130 True -> []
131 False -> toLazyPairs pointers fil thr prox lastPrd
132 -- | at least on of the pair candidates should be from the last added period
133 $ filter (\(g,g') -> ((g ^. phylo_groupPeriod) == lastPrd)
134 || ((g' ^. phylo_groupPeriod) == lastPrd))
135 $ listToKeys
136 $ filter (\g -> (g ^. phylo_groupPeriod == lastPrd)
137 || ((toProximity docs prox ego ego g) >= thr)) candidates
138 where
139 lastPrd :: PhyloPeriodId
140 lastPrd = toLastPeriod fil periods
141
142
143 filterPointers :: Proximity -> Double -> [Pointer] -> [Pointer]
144 filterPointers proxi thr pts = filter (\(_,w) -> filterProximity proxi thr w) pts
145
146
147 phyloGroupMatching :: [[PhyloGroup]] -> Filiation -> Proximity -> Map Date Double -> Double -> PhyloGroup -> PhyloGroup
148 phyloGroupMatching candidates fil proxi docs thr ego =
149 case null nextPointers of
150 -- | let's find new pointers
151 True -> if null $ filterPointers proxi thr $ getPeriodPointers fil ego
152 then addPointers ego fil TemporalPointer []
153 -- | or keep the old ones
154 else addPointers ego fil TemporalPointer
155 $ filterPointers proxi thr $ getPeriodPointers fil ego
156 False -> addPointers ego fil TemporalPointer
157 $ head' "phyloGroupMatching"
158 -- | Keep only the best set of pointers grouped by proximity
159 $ groupBy (\pt pt' -> snd pt == snd pt')
160 $ reverse $ sortOn snd $ head' "pointers"
161 $ nextPointers
162 -- | Find the first time frame where at leats one pointer satisfies the proximity threshold
163 where
164 nextPointers :: [[Pointer]]
165 nextPointers = take 1
166 $ dropWhile (null)
167 -- | for each time frame, process the proximity on relevant pairs of targeted groups
168 $ scanl (\acc groups ->
169 let periods = nub $ map _phylo_groupPeriod $ concat groups
170 docs' = (filterDocs docs ([ego ^. phylo_groupPeriod] ++ periods))
171 pairs = makePairs' ego (concat groups) periods (getPeriodPointers fil ego) fil thr proxi docs
172 in acc ++ ( filterPointers proxi thr
173 $ concat
174 $ map (\(c,c') ->
175 -- | process the proximity between the current group and a pair of candidates
176 let proximity = toProximity docs' proxi ego c c'
177 in if (c == c')
178 then [(getGroupId c,proximity)]
179 else [(getGroupId c,proximity),(getGroupId c',proximity)] ) pairs )) []
180 $ inits candidates -- | groups from [[1900],[1900,1901],[1900,1901,1902],...]
181
182
183 filterDocs :: Map Date Double -> [PhyloPeriodId] -> Map Date Double
184 filterDocs d pds = restrictKeys d $ periodsToYears pds
185
186
187 -----------------------------
188 -- | Matching Processing | --
189 -----------------------------
190
191
192 getNextPeriods :: Filiation -> Int -> PhyloPeriodId -> [PhyloPeriodId] -> [PhyloPeriodId]
193 getNextPeriods fil max' pId pIds =
194 case fil of
195 ToChilds -> take max' $ (tail . snd) $ splitAt (elemIndex' pId pIds) pIds
196 ToParents -> take max' $ (reverse . fst) $ splitAt (elemIndex' pId pIds) pIds
197
198
199 getCandidates :: Filiation -> PhyloGroup -> [[PhyloGroup]] -> [[PhyloGroup]]
200 getCandidates fil ego targets =
201 case fil of
202 ToChilds -> targets'
203 ToParents -> reverse targets'
204 where
205 targets' :: [[PhyloGroup]]
206 targets' =
207 map (\groups' ->
208 filter (\g' -> (not . null) $ intersect (ego ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams)
209 ) groups') targets
210
211
212 phyloBranchMatching :: Int -> [PhyloPeriodId] -> Proximity -> Double -> Map Date Double -> [PhyloGroup] -> [PhyloGroup]
213 phyloBranchMatching frame periods proximity thr docs branch = traceBranchMatching proximity thr
214 $ matchByPeriods
215 $ groupByField _phylo_groupPeriod branch
216 where
217 --------------------------------------
218 matchByPeriods :: Map PhyloPeriodId [PhyloGroup] -> [PhyloGroup]
219 matchByPeriods branch' = foldl' (\acc prd ->
220 let periodsPar = getNextPeriods ToParents frame prd periods
221 periodsChi = getNextPeriods ToChilds frame prd periods
222 candidatesPar = map (\prd' -> findWithDefault [] prd' branch') periodsPar
223 candidatesChi = map (\prd' -> findWithDefault [] prd' branch') periodsChi
224 docsPar = filterDocs docs ([prd] ++ periodsPar)
225 docsChi = filterDocs docs ([prd] ++ periodsChi)
226 egos = map (\ego -> phyloGroupMatching (getCandidates ToParents ego candidatesPar) ToParents proximity docsPar thr
227 $ phyloGroupMatching (getCandidates ToChilds ego candidatesChi) ToChilds proximity docsChi thr ego)
228 $ findWithDefault [] prd branch'
229 egos' = egos `using` parList rdeepseq
230 in acc ++ egos' ) [] periods
231
232
233 -----------------------
234 -- | Phylo Quality | --
235 -----------------------
236
237
238 count :: Eq a => a -> [a] -> Int
239 count x = length . filter (== x)
240
241 termFreq' :: Int -> [PhyloGroup] -> Double
242 termFreq' term groups =
243 let ngrams = concat $ map _phylo_groupNgrams groups
244 in log ((fromIntegral $ count term ngrams)
245 / (fromIntegral $ length ngrams))
246
247 relevantBranches :: Int -> [[PhyloGroup]] -> [[PhyloGroup]]
248 relevantBranches term branches =
249 filter (\groups -> (any (\group -> elem term $ group ^. phylo_groupNgrams) groups)) branches
250
251 branchCov' :: [PhyloGroup] -> [[PhyloGroup]] -> Double
252 branchCov' branch branches =
253 (fromIntegral $ length branch) / (fromIntegral $ length $ concat branches)
254
255
256 toRecall :: Double -> Int -> Int -> [[PhyloGroup]] -> Double
257 toRecall freq term border branches =
258 -- | given a random term in a phylo
259 freq
260 -- | for each relevant branches
261 * (sum $ map (\branch ->
262 -- | given its local coverage
263 ((branchCov' branch branches') / (sum $ map (\b -> branchCov' b branches') branches'))
264 -- | compute the local recall
265 * ( (fromIntegral $ length $ filter (\group -> elem term $ group ^. phylo_groupNgrams) branch)
266 / ( (fromIntegral $ length $ filter (\group -> elem term $ group ^. phylo_groupNgrams) $ concat branches')
267 -- | with a ponderation from border branches
268 + (fromIntegral border)) )) branches')
269 where
270 branches' :: [[PhyloGroup]]
271 branches' = relevantBranches term branches
272
273
274 toAccuracy :: Double -> Int -> [[PhyloGroup]] -> Double
275 toAccuracy freq term branches =
276 if (null branches)
277 then 0
278 else
279 -- | given a random term in a phylo
280 freq
281 -- | for each relevant branches
282 * (sum $ map (\branch ->
283 -- | given its local coverage
284 ((branchCov' branch branches') / (sum $ map (\b -> branchCov' b branches') branches'))
285 -- | compute the local accuracy
286 * ( (fromIntegral $ length $ filter (\group -> elem term $ group ^. phylo_groupNgrams) branch)
287 / (fromIntegral $ length branch))) branches')
288 where
289 branches' :: [[PhyloGroup]]
290 branches' = relevantBranches term branches
291
292
293 fScore :: Double -> Int -> [PhyloGroup] -> [[PhyloGroup]] -> Double
294 fScore beta i bk bks =
295 let recall = ( (fromIntegral $ length $ filter (\g -> elem i $ g ^. phylo_groupNgrams) bk)
296 / (fromIntegral $ length $ filter (\g -> elem i $ g ^. phylo_groupNgrams) $ concat bks))
297 accuracy = ( (fromIntegral $ length $ filter (\g -> elem i $ g ^. phylo_groupNgrams) bk)
298 / (fromIntegral $ length bk))
299 in ((1 + beta ** 2) * accuracy * recall)
300 / (((beta ** 2) * accuracy + recall))
301
302
303 wk :: [PhyloGroup] -> Double
304 wk bk = fromIntegral $ length bk
305
306
307 toPhyloQuality' :: Double -> Map Int Double -> [[PhyloGroup]] -> Double
308 toPhyloQuality' beta freq branches =
309 if (null branches)
310 then 0
311 else sum
312 $ map (\i ->
313 let bks = relevantBranches i branches
314 in (freq ! i) * (sum $ map (\bk -> ((wk bk) / (sum $ map wk bks)) * (fScore beta i bk bks)) bks))
315 $ keys freq
316
317
318
319 toPhyloQuality :: Double -> Map Int Double -> Int -> Double -> [[PhyloGroup]] -> Double
320 toPhyloQuality beta frequency border oldAcc branches =
321 -- trace (" rec : " <> show(recall)) $
322 -- trace (" acc : " <> show(accuracy)) $
323 if (null branches)
324 then 0
325 else ((1 + beta ** 2) * accuracy * recall)
326 / (((beta ** 2) * accuracy + recall))
327 where
328 -- | for each term compute the global accuracy
329 accuracy :: Double
330 accuracy = oldAcc + (sum $ map (\term -> toAccuracy (frequency ! term) term branches) $ keys frequency)
331 -- | for each term compute the global recall
332 recall :: Double
333 recall = sum $ map (\term -> toRecall (frequency ! term) term border branches) $ keys frequency
334
335
336 toBorderAccuracy :: Map Int Double -> [[PhyloGroup]] -> Double
337 toBorderAccuracy freq branches = sum $ map (\t -> toAccuracy (freq ! t) t branches) $ keys freq
338
339
340 -----------------------------
341 -- | Adaptative Matching | --
342 -----------------------------
343
344
345 groupsToBranches :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
346 groupsToBranches groups =
347 -- | run the related component algorithm
348 let egos = groupBy (\gs gs' -> (fst $ fst $ head' "egos" gs) == (fst $ fst $ head' "egos" gs'))
349 $ sortOn (\gs -> fst $ fst $ head' "egos" gs)
350 $ map (\group -> [getGroupId group]
351 ++ (map fst $ group ^. phylo_groupPeriodParents)
352 ++ (map fst $ group ^. phylo_groupPeriodChilds) ) $ elems groups
353 -- | first find the related components by inside each ego's period
354 graph' = map relatedComponents egos
355 -- | then run it for the all the periods
356 graph = zip [1..]
357 $ relatedComponents $ concat (graph' `using` parList rdeepseq)
358 -- | update each group's branch id
359 in map (\(bId,ids) ->
360 let groups' = map (\group -> group & phylo_groupBranchId %~ (\(lvl,lst) -> (lvl,lst ++ [bId])))
361 $ elems $ restrictKeys groups (Set.fromList ids)
362 in groups' `using` parList rdeepseq ) graph
363
364
365 reduceFrequency :: Map Int Double -> [[PhyloGroup]] -> Map Int Double
366 reduceFrequency frequency branches =
367 restrictKeys frequency (Set.fromList $ (nub . concat) $ map _phylo_groupNgrams $ concat branches)
368
369
370 alterBorder :: Int -> [[PhyloGroup]] -> [PhyloGroup] -> Int
371 alterBorder border branches branch = border + (length $ concat branches) - (length branch)
372
373
374 seqMatching :: Proximity -> Double -> Map Int Double -> Int -> Double -> Int -> Map Date Double -> [PhyloPeriodId] -> [([PhyloGroup],Bool)] -> ([PhyloGroup],Bool) -> [([PhyloGroup],Bool)] -> [([PhyloGroup],Bool)]
375 seqMatching proximity beta frequency minBranch egoThr frame docs periods done ego rest =
376 -- | 1) keep or not the new division of ego
377 let done' = done ++ (if snd ego
378 then (if ((null ego') || (quality > quality'))
379 then trace (" ✗ F(β) = " <> show(quality) <> " (vs) " <> show(quality')
380 <> " | " <> show((length done) + (length ego') + (length rest))
381 <> "["
382 <> show((length $ concat $ map fst done) + (length $ concat ego') + (length $ concat $ map fst rest))
383 <> "]")
384 $ [(fst ego,False)]
385 else trace (" ✓ F(β) = " <> show(quality) <> " (vs) " <> show(quality')
386 <> " | " <> show((length done) + (length ego') + (length rest))
387 <> "["
388 <> show((length $ concat $ map fst done) + (length $ concat ego') + (length $ concat $ map fst rest))
389 <> "]")
390 $ (map (\e -> (e,True)) ego'))
391 else [ego])
392 in
393 -- | 2) if there is no more branches in rest then return else continue
394 if null rest
395 then done'
396 else seqMatching proximity beta frequency minBranch egoThr frame docs periods
397 done' (head' "seqMatching" rest) (tail' "seqMatching" rest)
398 where
399 --------------------------------------
400 quality :: Double
401 quality = toPhyloQuality' beta frequency ((map fst done) ++ [fst ego] ++ (map fst rest))
402 --------------------------------------
403 ego' :: [[PhyloGroup]]
404 ego' =
405 let branches = groupsToBranches $ fromList $ map (\g -> (getGroupId g, g))
406 $ phyloBranchMatching frame periods proximity egoThr docs (fst ego)
407 branches' = branches `using` parList rdeepseq
408 in filter (\b -> length b >= minBranch) branches'
409 --------------------------------------
410 quality' :: Double
411 quality' = toPhyloQuality' beta (reduceFrequency frequency ((map fst done) ++ ego' ++ (map fst rest)))
412 ((map fst done) ++ ego' ++ (map fst rest))
413
414
415 recursiveMatching' :: Proximity -> Double -> Int -> Map Int Double -> Double -> Int -> [PhyloPeriodId] -> Map Date Double -> [([PhyloGroup],Bool)] -> [([PhyloGroup],Bool)]
416 recursiveMatching' proximity beta minBranch frequency egoThr frame periods docs branches =
417 if (egoThr >= 1) || ((not . and) $ map snd branches)
418 then branches
419 else
420 let branches' = seqMatching proximity beta frequency minBranch egoThr frame docs periods
421 [] (head' "recursiveMatching" branches) (tail' "recursiveMatching" branches)
422 frequency' = reduceFrequency frequency (map fst branches')
423 in recursiveMatching' proximity beta minBranch frequency' (egoThr + (getThresholdStep proximity)) frame periods docs branches'
424
425
426 recursiveMatching :: Proximity -> Double -> Int -> Map Int Double -> Double -> Int -> [PhyloPeriodId] -> Map Date Double -> Double -> Int -> Double -> [PhyloGroup] -> [PhyloGroup]
427 recursiveMatching proximity beta minBranch frequency egoThr frame periods docs quality border oldAcc groups =
428 if ((egoThr >= 1) || (quality > quality') || ((length $ concat $ snd branches') == (length groups)))
429 then
430 trace (" ✗ F(β) = " <> show(quality) <> " (vs) " <> show(quality') <> "\n"
431 <> " |✓ " <> show(length $ fst branches') <> show(map length $ fst branches')
432 <> " |✗ " <> show(length $ snd branches') <> "[" <> show(length $ concat $ snd branches') <> "]") $
433 groups
434 else
435 let next = map (\b -> recursiveMatching proximity beta minBranch
436 (reduceFrequency frequency (fst branches'))
437 (egoThr + (getThresholdStep proximity))
438 frame periods docs quality'
439 (alterBorder border (fst branches') b)
440 (oldAcc + (toBorderAccuracy frequency (delete b ((fst branches') ++ (snd branches')))))
441 b ) (fst branches')
442 in trace (" ✓ F(β) = " <> show(quality) <> " (vs) " <> show(quality') <> "\n"
443 <> " |✓ " <> show(length $ fst branches') <> show(map length $ fst branches')
444 <> " |✗ " <> show(length $ snd branches') <> "[" <> show(length $ concat $ snd branches') <> "]") $
445 concat (next ++ (snd branches'))
446 where
447 -- | 2) for each of the possible next branches process the phyloQuality score
448 quality' :: Double
449 quality' = toPhyloQuality beta frequency border oldAcc ((fst branches') ++ (snd branches'))
450 -- | 1) for each local branch process a temporal matching then find the resulting branches
451 branches' :: ([[PhyloGroup]],[[PhyloGroup]])
452 branches' =
453 let branches = groupsToBranches $ fromList $ map (\g -> (getGroupId g, g))
454 $ phyloBranchMatching frame periods proximity egoThr docs groups
455 in partition (\b -> length b >= minBranch) (branches `using` parList rdeepseq)
456
457
458 temporalMatching :: Phylo -> Phylo
459 temporalMatching phylo = updatePhyloGroups 1
460 (fromList $ map (\g -> (getGroupId g,g)) $ traceMatchEnd $ concat branches)
461 phylo
462 where
463 branches :: [[PhyloGroup]]
464 branches = map fst
465 $ recursiveMatching' (phyloProximity $ getConfig phylo)
466 (_qua_relevance $ phyloQuality $ getConfig phylo)
467 (_qua_minBranch $ phyloQuality $ getConfig phylo)
468 frequency
469 (getThresholdInit $ phyloProximity $ getConfig phylo)
470 (getTimeFrame $ timeUnit $ getConfig phylo)
471 (getPeriodIds phylo)
472 (phylo ^. phylo_timeDocs)
473 [(groups,True)]
474 -- | 2) process the constants of the quality score
475 frequency :: Map Int Double
476 frequency =
477 let terms = ngramsInBranches [groups]
478 freqs = map (\t -> termFreq' t groups) terms
479 in fromList $ map (\(t,freq) -> (t,freq/(sum freqs))) $ zip terms freqs
480 -- | 1) for each group process an initial temporal Matching
481 groups :: [PhyloGroup]
482 groups = phyloBranchMatching (getTimeFrame $ timeUnit $ getConfig phylo) (getPeriodIds phylo)
483 (phyloProximity $ getConfig phylo) (getThresholdInit $ phyloProximity $ getConfig phylo)
484 (phylo ^. phylo_timeDocs)
485 (traceTemporalMatching $ getGroupsFromLevel 1 phylo)
486
487
488
489 temporalMatching' :: Phylo -> Phylo
490 temporalMatching' phylo = updatePhyloGroups 1 branches' phylo
491 where
492 -- | 5) apply the recursive matching
493 branches' :: Map PhyloGroupId PhyloGroup
494 branches' =
495 let next = trace (" ✓ F(β) = " <> show(quality)
496 <> " |✓ " <> show(length $ fst branches) <> show(map length $ fst branches)
497 <> " |✗ " <> show(length $ snd branches) <> "[" <> show(length $ concat $ snd branches) <> "]")
498 $ map (\branch -> recursiveMatching (phyloProximity $ getConfig phylo)
499 (_qua_relevance $ phyloQuality $ getConfig phylo)
500 (_qua_minBranch $ phyloQuality $ getConfig phylo)
501 (reduceFrequency frequency (fst branches))
502 ( (getThresholdInit $ phyloProximity $ getConfig phylo)
503 + (getThresholdStep $ phyloProximity $ getConfig phylo))
504 (getTimeFrame $ timeUnit $ getConfig phylo)
505 (getPeriodIds phylo)
506 (phylo ^. phylo_timeDocs) quality (alterBorder 0 (fst branches) branch)
507 (toBorderAccuracy frequency (delete branch ((fst branches) ++ (snd branches))))
508 branch
509 ) (fst branches)
510 in fromList $ map (\g -> (getGroupId g,g)) $ traceMatchEnd (concat (next ++ (snd branches)))
511 -- | 4) process the quality score
512 quality :: Double
513 quality = toPhyloQuality (_qua_relevance $ phyloQuality $ getConfig phylo) frequency 0 0 ((fst branches) ++ (snd branches))
514 -- | 3) process the constants of the quality score
515 frequency :: Map Int Double
516 frequency =
517 let terms = ngramsInBranches ((fst branches) ++ (snd branches))
518 freqs = map (\t -> termFreq' t $ concat ((fst branches) ++ (snd branches))) terms
519 in fromList $ map (\(t,freq) -> (t,freq/(sum freqs))) $ zip terms freqs
520 -- | 2) group into branches
521 branches :: ([[PhyloGroup]],[[PhyloGroup]])
522 branches = partition (\b -> length b >= (_qua_minBranch $ phyloQuality $ getConfig phylo))
523 $ groupsToBranches $ fromList $ map (\group -> (getGroupId group, group)) groups'
524 -- | 1) for each group process an initial temporal Matching
525 groups' :: [PhyloGroup]
526 groups' = phyloBranchMatching (getTimeFrame $ timeUnit $ getConfig phylo) (getPeriodIds phylo)
527 (phyloProximity $ getConfig phylo) (getThresholdInit $ phyloProximity $ getConfig phylo)
528 (phylo ^. phylo_timeDocs)
529 (traceTemporalMatching $ getGroupsFromLevel 1 phylo)