]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/Phylo/TemporalMatching.hs
fix the freq
[gargantext.git] / src / Gargantext / Viz / Phylo / TemporalMatching.hs
1 {-|
2 Module : Gargantext.Viz.Phylo.TemporalMatching
3 Description : Module dedicated to the adaptative temporal matching of a Phylo.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 {-# LANGUAGE NoImplicitPrelude #-}
12 {-# LANGUAGE FlexibleContexts #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE MultiParamTypeClasses #-}
15
16 module Gargantext.Viz.Phylo.TemporalMatching where
17
18 import Data.List (concat, splitAt, tail, sortOn, (++), intersect, null, inits, groupBy, scanl, nub, union, dropWhile, partition, delete, or)
19 import Data.Map (Map, fromList, elems, restrictKeys, unionWith, intersectionWith, findWithDefault, keys, (!), filterWithKey)
20
21 import Gargantext.Prelude
22 import Gargantext.Viz.AdaptativePhylo
23 import Gargantext.Viz.Phylo.PhyloTools
24
25 -- import Prelude (logBase)
26 import Control.Lens hiding (Level)
27 import Control.Parallel.Strategies (parList, rdeepseq, using)
28 import Debug.Trace (trace)
29
30 import qualified Data.Set as Set
31
32
33 -------------------
34 -- | Proximity | --
35 -------------------
36
37
38 -- | Process the inverse sumLog
39 sumInvLog :: Double -> [Double] -> Double
40 sumInvLog s l = foldl (\mem x -> mem + (1 / log (s + x))) 0 l
41
42
43 -- | Process the sumLog
44 sumLog :: Double -> [Double] -> Double
45 sumLog s l = foldl (\mem x -> mem + log (s + x)) 0 l
46
47
48 -- | To compute a jaccard similarity between two lists
49 jaccard :: [Int] -> [Int] -> Double
50 jaccard inter' union' = ((fromIntegral . length) $ inter') / ((fromIntegral . length) $ union')
51
52
53 -- | To process a WeighedLogJaccard distance between to coocurency matrix
54 weightedLogJaccard :: Double -> Double -> Cooc -> Cooc -> [Int] -> [Int] -> Double
55 weightedLogJaccard sens docs cooc cooc' ngrams ngrams'
56 | null ngramsInter = 0
57 | ngramsInter == ngramsUnion = 1
58 | sens == 0 = jaccard ngramsInter ngramsUnion
59 | sens > 0 = (sumInvLog sens coocInter) / (sumInvLog sens coocUnion)
60 | otherwise = (sumLog sens coocInter) / (sumLog sens coocUnion)
61 where
62 --------------------------------------
63 ngramsInter :: [Int]
64 ngramsInter = intersect ngrams ngrams'
65 --------------------------------------
66 ngramsUnion :: [Int]
67 ngramsUnion = union ngrams ngrams'
68 --------------------------------------
69 coocInter :: [Double]
70 coocInter = elems $ map (/docs) $ filterWithKey (\(k,k') _ -> k == k') $ intersectionWith (+) cooc cooc'
71 -- coocInter = elems $ map (/docs) $ intersectionWith (+) cooc cooc'
72 --------------------------------------
73 coocUnion :: [Double]
74 coocUnion = elems $ map (/docs) $ filterWithKey (\(k,k') _ -> k == k') $ unionWith (+) cooc cooc'
75 --------------------------------------
76
77
78 -- | To choose a proximity function
79 pickProximity :: Proximity -> Double -> Cooc -> Cooc -> [Int] -> [Int] -> Double
80 pickProximity proximity docs cooc cooc' ngrams ngrams' = case proximity of
81 WeightedLogJaccard sens _ _ -> weightedLogJaccard sens docs cooc cooc' ngrams ngrams'
82 Hamming -> undefined
83
84
85 -- | To process the proximity between a current group and a pair of targets group
86 toProximity :: Map Date Double -> Proximity -> PhyloGroup -> PhyloGroup -> PhyloGroup -> Double
87 toProximity docs proximity ego target target' =
88 let docs' = sum $ elems docs
89 cooc = if target == target'
90 then (target ^. phylo_groupCooc)
91 else sumCooc (target ^. phylo_groupCooc) (target' ^. phylo_groupCooc)
92 ngrams = if target == target'
93 then (target ^. phylo_groupNgrams)
94 else union (target ^. phylo_groupNgrams) (target' ^. phylo_groupNgrams)
95 in pickProximity proximity docs' (ego ^. phylo_groupCooc) cooc (ego ^. phylo_groupNgrams) ngrams
96
97
98 ------------------------
99 -- | Local Matching | --
100 ------------------------
101
102 toLastPeriod :: Filiation -> [PhyloPeriodId] -> PhyloPeriodId
103 toLastPeriod fil periods = case fil of
104 ToParents -> head' "toLastPeriod" (sortOn fst periods)
105 ToChilds -> last' "toLastPeriod" (sortOn fst periods)
106
107
108 toLazyPairs :: [Pointer] -> Filiation -> Double -> Proximity -> PhyloPeriodId -> [(PhyloGroup,PhyloGroup)] -> [(PhyloGroup,PhyloGroup)]
109 toLazyPairs pointers fil thr prox prd pairs =
110 if null pointers then pairs
111 else let rest = filterPointers prox thr pointers
112 in if null rest
113 then let prd' = toLastPeriod fil (map (fst . fst . fst) pointers)
114 in if prd' == prd
115 then []
116 else filter (\(g,g') ->
117 case fil of
118 ToParents -> ((fst $ g ^. phylo_groupPeriod) < (fst prd'))
119 || ((fst $ g' ^. phylo_groupPeriod) < (fst prd'))
120 ToChilds -> ((fst $ g ^. phylo_groupPeriod) > (fst prd'))
121 || ((fst $ g' ^. phylo_groupPeriod) > (fst prd'))) pairs
122 else []
123
124
125 -- | Find pairs of valuable candidates to be matched
126 makePairs' :: PhyloGroup -> [PhyloGroup] -> [PhyloPeriodId] -> [Pointer] -> Filiation -> Double -> Proximity -> Map Date Double -> [(PhyloGroup,PhyloGroup)]
127 makePairs' ego candidates periods pointers fil thr prox docs =
128 case null periods of
129 True -> []
130 False -> toLazyPairs pointers fil thr prox lastPrd
131 -- | at least on of the pair candidates should be from the last added period
132 $ filter (\(g,g') -> ((g ^. phylo_groupPeriod) == lastPrd)
133 || ((g' ^. phylo_groupPeriod) == lastPrd))
134 $ listToKeys
135 $ filter (\g -> (g ^. phylo_groupPeriod == lastPrd)
136 || ((toProximity docs prox ego ego g) >= thr)) candidates
137 where
138 lastPrd :: PhyloPeriodId
139 lastPrd = toLastPeriod fil periods
140
141
142 filterPointers :: Proximity -> Double -> [Pointer] -> [Pointer]
143 filterPointers proxi thr pts = filter (\(_,w) -> filterProximity proxi thr w) pts
144
145
146 phyloGroupMatching :: [[PhyloGroup]] -> Filiation -> Proximity -> Map Date Double -> Double -> PhyloGroup -> PhyloGroup
147 phyloGroupMatching candidates fil proxi docs thr ego =
148 case null nextPointers of
149 -- | let's find new pointers
150 True -> if null $ filterPointers proxi thr $ getPeriodPointers fil ego
151 then addPointers ego fil TemporalPointer []
152 -- | or keep the old ones
153 else addPointers ego fil TemporalPointer
154 $ filterPointers proxi thr $ getPeriodPointers fil ego
155 False -> addPointers ego fil TemporalPointer
156 $ head' "phyloGroupMatching"
157 -- | Keep only the best set of pointers grouped by proximity
158 $ groupBy (\pt pt' -> snd pt == snd pt')
159 $ reverse $ sortOn snd $ head' "pointers"
160 $ nextPointers
161 -- | Find the first time frame where at leats one pointer satisfies the proximity threshold
162 where
163 nextPointers :: [[Pointer]]
164 nextPointers = take 1
165 $ dropWhile (null)
166 -- | for each time frame, process the proximity on relevant pairs of targeted groups
167 $ scanl (\acc groups ->
168 let periods = nub $ map _phylo_groupPeriod $ concat groups
169 docs' = (filterDocs docs ([ego ^. phylo_groupPeriod] ++ periods))
170 pairs = makePairs' ego (concat groups) periods (getPeriodPointers fil ego) fil thr proxi docs
171 in acc ++ ( filterPointers proxi thr
172 $ concat
173 $ map (\(c,c') ->
174 -- | process the proximity between the current group and a pair of candidates
175 let proximity = toProximity docs' proxi ego c c'
176 in if (c == c')
177 then [(getGroupId c,proximity)]
178 else [(getGroupId c,proximity),(getGroupId c',proximity)] ) pairs )) []
179 $ inits candidates -- | groups from [[1900],[1900,1901],[1900,1901,1902],...]
180
181
182 filterDocs :: Map Date Double -> [PhyloPeriodId] -> Map Date Double
183 filterDocs d pds = restrictKeys d $ periodsToYears pds
184
185
186 -----------------------------
187 -- | Matching Processing | --
188 -----------------------------
189
190
191 getNextPeriods :: Filiation -> Int -> PhyloPeriodId -> [PhyloPeriodId] -> [PhyloPeriodId]
192 getNextPeriods fil max' pId pIds =
193 case fil of
194 ToChilds -> take max' $ (tail . snd) $ splitAt (elemIndex' pId pIds) pIds
195 ToParents -> take max' $ (reverse . fst) $ splitAt (elemIndex' pId pIds) pIds
196
197
198 getCandidates :: Filiation -> PhyloGroup -> [[PhyloGroup]] -> [[PhyloGroup]]
199 getCandidates fil ego targets =
200 case fil of
201 ToChilds -> targets'
202 ToParents -> reverse targets'
203 where
204 targets' :: [[PhyloGroup]]
205 targets' =
206 map (\groups' ->
207 filter (\g' -> (not . null) $ intersect (ego ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams)
208 ) groups') targets
209
210
211 phyloBranchMatching :: Int -> [PhyloPeriodId] -> Proximity -> Double -> Map Date Double -> [PhyloGroup] -> [PhyloGroup]
212 phyloBranchMatching frame periods proximity thr docs branch = traceBranchMatching proximity thr
213 $ matchByPeriods
214 $ groupByField _phylo_groupPeriod branch
215 where
216 --------------------------------------
217 matchByPeriods :: Map PhyloPeriodId [PhyloGroup] -> [PhyloGroup]
218 matchByPeriods branch' = foldl' (\acc prd ->
219 let periodsPar = getNextPeriods ToParents frame prd periods
220 periodsChi = getNextPeriods ToChilds frame prd periods
221 candidatesPar = map (\prd' -> findWithDefault [] prd' branch') periodsPar
222 candidatesChi = map (\prd' -> findWithDefault [] prd' branch') periodsChi
223 docsPar = filterDocs docs ([prd] ++ periodsPar)
224 docsChi = filterDocs docs ([prd] ++ periodsChi)
225 egos = map (\ego -> phyloGroupMatching (getCandidates ToParents ego candidatesPar) ToParents proximity docsPar thr
226 $ phyloGroupMatching (getCandidates ToChilds ego candidatesChi) ToChilds proximity docsChi thr ego)
227 $ findWithDefault [] prd branch'
228 egos' = egos `using` parList rdeepseq
229 in acc ++ egos' ) [] periods
230
231
232 -----------------------
233 -- | Phylo Quality | --
234 -----------------------
235
236
237 count :: Eq a => a -> [a] -> Int
238 count x = length . filter (== x)
239
240 termFreq' :: Int -> [PhyloGroup] -> Double
241 termFreq' term groups =
242 let ngrams = concat $ map _phylo_groupNgrams groups
243 in log ((fromIntegral $ count term ngrams)
244 / (fromIntegral $ length ngrams))
245
246 relevantBranches :: Int -> [[PhyloGroup]] -> [[PhyloGroup]]
247 relevantBranches term branches =
248 filter (\groups -> (any (\group -> elem term $ group ^. phylo_groupNgrams) groups)) branches
249
250 branchCov' :: [PhyloGroup] -> [[PhyloGroup]] -> Double
251 branchCov' branch branches =
252 (fromIntegral $ length branch) / (fromIntegral $ length $ concat branches)
253
254
255 toRecall :: Double -> Int -> Int -> [[PhyloGroup]] -> Double
256 toRecall freq term border branches =
257 -- | given a random term in a phylo
258 freq
259 -- | for each relevant branches
260 * (sum $ map (\branch ->
261 -- | given its local coverage
262 ((branchCov' branch branches') / (sum $ map (\b -> branchCov' b branches') branches'))
263 -- | compute the local recall
264 * ( (fromIntegral $ length $ filter (\group -> elem term $ group ^. phylo_groupNgrams) branch)
265 / ( (fromIntegral $ length $ filter (\group -> elem term $ group ^. phylo_groupNgrams) $ concat branches')
266 -- | with a ponderation from border branches
267 + (fromIntegral border)) )) branches')
268 where
269 branches' :: [[PhyloGroup]]
270 branches' = relevantBranches term branches
271
272
273 toAccuracy :: Double -> Int -> [[PhyloGroup]] -> Double
274 toAccuracy freq term branches =
275 if (null branches)
276 then 0
277 else
278 -- | given a random term in a phylo
279 freq
280 -- | for each relevant branches
281 * (sum $ map (\branch ->
282 -- | given its local coverage
283 ((branchCov' branch branches') / (sum $ map (\b -> branchCov' b branches') branches'))
284 -- | compute the local accuracy
285 * ( (fromIntegral $ length $ filter (\group -> elem term $ group ^. phylo_groupNgrams) branch)
286 / (fromIntegral $ length branch))) branches')
287 where
288 branches' :: [[PhyloGroup]]
289 branches' = relevantBranches term branches
290
291
292 fScore :: Double -> Int -> [PhyloGroup] -> [[PhyloGroup]] -> Double
293 fScore beta i bk bks =
294 let recall = ( (fromIntegral $ length $ filter (\g -> elem i $ g ^. phylo_groupNgrams) bk)
295 / (fromIntegral $ length $ filter (\g -> elem i $ g ^. phylo_groupNgrams) $ concat bks))
296 accuracy = ( (fromIntegral $ length $ filter (\g -> elem i $ g ^. phylo_groupNgrams) bk)
297 / (fromIntegral $ length bk))
298 in ((1 + beta ** 2) * accuracy * recall)
299 / (((beta ** 2) * accuracy + recall))
300
301
302 wk :: [PhyloGroup] -> Double
303 wk bk = fromIntegral $ length bk
304
305
306 toPhyloQuality' :: Double -> Map Int Double -> [[PhyloGroup]] -> Double
307 toPhyloQuality' beta freq branches =
308 if (null branches)
309 then 0
310 else sum
311 $ map (\i ->
312 let bks = relevantBranches i branches
313 in (freq ! i) * (sum $ map (\bk -> ((wk bk) / (sum $ map wk bks)) * (fScore beta i bk bks)) bks))
314 $ keys freq
315
316
317
318 toPhyloQuality :: Double -> Map Int Double -> Int -> Double -> [[PhyloGroup]] -> Double
319 toPhyloQuality beta frequency border oldAcc branches =
320 -- trace (" rec : " <> show(recall)) $
321 -- trace (" acc : " <> show(accuracy)) $
322 if (null branches)
323 then 0
324 else ((1 + beta ** 2) * accuracy * recall)
325 / (((beta ** 2) * accuracy + recall))
326 where
327 -- | for each term compute the global accuracy
328 accuracy :: Double
329 accuracy = oldAcc + (sum $ map (\term -> toAccuracy (frequency ! term) term branches) $ keys frequency)
330 -- | for each term compute the global recall
331 recall :: Double
332 recall = sum $ map (\term -> toRecall (frequency ! term) term border branches) $ keys frequency
333
334
335 toBorderAccuracy :: Map Int Double -> [[PhyloGroup]] -> Double
336 toBorderAccuracy freq branches = sum $ map (\t -> toAccuracy (freq ! t) t branches) $ keys freq
337
338
339 -----------------------------
340 -- | Adaptative Matching | --
341 -----------------------------
342
343
344 groupsToBranches :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
345 groupsToBranches groups =
346 -- | run the related component algorithm
347 let egos = groupBy (\gs gs' -> (fst $ fst $ head' "egos" gs) == (fst $ fst $ head' "egos" gs'))
348 $ sortOn (\gs -> fst $ fst $ head' "egos" gs)
349 $ map (\group -> [getGroupId group]
350 ++ (map fst $ group ^. phylo_groupPeriodParents)
351 ++ (map fst $ group ^. phylo_groupPeriodChilds) ) $ elems groups
352 -- | first find the related components by inside each ego's period
353 graph' = map relatedComponents egos
354 -- | then run it for the all the periods
355 graph = zip [1..]
356 $ relatedComponents $ concat (graph' `using` parList rdeepseq)
357 -- | update each group's branch id
358 in map (\(bId,ids) ->
359 let groups' = map (\group -> group & phylo_groupBranchId %~ (\(lvl,lst) -> (lvl,lst ++ [bId])))
360 $ elems $ restrictKeys groups (Set.fromList ids)
361 in groups' `using` parList rdeepseq ) graph
362
363
364 reduceFrequency :: Map Int Double -> [[PhyloGroup]] -> Map Int Double
365 reduceFrequency frequency branches =
366 restrictKeys frequency (Set.fromList $ (nub . concat) $ map _phylo_groupNgrams $ concat branches)
367
368
369 alterBorder :: Int -> [[PhyloGroup]] -> [PhyloGroup] -> Int
370 alterBorder border branches branch = border + (length $ concat branches) - (length branch)
371
372
373 -- | Important ne pas virer les filtree mais les mettre en false
374
375 seqMatching :: Proximity -> Double -> Map Int Double -> Int -> Double -> Int -> Map Date Double -> [PhyloPeriodId] -> [([PhyloGroup],Bool)] -> ([PhyloGroup],Bool) -> [([PhyloGroup],Bool)] -> [([PhyloGroup],Bool)]
376 seqMatching proximity beta frequency minBranch egoThr frame docs periods done ego rest =
377 -- | 1) keep or not the new division of ego
378 let done' = done ++ (if snd ego
379 then (if ((null (fst ego')) || (quality > quality'))
380 then trace (" ✗ F(β) = " <> show(quality) <> " (vs) " <> show(quality')
381 <> " | " <> show(length $ fst ego) <> " groups : "
382 <> " |✓ " <> show(length $ fst ego') <> show(map length $ fst ego')
383 <> " |✗ " <> show(length $ snd ego') <> "[" <> show(length $ concat $ snd ego') <> "]")
384 $ [(fst ego,False)]
385 else trace (" ✓ F(β) = " <> show(quality) <> " (vs) " <> show(quality')
386 <> " | " <> show(length $ fst ego) <> " groups : "
387 <> " |✓ " <> show(length $ fst ego') <> show(map length $ fst ego')
388 <> " |✗ " <> show(length $ snd ego') <> "[" <> show(length $ concat $ snd ego') <> "]")
389 $ ((map (\e -> (e,True)) (fst ego')) ++ (map (\e -> (e,False)) (snd ego'))))
390 else [ego])
391 in
392 -- | 2) if there is no more branches in rest then return else continue
393 if null rest
394 then done'
395 else seqMatching proximity beta frequency minBranch egoThr frame docs periods
396 done' (head' "seqMatching" rest) (tail' "seqMatching" rest)
397 where
398 --------------------------------------
399 quality :: Double
400 quality = toPhyloQuality' beta frequency ((map fst done) ++ [fst ego] ++ (map fst rest))
401 --------------------------------------
402 ego' :: ([[PhyloGroup]],[[PhyloGroup]])
403 ego' =
404 let branches = groupsToBranches $ fromList $ map (\g -> (getGroupId g, g))
405 $ phyloBranchMatching frame periods proximity egoThr docs (fst ego)
406 branches' = branches `using` parList rdeepseq
407 in partition (\b -> (length $ nub $ map _phylo_groupPeriod b) >= minBranch) branches'
408 --------------------------------------
409 quality' :: Double
410 quality' = toPhyloQuality' beta frequency
411 ((map fst done) ++ (fst ego') ++ (snd ego') ++ (map fst rest))
412
413
414 recursiveMatching' :: Proximity -> Double -> Int -> Map Int Double -> Double -> Int -> [PhyloPeriodId] -> Map Date Double -> [([PhyloGroup],Bool)] -> [([PhyloGroup],Bool)]
415 recursiveMatching' proximity beta minBranch frequency egoThr frame periods docs branches =
416 if (egoThr >= 1) || ((not . or) $ map snd branches)
417 then branches
418 else
419 let branches' = seqMatching proximity beta frequency minBranch egoThr frame docs periods
420 [] (head' "recursiveMatching" branches) (tail' "recursiveMatching" branches)
421 frequency' = reduceFrequency frequency (map fst branches')
422 in recursiveMatching' proximity beta minBranch frequency' (egoThr + (getThresholdStep proximity)) frame periods docs branches'
423
424
425 recursiveMatching :: Proximity -> Double -> Int -> Map Int Double -> Double -> Int -> [PhyloPeriodId] -> Map Date Double -> Double -> Int -> Double -> [PhyloGroup] -> [PhyloGroup]
426 recursiveMatching proximity beta minBranch frequency egoThr frame periods docs quality border oldAcc groups =
427 if ((egoThr >= 1) || (quality > quality') || ((length $ concat $ snd branches') == (length groups)))
428 then
429 trace (" ✗ F(β) = " <> show(quality) <> " (vs) " <> show(quality') <> "\n"
430 <> " |✓ " <> show(length $ fst branches') <> show(map length $ fst branches')
431 <> " |✗ " <> show(length $ snd branches') <> "[" <> show(length $ concat $ snd branches') <> "]") $
432 groups
433 else
434 let next = map (\b -> recursiveMatching proximity beta minBranch
435 (reduceFrequency frequency (fst branches'))
436 (egoThr + (getThresholdStep proximity))
437 frame periods docs quality'
438 (alterBorder border (fst branches') b)
439 (oldAcc + (toBorderAccuracy frequency (delete b ((fst branches') ++ (snd branches')))))
440 b ) (fst branches')
441 in trace (" ✓ F(β) = " <> show(quality) <> " (vs) " <> show(quality') <> "\n"
442 <> " |✓ " <> show(length $ fst branches') <> show(map length $ fst branches')
443 <> " |✗ " <> show(length $ snd branches') <> "[" <> show(length $ concat $ snd branches') <> "]") $
444 concat (next ++ (snd branches'))
445 where
446 -- | 2) for each of the possible next branches process the phyloQuality score
447 quality' :: Double
448 quality' = toPhyloQuality beta frequency border oldAcc ((fst branches') ++ (snd branches'))
449 -- | 1) for each local branch process a temporal matching then find the resulting branches
450 branches' :: ([[PhyloGroup]],[[PhyloGroup]])
451 branches' =
452 let branches = groupsToBranches $ fromList $ map (\g -> (getGroupId g, g))
453 $ phyloBranchMatching frame periods proximity egoThr docs groups
454 in partition (\b -> length b >= minBranch) (branches `using` parList rdeepseq)
455
456
457 temporalMatching :: Phylo -> Phylo
458 temporalMatching phylo = updatePhyloGroups 1
459 (fromList $ map (\g -> (getGroupId g,g)) $ traceMatchEnd $ concat branches)
460 phylo
461 where
462 -- | 2) init the recursiveMatching
463 branches :: [[PhyloGroup]]
464 branches = map fst
465 $ recursiveMatching' (phyloProximity $ getConfig phylo)
466 (_qua_relevance $ phyloQuality $ getConfig phylo)
467 (_qua_minBranch $ phyloQuality $ getConfig phylo)
468 (phylo ^. phylo_termFreq)
469 (getThresholdInit $ phyloProximity $ getConfig phylo)
470 (getTimeFrame $ timeUnit $ getConfig phylo)
471 (getPeriodIds phylo)
472 (phylo ^. phylo_timeDocs)
473 [(groups,True)]
474 -- | 1) for each group process an initial temporal Matching
475 groups :: [PhyloGroup]
476 groups = phyloBranchMatching (getTimeFrame $ timeUnit $ getConfig phylo) (getPeriodIds phylo)
477 (phyloProximity $ getConfig phylo) (getThresholdInit $ phyloProximity $ getConfig phylo)
478 (phylo ^. phylo_timeDocs)
479 (traceTemporalMatching $ getGroupsFromLevel 1 phylo)
480
481
482
483 temporalMatching' :: Phylo -> Phylo
484 temporalMatching' phylo = updatePhyloGroups 1 branches' phylo
485 where
486 -- | 5) apply the recursive matching
487 branches' :: Map PhyloGroupId PhyloGroup
488 branches' =
489 let next = trace (" ✓ F(β) = " <> show(quality)
490 <> " |✓ " <> show(length $ fst branches) <> show(map length $ fst branches)
491 <> " |✗ " <> show(length $ snd branches) <> "[" <> show(length $ concat $ snd branches) <> "]")
492 $ map (\branch -> recursiveMatching (phyloProximity $ getConfig phylo)
493 (_qua_relevance $ phyloQuality $ getConfig phylo)
494 (_qua_minBranch $ phyloQuality $ getConfig phylo)
495 (reduceFrequency frequency (fst branches))
496 ( (getThresholdInit $ phyloProximity $ getConfig phylo)
497 + (getThresholdStep $ phyloProximity $ getConfig phylo))
498 (getTimeFrame $ timeUnit $ getConfig phylo)
499 (getPeriodIds phylo)
500 (phylo ^. phylo_timeDocs) quality (alterBorder 0 (fst branches) branch)
501 (toBorderAccuracy frequency (delete branch ((fst branches) ++ (snd branches))))
502 branch
503 ) (fst branches)
504 in fromList $ map (\g -> (getGroupId g,g)) $ traceMatchEnd (concat (next ++ (snd branches)))
505 -- | 4) process the quality score
506 quality :: Double
507 quality = toPhyloQuality (_qua_relevance $ phyloQuality $ getConfig phylo) frequency 0 0 ((fst branches) ++ (snd branches))
508 -- | 3) process the constants of the quality score
509 frequency :: Map Int Double
510 frequency =
511 let terms = ngramsInBranches ((fst branches) ++ (snd branches))
512 freqs = map (\t -> termFreq' t $ concat ((fst branches) ++ (snd branches))) terms
513 in fromList $ map (\(t,freq) -> (t,freq/(sum freqs))) $ zip terms freqs
514 -- | 2) group into branches
515 branches :: ([[PhyloGroup]],[[PhyloGroup]])
516 branches = partition (\b -> length b >= (_qua_minBranch $ phyloQuality $ getConfig phylo))
517 $ groupsToBranches $ fromList $ map (\group -> (getGroupId group, group)) groups'
518 -- | 1) for each group process an initial temporal Matching
519 groups' :: [PhyloGroup]
520 groups' = phyloBranchMatching (getTimeFrame $ timeUnit $ getConfig phylo) (getPeriodIds phylo)
521 (phyloProximity $ getConfig phylo) (getThresholdInit $ phyloProximity $ getConfig phylo)
522 (phylo ^. phylo_timeDocs)
523 (traceTemporalMatching $ getGroupsFromLevel 1 phylo)