]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Viz/Phylo/TemporalMatching.hs
[FEAT][PHYLO] corpusId to Documents function
[gargantext.git] / src / Gargantext / Core / Viz / Phylo / TemporalMatching.hs
1 {-|
2 Module : Gargantext.Core.Viz.Phylo.TemporalMatching
3 Description : Module dedicated to the adaptative temporal matching of a Phylo.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11
12 module Gargantext.Core.Viz.Phylo.TemporalMatching where
13
14 import Control.Lens hiding (Level)
15 import Control.Parallel.Strategies (parList, rdeepseq, using)
16 import Data.List (concat, splitAt, tail, sortOn, (++), intersect, null, inits, groupBy, scanl, nub, nubBy, union, dropWhile, partition, or, sort, (!!))
17 import Data.Map (Map, fromList, elems, restrictKeys, unionWith, findWithDefault, keys, (!), (!?), filterWithKey, singleton, empty, mapKeys, adjust)
18 import Debug.Trace (trace)
19 import Gargantext.Core.Viz.Phylo
20 import Gargantext.Core.Viz.Phylo.PhyloTools
21 import Gargantext.Prelude
22 import Prelude (floor,tan,pi)
23 import Text.Printf
24 import qualified Data.Map as Map
25 import qualified Data.Set as Set
26 import qualified Data.Vector as Vector
27
28
29 -------------------
30 -- | Proximity | --
31 -------------------
32
33
34 -- | To compute a jaccard similarity between two lists
35 jaccard :: [Int] -> [Int] -> Double
36 jaccard inter' union' = ((fromIntegral . length) $ inter') / ((fromIntegral . length) $ union')
37
38
39 -- | Process the inverse sumLog
40 sumInvLog' :: Double -> Double -> [Double] -> Double
41 sumInvLog' s nb diago = foldl (\mem occ -> mem + (1 / (log (occ + 1/ tan (s * pi / 2)) / log (nb + 1/ tan (s * pi / 2))))) 0 diago
42
43
44 -- | Process the sumLog
45 sumLog' :: Double -> Double -> [Double] -> Double
46 sumLog' s nb diago = foldl (\mem occ -> mem + (log (occ + 1/ tan (s * pi / 2)) / log (nb + 1/ tan (s * pi / 2)))) 0 diago
47
48
49 weightedLogJaccard' :: Double -> Double -> Map Int Double -> [Int] -> [Int] -> Double
50 weightedLogJaccard' sens nbDocs diago ngrams ngrams'
51 | null ngramsInter = 0
52 | ngramsInter == ngramsUnion = 1
53 | sens == 0 = jaccard ngramsInter ngramsUnion
54 | sens > 0 = (sumInvLog' sens nbDocs diagoInter) / (sumInvLog' sens nbDocs diagoUnion)
55 | otherwise = (sumLog' sens nbDocs diagoInter) / (sumLog' sens nbDocs diagoUnion)
56 where
57 --------------------------------------
58 ngramsInter :: [Int]
59 ngramsInter = intersect ngrams ngrams'
60 --------------------------------------
61 ngramsUnion :: [Int]
62 ngramsUnion = union ngrams ngrams'
63 --------------------------------------
64 diagoInter :: [Double]
65 diagoInter = elems $ restrictKeys diago (Set.fromList ngramsInter)
66 --------------------------------------
67 diagoUnion :: [Double]
68 diagoUnion = elems $ restrictKeys diago (Set.fromList ngramsUnion)
69 --------------------------------------
70
71 -- | Process the weighted similarity between clusters. Adapted from Wang, X., Cheng, Q., Lu, W., 2014. Analyzing evolution of research topics with NEViewer: a new method based on dynamic co-word networks. Scientometrics 101, 1253–1271. https://doi.org/10.1007/s11192-014-1347-y (log added in the formula + pair comparison)
72 -- tests not conclusive
73 weightedLogSim' :: Double -> Double -> Map Int Double -> [Int] -> [Int] -> Double
74 weightedLogSim' sens nbDocs diago ego_ngrams target_ngrams
75 | null ngramsInter = 0
76 | ngramsInter == ngramsUnion = 1
77 | sens == 0 = jaccard ngramsInter ngramsUnion
78 | sens > 0 = (sumInvLog' sens nbDocs diagoInter) / minimum [(sumInvLog' sens nbDocs diagoEgo),(sumInvLog' sens nbDocs diagoTarget)]
79 | otherwise = (sumLog' sens nbDocs diagoInter) / minimum [(sumLog' sens nbDocs diagoEgo),(sumLog' sens nbDocs diagoTarget)]
80 where
81 --------------------------------------
82 ngramsInter :: [Int]
83 ngramsInter = intersect ego_ngrams target_ngrams
84 --------------------------------------
85 ngramsUnion :: [Int]
86 ngramsUnion = union ego_ngrams target_ngrams
87 --------------------------------------
88 diagoInter :: [Double]
89 diagoInter = elems $ restrictKeys diago (Set.fromList ngramsInter)
90 --------------------------------------
91 diagoEgo :: [Double]
92 diagoEgo = elems $ restrictKeys diago (Set.fromList ego_ngrams)
93 --------------------------------------
94 diagoTarget :: [Double]
95 diagoTarget = elems $ restrictKeys diago (Set.fromList target_ngrams)
96 --------------------------------------
97
98 toProximity :: Double -> Map Int Double -> Proximity -> [Int] -> [Int] -> [Int] -> Double
99 -- | To process the proximity between a current group and a pair of targets group using the adapted Wang et al. Similarity
100 toProximity nbDocs diago proximity egoNgrams targetNgrams targetNgrams' =
101 case proximity of
102 WeightedLogJaccard sens ->
103 let pairNgrams = if targetNgrams == targetNgrams'
104 then targetNgrams
105 else union targetNgrams targetNgrams'
106 in weightedLogJaccard' sens nbDocs diago egoNgrams pairNgrams
107 WeightedLogSim sens ->
108 let pairNgrams = if targetNgrams == targetNgrams'
109 then targetNgrams
110 else union targetNgrams targetNgrams'
111 in weightedLogSim' sens nbDocs diago egoNgrams pairNgrams
112 Hamming -> undefined
113
114 ------------------------
115 -- | Local Matching | --
116 ------------------------
117
118 findLastPeriod :: Filiation -> [PhyloPeriodId] -> PhyloPeriodId
119 findLastPeriod fil periods = case fil of
120 ToParents -> head' "findLastPeriod" (sortOn fst periods)
121 ToChilds -> last' "findLastPeriod" (sortOn fst periods)
122
123
124 -- | To filter pairs of candidates related to old pointers periods
125 removeOldPointers :: [Pointer] -> Filiation -> Double -> Proximity -> PhyloPeriodId
126 -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
127 -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
128 removeOldPointers oldPointers fil thr prox prd pairs
129 | null oldPointers = pairs
130 | null (filterPointers prox thr oldPointers) =
131 let lastMatchedPrd = findLastPeriod fil (map (fst . fst . fst) oldPointers)
132 in if lastMatchedPrd == prd
133 then []
134 else filter (\((id,_),(id',_)) ->
135 case fil of
136 ToParents -> (((fst . fst . fst) id ) < (fst lastMatchedPrd))
137 || (((fst . fst . fst) id') < (fst lastMatchedPrd))
138 ToChilds -> (((fst . fst . fst) id ) > (fst lastMatchedPrd))
139 || (((fst . fst . fst) id') > (fst lastMatchedPrd))) pairs
140 | otherwise = []
141
142
143 makePairs' :: (PhyloGroupId,[Int]) -> [(PhyloGroupId,[Int])] -> [PhyloPeriodId] -> [Pointer] -> Filiation -> Double -> Proximity
144 -> Map Date Double -> Map Date Cooc -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
145 makePairs' (egoId, egoNgrams) candidates periods oldPointers fil thr prox docs diagos =
146 if (null periods)
147 then []
148 else removeOldPointers oldPointers fil thr prox lastPrd
149 {- at least on of the pair candidates should be from the last added period -}
150 $ filter (\((id,_),(id',_)) -> ((fst . fst) id == lastPrd) || ((fst . fst) id' == lastPrd))
151 $ listToKeys
152 $ filter (\(id,ngrams) ->
153 let nbDocs = (sum . elems) $ filterDocs docs ([(fst . fst) egoId, (fst . fst) id])
154 diago = reduceDiagos $ filterDiago diagos ([(fst . fst) egoId, (fst . fst) id])
155 in (toProximity nbDocs diago prox egoNgrams egoNgrams ngrams) >= thr
156 ) candidates
157 where
158 lastPrd :: PhyloPeriodId
159 lastPrd = findLastPeriod fil periods
160
161
162 filterPointers :: Proximity -> Double -> [Pointer] -> [Pointer]
163 filterPointers proxi thr pts = filter (\(_,w) -> filterProximity proxi thr w) pts
164
165 filterPointers' :: Proximity -> Double -> [(Pointer,[Int])] -> [(Pointer,[Int])]
166 filterPointers' proxi thr pts = filter (\((_,w),_) -> filterProximity proxi thr w) pts
167
168
169 reduceDiagos :: Map Date Cooc -> Map Int Double
170 reduceDiagos diagos = mapKeys (\(k,_) -> k)
171 $ foldl (\acc diago -> unionWith (+) acc diago) empty (elems diagos)
172
173 filterPointersByPeriod :: Filiation -> [(Pointer,[Int])] -> [Pointer]
174 filterPointersByPeriod fil pts =
175 let pts' = sortOn (fst . fst . fst . fst) pts
176 inf = (fst . fst . fst . fst) $ head' "filterPointersByPeriod" pts'
177 sup = (fst . fst . fst . fst) $ last' "filterPointersByPeriod" pts'
178 in map fst
179 $ nubBy (\pt pt' -> snd pt == snd pt')
180 $ filter (\pt -> ((fst . fst . fst . fst) pt == inf) || ((fst . fst . fst . fst) pt == sup))
181 $ case fil of
182 ToParents -> reverse pts'
183 ToChilds -> pts'
184
185 phyloGroupMatching :: [[(PhyloGroupId,[Int])]] -> Filiation -> Proximity -> Map Date Double -> Map Date Cooc
186 -> Double -> [Pointer] -> (PhyloGroupId,[Int]) -> [Pointer]
187 phyloGroupMatching candidates fil proxi docs diagos thr oldPointers (id,ngrams) =
188 if (null $ filterPointers proxi thr oldPointers)
189 {- let's find new pointers -}
190 then if null nextPointers
191 then []
192 else filterPointersByPeriod fil
193 $ head' "phyloGroupMatching"
194 -- Keep only the best set of pointers grouped by proximity
195 $ groupBy (\pt pt' -> (snd . fst) pt == (snd . fst) pt')
196 $ reverse $ sortOn (snd . fst) $ head' "pointers" nextPointers
197 -- Find the first time frame where at leats one pointer satisfies the proximity threshold
198 else oldPointers
199 where
200 nextPointers :: [[(Pointer,[Int])]]
201 nextPointers = take 1
202 $ dropWhile (null)
203 {- for each time frame, process the proximity on relevant pairs of targeted groups -}
204 $ scanl (\acc groups ->
205 let periods = nub $ map (fst . fst . fst) $ concat groups
206 nbdocs = sum $ elems $ (filterDocs docs ([(fst . fst) id] ++ periods))
207 diago = reduceDiagos
208 $ filterDiago diagos ([(fst . fst) id] ++ periods)
209 {- important resize nbdocs et diago dans le make pairs -}
210 pairs = makePairs' (id,ngrams) (concat groups) periods oldPointers fil thr proxi docs diagos
211 in acc ++ ( filterPointers' proxi thr
212 $ concat
213 $ map (\(c,c') ->
214 {- process the proximity between the current group and a pair of candidates -}
215 let proximity = toProximity nbdocs diago proxi ngrams (snd c) (snd c')
216 in if ((c == c') || (snd c == snd c'))
217 then [((fst c,proximity),snd c)]
218 else [((fst c,proximity),snd c),((fst c',proximity),snd c')] ) pairs )) []
219 $ inits candidates -- groups from [[1900],[1900,1901],[1900,1901,1902],...]
220
221
222 filterDocs :: Map Date Double -> [PhyloPeriodId] -> Map Date Double
223 filterDocs d pds = restrictKeys d $ periodsToYears pds
224
225 filterDiago :: Map Date Cooc -> [PhyloPeriodId] -> Map Date Cooc
226 filterDiago diago pds = restrictKeys diago $ periodsToYears pds
227
228
229 -----------------------------
230 -- | Matching Processing | --
231 -----------------------------
232
233
234 getNextPeriods :: Filiation -> Int -> PhyloPeriodId -> [PhyloPeriodId] -> [PhyloPeriodId]
235 getNextPeriods fil max' pId pIds =
236 case fil of
237 ToChilds -> take max' $ (tail . snd) $ splitAt (elemIndex' pId pIds) pIds
238 ToParents -> take max' $ (reverse . fst) $ splitAt (elemIndex' pId pIds) pIds
239
240
241 getCandidates :: PhyloGroup -> [[(PhyloGroupId,[Int])]] -> [[(PhyloGroupId,[Int])]]
242 getCandidates ego targets =
243 map (\groups' ->
244 filter (\g' -> (not . null) $ intersect (ego ^. phylo_groupNgrams) (snd g')
245 ) groups') targets
246
247
248 matchGroupsToGroups :: Int -> [PhyloPeriodId] -> Proximity -> Double -> Map Date Double -> Map Date Cooc -> [PhyloGroup] -> [PhyloGroup]
249 matchGroupsToGroups frame periods proximity thr docs coocs groups =
250 let groups' = groupByField _phylo_groupPeriod groups
251 in foldl' (\acc prd ->
252 let -- 1) find the parents/childs matching periods
253 periodsPar = getNextPeriods ToParents frame prd periods
254 periodsChi = getNextPeriods ToChilds frame prd periods
255 -- 2) find the parents/childs matching candidates
256 candidatesPar = map (\prd' -> map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ findWithDefault [] prd' groups') periodsPar
257 candidatesChi = map (\prd' -> map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ findWithDefault [] prd' groups') periodsChi
258 -- 3) find the parents/child number of docs by years
259 docsPar = filterDocs docs ([prd] ++ periodsPar)
260 docsChi = filterDocs docs ([prd] ++ periodsChi)
261 -- 4) find the parents/child diago by years
262 diagoPar = filterDiago (map coocToDiago coocs) ([prd] ++ periodsPar)
263 diagoChi = filterDiago (map coocToDiago coocs) ([prd] ++ periodsPar)
264 -- 5) match in parallel all the groups (egos) to their possible candidates
265 egos = map (\ego ->
266 let pointersPar = phyloGroupMatching (getCandidates ego candidatesPar) ToParents proximity docsPar diagoPar
267 thr (getPeriodPointers ToParents ego) (getGroupId ego, ego ^. phylo_groupNgrams)
268 pointersChi = phyloGroupMatching (getCandidates ego candidatesChi) ToChilds proximity docsChi diagoChi
269 thr (getPeriodPointers ToChilds ego) (getGroupId ego, ego ^. phylo_groupNgrams)
270 in addPointers ToChilds TemporalPointer pointersChi
271 $ addPointers ToParents TemporalPointer pointersPar ego)
272 $ findWithDefault [] prd groups'
273 egos' = egos `using` parList rdeepseq
274 in acc ++ egos'
275 ) [] periods
276
277
278 -----------------------
279 -- | Phylo Quality | --
280 -----------------------
281
282
283 relevantBranches :: Int -> [[PhyloGroup]] -> [[PhyloGroup]]
284 relevantBranches term branches =
285 filter (\groups -> (any (\group -> elem term $ group ^. phylo_groupNgrams) groups)) branches
286
287 accuracy :: Int -> [(Date,Date)] -> [PhyloGroup] -> Double
288 -- The accuracy of a branch relatively to a term x is computed only over the periods there exist some cluster mentionning x in the phylomemy
289 accuracy x periods bk = ((fromIntegral $ length $ filter (\g -> elem x $ g ^. phylo_groupNgrams) bk')
290 / (fromIntegral $ length bk'))
291 where
292 bk' :: [PhyloGroup]
293 bk' = filter (\g -> elem (g ^. phylo_groupPeriod) periods) bk
294
295 recall :: Int -> [PhyloGroup] -> [[PhyloGroup]] -> Double
296 recall x bk bx = ((fromIntegral $ length $ filter (\g -> elem x $ g ^. phylo_groupNgrams) bk)
297 / (fromIntegral $ length $ filter (\g -> elem x $ g ^. phylo_groupNgrams) $ concat bx))
298
299 fScore :: Double -> Int -> [(Date,Date)] -> [PhyloGroup] -> [[PhyloGroup]] -> Double
300 fScore lambda x periods bk bx =
301 let rec = recall x bk bx
302 acc = accuracy x periods bk
303 in ((1 + lambda ** 2) * acc * rec)
304 / (((lambda ** 2) * acc + rec))
305
306
307 wk :: [PhyloGroup] -> Double
308 wk bk = fromIntegral $ length bk
309
310
311 toPhyloQuality' :: Double -> Map Int Double -> [[PhyloGroup]] -> Double
312 toPhyloQuality' lambda freq branches =
313 if (null branches)
314 then 0
315 else sum
316 $ map (\i ->
317 let bks = relevantBranches i branches
318 periods = nub $ map _phylo_groupPeriod $ filter (\g -> elem i $ g ^. phylo_groupNgrams) $ concat bks
319 in (freq ! i) * (sum $ map (\bk -> ((wk bk) / (sum $ map wk bks)) * (fScore lambda i periods bk bks)) bks))
320 $ keys freq
321
322 toRecall :: Map Int Double -> [[PhyloGroup]] -> Double
323 toRecall freq branches =
324 if (null branches)
325 then 0
326 else sum
327 $ map (\x ->
328 let px = freq ! x
329 bx = relevantBranches x branches
330 wks = sum $ map wk bx
331 in (px / pys) * (sum $ map (\bk -> ((wk bk) / wks) * (recall x bk bx)) bx))
332 $ keys freq
333 where
334 pys :: Double
335 pys = sum (elems freq)
336
337
338 toAccuracy :: Map Int Double -> [[PhyloGroup]] -> Double
339 toAccuracy freq branches =
340 if (null branches)
341 then 0
342 else sum
343 $ map (\x ->
344 let px = freq ! x
345 bx = relevantBranches x branches
346 -- | periods containing x
347 periods = nub $ map _phylo_groupPeriod $ filter (\g -> elem x $ g ^. phylo_groupNgrams) $ concat bx
348 wks = sum $ map wk bx
349 in (px / pys) * (sum $ map (\bk -> ((wk bk) / wks) * (accuracy x periods bk)) bx))
350 $ keys freq
351 where
352 pys :: Double
353 pys = sum (elems freq)
354
355
356 -- | here we do the average of all the local f_scores
357 toPhyloQuality :: Double -> Double -> Map Int Double -> [[PhyloGroup]] -> Double
358 toPhyloQuality fdt lambda freq branches =
359 if (null branches)
360 then 0
361 else sum
362 $ map (\x ->
363 -- let px = freq ! x
364 let bx = relevantBranches x branches
365 -- | periods containing x
366 periods = nub $ map _phylo_groupPeriod $ filter (\g -> elem x $ g ^. phylo_groupNgrams) $ concat bx
367 wks = sum $ map wk bx
368 -- in (px / pys) * (sum $ map (\bk -> ((wk bk) / wks) * (fScore beta x bk bx)) bx))
369 -- in (1 / fdt) * (sum $ map (\bk -> ((wk bk) / wks) * (fScore beta x periods bk bx)) bx))
370 in (1 / fdt) * (sum $ map (\bk -> ((wk bk) / wks) * (fScore (tan (lambda * pi / 2)) x periods bk bx)) bx))
371 $ keys freq
372 -- where
373 -- pys :: Double
374 -- pys = sum (elems freq)
375
376 -- 1 / nb de foundation
377
378 ------------------------------------
379 -- | Constant Temporal Matching | --
380 ------------------------------------
381
382
383 groupsToBranches' :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
384 groupsToBranches' groups =
385 {- run the related component algorithm -}
386 let egos = groupBy (\gs gs' -> (fst $ fst $ head' "egos" gs) == (fst $ fst $ head' "egos" gs'))
387 $ sortOn (\gs -> fst $ fst $ head' "egos" gs)
388 $ map (\group -> [getGroupId group]
389 ++ (map fst $ group ^. phylo_groupPeriodParents)
390 ++ (map fst $ group ^. phylo_groupPeriodChilds) ) $ elems groups
391 -- first find the related components by inside each ego's period
392 -- a supprimer
393 graph' = map relatedComponents egos
394 -- then run it for the all the periods
395 graph = zip [1..]
396 $ relatedComponents $ concat (graph' `using` parList rdeepseq)
397 -- update each group's branch id
398 in map (\(bId,ids) ->
399 let groups' = map (\group -> group & phylo_groupBranchId %~ (\(lvl,lst) -> (lvl,lst ++ [bId])))
400 $ elems $ restrictKeys groups (Set.fromList ids)
401 in groups' `using` parList rdeepseq ) graph
402
403
404 reduceFrequency :: Map Int Double -> [[PhyloGroup]] -> Map Int Double
405 reduceFrequency frequency branches =
406 restrictKeys frequency (Set.fromList $ (nub . concat) $ map _phylo_groupNgrams $ concat branches)
407
408 updateThr :: Double -> [[PhyloGroup]] -> [[PhyloGroup]]
409 updateThr thr branches = map (\b -> map (\g ->
410 g & phylo_groupMeta .~ (singleton "seaLevels" (((g ^. phylo_groupMeta) ! "seaLevels") ++ [thr]))) b) branches
411
412
413 -- Sequentially break each branch of a phylo where
414 -- done = all the allready broken branches
415 -- ego = the current branch we want to break
416 -- rest = the branches we still have to break
417 breakBranches :: Double -> Proximity -> Double -> Map Int Double -> Int -> Double -> Double -> Double
418 -> Int -> Map Date Double -> Map Date Cooc -> [PhyloPeriodId] -> [([PhyloGroup],Bool)] -> ([PhyloGroup],Bool) -> [([PhyloGroup],Bool)] -> [([PhyloGroup],Bool)]
419 breakBranches fdt proximity lambda frequency minBranch thr depth elevation frame docs coocs periods done ego rest =
420 -- 1) keep or not the new division of ego
421 let done' = done ++ (if snd ego
422 then
423 (if ((null (fst ego')) || (quality > quality'))
424 then
425 -- trace (" ✗ F(β) = " <> show(quality) <> " (vs) " <> show(quality')
426 -- <> " | " <> show(length $ fst ego) <> " groups : "
427 -- <> " |✓ " <> show(length $ fst ego') <> show(map length $ fst ego')
428 -- <> " |✗ " <> show(length $ snd ego') <> "[" <> show(length $ concat $ snd ego') <> "]")
429 [(fst ego,False)]
430 else
431 -- trace (" ✓ level = " <> printf "%.1f" thr <> "")
432 -- trace (" ✓ F(β) = " <> show(quality) <> " (vs) " <> show(quality')
433 -- <> " | " <> show(length $ fst ego) <> " groups : "
434 -- <> " |✓ " <> show(length $ fst ego') <> show(map length $ fst ego')
435 -- <> " |✗ " <> show(length $ snd ego') <> "[" <> show(length $ concat $ snd ego') <> "]")
436 ((map (\e -> (e,True)) (fst ego')) ++ (map (\e -> (e,False)) (snd ego'))))
437 else [ego])
438 in
439 -- 2) if there is no more branches in rest then return else continue
440 if null rest
441 then done'
442 else breakBranches fdt proximity lambda frequency minBranch thr depth elevation frame docs coocs periods
443 done' (head' "breakBranches" rest) (tail' "breakBranches" rest)
444 where
445 --------------------------------------
446 quality :: Double
447 quality = toPhyloQuality fdt lambda frequency ((map fst done) ++ [fst ego] ++ (map fst rest))
448 --------------------------------------
449 ego' :: ([[PhyloGroup]],[[PhyloGroup]])
450 ego' =
451 let branches = groupsToBranches' $ fromList $ map (\g -> (getGroupId g, g))
452 $ matchGroupsToGroups frame periods proximity thr docs coocs (fst ego)
453 branches' = branches `using` parList rdeepseq
454 in partition (\b -> (length $ nub $ map _phylo_groupPeriod b) >= minBranch)
455 $ thrToMeta thr
456 $ depthToMeta (elevation - depth) branches'
457 --------------------------------------
458 quality' :: Double
459 quality' = toPhyloQuality fdt lambda frequency
460 ((map fst done) ++ (fst ego') ++ (snd ego') ++ (map fst rest))
461
462
463 seaLevelMatching :: Double -> Proximity -> Double -> Int -> Map Int Double -> Double -> Double -> Double -> Double
464 -> Int -> [PhyloPeriodId] -> Map Date Double -> Map Date Cooc -> [([PhyloGroup],Bool)] -> [([PhyloGroup],Bool)]
465 seaLevelMatching fdt proximity lambda minBranch frequency thr step depth elevation frame periods docs coocs branches =
466 -- if there is no branch to break or if seaLvl level > 1 then end
467 if (thr >= 1) || ((not . or) $ map snd branches)
468 then branches
469 else
470 -- break all the possible branches at the current seaLvl level
471 let quality = toPhyloQuality fdt lambda frequency (map fst branches)
472 acc = toAccuracy frequency (map fst branches)
473 rec = toRecall frequency (map fst branches)
474 branches' = trace ("↑ level = " <> printf "%.3f" thr <> " F(λ) = " <> printf "%.5f" quality
475 <> " ξ = " <> printf "%.5f" acc
476 <> " ρ = " <> printf "%.5f" rec
477 <> " branches = " <> show(length branches) <> " ↴")
478 $ breakBranches fdt proximity lambda frequency minBranch thr depth elevation frame docs coocs periods
479 [] (head' "seaLevelMatching" branches) (tail' "seaLevelMatching" branches)
480 frequency' = reduceFrequency frequency (map fst branches')
481 in seaLevelMatching fdt proximity lambda minBranch frequency' (thr + step) step (depth - 1) elevation frame periods docs coocs branches'
482
483
484 constanteTemporalMatching :: Double -> Double -> Phylo -> Phylo
485 constanteTemporalMatching start step phylo = updatePhyloGroups 1
486 (fromList $ map (\g -> (getGroupId g,g)) $ traceMatchEnd $ concat branches)
487 (toPhyloHorizon phylo)
488 where
489 -- 2) process the temporal matching by elevating seaLvl level
490 branches :: [[PhyloGroup]]
491 branches = map fst
492 $ seaLevelMatching (fromIntegral $ Vector.length $ getRoots phylo)
493 (phyloProximity $ getConfig phylo)
494 (_qua_granularity $ phyloQuality $ getConfig phylo)
495 (_qua_minBranch $ phyloQuality $ getConfig phylo)
496 (phylo ^. phylo_termFreq)
497 start step
498 ((((1 - start) / step) - 1))
499 (((1 - start) / step))
500 (getTimeFrame $ timeUnit $ getConfig phylo)
501 (getPeriodIds phylo)
502 (phylo ^. phylo_timeDocs)
503 (phylo ^. phylo_timeCooc)
504 (reverse $ sortOn (length . fst) groups)
505 -- 1) for each group process an initial temporal Matching
506 -- here we suppose that all the groups of level 1 are part of the same big branch
507 groups :: [([PhyloGroup],Bool)]
508 groups = map (\b -> (b,(length $ nub $ map _phylo_groupPeriod b) >= (_qua_minBranch $ phyloQuality $ getConfig phylo)))
509 $ groupsToBranches' $ fromList $ map (\g -> (getGroupId g, g))
510 $ matchGroupsToGroups (getTimeFrame $ timeUnit $ getConfig phylo)
511 (getPeriodIds phylo) (phyloProximity $ getConfig phylo)
512 start
513 (phylo ^. phylo_timeDocs)
514 (phylo ^. phylo_timeCooc)
515 (traceTemporalMatching $ getGroupsFromLevel 1 phylo)
516
517 -----------------
518 -- | Horizon | --
519 -----------------
520
521 toPhyloHorizon :: Phylo -> Phylo
522 toPhyloHorizon phylo =
523 let t0 = take 1 (getPeriodIds phylo)
524 groups = getGroupsFromLevelPeriods 1 t0 phylo
525 sens = getSensibility (phyloProximity $ getConfig phylo)
526 nbDocs = sum $ elems $ filterDocs (phylo ^. phylo_timeDocs) t0
527 diago = reduceDiagos $ filterDiago (phylo ^. phylo_timeCooc) t0
528 in phylo & phylo_horizon .~ (fromList $ map (\(g,g') ->
529 ((getGroupId g,getGroupId g'),weightedLogJaccard' sens nbDocs diago (g ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams))) $ listToCombi' groups)
530
531
532 --------------------------------------
533 -- | Adaptative Temporal Matching | --
534 --------------------------------------
535
536
537 thrToMeta :: Double -> [[PhyloGroup]] -> [[PhyloGroup]]
538 thrToMeta thr branches =
539 map (\b ->
540 map (\g -> g & phylo_groupMeta .~ (adjust (\lst -> lst ++ [thr]) "seaLevels" (g ^. phylo_groupMeta))) b) branches
541
542 depthToMeta :: Double -> [[PhyloGroup]] -> [[PhyloGroup]]
543 depthToMeta depth branches =
544 let break = length branches > 1
545 in map (\b ->
546 map (\g ->
547 if break then g & phylo_groupMeta .~ (adjust (\lst -> lst ++ [depth]) "breaks"(g ^. phylo_groupMeta))
548 else g) b) branches
549
550 reduceTupleMapByKeys :: Eq a => [a] -> Map (a,a) Double -> Map (a,a) Double
551 reduceTupleMapByKeys ks m = filterWithKey (\(k,k') _ -> (elem k ks) && (elem k' ks)) m
552
553
554 getInTupleMap :: Ord a => Map (a,a) Double -> a -> a -> Double
555 getInTupleMap m k k'
556 | isJust (m !? ( k ,k')) = m ! ( k ,k')
557 | isJust (m !? ( k',k )) = m ! ( k',k )
558 | otherwise = 0
559
560
561 toThreshold :: Double -> Map (PhyloGroupId,PhyloGroupId) Double -> Double
562 toThreshold lvl proxiGroups =
563 let idx = ((Map.size proxiGroups) `div` (floor lvl)) - 1
564 in if idx >= 0
565 then (sort $ elems proxiGroups) !! idx
566 else 1
567
568
569 -- done = all the allready broken branches
570 -- ego = the current branch we want to break
571 -- rest = the branches we still have to break
572 adaptativeBreakBranches :: Double -> Proximity -> Double -> Double -> Map (PhyloGroupId,PhyloGroupId) Double
573 -> Double -> Map Int Double -> Int -> Int -> Map Date Double -> Map Date Cooc
574 -> [PhyloPeriodId] -> [([PhyloGroup],(Bool,[Double]))] -> ([PhyloGroup],(Bool,[Double])) -> [([PhyloGroup],(Bool,[Double]))]
575 -> [([PhyloGroup],(Bool,[Double]))]
576 adaptativeBreakBranches fdt proxiConf depth elevation groupsProxi lambda frequency minBranch frame docs coocs periods done ego rest =
577 -- 1) keep or not the new division of ego
578 let done' = done ++ (if (fst . snd) ego
579 then (if ((null (fst ego')) || (quality > quality'))
580 then
581 [(concat $ thrToMeta thr $ [fst ego],(False, ((snd . snd) ego)))]
582 else
583 ( (map (\e -> (e,(True, ((snd . snd) ego) ++ [thr]))) (fst ego'))
584 ++ (map (\e -> (e,(False, ((snd . snd) ego)))) (snd ego'))))
585 else [(concat $ thrToMeta thr $ [fst ego], snd ego)])
586 in
587 -- uncomment let .. in for debugging
588 -- let part1 = partition (snd) done'
589 -- part2 = partition (snd) rest
590 -- in trace ( "[✓ " <> show(length $ fst part1) <> "(" <> show(length $ concat $ map (fst) $ fst part1) <> ")|✗ " <> show(length $ snd part1) <> "(" <> show(length $ concat $ map (fst) $ snd part1) <> ")] "
591 -- <> "[✓ " <> show(length $ fst part2) <> "(" <> show(length $ concat $ map (fst) $ fst part2) <> ")|✗ " <> show(length $ snd part2) <> "(" <> show(length $ concat $ map (fst) $ snd part2) <> ")]"
592 -- ) $
593 -- 2) if there is no more branches in rest then return else continue
594 if null rest
595 then done'
596 else adaptativeBreakBranches fdt proxiConf depth elevation groupsProxi lambda frequency minBranch frame docs coocs periods
597 done' (head' "breakBranches" rest) (tail' "breakBranches" rest)
598 where
599 --------------------------------------
600 thr :: Double
601 thr = toThreshold depth $ Map.filter (\v -> v > (last' "breakBranches" $ (snd . snd) ego)) $ reduceTupleMapByKeys (map getGroupId $ fst ego) groupsProxi
602 --------------------------------------
603 quality :: Double
604 quality = toPhyloQuality fdt lambda frequency ((map fst done) ++ [fst ego] ++ (map fst rest))
605 --------------------------------------
606 ego' :: ([[PhyloGroup]],[[PhyloGroup]])
607 ego' =
608 let branches = groupsToBranches' $ fromList $ map (\g -> (getGroupId g, g))
609 $ matchGroupsToGroups frame periods proxiConf thr docs coocs (fst ego)
610 branches' = branches `using` parList rdeepseq
611 in partition (\b -> (length $ nub $ map _phylo_groupPeriod b) > minBranch)
612 $ thrToMeta thr
613 $ depthToMeta (elevation - depth) branches'
614 --------------------------------------
615 quality' :: Double
616 quality' = toPhyloQuality fdt lambda frequency
617 ((map fst done) ++ (fst ego') ++ (snd ego') ++ (map fst rest))
618
619
620 adaptativeSeaLevelMatching :: Double -> Proximity -> Double -> Double -> Map (PhyloGroupId, PhyloGroupId) Double
621 -> Double -> Int -> Map Int Double
622 -> Int -> [PhyloPeriodId] -> Map Date Double -> Map Date Cooc
623 -> [([PhyloGroup],(Bool,[Double]))] -> [([PhyloGroup],(Bool,[Double]))]
624 adaptativeSeaLevelMatching fdt proxiConf depth elevation groupsProxi lambda minBranch frequency frame periods docs coocs branches =
625 -- if there is no branch to break or if seaLvl level >= depth then end
626 if (Map.null groupsProxi) || (depth <= 0) || ((not . or) $ map (fst . snd) branches)
627 then branches
628 else
629 -- break all the possible branches at the current seaLvl level
630 let branches' = adaptativeBreakBranches fdt proxiConf depth elevation groupsProxi lambda frequency minBranch frame docs coocs periods
631 [] (head' "seaLevelMatching" branches) (tail' "seaLevelMatching" branches)
632 frequency' = reduceFrequency frequency (map fst branches')
633 groupsProxi' = reduceTupleMapByKeys (map (getGroupId) $ concat $ map (fst) $ filter (fst . snd) branches') groupsProxi
634 -- thr = toThreshold depth groupsProxi
635 in trace("\n " <> foldl (\acc _ -> acc <> "🌊 ") "" [0..(elevation - depth)]
636 <> " [✓ " <> show(length $ filter (fst . snd) branches') <> "(" <> show(length $ concat $ map (fst) $ filter (fst . snd) branches')
637 <> ")|✗ " <> show(length $ filter (not . fst . snd) branches') <> "(" <> show(length $ concat $ map (fst) $ filter (not . fst . snd) branches') <> ")]"
638 <> " thr = ")
639 $ adaptativeSeaLevelMatching fdt proxiConf (depth - 1) elevation groupsProxi' lambda minBranch frequency' frame periods docs coocs branches'
640
641
642 adaptativeTemporalMatching :: Double -> Phylo -> Phylo
643 adaptativeTemporalMatching elevation phylo = updatePhyloGroups 1
644 (fromList $ map (\g -> (getGroupId g,g)) $ traceMatchEnd $ concat branches)
645 (toPhyloHorizon phylo)
646 where
647 -- 2) process the temporal matching by elevating seaLvl level
648 branches :: [[PhyloGroup]]
649 branches = map fst
650 $ adaptativeSeaLevelMatching (fromIntegral $ Vector.length $ getRoots phylo)
651 (phyloProximity $ getConfig phylo)
652 (elevation - 1)
653 elevation
654 (phylo ^. phylo_groupsProxi)
655 (_qua_granularity $ phyloQuality $ getConfig phylo)
656 (_qua_minBranch $ phyloQuality $ getConfig phylo)
657 (phylo ^. phylo_termFreq)
658 (getTimeFrame $ timeUnit $ getConfig phylo)
659 (getPeriodIds phylo)
660 (phylo ^. phylo_timeDocs)
661 (phylo ^. phylo_timeCooc)
662 groups
663 -- 1) for each group process an initial temporal Matching
664 -- here we suppose that all the groups of level 1 are part of the same big branch
665 groups :: [([PhyloGroup],(Bool,[Double]))]
666 groups = map (\b -> (b,((length $ nub $ map _phylo_groupPeriod b) >= (_qua_minBranch $ phyloQuality $ getConfig phylo),[thr])))
667 $ groupsToBranches' $ fromList $ map (\g -> (getGroupId g, g))
668 $ matchGroupsToGroups (getTimeFrame $ timeUnit $ getConfig phylo)
669 (getPeriodIds phylo) (phyloProximity $ getConfig phylo)
670 thr
671 (phylo ^. phylo_timeDocs)
672 (phylo ^. phylo_timeCooc)
673 (traceTemporalMatching $ getGroupsFromLevel 1 phylo)
674 --------------------------------------
675 thr :: Double
676 thr = toThreshold elevation (phylo ^. phylo_groupsProxi)