]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Viz/Phylo/TemporalMatching.hs
[MERGE]
[gargantext.git] / src / Gargantext / Core / Viz / Phylo / TemporalMatching.hs
1 {-|
2 Module : Gargantext.Core.Viz.Phylo.TemporalMatching
3 Description : Module dedicated to the adaptative temporal matching of a Phylo.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11
12 module Gargantext.Core.Viz.Phylo.TemporalMatching where
13
14 import Control.Lens hiding (Level)
15 import Control.Parallel.Strategies (parList, rdeepseq, using)
16 import Data.Ord
17 import Data.List (concat, splitAt, tail, sortOn, sortBy, (++), intersect, null, inits, groupBy, scanl, nub, nubBy, union, dropWhile, partition, or, sort, (!!))
18 import Data.Map (Map, fromList, elems, restrictKeys, unionWith, findWithDefault, keys, (!), (!?), filterWithKey, singleton, empty, mapKeys, adjust)
19 import Debug.Trace (trace)
20 import Gargantext.Core.Viz.Phylo
21 import Gargantext.Core.Viz.Phylo.PhyloTools
22 import Gargantext.Prelude
23 import Prelude (floor,tan,pi)
24 import Text.Printf
25 import qualified Data.Map as Map
26 import qualified Data.Set as Set
27 import qualified Data.Vector as Vector
28
29
30 -------------------
31 -- | Proximity | --
32 -------------------
33
34
35 -- | To compute a jaccard similarity between two lists
36 jaccard :: [Int] -> [Int] -> Double
37 jaccard inter' union' = ((fromIntegral . length) $ inter') / ((fromIntegral . length) $ union')
38
39
40 -- | Process the inverse sumLog
41 sumInvLog' :: Double -> Double -> [Double] -> Double
42 sumInvLog' s nb diago = foldl (\mem occ -> mem + (1 / (log (occ + 1/ tan (s * pi / 2)) / log (nb + 1/ tan (s * pi / 2))))) 0 diago
43
44
45 -- | Process the sumLog
46 sumLog' :: Double -> Double -> [Double] -> Double
47 sumLog' s nb diago = foldl (\mem occ -> mem + (log (occ + 1/ tan (s * pi / 2)) / log (nb + 1/ tan (s * pi / 2)))) 0 diago
48
49
50 weightedLogJaccard' :: Double -> Double -> Map Int Double -> [Int] -> [Int] -> Double
51 weightedLogJaccard' sens nbDocs diago ngrams ngrams'
52 | null ngramsInter = 0
53 | ngramsInter == ngramsUnion = 1
54 | sens == 0 = jaccard ngramsInter ngramsUnion
55 | sens > 0 = (sumInvLog' sens nbDocs diagoInter) / (sumInvLog' sens nbDocs diagoUnion)
56 | otherwise = (sumLog' sens nbDocs diagoInter) / (sumLog' sens nbDocs diagoUnion)
57 where
58 --------------------------------------
59 ngramsInter :: [Int]
60 ngramsInter = intersect ngrams ngrams'
61 --------------------------------------
62 ngramsUnion :: [Int]
63 ngramsUnion = union ngrams ngrams'
64 --------------------------------------
65 diagoInter :: [Double]
66 diagoInter = elems $ restrictKeys diago (Set.fromList ngramsInter)
67 --------------------------------------
68 diagoUnion :: [Double]
69 diagoUnion = elems $ restrictKeys diago (Set.fromList ngramsUnion)
70 --------------------------------------
71
72 -- | Process the weighted similarity between clusters. Adapted from Wang, X., Cheng, Q., Lu, W., 2014. Analyzing evolution of research topics with NEViewer: a new method based on dynamic co-word networks. Scientometrics 101, 1253–1271. https://doi.org/10.1007/s11192-014-1347-y (log added in the formula + pair comparison)
73 -- tests not conclusive
74 weightedLogSim' :: Double -> Double -> Map Int Double -> [Int] -> [Int] -> Double
75 weightedLogSim' sens nbDocs diago ego_ngrams target_ngrams
76 | null ngramsInter = 0
77 | ngramsInter == ngramsUnion = 1
78 | sens == 0 = jaccard ngramsInter ngramsUnion
79 | sens > 0 = (sumInvLog' sens nbDocs diagoInter) / minimum [(sumInvLog' sens nbDocs diagoEgo),(sumInvLog' sens nbDocs diagoTarget)]
80 | otherwise = (sumLog' sens nbDocs diagoInter) / minimum [(sumLog' sens nbDocs diagoEgo),(sumLog' sens nbDocs diagoTarget)]
81 where
82 --------------------------------------
83 ngramsInter :: [Int]
84 ngramsInter = intersect ego_ngrams target_ngrams
85 --------------------------------------
86 ngramsUnion :: [Int]
87 ngramsUnion = union ego_ngrams target_ngrams
88 --------------------------------------
89 diagoInter :: [Double]
90 diagoInter = elems $ restrictKeys diago (Set.fromList ngramsInter)
91 --------------------------------------
92 diagoEgo :: [Double]
93 diagoEgo = elems $ restrictKeys diago (Set.fromList ego_ngrams)
94 --------------------------------------
95 diagoTarget :: [Double]
96 diagoTarget = elems $ restrictKeys diago (Set.fromList target_ngrams)
97 --------------------------------------
98
99 toProximity :: Double -> Map Int Double -> Proximity -> [Int] -> [Int] -> [Int] -> Double
100 -- | To process the proximity between a current group and a pair of targets group using the adapted Wang et al. Similarity
101 toProximity nbDocs diago proximity egoNgrams targetNgrams targetNgrams' =
102 case proximity of
103 WeightedLogJaccard sens _ ->
104 let pairNgrams = if targetNgrams == targetNgrams'
105 then targetNgrams
106 else union targetNgrams targetNgrams'
107 in weightedLogJaccard' sens nbDocs diago egoNgrams pairNgrams
108 WeightedLogSim sens _ ->
109 let pairNgrams = if targetNgrams == targetNgrams'
110 then targetNgrams
111 else union targetNgrams targetNgrams'
112 in weightedLogSim' sens nbDocs diago egoNgrams pairNgrams
113 Hamming _ _ -> undefined
114
115 ------------------------
116 -- | Local Matching | --
117 ------------------------
118
119 findLastPeriod :: Filiation -> [Period] -> Period
120 findLastPeriod fil periods = case fil of
121 ToParents -> head' "findLastPeriod" (sortOn fst periods)
122 ToChilds -> last' "findLastPeriod" (sortOn fst periods)
123 ToChildsMemory -> undefined
124 ToParentsMemory -> undefined
125
126
127 -- | To filter pairs of candidates related to old pointers periods
128 removeOldPointers :: [Pointer] -> Filiation -> Double -> Proximity -> Period
129 -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
130 -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
131 removeOldPointers oldPointers fil thr prox prd pairs
132 | null oldPointers = pairs
133 | null (filterPointers prox thr oldPointers) =
134 let lastMatchedPrd = findLastPeriod fil (map (fst . fst . fst) oldPointers)
135 in if lastMatchedPrd == prd
136 then []
137 else filter (\((id,_),(id',_)) ->
138 case fil of
139 ToChildsMemory -> undefined
140 ToParentsMemory -> undefined
141 ToParents -> (((fst . fst . fst) id ) < (fst lastMatchedPrd))
142 || (((fst . fst . fst) id') < (fst lastMatchedPrd))
143 ToChilds -> (((fst . fst . fst) id ) > (fst lastMatchedPrd))
144 || (((fst . fst . fst) id') > (fst lastMatchedPrd))) pairs
145 | otherwise = []
146
147
148
149 makePairs :: (PhyloGroupId,[Int]) -> [(PhyloGroupId,[Int])] -> [Period] -> [Pointer] -> Filiation -> Double -> Proximity
150 -> Map Date Double -> Map Date Cooc -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
151 makePairs (egoId, egoNgrams) candidates periods oldPointers fil thr prox docs diagos =
152 if (null periods)
153 then []
154 else removeOldPointers oldPointers fil thr prox lastPrd
155 {- at least on of the pair candidates should be from the last added period -}
156 $ filter (\((id,_),(id',_)) -> ((fst . fst) id == lastPrd) || ((fst . fst) id' == lastPrd))
157 $ filter (\((id,_),(id',_)) -> (elem id inPairs) || (elem id' inPairs))
158 $ listToCombi' candidates
159 where
160 --------------------------------------
161 inPairs :: [PhyloGroupId]
162 inPairs = map fst
163 $ filter (\(id,ngrams) ->
164 let nbDocs = (sum . elems) $ filterDocs docs ([(fst . fst) egoId, (fst . fst) id])
165 diago = reduceDiagos $ filterDiago diagos ([(fst . fst) egoId, (fst . fst) id])
166 in (toProximity nbDocs diago prox egoNgrams egoNgrams ngrams) >= thr
167 ) candidates
168 --------------------------------------
169 lastPrd :: Period
170 lastPrd = findLastPeriod fil periods
171 --------------------------------------
172
173
174 filterPointers :: Proximity -> Double -> [Pointer] -> [Pointer]
175 filterPointers proxi thr pts = filter (\(_,w) -> filterProximity proxi thr w) pts
176
177 filterPointers' :: Proximity -> Double -> [(Pointer,[Int])] -> [(Pointer,[Int])]
178 filterPointers' proxi thr pts = filter (\((_,w),_) -> filterProximity proxi thr w) pts
179
180
181 reduceDiagos :: Map Date Cooc -> Map Int Double
182 reduceDiagos diagos = mapKeys (\(k,_) -> k)
183 $ foldl (\acc diago -> unionWith (+) acc diago) empty (elems diagos)
184
185 filterPointersByPeriod :: Filiation -> [(Pointer,[Int])] -> [Pointer]
186 filterPointersByPeriod fil pts =
187 let pts' = sortOn (fst . fst . fst . fst) pts
188 inf = (fst . fst . fst . fst) $ head' "filterPointersByPeriod" pts'
189 sup = (fst . fst . fst . fst) $ last' "filterPointersByPeriod" pts'
190 in map fst
191 $ nubBy (\pt pt' -> snd pt == snd pt')
192 $ filter (\pt -> ((fst . fst . fst . fst) pt == inf) || ((fst . fst . fst . fst) pt == sup))
193 $ case fil of
194 ToParents -> reverse pts'
195 ToChilds -> pts'
196 ToChildsMemory -> undefined
197 ToParentsMemory -> undefined
198
199
200 phyloGroupMatching' :: [[(PhyloGroupId,[Int])]] -> Filiation -> Proximity -> Map Date Double -> Map Date Cooc
201 -> Double -> [Pointer] -> (PhyloGroupId,[Int]) -> [Pointer]
202 phyloGroupMatching' candidates filiation proxi docs diagos thr oldPointers (id,ngrams) =
203 if (null $ filterPointers proxi thr oldPointers)
204 -- if no previous pointers satisfy the current threshold then let's find new pointers
205 then if null nextPointers
206 then []
207 else filterPointersByPeriod filiation
208 -- 2) keep only the best set of pointers grouped by proximity
209 $ head' "phyloGroupMatching"
210 $ groupBy (\pt pt' -> (snd . fst) pt == (snd . fst) pt')
211 -- 1) find the first time frame where at leats one pointer satisfies the proximity threshold
212 $ sortBy (comparing (Down . snd . fst)) $ head' "pointers" nextPointers
213 else oldPointers
214 where
215 nextPointers :: [[(Pointer,[Int])]]
216 nextPointers = take 1
217 -- stop as soon as we find a time frame where at least one singleton / pair satisfies the threshold
218 $ dropWhile (null)
219 -- for each time frame, process the proximity on relevant pairs of targeted groups
220 $ scanl (\acc targets ->
221 let periods = nub $ map (fst . fst . fst) targets
222 lastPrd = findLastPeriod filiation periods
223 nbdocs = sum $ elems $ (filterDocs docs ([(fst . fst) id] ++ periods))
224 diago = reduceDiagos
225 $ filterDiago diagos ([(fst . fst) id] ++ periods)
226 singletons = processProximity nbdocs diago $ map (\g -> (g,g)) $ filter (\g -> (fst . fst . fst) g == lastPrd) targets
227 pairs = makePairs (id,ngrams) targets periods oldPointers filiation thr proxi docs diagos
228 in
229 if (null singletons)
230 then acc ++ ( processProximity nbdocs diago pairs )
231 else acc ++ singletons
232 ) [] $ map concat $ inits candidates -- groups from [[1900],[1900,1901],[1900,1901,1902],...]
233 -----------------------------
234 processProximity :: Double -> Map Int Double -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))] -> [(Pointer,[Int])]
235 processProximity nbdocs diago targets = filterPointers' proxi thr
236 $ concat
237 $ map (\(c,c') ->
238 let proximity = toProximity nbdocs diago proxi ngrams (snd c) (snd c')
239 in if ((c == c') || (snd c == snd c'))
240 then [((fst c,proximity),snd c)]
241 else [((fst c,proximity),snd c),((fst c',proximity),snd c')] ) targets
242
243
244 phyloGroupMatching :: [[(PhyloGroupId,[Int])]] -> Filiation -> Proximity -> Map Date Double -> Map Date Cooc
245 -> Double -> [Pointer] -> (PhyloGroupId,[Int]) -> [Pointer]
246 phyloGroupMatching candidates filiation proxi docs diagos thr oldPointers (id,ngrams) =
247 if (null $ filterPointers proxi thr oldPointers)
248 {- let's find new pointers -}
249 then if null nextPointers
250 then []
251 else filterPointersByPeriod filiation
252 $ head' "phyloGroupMatching"
253 -- Keep only the best set of pointers grouped by proximity
254 $ groupBy (\pt pt' -> (snd . fst) pt == (snd . fst) pt')
255 -- verifier que l on garde bien les plus importants
256 $ sortBy (comparing (Down . snd . fst)) $ head' "pointers" nextPointers
257 -- Find the first time frame where at leats one pointer satisfies the proximity threshold
258 else oldPointers
259 where
260 nextPointers :: [[(Pointer,[Int])]]
261 nextPointers = take 1
262 $ dropWhile (null)
263 {- for each time frame, process the proximity on relevant pairs of targeted groups -}
264 $ scanl (\acc groups ->
265 let periods = nub $ map (fst . fst . fst) $ concat groups
266 nbdocs = sum $ elems $ (filterDocs docs ([(fst . fst) id] ++ periods))
267 diago = reduceDiagos
268 $ filterDiago diagos ([(fst . fst) id] ++ periods)
269 pairs = makePairs (id,ngrams) (concat groups) periods oldPointers filiation thr proxi docs diagos
270 in acc ++ ( filterPointers' proxi thr
271 $ concat
272 $ map (\(c,c') ->
273 {- process the proximity between the current group and a pair of candidates -}
274 let proximity = toProximity nbdocs diago proxi ngrams (snd c) (snd c')
275 in if ((c == c') || (snd c == snd c'))
276 then [((fst c,proximity),snd c)]
277 else [((fst c,proximity),snd c),((fst c',proximity),snd c')] ) pairs )) []
278 $ inits candidates -- groups from [[1900],[1900,1901],[1900,1901,1902],...]
279
280
281 filterDocs :: Map Date Double -> [Period] -> Map Date Double
282 filterDocs d pds = restrictKeys d $ periodsToYears pds
283
284 filterDiago :: Map Date Cooc -> [Period] -> Map Date Cooc
285 filterDiago diago pds = restrictKeys diago $ periodsToYears pds
286
287
288 -----------------------------
289 -- | Matching Processing | --
290 -----------------------------
291
292
293 getNextPeriods :: Filiation -> Int -> Period -> [Period] -> [Period]
294 getNextPeriods fil max' pId pIds =
295 case fil of
296 ToChilds -> take max' $ (tail . snd) $ splitAt (elemIndex' pId pIds) pIds
297 ToParents -> take max' $ (reverse . fst) $ splitAt (elemIndex' pId pIds) pIds
298 ToChildsMemory -> undefined
299 ToParentsMemory -> undefined
300
301
302 getCandidates :: Int -> PhyloGroup -> [[(PhyloGroupId,[Int])]] -> [[(PhyloGroupId,[Int])]]
303 getCandidates minNgrams ego targets =
304 if (length (ego ^. phylo_groupNgrams)) > 1
305 then
306 map (\groups' -> filter (\g' -> (> minNgrams) $ length $ intersect (ego ^. phylo_groupNgrams) (snd g')) groups') targets
307 else
308 map (\groups' -> filter (\g' -> (not . null) $ intersect (ego ^. phylo_groupNgrams) (snd g')) groups') targets
309
310
311 matchGroupsToGroups :: Int -> [Period] -> Proximity -> Double -> Map Date Double -> Map Date Cooc -> [PhyloGroup] -> [PhyloGroup]
312 matchGroupsToGroups frame periods proximity thr docs coocs groups =
313 let groups' = groupByField _phylo_groupPeriod groups
314 in foldl' (\acc prd ->
315 let -- 1) find the parents/childs matching periods
316 periodsPar = getNextPeriods ToParents frame prd periods
317 periodsChi = getNextPeriods ToChilds frame prd periods
318 -- 2) find the parents/childs matching candidates
319 candidatesPar = map (\prd' -> map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ findWithDefault [] prd' groups') periodsPar
320 candidatesChi = map (\prd' -> map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ findWithDefault [] prd' groups') periodsChi
321 -- 3) find the parents/child number of docs by years
322 docsPar = filterDocs docs ([prd] ++ periodsPar)
323 docsChi = filterDocs docs ([prd] ++ periodsChi)
324 -- 4) find the parents/child diago by years
325 diagoPar = filterDiago (map coocToDiago coocs) ([prd] ++ periodsPar)
326 diagoChi = filterDiago (map coocToDiago coocs) ([prd] ++ periodsPar)
327 -- 5) match in parallel all the groups (egos) to their possible candidates
328 egos = map (\ego ->
329 let pointersPar = phyloGroupMatching' (getCandidates (getMinSharedNgrams proximity) ego candidatesPar) ToParents proximity docsPar diagoPar
330 thr (getPeriodPointers ToParents ego) (getGroupId ego, ego ^. phylo_groupNgrams)
331 pointersChi = phyloGroupMatching' (getCandidates (getMinSharedNgrams proximity) ego candidatesChi) ToChilds proximity docsChi diagoChi
332 thr (getPeriodPointers ToChilds ego) (getGroupId ego, ego ^. phylo_groupNgrams)
333 in addPointers ToChilds TemporalPointer pointersChi
334 $ addPointers ToParents TemporalPointer pointersPar
335 $ addMemoryPointers ToChildsMemory TemporalPointer thr pointersChi
336 $ addMemoryPointers ToParentsMemory TemporalPointer thr pointersPar ego)
337 $ findWithDefault [] prd groups'
338 egos' = egos `using` parList rdeepseq
339 in acc ++ egos'
340 ) [] periods
341
342
343 -----------------------
344 -- | Phylo Quality | --
345 -----------------------
346
347
348 relevantBranches :: Int -> [[PhyloGroup]] -> [[PhyloGroup]]
349 relevantBranches term branches =
350 filter (\groups -> (any (\group -> elem term $ group ^. phylo_groupNgrams) groups)) branches
351
352 accuracy :: Int -> [(Date,Date)] -> [PhyloGroup] -> Double
353 -- The accuracy of a branch relatively to a term x is computed only over the periods there exist some cluster mentionning x in the phylomemy
354 accuracy x periods bk = ((fromIntegral $ length $ filter (\g -> elem x $ g ^. phylo_groupNgrams) bk')
355 / (fromIntegral $ length bk'))
356 where
357 bk' :: [PhyloGroup]
358 bk' = filter (\g -> elem (g ^. phylo_groupPeriod) periods) bk
359
360 recall :: Int -> [PhyloGroup] -> [[PhyloGroup]] -> Double
361 recall x bk bx = ((fromIntegral $ length $ filter (\g -> elem x $ g ^. phylo_groupNgrams) bk)
362 / (fromIntegral $ length $ filter (\g -> elem x $ g ^. phylo_groupNgrams) $ concat bx))
363
364 fScore :: Double -> Int -> [(Date,Date)] -> [PhyloGroup] -> [[PhyloGroup]] -> Double
365 fScore lambda x periods bk bx =
366 let rec = recall x bk bx
367 acc = accuracy x periods bk
368 in ((1 + lambda ** 2) * acc * rec)
369 / (((lambda ** 2) * acc + rec))
370
371
372 wk :: [PhyloGroup] -> Double
373 wk bk = fromIntegral $ length bk
374
375 toRecall :: Map Int Double -> [[PhyloGroup]] -> Double
376 toRecall freq branches =
377 if (null branches)
378 then 0
379 else sum
380 $ map (\x ->
381 let px = freq ! x
382 bx = relevantBranches x branches
383 wks = sum $ map wk bx
384 in (px / pys) * (sum $ map (\bk -> ((wk bk) / wks) * (recall x bk bx)) bx))
385 $ keys freq
386 where
387 pys :: Double
388 pys = sum (elems freq)
389
390
391 toAccuracy :: Map Int Double -> [[PhyloGroup]] -> Double
392 toAccuracy freq branches =
393 if (null branches)
394 then 0
395 else sum
396 $ map (\x ->
397 let px = freq ! x
398 bx = relevantBranches x branches
399 -- | periods containing x
400 periods = nub $ map _phylo_groupPeriod $ filter (\g -> elem x $ g ^. phylo_groupNgrams) $ concat bx
401 wks = sum $ map wk bx
402 in (px / pys) * (sum $ map (\bk -> ((wk bk) / wks) * (accuracy x periods bk)) bx))
403 $ keys freq
404 where
405 pys :: Double
406 pys = sum (elems freq)
407
408
409 -- | here we do the average of all the local f_scores
410 toPhyloQuality :: Double -> Double -> Map Int Double -> [[PhyloGroup]] -> Double
411 toPhyloQuality fdt lambda freq branches =
412 if (null branches)
413 then 0
414 else sum
415 $ map (\x ->
416 -- let px = freq ! x
417 let bx = relevantBranches x branches
418 -- | periods containing x
419 periods = nub $ map _phylo_groupPeriod $ filter (\g -> elem x $ g ^. phylo_groupNgrams) $ concat bx
420 wks = sum $ map wk bx
421 -- in (px / pys) * (sum $ map (\bk -> ((wk bk) / wks) * (fScore beta x bk bx)) bx))
422 -- in (1 / fdt) * (sum $ map (\bk -> ((wk bk) / wks) * (fScore beta x periods bk bx)) bx))
423 in (1 / fdt) * (sum $ map (\bk -> ((wk bk) / wks) * (fScore (tan (lambda * pi / 2)) x periods bk bx)) bx))
424 $ keys freq
425 -- where
426 -- pys :: Double
427 -- pys = sum (elems freq)
428
429 -- 1 / nb de foundation
430
431 ------------------------------------
432 -- | Constant Temporal Matching | --
433 ------------------------------------
434
435 -- add a branch id within each group
436 groupsToBranches :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
437 groupsToBranches groups =
438 {- run the related component algorithm -}
439 let egos = groupBy (\gs gs' -> (fst $ fst $ head' "egos" gs) == (fst $ fst $ head' "egos" gs'))
440 $ sortOn (\gs -> fst $ fst $ head' "egos" gs)
441 $ map (\group -> [getGroupId group]
442 ++ (map fst $ group ^. phylo_groupPeriodParents)
443 ++ (map fst $ group ^. phylo_groupPeriodChilds) ) $ elems groups
444 -- first find the related components by inside each ego's period
445 -- a supprimer
446 graph' = map relatedComponents egos
447 -- then run it for the all the periods
448 branches = zip [1..]
449 $ relatedComponents $ concat (graph' `using` parList rdeepseq)
450 -- update each group's branch id
451 in map (\(bId,branch) ->
452 let groups' = map (\group -> group & phylo_groupBranchId %~ (\(lvl,lst) -> (lvl,lst ++ [bId])))
453 $ elems $ restrictKeys groups (Set.fromList branch)
454 in groups' `using` parList rdeepseq
455 ) branches `using` parList rdeepseq
456
457
458 reduceFrequency :: Map Int Double -> [[PhyloGroup]] -> Map Int Double
459 reduceFrequency frequency branches =
460 restrictKeys frequency (Set.fromList $ (nub . concat) $ map _phylo_groupNgrams $ concat branches)
461
462 updateThr :: Double -> [[PhyloGroup]] -> [[PhyloGroup]]
463 updateThr thr branches = map (\b -> map (\g ->
464 g & phylo_groupMeta .~ (singleton "seaLevels" (((g ^. phylo_groupMeta) ! "seaLevels") ++ [thr]))) b) branches
465
466
467 -- Sequentially break each branch of a phylo where
468 -- done = all the allready broken branches
469 -- ego = the current branch we want to break
470 -- rest = the branches we still have to break
471 breakBranches :: Double -> Proximity -> Double -> Map Int Double -> Int -> Double -> Double -> Double
472 -> Int -> Map Date Double -> Map Date Cooc -> [Period] -> [([PhyloGroup],Bool)] -> ([PhyloGroup],Bool) -> [([PhyloGroup],Bool)] -> [([PhyloGroup],Bool)]
473 breakBranches fdt proximity lambda frequency minBranch thr depth elevation frame docs coocs periods done ego rest =
474 -- 1) keep or not the new division of ego
475 let done' = done ++ (if snd ego
476 then
477 (if ((null (fst ego')) || (quality > quality'))
478 then
479 -- trace (" ✗ F(β) = " <> show(quality) <> " (vs) " <> show(quality')
480 -- <> " | " <> show(length $ fst ego) <> " groups : "
481 -- <> " |✓ " <> show(length $ fst ego') <> show(map length $ fst ego')
482 -- <> " |✗ " <> show(length $ snd ego') <> "[" <> show(length $ concat $ snd ego') <> "]")
483 [(fst ego,False)]
484 else
485 -- trace (" ✓ level = " <> printf "%.1f" thr <> "")
486 -- trace (" ✓ F(β) = " <> show(quality) <> " (vs) " <> show(quality')
487 -- <> " | " <> show(length $ fst ego) <> " groups : "
488 -- <> " |✓ " <> show(length $ fst ego') <> show(map length $ fst ego')
489 -- <> " |✗ " <> show(length $ snd ego') <> "[" <> show(length $ concat $ snd ego') <> "]")
490 ((map (\e -> (e,True)) (fst ego')) ++ (map (\e -> (e,False)) (snd ego'))))
491 else [ego])
492 in
493 -- 2) if there is no more branches in rest then return else continue
494 if null rest
495 then done'
496 else breakBranches fdt proximity lambda frequency minBranch thr depth elevation frame docs coocs periods
497 done' (head' "breakBranches" rest) (tail' "breakBranches" rest)
498 where
499 --------------------------------------
500 quality :: Double
501 quality = toPhyloQuality fdt lambda frequency ((map fst done) ++ [fst ego] ++ (map fst rest))
502 --------------------------------------
503 ego' :: ([[PhyloGroup]],[[PhyloGroup]])
504 ego' =
505 let branches = groupsToBranches $ fromList $ map (\g -> (getGroupId g, g))
506 $ matchGroupsToGroups frame periods proximity thr docs coocs (fst ego)
507 branches' = branches `using` parList rdeepseq
508 in partition (\b -> (length $ nub $ map _phylo_groupPeriod b) >= minBranch)
509 $ thrToMeta thr
510 $ depthToMeta (elevation - depth) branches'
511 --------------------------------------
512 quality' :: Double
513 quality' = toPhyloQuality fdt lambda frequency
514 ((map fst done) ++ (fst ego') ++ (snd ego') ++ (map fst rest))
515
516
517 seaLevelMatching :: Double -> Proximity -> Double -> Int -> Map Int Double -> Double -> Double -> Double -> Double
518 -> Int -> [Period] -> Map Date Double -> Map Date Cooc -> [([PhyloGroup],Bool)] -> ([([PhyloGroup],Bool)],Double)
519 seaLevelMatching fdt proximity lambda minBranch frequency thr step depth elevation frame periods docs coocs branches =
520 -- if there is no branch to break or if seaLvl level > 1 then end
521 if (thr >= 1) || ((not . or) $ map snd branches)
522 then (branches, toPhyloQuality fdt lambda frequency (map fst branches))
523 else
524 -- break all the possible branches at the current seaLvl level
525 let quality = toPhyloQuality fdt lambda frequency (map fst branches)
526 acc = toAccuracy frequency (map fst branches)
527 rec = toRecall frequency (map fst branches)
528 branches' = trace ("↑ level = " <> printf "%.3f" thr <> " F(λ) = " <> printf "%.5f" quality
529 <> " ξ = " <> printf "%.5f" acc
530 <> " ρ = " <> printf "%.5f" rec
531 <> " branches = " <> show(length branches) <> " ↴")
532 $ breakBranches fdt proximity lambda frequency minBranch thr depth elevation frame docs coocs periods
533 [] (head' "seaLevelMatching" branches) (tail' "seaLevelMatching" branches)
534 frequency' = reduceFrequency frequency (map fst branches')
535 in seaLevelMatching fdt proximity lambda minBranch frequency' (thr + step) step (depth - 1) elevation frame periods docs coocs branches'
536
537
538 constanteTemporalMatching :: Double -> Double -> Phylo -> Phylo
539 constanteTemporalMatching start step phylo = updatePhyloGroups 1
540 (fromList $ map (\g -> (getGroupId g,g)) $ traceMatchEnd $ concat (map fst $ (fst branches)))
541 (toPhyloHorizon (updateQuality (snd branches) phylo))
542 where
543 -- 2) process the temporal matching by elevating seaLvl level
544 -- branches :: ([([groups in the same branch],should westill break the branch?)],final quality)
545 branches :: ([([PhyloGroup],Bool)],Double)
546 branches = seaLevelMatching (fromIntegral $ Vector.length $ getRoots phylo)
547 (phyloProximity $ getConfig phylo)
548 (_qua_granularity $ phyloQuality $ getConfig phylo)
549 (_qua_minBranch $ phyloQuality $ getConfig phylo)
550 (phylo ^. phylo_termFreq)
551 start step
552 ((((1 - start) / step) - 1))
553 (((1 - start) / step))
554 (getTimeFrame $ timeUnit $ getConfig phylo)
555 (getPeriodIds phylo)
556 (phylo ^. phylo_timeDocs)
557 (phylo ^. phylo_timeCooc)
558 (reverse $ sortOn (length . fst) initBranches)
559 -- 1) for each group process an initial temporal Matching
560 -- here we suppose that all the groups of level 1 are part of the same big branch
561 -- the Bool param determines weither you should apply the sealevel within the branch
562 -- creer un type [PhyloGroup] <=> Branch
563 initBranches :: [([PhyloGroup],Bool)]
564 initBranches = map (\b -> (b,(length $ nub $ map _phylo_groupPeriod b) >= (_qua_minBranch $ phyloQuality $ getConfig phylo)))
565 $ groupsToBranches $ Map.fromList $ map (\g -> (getGroupId g, g))
566 $ matchGroupsToGroups (getTimeFrame $ timeUnit $ getConfig phylo)
567 (getPeriodIds phylo) (phyloProximity $ getConfig phylo)
568 start
569 (phylo ^. phylo_timeDocs)
570 (phylo ^. phylo_timeCooc)
571 (traceTemporalMatching $ getGroupsFromLevel 1 phylo)
572
573 -----------------
574 -- | Horizon | --
575 -----------------
576
577 toPhyloHorizon :: Phylo -> Phylo
578 toPhyloHorizon phylo =
579 let t0 = take 1 (getPeriodIds phylo)
580 groups = getGroupsFromLevelPeriods 1 t0 phylo
581 sens = getSensibility (phyloProximity $ getConfig phylo)
582 nbDocs = sum $ elems $ filterDocs (phylo ^. phylo_timeDocs) t0
583 diago = reduceDiagos $ filterDiago (phylo ^. phylo_timeCooc) t0
584 in phylo & phylo_horizon .~ (fromList $ map (\(g,g') ->
585 ((getGroupId g,getGroupId g'),weightedLogJaccard' sens nbDocs diago (g ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams))) $ listToCombi' groups)
586
587
588 --------------------------------------
589 -- | Adaptative Temporal Matching | --
590 --------------------------------------
591
592
593 thrToMeta :: Double -> [[PhyloGroup]] -> [[PhyloGroup]]
594 thrToMeta thr branches =
595 map (\b ->
596 map (\g -> g & phylo_groupMeta .~ (adjust (\lst -> lst ++ [thr]) "seaLevels" (g ^. phylo_groupMeta))) b) branches
597
598 depthToMeta :: Double -> [[PhyloGroup]] -> [[PhyloGroup]]
599 depthToMeta depth branches =
600 let break = length branches > 1
601 in map (\b ->
602 map (\g ->
603 if break then g & phylo_groupMeta .~ (adjust (\lst -> lst ++ [depth]) "breaks"(g ^. phylo_groupMeta))
604 else g) b) branches
605
606 reduceTupleMapByKeys :: Eq a => [a] -> Map (a,a) Double -> Map (a,a) Double
607 reduceTupleMapByKeys ks m = filterWithKey (\(k,k') _ -> (elem k ks) && (elem k' ks)) m
608
609
610 getInTupleMap :: Ord a => Map (a,a) Double -> a -> a -> Double
611 getInTupleMap m k k'
612 | isJust (m !? ( k ,k')) = m ! ( k ,k')
613 | isJust (m !? ( k',k )) = m ! ( k',k )
614 | otherwise = 0
615
616
617 toThreshold :: Double -> Map (PhyloGroupId,PhyloGroupId) Double -> Double
618 toThreshold nbSteps proxiGroups =
619 let idx = ((Map.size proxiGroups) `div` (floor nbSteps)) - 1
620 in if idx >= 0
621 then (sort $ elems proxiGroups) !! idx
622 else 1
623
624
625 -- done = all the allready broken branches
626 -- ego = the current branch we want to break
627 -- rest = the branches we still have to break
628 adaptativeBreakBranches :: Double -> Proximity -> Double -> Double -> Map (PhyloGroupId,PhyloGroupId) Double
629 -> Double -> Map Int Double -> Int -> Int -> Map Date Double -> Map Date Cooc
630 -> [Period] -> [([PhyloGroup],(Bool,[Double]))] -> ([PhyloGroup],(Bool,[Double])) -> [([PhyloGroup],(Bool,[Double]))]
631 -> [([PhyloGroup],(Bool,[Double]))]
632 adaptativeBreakBranches fdt proxiConf depth elevation groupsProxi lambda frequency minBranch frame docs coocs periods done ego rest =
633 -- 1) keep or not the new division of ego
634 let done' = done ++ (if (fst . snd) ego
635 then (if ((null (fst ego')) || (quality > quality'))
636 then
637 [(concat $ thrToMeta thr $ [fst ego],(False, ((snd . snd) ego)))]
638 else
639 ( (map (\e -> (e,(True, ((snd . snd) ego) ++ [thr]))) (fst ego'))
640 ++ (map (\e -> (e,(False, ((snd . snd) ego)))) (snd ego'))))
641 else [(concat $ thrToMeta thr $ [fst ego], snd ego)])
642 in
643 -- uncomment let .. in for debugging
644 -- let part1 = partition (snd) done'
645 -- part2 = partition (snd) rest
646 -- in trace ( "[✓ " <> show(length $ fst part1) <> "(" <> show(length $ concat $ map (fst) $ fst part1) <> ")|✗ " <> show(length $ snd part1) <> "(" <> show(length $ concat $ map (fst) $ snd part1) <> ")] "
647 -- <> "[✓ " <> show(length $ fst part2) <> "(" <> show(length $ concat $ map (fst) $ fst part2) <> ")|✗ " <> show(length $ snd part2) <> "(" <> show(length $ concat $ map (fst) $ snd part2) <> ")]"
648 -- ) $
649 -- 2) if there is no more branches in rest then return else continue
650 if null rest
651 then done'
652 else adaptativeBreakBranches fdt proxiConf depth elevation groupsProxi lambda frequency minBranch frame docs coocs periods
653 done' (head' "breakBranches" rest) (tail' "breakBranches" rest)
654 where
655 --------------------------------------
656 thr :: Double
657 thr = toThreshold depth $ Map.filter (\v -> v > (last' "breakBranches" $ (snd . snd) ego)) $ reduceTupleMapByKeys (map getGroupId $ fst ego) groupsProxi
658 --------------------------------------
659 quality :: Double
660 quality = toPhyloQuality fdt lambda frequency ((map fst done) ++ [fst ego] ++ (map fst rest))
661 --------------------------------------
662 ego' :: ([[PhyloGroup]],[[PhyloGroup]])
663 ego' =
664 let branches = groupsToBranches $ fromList $ map (\g -> (getGroupId g, g))
665 $ matchGroupsToGroups frame periods proxiConf thr docs coocs (fst ego)
666 branches' = branches `using` parList rdeepseq
667 in partition (\b -> (length $ nub $ map _phylo_groupPeriod b) > minBranch)
668 $ thrToMeta thr
669 $ depthToMeta (elevation - depth) branches'
670 --------------------------------------
671 quality' :: Double
672 quality' = toPhyloQuality fdt lambda frequency
673 ((map fst done) ++ (fst ego') ++ (snd ego') ++ (map fst rest))
674
675
676 adaptativeSeaLevelMatching :: Double -> Proximity -> Double -> Double -> Map (PhyloGroupId, PhyloGroupId) Double
677 -> Double -> Int -> Map Int Double
678 -> Int -> [Period] -> Map Date Double -> Map Date Cooc
679 -> [([PhyloGroup],(Bool,[Double]))] -> [([PhyloGroup],(Bool,[Double]))]
680 adaptativeSeaLevelMatching fdt proxiConf depth elevation groupsProxi lambda minBranch frequency frame periods docs coocs branches =
681 -- if there is no branch to break or if seaLvl level >= depth then end
682 if (Map.null groupsProxi) || (depth <= 0) || ((not . or) $ map (fst . snd) branches)
683 then branches
684 else
685 -- break all the possible branches at the current seaLvl level
686 let branches' = adaptativeBreakBranches fdt proxiConf depth elevation groupsProxi lambda frequency minBranch frame docs coocs periods
687 [] (head' "seaLevelMatching" branches) (tail' "seaLevelMatching" branches)
688 frequency' = reduceFrequency frequency (map fst branches')
689 groupsProxi' = reduceTupleMapByKeys (map (getGroupId) $ concat $ map (fst) $ filter (fst . snd) branches') groupsProxi
690 thr = toThreshold depth groupsProxi
691 in trace("\n " <> foldl (\acc _ -> acc <> "🌊 ") "" [0..(elevation - depth)]
692 <> " [✓ " <> show(length $ filter (fst . snd) branches') <> "(" <> show(length $ concat $ map (fst) $ filter (fst . snd) branches')
693 <> ")|✗ " <> show(length $ filter (not . fst . snd) branches') <> "(" <> show(length $ concat $ map (fst) $ filter (not . fst . snd) branches') <> ")]"
694 <> " thr = " <> show(thr))
695 $ adaptativeSeaLevelMatching fdt proxiConf (depth - 1) elevation groupsProxi' lambda minBranch frequency' frame periods docs coocs branches'
696
697
698 adaptativeTemporalMatching :: Double -> Phylo -> Phylo
699 adaptativeTemporalMatching elevation phylo = updatePhyloGroups 1
700 (fromList $ map (\g -> (getGroupId g,g)) $ traceMatchEnd $ concat branches)
701 (toPhyloHorizon phylo)
702 where
703 -- 2) process the temporal matching by elevating seaLvl level
704 branches :: [[PhyloGroup]]
705 branches = map fst
706 $ adaptativeSeaLevelMatching (fromIntegral $ Vector.length $ getRoots phylo)
707 (phyloProximity $ getConfig phylo)
708 (elevation - 1)
709 elevation
710 (phylo ^. phylo_groupsProxi)
711 (_qua_granularity $ phyloQuality $ getConfig phylo)
712 (_qua_minBranch $ phyloQuality $ getConfig phylo)
713 (phylo ^. phylo_termFreq)
714 (getTimeFrame $ timeUnit $ getConfig phylo)
715 (getPeriodIds phylo)
716 (phylo ^. phylo_timeDocs)
717 (phylo ^. phylo_timeCooc)
718 groups
719 -- 1) for each group process an initial temporal Matching
720 -- here we suppose that all the groups of level 1 are part of the same big branch
721 groups :: [([PhyloGroup],(Bool,[Double]))]
722 groups = map (\b -> (b,((length $ nub $ map _phylo_groupPeriod b) >= (_qua_minBranch $ phyloQuality $ getConfig phylo),[thr])))
723 $ groupsToBranches $ fromList $ map (\g -> (getGroupId g, g))
724 $ matchGroupsToGroups (getTimeFrame $ timeUnit $ getConfig phylo)
725 (getPeriodIds phylo) (phyloProximity $ getConfig phylo)
726 thr
727 (phylo ^. phylo_timeDocs)
728 (phylo ^. phylo_timeCooc)
729 (traceTemporalMatching $ getGroupsFromLevel 1 phylo)
730 --------------------------------------
731 thr :: Double
732 thr = toThreshold elevation (phylo ^. phylo_groupsProxi)