]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Viz/Phylo/TemporalMatching.hs
[CLEAN] readFile homonyms
[gargantext.git] / src / Gargantext / Core / Viz / Phylo / TemporalMatching.hs
1 {-|
2 Module : Gargantext.Core.Viz.Phylo.TemporalMatching
3 Description : Module dedicated to the adaptative temporal matching of a Phylo.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11
12 module Gargantext.Core.Viz.Phylo.TemporalMatching where
13
14 import Control.Lens hiding (Level)
15 import Control.Parallel.Strategies (parList, rdeepseq, using)
16 import Data.List (concat, splitAt, tail, sortOn, (++), intersect, null, inits, groupBy, scanl, nub, nubBy, union, dropWhile, partition, or, sort, (!!))
17 import Data.Map (Map, fromList, elems, restrictKeys, unionWith, findWithDefault, keys, (!), (!?), filterWithKey, singleton, empty, mapKeys, adjust)
18 import Debug.Trace (trace)
19 import Gargantext.Core.Viz.Phylo
20 import Gargantext.Core.Viz.Phylo.PhyloTools
21 import Gargantext.Prelude
22 import Prelude (floor,tan,pi)
23 import Text.Printf
24 import qualified Data.Map as Map
25 import qualified Data.Set as Set
26 import qualified Data.Vector as Vector
27
28
29 -------------------
30 -- | Proximity | --
31 -------------------
32
33
34 -- | To compute a jaccard similarity between two lists
35 jaccard :: [Int] -> [Int] -> Double
36 jaccard inter' union' = ((fromIntegral . length) $ inter') / ((fromIntegral . length) $ union')
37
38
39 -- | Process the inverse sumLog
40 sumInvLog' :: Double -> Double -> [Double] -> Double
41 sumInvLog' s nb diago = foldl (\mem occ -> mem + (1 / (log (occ + 1/ tan (s * pi / 2)) / log (nb + 1/ tan (s * pi / 2))))) 0 diago
42
43
44 -- | Process the sumLog
45 sumLog' :: Double -> Double -> [Double] -> Double
46 sumLog' s nb diago = foldl (\mem occ -> mem + (log (occ + 1/ tan (s * pi / 2)) / log (nb + 1/ tan (s * pi / 2)))) 0 diago
47
48
49 weightedLogJaccard' :: Double -> Double -> Map Int Double -> [Int] -> [Int] -> Double
50 weightedLogJaccard' sens nbDocs diago ngrams ngrams'
51 | null ngramsInter = 0
52 | ngramsInter == ngramsUnion = 1
53 | sens == 0 = jaccard ngramsInter ngramsUnion
54 | sens > 0 = (sumInvLog' sens nbDocs diagoInter) / (sumInvLog' sens nbDocs diagoUnion)
55 | otherwise = (sumLog' sens nbDocs diagoInter) / (sumLog' sens nbDocs diagoUnion)
56 where
57 --------------------------------------
58 ngramsInter :: [Int]
59 ngramsInter = intersect ngrams ngrams'
60 --------------------------------------
61 ngramsUnion :: [Int]
62 ngramsUnion = union ngrams ngrams'
63 --------------------------------------
64 diagoInter :: [Double]
65 diagoInter = elems $ restrictKeys diago (Set.fromList ngramsInter)
66 --------------------------------------
67 diagoUnion :: [Double]
68 diagoUnion = elems $ restrictKeys diago (Set.fromList ngramsUnion)
69 --------------------------------------
70
71 -- | Process the weighted similarity between clusters. Adapted from Wang, X., Cheng, Q., Lu, W., 2014. Analyzing evolution of research topics with NEViewer: a new method based on dynamic co-word networks. Scientometrics 101, 1253–1271. https://doi.org/10.1007/s11192-014-1347-y (log added in the formula + pair comparison)
72 -- tests not conclusive
73 weightedLogSim' :: Double -> Double -> Map Int Double -> [Int] -> [Int] -> Double
74 weightedLogSim' sens nbDocs diago ego_ngrams target_ngrams
75 | null ngramsInter = 0
76 | ngramsInter == ngramsUnion = 1
77 | sens == 0 = jaccard ngramsInter ngramsUnion
78 | sens > 0 = (sumInvLog' sens nbDocs diagoInter) / minimum [(sumInvLog' sens nbDocs diagoEgo),(sumInvLog' sens nbDocs diagoTarget)]
79 | otherwise = (sumLog' sens nbDocs diagoInter) / minimum [(sumLog' sens nbDocs diagoEgo),(sumLog' sens nbDocs diagoTarget)]
80 where
81 --------------------------------------
82 ngramsInter :: [Int]
83 ngramsInter = intersect ego_ngrams target_ngrams
84 --------------------------------------
85 ngramsUnion :: [Int]
86 ngramsUnion = union ego_ngrams target_ngrams
87 --------------------------------------
88 diagoInter :: [Double]
89 diagoInter = elems $ restrictKeys diago (Set.fromList ngramsInter)
90 --------------------------------------
91 diagoEgo :: [Double]
92 diagoEgo = elems $ restrictKeys diago (Set.fromList ego_ngrams)
93 --------------------------------------
94 diagoTarget :: [Double]
95 diagoTarget = elems $ restrictKeys diago (Set.fromList target_ngrams)
96 --------------------------------------
97
98 toProximity :: Double -> Map Int Double -> Proximity -> [Int] -> [Int] -> [Int] -> Double
99 -- | To process the proximity between a current group and a pair of targets group using the adapted Wang et al. Similarity
100 toProximity nbDocs diago proximity egoNgrams targetNgrams targetNgrams' =
101 case proximity of
102 WeightedLogJaccard sens ->
103 let pairNgrams = if targetNgrams == targetNgrams'
104 then targetNgrams
105 else union targetNgrams targetNgrams'
106 in weightedLogJaccard' sens nbDocs diago egoNgrams pairNgrams
107 WeightedLogSim sens ->
108 let pairNgrams = if targetNgrams == targetNgrams'
109 then targetNgrams
110 else union targetNgrams targetNgrams'
111 in weightedLogSim' sens nbDocs diago egoNgrams pairNgrams
112 Hamming _ -> undefined
113
114 ------------------------
115 -- | Local Matching | --
116 ------------------------
117
118 findLastPeriod :: Filiation -> [PhyloPeriodId] -> PhyloPeriodId
119 findLastPeriod fil periods = case fil of
120 ToParents -> head' "findLastPeriod" (sortOn fst periods)
121 ToChilds -> last' "findLastPeriod" (sortOn fst periods)
122 ToChildsMemory -> undefined
123 ToParentsMemory -> undefined
124
125
126 -- | To filter pairs of candidates related to old pointers periods
127 removeOldPointers :: [Pointer] -> Filiation -> Double -> Proximity -> PhyloPeriodId
128 -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
129 -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
130 removeOldPointers oldPointers fil thr prox prd pairs
131 | null oldPointers = pairs
132 | null (filterPointers prox thr oldPointers) =
133 let lastMatchedPrd = findLastPeriod fil (map (fst . fst . fst) oldPointers)
134 in if lastMatchedPrd == prd
135 then []
136 else filter (\((id,_),(id',_)) ->
137 case fil of
138 ToChildsMemory -> undefined
139 ToParentsMemory -> undefined
140 ToParents -> (((fst . fst . fst) id ) < (fst lastMatchedPrd))
141 || (((fst . fst . fst) id') < (fst lastMatchedPrd))
142 ToChilds -> (((fst . fst . fst) id ) > (fst lastMatchedPrd))
143 || (((fst . fst . fst) id') > (fst lastMatchedPrd))) pairs
144 | otherwise = []
145
146
147 makePairs' :: (PhyloGroupId,[Int]) -> [(PhyloGroupId,[Int])] -> [PhyloPeriodId] -> [Pointer] -> Filiation -> Double -> Proximity
148 -> Map Date Double -> Map Date Cooc -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
149 makePairs' (egoId, egoNgrams) candidates periods oldPointers fil thr prox docs diagos =
150 if (null periods)
151 then []
152 else removeOldPointers oldPointers fil thr prox lastPrd
153 {- at least on of the pair candidates should be from the last added period -}
154 $ filter (\((id,_),(id',_)) -> ((fst . fst) id == lastPrd) || ((fst . fst) id' == lastPrd))
155 $ listToKeys
156 $ filter (\(id,ngrams) ->
157 let nbDocs = (sum . elems) $ filterDocs docs ([(fst . fst) egoId, (fst . fst) id])
158 diago = reduceDiagos $ filterDiago diagos ([(fst . fst) egoId, (fst . fst) id])
159 in (toProximity nbDocs diago prox egoNgrams egoNgrams ngrams) >= thr
160 ) candidates
161 where
162 lastPrd :: PhyloPeriodId
163 lastPrd = findLastPeriod fil periods
164
165
166 filterPointers :: Proximity -> Double -> [Pointer] -> [Pointer]
167 filterPointers proxi thr pts = filter (\(_,w) -> filterProximity proxi thr w) pts
168
169 filterPointers' :: Proximity -> Double -> [(Pointer,[Int])] -> [(Pointer,[Int])]
170 filterPointers' proxi thr pts = filter (\((_,w),_) -> filterProximity proxi thr w) pts
171
172
173 reduceDiagos :: Map Date Cooc -> Map Int Double
174 reduceDiagos diagos = mapKeys (\(k,_) -> k)
175 $ foldl (\acc diago -> unionWith (+) acc diago) empty (elems diagos)
176
177 filterPointersByPeriod :: Filiation -> [(Pointer,[Int])] -> [Pointer]
178 filterPointersByPeriod fil pts =
179 let pts' = sortOn (fst . fst . fst . fst) pts
180 inf = (fst . fst . fst . fst) $ head' "filterPointersByPeriod" pts'
181 sup = (fst . fst . fst . fst) $ last' "filterPointersByPeriod" pts'
182 in map fst
183 $ nubBy (\pt pt' -> snd pt == snd pt')
184 $ filter (\pt -> ((fst . fst . fst . fst) pt == inf) || ((fst . fst . fst . fst) pt == sup))
185 $ case fil of
186 ToParents -> reverse pts'
187 ToChilds -> pts'
188 ToChildsMemory -> undefined
189 ToParentsMemory -> undefined
190
191 phyloGroupMatching :: [[(PhyloGroupId,[Int])]] -> Filiation -> Proximity -> Map Date Double -> Map Date Cooc
192 -> Double -> [Pointer] -> (PhyloGroupId,[Int]) -> [Pointer]
193 phyloGroupMatching candidates fil proxi docs diagos thr oldPointers (id,ngrams) =
194 if (null $ filterPointers proxi thr oldPointers)
195 {- let's find new pointers -}
196 then if null nextPointers
197 then []
198 else filterPointersByPeriod fil
199 $ head' "phyloGroupMatching"
200 -- Keep only the best set of pointers grouped by proximity
201 $ groupBy (\pt pt' -> (snd . fst) pt == (snd . fst) pt')
202 $ reverse $ sortOn (snd . fst) $ head' "pointers" nextPointers
203 -- Find the first time frame where at leats one pointer satisfies the proximity threshold
204 else oldPointers
205 where
206 nextPointers :: [[(Pointer,[Int])]]
207 nextPointers = take 1
208 $ dropWhile (null)
209 {- for each time frame, process the proximity on relevant pairs of targeted groups -}
210 $ scanl (\acc groups ->
211 let periods = nub $ map (fst . fst . fst) $ concat groups
212 nbdocs = sum $ elems $ (filterDocs docs ([(fst . fst) id] ++ periods))
213 diago = reduceDiagos
214 $ filterDiago diagos ([(fst . fst) id] ++ periods)
215 {- important resize nbdocs et diago dans le make pairs -}
216 pairs = makePairs' (id,ngrams) (concat groups) periods oldPointers fil thr proxi docs diagos
217 in acc ++ ( filterPointers' proxi thr
218 $ concat
219 $ map (\(c,c') ->
220 {- process the proximity between the current group and a pair of candidates -}
221 let proximity = toProximity nbdocs diago proxi ngrams (snd c) (snd c')
222 in if ((c == c') || (snd c == snd c'))
223 then [((fst c,proximity),snd c)]
224 else [((fst c,proximity),snd c),((fst c',proximity),snd c')] ) pairs )) []
225 $ inits candidates -- groups from [[1900],[1900,1901],[1900,1901,1902],...]
226
227
228 filterDocs :: Map Date Double -> [PhyloPeriodId] -> Map Date Double
229 filterDocs d pds = restrictKeys d $ periodsToYears pds
230
231 filterDiago :: Map Date Cooc -> [PhyloPeriodId] -> Map Date Cooc
232 filterDiago diago pds = restrictKeys diago $ periodsToYears pds
233
234
235 -----------------------------
236 -- | Matching Processing | --
237 -----------------------------
238
239
240 getNextPeriods :: Filiation -> Int -> PhyloPeriodId -> [PhyloPeriodId] -> [PhyloPeriodId]
241 getNextPeriods fil max' pId pIds =
242 case fil of
243 ToChilds -> take max' $ (tail . snd) $ splitAt (elemIndex' pId pIds) pIds
244 ToParents -> take max' $ (reverse . fst) $ splitAt (elemIndex' pId pIds) pIds
245 ToChildsMemory -> undefined
246 ToParentsMemory -> undefined
247
248
249 getCandidates :: PhyloGroup -> [[(PhyloGroupId,[Int])]] -> [[(PhyloGroupId,[Int])]]
250 getCandidates ego targets =
251 map (\groups' ->
252 filter (\g' -> (not . null) $ intersect (ego ^. phylo_groupNgrams) (snd g')
253 ) groups') targets
254
255
256 matchGroupsToGroups :: Int -> [PhyloPeriodId] -> Proximity -> Double -> Map Date Double -> Map Date Cooc -> [PhyloGroup] -> [PhyloGroup]
257 matchGroupsToGroups frame periods proximity thr docs coocs groups =
258 let groups' = groupByField _phylo_groupPeriod groups
259 in foldl' (\acc prd ->
260 let -- 1) find the parents/childs matching periods
261 periodsPar = getNextPeriods ToParents frame prd periods
262 periodsChi = getNextPeriods ToChilds frame prd periods
263 -- 2) find the parents/childs matching candidates
264 candidatesPar = map (\prd' -> map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ findWithDefault [] prd' groups') periodsPar
265 candidatesChi = map (\prd' -> map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ findWithDefault [] prd' groups') periodsChi
266 -- 3) find the parents/child number of docs by years
267 docsPar = filterDocs docs ([prd] ++ periodsPar)
268 docsChi = filterDocs docs ([prd] ++ periodsChi)
269 -- 4) find the parents/child diago by years
270 diagoPar = filterDiago (map coocToDiago coocs) ([prd] ++ periodsPar)
271 diagoChi = filterDiago (map coocToDiago coocs) ([prd] ++ periodsPar)
272 -- 5) match in parallel all the groups (egos) to their possible candidates
273 egos = map (\ego ->
274 let pointersPar = phyloGroupMatching (getCandidates ego candidatesPar) ToParents proximity docsPar diagoPar
275 thr (getPeriodPointers ToParents ego) (getGroupId ego, ego ^. phylo_groupNgrams)
276 pointersChi = phyloGroupMatching (getCandidates ego candidatesChi) ToChilds proximity docsChi diagoChi
277 thr (getPeriodPointers ToChilds ego) (getGroupId ego, ego ^. phylo_groupNgrams)
278 in addPointers ToChilds TemporalPointer pointersChi
279 $ addPointers ToParents TemporalPointer pointersPar
280 $ addMemoryPointers ToChildsMemory TemporalPointer thr pointersChi
281 $ addMemoryPointers ToParentsMemory TemporalPointer thr pointersPar ego)
282 $ findWithDefault [] prd groups'
283 egos' = egos `using` parList rdeepseq
284 in acc ++ egos'
285 ) [] periods
286
287
288 -----------------------
289 -- | Phylo Quality | --
290 -----------------------
291
292
293 relevantBranches :: Int -> [[PhyloGroup]] -> [[PhyloGroup]]
294 relevantBranches term branches =
295 filter (\groups -> (any (\group -> elem term $ group ^. phylo_groupNgrams) groups)) branches
296
297 accuracy :: Int -> [(Date,Date)] -> [PhyloGroup] -> Double
298 -- The accuracy of a branch relatively to a term x is computed only over the periods there exist some cluster mentionning x in the phylomemy
299 accuracy x periods bk = ((fromIntegral $ length $ filter (\g -> elem x $ g ^. phylo_groupNgrams) bk')
300 / (fromIntegral $ length bk'))
301 where
302 bk' :: [PhyloGroup]
303 bk' = filter (\g -> elem (g ^. phylo_groupPeriod) periods) bk
304
305 recall :: Int -> [PhyloGroup] -> [[PhyloGroup]] -> Double
306 recall x bk bx = ((fromIntegral $ length $ filter (\g -> elem x $ g ^. phylo_groupNgrams) bk)
307 / (fromIntegral $ length $ filter (\g -> elem x $ g ^. phylo_groupNgrams) $ concat bx))
308
309 fScore :: Double -> Int -> [(Date,Date)] -> [PhyloGroup] -> [[PhyloGroup]] -> Double
310 fScore lambda x periods bk bx =
311 let rec = recall x bk bx
312 acc = accuracy x periods bk
313 in ((1 + lambda ** 2) * acc * rec)
314 / (((lambda ** 2) * acc + rec))
315
316
317 wk :: [PhyloGroup] -> Double
318 wk bk = fromIntegral $ length bk
319
320
321 toPhyloQuality' :: Double -> Map Int Double -> [[PhyloGroup]] -> Double
322 toPhyloQuality' lambda freq branches =
323 if (null branches)
324 then 0
325 else sum
326 $ map (\i ->
327 let bks = relevantBranches i branches
328 periods = nub $ map _phylo_groupPeriod $ filter (\g -> elem i $ g ^. phylo_groupNgrams) $ concat bks
329 in (freq ! i) * (sum $ map (\bk -> ((wk bk) / (sum $ map wk bks)) * (fScore lambda i periods bk bks)) bks))
330 $ keys freq
331
332 toRecall :: Map Int Double -> [[PhyloGroup]] -> Double
333 toRecall freq branches =
334 if (null branches)
335 then 0
336 else sum
337 $ map (\x ->
338 let px = freq ! x
339 bx = relevantBranches x branches
340 wks = sum $ map wk bx
341 in (px / pys) * (sum $ map (\bk -> ((wk bk) / wks) * (recall x bk bx)) bx))
342 $ keys freq
343 where
344 pys :: Double
345 pys = sum (elems freq)
346
347
348 toAccuracy :: Map Int Double -> [[PhyloGroup]] -> Double
349 toAccuracy freq branches =
350 if (null branches)
351 then 0
352 else sum
353 $ map (\x ->
354 let px = freq ! x
355 bx = relevantBranches x branches
356 -- | periods containing x
357 periods = nub $ map _phylo_groupPeriod $ filter (\g -> elem x $ g ^. phylo_groupNgrams) $ concat bx
358 wks = sum $ map wk bx
359 in (px / pys) * (sum $ map (\bk -> ((wk bk) / wks) * (accuracy x periods bk)) bx))
360 $ keys freq
361 where
362 pys :: Double
363 pys = sum (elems freq)
364
365
366 -- | here we do the average of all the local f_scores
367 toPhyloQuality :: Double -> Double -> Map Int Double -> [[PhyloGroup]] -> Double
368 toPhyloQuality fdt lambda freq branches =
369 if (null branches)
370 then 0
371 else sum
372 $ map (\x ->
373 -- let px = freq ! x
374 let bx = relevantBranches x branches
375 -- | periods containing x
376 periods = nub $ map _phylo_groupPeriod $ filter (\g -> elem x $ g ^. phylo_groupNgrams) $ concat bx
377 wks = sum $ map wk bx
378 -- in (px / pys) * (sum $ map (\bk -> ((wk bk) / wks) * (fScore beta x bk bx)) bx))
379 -- in (1 / fdt) * (sum $ map (\bk -> ((wk bk) / wks) * (fScore beta x periods bk bx)) bx))
380 in (1 / fdt) * (sum $ map (\bk -> ((wk bk) / wks) * (fScore (tan (lambda * pi / 2)) x periods bk bx)) bx))
381 $ keys freq
382 -- where
383 -- pys :: Double
384 -- pys = sum (elems freq)
385
386 -- 1 / nb de foundation
387
388 ------------------------------------
389 -- | Constant Temporal Matching | --
390 ------------------------------------
391
392
393 groupsToBranches' :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
394 groupsToBranches' groups =
395 {- run the related component algorithm -}
396 let egos = groupBy (\gs gs' -> (fst $ fst $ head' "egos" gs) == (fst $ fst $ head' "egos" gs'))
397 $ sortOn (\gs -> fst $ fst $ head' "egos" gs)
398 $ map (\group -> [getGroupId group]
399 ++ (map fst $ group ^. phylo_groupPeriodParents)
400 ++ (map fst $ group ^. phylo_groupPeriodChilds) ) $ elems groups
401 -- first find the related components by inside each ego's period
402 -- a supprimer
403 graph' = map relatedComponents egos
404 -- then run it for the all the periods
405 graph = zip [1..]
406 $ relatedComponents $ concat (graph' `using` parList rdeepseq)
407 -- update each group's branch id
408 in map (\(bId,ids) ->
409 let groups' = map (\group -> group & phylo_groupBranchId %~ (\(lvl,lst) -> (lvl,lst ++ [bId])))
410 $ elems $ restrictKeys groups (Set.fromList ids)
411 in groups' `using` parList rdeepseq ) graph
412
413
414 reduceFrequency :: Map Int Double -> [[PhyloGroup]] -> Map Int Double
415 reduceFrequency frequency branches =
416 restrictKeys frequency (Set.fromList $ (nub . concat) $ map _phylo_groupNgrams $ concat branches)
417
418 updateThr :: Double -> [[PhyloGroup]] -> [[PhyloGroup]]
419 updateThr thr branches = map (\b -> map (\g ->
420 g & phylo_groupMeta .~ (singleton "seaLevels" (((g ^. phylo_groupMeta) ! "seaLevels") ++ [thr]))) b) branches
421
422
423 -- Sequentially break each branch of a phylo where
424 -- done = all the allready broken branches
425 -- ego = the current branch we want to break
426 -- rest = the branches we still have to break
427 breakBranches :: Double -> Proximity -> Double -> Map Int Double -> Int -> Double -> Double -> Double
428 -> Int -> Map Date Double -> Map Date Cooc -> [PhyloPeriodId] -> [([PhyloGroup],Bool)] -> ([PhyloGroup],Bool) -> [([PhyloGroup],Bool)] -> [([PhyloGroup],Bool)]
429 breakBranches fdt proximity lambda frequency minBranch thr depth elevation frame docs coocs periods done ego rest =
430 -- 1) keep or not the new division of ego
431 let done' = done ++ (if snd ego
432 then
433 (if ((null (fst ego')) || (quality > quality'))
434 then
435 -- trace (" ✗ F(β) = " <> show(quality) <> " (vs) " <> show(quality')
436 -- <> " | " <> show(length $ fst ego) <> " groups : "
437 -- <> " |✓ " <> show(length $ fst ego') <> show(map length $ fst ego')
438 -- <> " |✗ " <> show(length $ snd ego') <> "[" <> show(length $ concat $ snd ego') <> "]")
439 [(fst ego,False)]
440 else
441 -- trace (" ✓ level = " <> printf "%.1f" thr <> "")
442 -- trace (" ✓ F(β) = " <> show(quality) <> " (vs) " <> show(quality')
443 -- <> " | " <> show(length $ fst ego) <> " groups : "
444 -- <> " |✓ " <> show(length $ fst ego') <> show(map length $ fst ego')
445 -- <> " |✗ " <> show(length $ snd ego') <> "[" <> show(length $ concat $ snd ego') <> "]")
446 ((map (\e -> (e,True)) (fst ego')) ++ (map (\e -> (e,False)) (snd ego'))))
447 else [ego])
448 in
449 -- 2) if there is no more branches in rest then return else continue
450 if null rest
451 then done'
452 else breakBranches fdt proximity lambda frequency minBranch thr depth elevation frame docs coocs periods
453 done' (head' "breakBranches" rest) (tail' "breakBranches" rest)
454 where
455 --------------------------------------
456 quality :: Double
457 quality = toPhyloQuality fdt lambda frequency ((map fst done) ++ [fst ego] ++ (map fst rest))
458 --------------------------------------
459 ego' :: ([[PhyloGroup]],[[PhyloGroup]])
460 ego' =
461 let branches = groupsToBranches' $ fromList $ map (\g -> (getGroupId g, g))
462 $ matchGroupsToGroups frame periods proximity thr docs coocs (fst ego)
463 branches' = branches `using` parList rdeepseq
464 in partition (\b -> (length $ nub $ map _phylo_groupPeriod b) >= minBranch)
465 $ thrToMeta thr
466 $ depthToMeta (elevation - depth) branches'
467 --------------------------------------
468 quality' :: Double
469 quality' = toPhyloQuality fdt lambda frequency
470 ((map fst done) ++ (fst ego') ++ (snd ego') ++ (map fst rest))
471
472
473 seaLevelMatching :: Double -> Proximity -> Double -> Int -> Map Int Double -> Double -> Double -> Double -> Double
474 -> Int -> [PhyloPeriodId] -> Map Date Double -> Map Date Cooc -> [([PhyloGroup],Bool)] -> [([PhyloGroup],Bool)]
475 seaLevelMatching fdt proximity lambda minBranch frequency thr step depth elevation frame periods docs coocs branches =
476 -- if there is no branch to break or if seaLvl level > 1 then end
477 if (thr >= 1) || ((not . or) $ map snd branches)
478 then branches
479 else
480 -- break all the possible branches at the current seaLvl level
481 let quality = toPhyloQuality fdt lambda frequency (map fst branches)
482 acc = toAccuracy frequency (map fst branches)
483 rec = toRecall frequency (map fst branches)
484 branches' = trace ("↑ level = " <> printf "%.3f" thr <> " F(λ) = " <> printf "%.5f" quality
485 <> " ξ = " <> printf "%.5f" acc
486 <> " ρ = " <> printf "%.5f" rec
487 <> " branches = " <> show(length branches) <> " ↴")
488 $ breakBranches fdt proximity lambda frequency minBranch thr depth elevation frame docs coocs periods
489 [] (head' "seaLevelMatching" branches) (tail' "seaLevelMatching" branches)
490 frequency' = reduceFrequency frequency (map fst branches')
491 in seaLevelMatching fdt proximity lambda minBranch frequency' (thr + step) step (depth - 1) elevation frame periods docs coocs branches'
492
493
494 constanteTemporalMatching :: Double -> Double -> Phylo -> Phylo
495 constanteTemporalMatching start step phylo = updatePhyloGroups 1
496 (fromList $ map (\g -> (getGroupId g,g)) $ traceMatchEnd $ concat branches)
497 (toPhyloHorizon phylo)
498 where
499 -- 2) process the temporal matching by elevating seaLvl level
500 branches :: [[PhyloGroup]]
501 branches = map fst
502 $ seaLevelMatching (fromIntegral $ Vector.length $ getRoots phylo)
503 (phyloProximity $ getConfig phylo)
504 (_qua_granularity $ phyloQuality $ getConfig phylo)
505 (_qua_minBranch $ phyloQuality $ getConfig phylo)
506 (phylo ^. phylo_termFreq)
507 start step
508 ((((1 - start) / step) - 1))
509 (((1 - start) / step))
510 (getTimeFrame $ timeUnit $ getConfig phylo)
511 (getPeriodIds phylo)
512 (phylo ^. phylo_timeDocs)
513 (phylo ^. phylo_timeCooc)
514 (reverse $ sortOn (length . fst) groups)
515 -- 1) for each group process an initial temporal Matching
516 -- here we suppose that all the groups of level 1 are part of the same big branch
517 groups :: [([PhyloGroup],Bool)]
518 groups = map (\b -> (b,(length $ nub $ map _phylo_groupPeriod b) >= (_qua_minBranch $ phyloQuality $ getConfig phylo)))
519 $ groupsToBranches' $ fromList $ map (\g -> (getGroupId g, g))
520 $ matchGroupsToGroups (getTimeFrame $ timeUnit $ getConfig phylo)
521 (getPeriodIds phylo) (phyloProximity $ getConfig phylo)
522 start
523 (phylo ^. phylo_timeDocs)
524 (phylo ^. phylo_timeCooc)
525 (traceTemporalMatching $ getGroupsFromLevel 1 phylo)
526
527 -----------------
528 -- | Horizon | --
529 -----------------
530
531 toPhyloHorizon :: Phylo -> Phylo
532 toPhyloHorizon phylo =
533 let t0 = take 1 (getPeriodIds phylo)
534 groups = getGroupsFromLevelPeriods 1 t0 phylo
535 sens = getSensibility (phyloProximity $ getConfig phylo)
536 nbDocs = sum $ elems $ filterDocs (phylo ^. phylo_timeDocs) t0
537 diago = reduceDiagos $ filterDiago (phylo ^. phylo_timeCooc) t0
538 in phylo & phylo_horizon .~ (fromList $ map (\(g,g') ->
539 ((getGroupId g,getGroupId g'),weightedLogJaccard' sens nbDocs diago (g ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams))) $ listToCombi' groups)
540
541
542 --------------------------------------
543 -- | Adaptative Temporal Matching | --
544 --------------------------------------
545
546
547 thrToMeta :: Double -> [[PhyloGroup]] -> [[PhyloGroup]]
548 thrToMeta thr branches =
549 map (\b ->
550 map (\g -> g & phylo_groupMeta .~ (adjust (\lst -> lst ++ [thr]) "seaLevels" (g ^. phylo_groupMeta))) b) branches
551
552 depthToMeta :: Double -> [[PhyloGroup]] -> [[PhyloGroup]]
553 depthToMeta depth branches =
554 let break = length branches > 1
555 in map (\b ->
556 map (\g ->
557 if break then g & phylo_groupMeta .~ (adjust (\lst -> lst ++ [depth]) "breaks"(g ^. phylo_groupMeta))
558 else g) b) branches
559
560 reduceTupleMapByKeys :: Eq a => [a] -> Map (a,a) Double -> Map (a,a) Double
561 reduceTupleMapByKeys ks m = filterWithKey (\(k,k') _ -> (elem k ks) && (elem k' ks)) m
562
563
564 getInTupleMap :: Ord a => Map (a,a) Double -> a -> a -> Double
565 getInTupleMap m k k'
566 | isJust (m !? ( k ,k')) = m ! ( k ,k')
567 | isJust (m !? ( k',k )) = m ! ( k',k )
568 | otherwise = 0
569
570
571 toThreshold :: Double -> Map (PhyloGroupId,PhyloGroupId) Double -> Double
572 toThreshold lvl proxiGroups =
573 let idx = ((Map.size proxiGroups) `div` (floor lvl)) - 1
574 in if idx >= 0
575 then (sort $ elems proxiGroups) !! idx
576 else 1
577
578
579 -- done = all the allready broken branches
580 -- ego = the current branch we want to break
581 -- rest = the branches we still have to break
582 adaptativeBreakBranches :: Double -> Proximity -> Double -> Double -> Map (PhyloGroupId,PhyloGroupId) Double
583 -> Double -> Map Int Double -> Int -> Int -> Map Date Double -> Map Date Cooc
584 -> [PhyloPeriodId] -> [([PhyloGroup],(Bool,[Double]))] -> ([PhyloGroup],(Bool,[Double])) -> [([PhyloGroup],(Bool,[Double]))]
585 -> [([PhyloGroup],(Bool,[Double]))]
586 adaptativeBreakBranches fdt proxiConf depth elevation groupsProxi lambda frequency minBranch frame docs coocs periods done ego rest =
587 -- 1) keep or not the new division of ego
588 let done' = done ++ (if (fst . snd) ego
589 then (if ((null (fst ego')) || (quality > quality'))
590 then
591 [(concat $ thrToMeta thr $ [fst ego],(False, ((snd . snd) ego)))]
592 else
593 ( (map (\e -> (e,(True, ((snd . snd) ego) ++ [thr]))) (fst ego'))
594 ++ (map (\e -> (e,(False, ((snd . snd) ego)))) (snd ego'))))
595 else [(concat $ thrToMeta thr $ [fst ego], snd ego)])
596 in
597 -- uncomment let .. in for debugging
598 -- let part1 = partition (snd) done'
599 -- part2 = partition (snd) rest
600 -- in trace ( "[✓ " <> show(length $ fst part1) <> "(" <> show(length $ concat $ map (fst) $ fst part1) <> ")|✗ " <> show(length $ snd part1) <> "(" <> show(length $ concat $ map (fst) $ snd part1) <> ")] "
601 -- <> "[✓ " <> show(length $ fst part2) <> "(" <> show(length $ concat $ map (fst) $ fst part2) <> ")|✗ " <> show(length $ snd part2) <> "(" <> show(length $ concat $ map (fst) $ snd part2) <> ")]"
602 -- ) $
603 -- 2) if there is no more branches in rest then return else continue
604 if null rest
605 then done'
606 else adaptativeBreakBranches fdt proxiConf depth elevation groupsProxi lambda frequency minBranch frame docs coocs periods
607 done' (head' "breakBranches" rest) (tail' "breakBranches" rest)
608 where
609 --------------------------------------
610 thr :: Double
611 thr = toThreshold depth $ Map.filter (\v -> v > (last' "breakBranches" $ (snd . snd) ego)) $ reduceTupleMapByKeys (map getGroupId $ fst ego) groupsProxi
612 --------------------------------------
613 quality :: Double
614 quality = toPhyloQuality fdt lambda frequency ((map fst done) ++ [fst ego] ++ (map fst rest))
615 --------------------------------------
616 ego' :: ([[PhyloGroup]],[[PhyloGroup]])
617 ego' =
618 let branches = groupsToBranches' $ fromList $ map (\g -> (getGroupId g, g))
619 $ matchGroupsToGroups frame periods proxiConf thr docs coocs (fst ego)
620 branches' = branches `using` parList rdeepseq
621 in partition (\b -> (length $ nub $ map _phylo_groupPeriod b) > minBranch)
622 $ thrToMeta thr
623 $ depthToMeta (elevation - depth) branches'
624 --------------------------------------
625 quality' :: Double
626 quality' = toPhyloQuality fdt lambda frequency
627 ((map fst done) ++ (fst ego') ++ (snd ego') ++ (map fst rest))
628
629
630 adaptativeSeaLevelMatching :: Double -> Proximity -> Double -> Double -> Map (PhyloGroupId, PhyloGroupId) Double
631 -> Double -> Int -> Map Int Double
632 -> Int -> [PhyloPeriodId] -> Map Date Double -> Map Date Cooc
633 -> [([PhyloGroup],(Bool,[Double]))] -> [([PhyloGroup],(Bool,[Double]))]
634 adaptativeSeaLevelMatching fdt proxiConf depth elevation groupsProxi lambda minBranch frequency frame periods docs coocs branches =
635 -- if there is no branch to break or if seaLvl level >= depth then end
636 if (Map.null groupsProxi) || (depth <= 0) || ((not . or) $ map (fst . snd) branches)
637 then branches
638 else
639 -- break all the possible branches at the current seaLvl level
640 let branches' = adaptativeBreakBranches fdt proxiConf depth elevation groupsProxi lambda frequency minBranch frame docs coocs periods
641 [] (head' "seaLevelMatching" branches) (tail' "seaLevelMatching" branches)
642 frequency' = reduceFrequency frequency (map fst branches')
643 groupsProxi' = reduceTupleMapByKeys (map (getGroupId) $ concat $ map (fst) $ filter (fst . snd) branches') groupsProxi
644 -- thr = toThreshold depth groupsProxi
645 in trace("\n " <> foldl (\acc _ -> acc <> "🌊 ") "" [0..(elevation - depth)]
646 <> " [✓ " <> show(length $ filter (fst . snd) branches') <> "(" <> show(length $ concat $ map (fst) $ filter (fst . snd) branches')
647 <> ")|✗ " <> show(length $ filter (not . fst . snd) branches') <> "(" <> show(length $ concat $ map (fst) $ filter (not . fst . snd) branches') <> ")]"
648 <> " thr = ")
649 $ adaptativeSeaLevelMatching fdt proxiConf (depth - 1) elevation groupsProxi' lambda minBranch frequency' frame periods docs coocs branches'
650
651
652 adaptativeTemporalMatching :: Double -> Phylo -> Phylo
653 adaptativeTemporalMatching elevation phylo = updatePhyloGroups 1
654 (fromList $ map (\g -> (getGroupId g,g)) $ traceMatchEnd $ concat branches)
655 (toPhyloHorizon phylo)
656 where
657 -- 2) process the temporal matching by elevating seaLvl level
658 branches :: [[PhyloGroup]]
659 branches = map fst
660 $ adaptativeSeaLevelMatching (fromIntegral $ Vector.length $ getRoots phylo)
661 (phyloProximity $ getConfig phylo)
662 (elevation - 1)
663 elevation
664 (phylo ^. phylo_groupsProxi)
665 (_qua_granularity $ phyloQuality $ getConfig phylo)
666 (_qua_minBranch $ phyloQuality $ getConfig phylo)
667 (phylo ^. phylo_termFreq)
668 (getTimeFrame $ timeUnit $ getConfig phylo)
669 (getPeriodIds phylo)
670 (phylo ^. phylo_timeDocs)
671 (phylo ^. phylo_timeCooc)
672 groups
673 -- 1) for each group process an initial temporal Matching
674 -- here we suppose that all the groups of level 1 are part of the same big branch
675 groups :: [([PhyloGroup],(Bool,[Double]))]
676 groups = map (\b -> (b,((length $ nub $ map _phylo_groupPeriod b) >= (_qua_minBranch $ phyloQuality $ getConfig phylo),[thr])))
677 $ groupsToBranches' $ fromList $ map (\g -> (getGroupId g, g))
678 $ matchGroupsToGroups (getTimeFrame $ timeUnit $ getConfig phylo)
679 (getPeriodIds phylo) (phyloProximity $ getConfig phylo)
680 thr
681 (phylo ^. phylo_timeDocs)
682 (phylo ^. phylo_timeCooc)
683 (traceTemporalMatching $ getGroupsFromLevel 1 phylo)
684 --------------------------------------
685 thr :: Double
686 thr = toThreshold elevation (phylo ^. phylo_groupsProxi)