]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Viz/Phylo/TemporalMatching.hs
[FEAT] Backend NLP French tested
[gargantext.git] / src / Gargantext / Core / Viz / Phylo / TemporalMatching.hs
1 {-|
2 Module : Gargantext.Core.Viz.Phylo.TemporalMatching
3 Description : Module dedicated to the adaptative temporal matching of a Phylo.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11
12 module Gargantext.Core.Viz.Phylo.TemporalMatching where
13
14 import Control.Lens hiding (Level)
15 import Control.Parallel.Strategies (parList, rdeepseq, using)
16 import Data.List (concat, splitAt, tail, sortOn, (++), intersect, null, inits, groupBy, scanl, nub, nubBy, union, dropWhile, partition, or, sort, (!!))
17 import Data.Map (Map, fromList, elems, restrictKeys, unionWith, findWithDefault, keys, (!), (!?), filterWithKey, singleton, empty, mapKeys, adjust)
18 import Debug.Trace (trace)
19 import Gargantext.Core.Viz.Phylo
20 import Gargantext.Core.Viz.Phylo.PhyloTools
21 import Gargantext.Prelude
22 import Prelude (floor,tan,pi)
23 import Text.Printf
24 import qualified Data.Map as Map
25 import qualified Data.Set as Set
26 import qualified Data.Vector as Vector
27
28
29 -------------------
30 -- | Proximity | --
31 -------------------
32
33
34 -- | To compute a jaccard similarity between two lists
35 jaccard :: [Int] -> [Int] -> Double
36 jaccard inter' union' = ((fromIntegral . length) $ inter') / ((fromIntegral . length) $ union')
37
38
39 -- | Process the inverse sumLog
40 sumInvLog' :: Double -> Double -> [Double] -> Double
41 sumInvLog' s nb diago = foldl (\mem occ -> mem + (1 / (log (occ + 1/ tan (s * pi / 2)) / log (nb + 1/ tan (s * pi / 2))))) 0 diago
42
43
44 -- | Process the sumLog
45 sumLog' :: Double -> Double -> [Double] -> Double
46 sumLog' s nb diago = foldl (\mem occ -> mem + (log (occ + 1/ tan (s * pi / 2)) / log (nb + 1/ tan (s * pi / 2)))) 0 diago
47
48
49 weightedLogJaccard' :: Double -> Double -> Map Int Double -> [Int] -> [Int] -> Double
50 weightedLogJaccard' sens nbDocs diago ngrams ngrams'
51 | null ngramsInter = 0
52 | ngramsInter == ngramsUnion = 1
53 | sens == 0 = jaccard ngramsInter ngramsUnion
54 | sens > 0 = (sumInvLog' sens nbDocs diagoInter) / (sumInvLog' sens nbDocs diagoUnion)
55 | otherwise = (sumLog' sens nbDocs diagoInter) / (sumLog' sens nbDocs diagoUnion)
56 where
57 --------------------------------------
58 ngramsInter :: [Int]
59 ngramsInter = intersect ngrams ngrams'
60 --------------------------------------
61 ngramsUnion :: [Int]
62 ngramsUnion = union ngrams ngrams'
63 --------------------------------------
64 diagoInter :: [Double]
65 diagoInter = elems $ restrictKeys diago (Set.fromList ngramsInter)
66 --------------------------------------
67 diagoUnion :: [Double]
68 diagoUnion = elems $ restrictKeys diago (Set.fromList ngramsUnion)
69 --------------------------------------
70
71 -- | Process the weighted similarity between clusters. Adapted from Wang, X., Cheng, Q., Lu, W., 2014. Analyzing evolution of research topics with NEViewer: a new method based on dynamic co-word networks. Scientometrics 101, 1253–1271. https://doi.org/10.1007/s11192-014-1347-y (log added in the formula + pair comparison)
72 -- tests not conclusive
73 weightedLogSim' :: Double -> Double -> Map Int Double -> [Int] -> [Int] -> Double
74 weightedLogSim' sens nbDocs diago ego_ngrams target_ngrams
75 | null ngramsInter = 0
76 | ngramsInter == ngramsUnion = 1
77 | sens == 0 = jaccard ngramsInter ngramsUnion
78 | sens > 0 = (sumInvLog' sens nbDocs diagoInter) / minimum [(sumInvLog' sens nbDocs diagoEgo),(sumInvLog' sens nbDocs diagoTarget)]
79 | otherwise = (sumLog' sens nbDocs diagoInter) / minimum [(sumLog' sens nbDocs diagoEgo),(sumLog' sens nbDocs diagoTarget)]
80 where
81 --------------------------------------
82 ngramsInter :: [Int]
83 ngramsInter = intersect ego_ngrams target_ngrams
84 --------------------------------------
85 ngramsUnion :: [Int]
86 ngramsUnion = union ego_ngrams target_ngrams
87 --------------------------------------
88 diagoInter :: [Double]
89 diagoInter = elems $ restrictKeys diago (Set.fromList ngramsInter)
90 --------------------------------------
91 diagoEgo :: [Double]
92 diagoEgo = elems $ restrictKeys diago (Set.fromList ego_ngrams)
93 --------------------------------------
94 diagoTarget :: [Double]
95 diagoTarget = elems $ restrictKeys diago (Set.fromList target_ngrams)
96 --------------------------------------
97
98 toProximity :: Double -> Map Int Double -> Proximity -> [Int] -> [Int] -> [Int] -> Double
99 -- | To process the proximity between a current group and a pair of targets group using the adapted Wang et al. Similarity
100 toProximity nbDocs diago proximity egoNgrams targetNgrams targetNgrams' =
101 case proximity of
102 WeightedLogJaccard sens ->
103 let pairNgrams = if targetNgrams == targetNgrams'
104 then targetNgrams
105 else union targetNgrams targetNgrams'
106 in weightedLogJaccard' sens nbDocs diago egoNgrams pairNgrams
107 WeightedLogSim sens ->
108 let pairNgrams = if targetNgrams == targetNgrams'
109 then targetNgrams
110 else union targetNgrams targetNgrams'
111 in weightedLogSim' sens nbDocs diago egoNgrams pairNgrams
112 Hamming _ -> undefined
113
114 ------------------------
115 -- | Local Matching | --
116 ------------------------
117
118 findLastPeriod :: Filiation -> [PhyloPeriodId] -> PhyloPeriodId
119 findLastPeriod fil periods = case fil of
120 ToParents -> head' "findLastPeriod" (sortOn fst periods)
121 ToChilds -> last' "findLastPeriod" (sortOn fst periods)
122 ToChildsMemory -> undefined
123 ToParentsMemory -> undefined
124
125
126 -- | To filter pairs of candidates related to old pointers periods
127 removeOldPointers :: [Pointer] -> Filiation -> Double -> Proximity -> PhyloPeriodId
128 -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
129 -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
130 removeOldPointers oldPointers fil thr prox prd pairs
131 | null oldPointers = pairs
132 | null (filterPointers prox thr oldPointers) =
133 let lastMatchedPrd = findLastPeriod fil (map (fst . fst . fst) oldPointers)
134 in if lastMatchedPrd == prd
135 then []
136 else filter (\((id,_),(id',_)) ->
137 case fil of
138 ToChildsMemory -> undefined
139 ToParentsMemory -> undefined
140 ToParents -> (((fst . fst . fst) id ) < (fst lastMatchedPrd))
141 || (((fst . fst . fst) id') < (fst lastMatchedPrd))
142 ToChilds -> (((fst . fst . fst) id ) > (fst lastMatchedPrd))
143 || (((fst . fst . fst) id') > (fst lastMatchedPrd))) pairs
144 | otherwise = []
145
146
147 makePairs' :: (PhyloGroupId,[Int]) -> [(PhyloGroupId,[Int])] -> [PhyloPeriodId] -> [Pointer] -> Filiation -> Double -> Proximity
148 -> Map Date Double -> Map Date Cooc -> [((PhyloGroupId,[Int]),(PhyloGroupId,[Int]))]
149 makePairs' (egoId, egoNgrams) candidates periods oldPointers fil thr prox docs diagos =
150 if (null periods)
151 then []
152 else removeOldPointers oldPointers fil thr prox lastPrd
153 {- at least on of the pair candidates should be from the last added period -}
154 $ filter (\((id,_),(id',_)) -> ((fst . fst) id == lastPrd) || ((fst . fst) id' == lastPrd))
155 $ listToKeys
156 $ filter (\(id,ngrams) ->
157 let nbDocs = (sum . elems) $ filterDocs docs ([(fst . fst) egoId, (fst . fst) id])
158 diago = reduceDiagos $ filterDiago diagos ([(fst . fst) egoId, (fst . fst) id])
159 in (toProximity nbDocs diago prox egoNgrams egoNgrams ngrams) >= thr
160 ) candidates
161 where
162 lastPrd :: PhyloPeriodId
163 lastPrd = findLastPeriod fil periods
164
165
166 filterPointers :: Proximity -> Double -> [Pointer] -> [Pointer]
167 filterPointers proxi thr pts = filter (\(_,w) -> filterProximity proxi thr w) pts
168
169 filterPointers' :: Proximity -> Double -> [(Pointer,[Int])] -> [(Pointer,[Int])]
170 filterPointers' proxi thr pts = filter (\((_,w),_) -> filterProximity proxi thr w) pts
171
172
173 reduceDiagos :: Map Date Cooc -> Map Int Double
174 reduceDiagos diagos = mapKeys (\(k,_) -> k)
175 $ foldl (\acc diago -> unionWith (+) acc diago) empty (elems diagos)
176
177 filterPointersByPeriod :: Filiation -> [(Pointer,[Int])] -> [Pointer]
178 filterPointersByPeriod fil pts =
179 let pts' = sortOn (fst . fst . fst . fst) pts
180 inf = (fst . fst . fst . fst) $ head' "filterPointersByPeriod" pts'
181 sup = (fst . fst . fst . fst) $ last' "filterPointersByPeriod" pts'
182 in map fst
183 $ nubBy (\pt pt' -> snd pt == snd pt')
184 $ filter (\pt -> ((fst . fst . fst . fst) pt == inf) || ((fst . fst . fst . fst) pt == sup))
185 $ case fil of
186 ToParents -> reverse pts'
187 ToChilds -> pts'
188 ToChildsMemory -> undefined
189 ToParentsMemory -> undefined
190
191 phyloGroupMatching :: [[(PhyloGroupId,[Int])]] -> Filiation -> Proximity -> Map Date Double -> Map Date Cooc
192 -> Double -> [Pointer] -> (PhyloGroupId,[Int]) -> [Pointer]
193 phyloGroupMatching candidates fil proxi docs diagos thr oldPointers (id,ngrams) =
194 if (null $ filterPointers proxi thr oldPointers)
195 {- let's find new pointers -}
196 then if null nextPointers
197 then []
198 else filterPointersByPeriod fil
199 $ head' "phyloGroupMatching"
200 -- Keep only the best set of pointers grouped by proximity
201 $ groupBy (\pt pt' -> (snd . fst) pt == (snd . fst) pt')
202 $ reverse $ sortOn (snd . fst) $ head' "pointers" nextPointers
203 -- Find the first time frame where at leats one pointer satisfies the proximity threshold
204 else oldPointers
205 where
206 nextPointers :: [[(Pointer,[Int])]]
207 nextPointers = take 1
208 $ dropWhile null
209 {- for each time frame, process the proximity on relevant pairs of targeted groups -}
210 $ scanl (\acc groups ->
211 let periods = nub $ map (fst . fst . fst) $ concat groups
212 nbdocs = sum $ elems $ (filterDocs docs ([(fst . fst) id] ++ periods))
213 diago = reduceDiagos
214 $ filterDiago diagos ([(fst . fst) id] ++ periods)
215 {- important resize nbdocs et diago dans le make pairs -}
216 pairs = makePairs' (id,ngrams) (concat groups) periods oldPointers fil thr proxi docs diagos
217 in acc ++ ( filterPointers' proxi thr
218 $ concat
219 $ map (\(c,c') ->
220 {- process the proximity between the current group and a pair of candidates -}
221 let proximity = toProximity nbdocs diago proxi ngrams (snd c) (snd c')
222 in if ((c == c') || (snd c == snd c'))
223 then [((fst c,proximity),snd c)]
224 else [((fst c,proximity),snd c),((fst c',proximity),snd c')] ) pairs )) []
225 $ inits candidates -- groups from [[1900],[1900,1901],[1900,1901,1902],...]
226
227
228 filterDocs :: Map Date Double -> [PhyloPeriodId] -> Map Date Double
229 filterDocs d pds = restrictKeys d $ periodsToYears pds
230
231 filterDiago :: Map Date Cooc -> [PhyloPeriodId] -> Map Date Cooc
232 filterDiago diago pds = restrictKeys diago $ periodsToYears pds
233
234
235 -----------------------------
236 -- | Matching Processing | --
237 -----------------------------
238
239
240 getNextPeriods :: Filiation -> Int -> PhyloPeriodId -> [PhyloPeriodId] -> [PhyloPeriodId]
241 getNextPeriods fil max' pId pIds =
242 case fil of
243 ToChilds -> take max' $ (tail . snd) $ splitAt (elemIndex' pId pIds) pIds
244 ToParents -> take max' $ (reverse . fst) $ splitAt (elemIndex' pId pIds) pIds
245 ToChildsMemory -> undefined
246 ToParentsMemory -> undefined
247
248
249 getCandidates :: PhyloGroup -> [[(PhyloGroupId,[Int])]] -> [[(PhyloGroupId,[Int])]]
250 getCandidates ego targets =
251 if (length (ego ^. phylo_groupNgrams)) > 1
252 then
253 map (\groups' -> filter (\g' -> (> 1) $ length $ intersect (ego ^. phylo_groupNgrams) (snd g')) groups') targets
254 else
255 map (\groups' -> filter (\g' -> (not . null) $ intersect (ego ^. phylo_groupNgrams) (snd g')) groups') targets
256
257
258 matchGroupsToGroups :: Int -> [PhyloPeriodId] -> Proximity -> Double -> Map Date Double -> Map Date Cooc -> [PhyloGroup] -> [PhyloGroup]
259 matchGroupsToGroups frame periods proximity thr docs coocs groups =
260 let groups' = groupByField _phylo_groupPeriod groups
261 in foldl' (\acc prd ->
262 let -- 1) find the parents/childs matching periods
263 periodsPar = getNextPeriods ToParents frame prd periods
264 periodsChi = getNextPeriods ToChilds frame prd periods
265 -- 2) find the parents/childs matching candidates
266 candidatesPar = map (\prd' -> map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ findWithDefault [] prd' groups') periodsPar
267 candidatesChi = map (\prd' -> map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ findWithDefault [] prd' groups') periodsChi
268 -- 3) find the parents/child number of docs by years
269 docsPar = filterDocs docs ([prd] ++ periodsPar)
270 docsChi = filterDocs docs ([prd] ++ periodsChi)
271 -- 4) find the parents/child diago by years
272 diagoPar = filterDiago (map coocToDiago coocs) ([prd] ++ periodsPar)
273 diagoChi = filterDiago (map coocToDiago coocs) ([prd] ++ periodsPar)
274 -- 5) match in parallel all the groups (egos) to their possible candidates
275 egos = map (\ego ->
276 let pointersPar = phyloGroupMatching (getCandidates ego candidatesPar) ToParents proximity docsPar diagoPar
277 thr (getPeriodPointers ToParents ego) (getGroupId ego, ego ^. phylo_groupNgrams)
278 pointersChi = phyloGroupMatching (getCandidates ego candidatesChi) ToChilds proximity docsChi diagoChi
279 thr (getPeriodPointers ToChilds ego) (getGroupId ego, ego ^. phylo_groupNgrams)
280 in addPointers ToChilds TemporalPointer pointersChi
281 $ addPointers ToParents TemporalPointer pointersPar
282 $ addMemoryPointers ToChildsMemory TemporalPointer thr pointersChi
283 $ addMemoryPointers ToParentsMemory TemporalPointer thr pointersPar ego)
284 $ findWithDefault [] prd groups'
285 egos' = egos `using` parList rdeepseq
286 in acc ++ egos'
287 ) [] periods
288
289
290 -----------------------
291 -- | Phylo Quality | --
292 -----------------------
293
294
295 relevantBranches :: Int -> [[PhyloGroup]] -> [[PhyloGroup]]
296 relevantBranches term branches =
297 filter (\groups -> (any (\group -> elem term $ group ^. phylo_groupNgrams) groups)) branches
298
299 accuracy :: Int -> [(Date,Date)] -> [PhyloGroup] -> Double
300 -- The accuracy of a branch relatively to a term x is computed only over the periods there exist some cluster mentionning x in the phylomemy
301 accuracy x periods bk = ((fromIntegral $ length $ filter (\g -> elem x $ g ^. phylo_groupNgrams) bk')
302 / (fromIntegral $ length bk'))
303 where
304 bk' :: [PhyloGroup]
305 bk' = filter (\g -> elem (g ^. phylo_groupPeriod) periods) bk
306
307 recall :: Int -> [PhyloGroup] -> [[PhyloGroup]] -> Double
308 recall x bk bx = ((fromIntegral $ length $ filter (\g -> elem x $ g ^. phylo_groupNgrams) bk)
309 / (fromIntegral $ length $ filter (\g -> elem x $ g ^. phylo_groupNgrams) $ concat bx))
310
311 fScore :: Double -> Int -> [(Date,Date)] -> [PhyloGroup] -> [[PhyloGroup]] -> Double
312 fScore lambda x periods bk bx =
313 let rec = recall x bk bx
314 acc = accuracy x periods bk
315 in ((1 + lambda ** 2) * acc * rec)
316 / (((lambda ** 2) * acc + rec))
317
318
319 wk :: [PhyloGroup] -> Double
320 wk bk = fromIntegral $ length bk
321
322
323 toPhyloQuality' :: Double -> Map Int Double -> [[PhyloGroup]] -> Double
324 toPhyloQuality' lambda freq branches =
325 if (null branches)
326 then 0
327 else sum
328 $ map (\i ->
329 let bks = relevantBranches i branches
330 periods = nub $ map _phylo_groupPeriod $ filter (\g -> elem i $ g ^. phylo_groupNgrams) $ concat bks
331 in (freq ! i) * (sum $ map (\bk -> ((wk bk) / (sum $ map wk bks)) * (fScore lambda i periods bk bks)) bks))
332 $ keys freq
333
334 toRecall :: Map Int Double -> [[PhyloGroup]] -> Double
335 toRecall freq branches =
336 if (null branches)
337 then 0
338 else sum
339 $ map (\x ->
340 let px = freq ! x
341 bx = relevantBranches x branches
342 wks = sum $ map wk bx
343 in (px / pys) * (sum $ map (\bk -> ((wk bk) / wks) * (recall x bk bx)) bx))
344 $ keys freq
345 where
346 pys :: Double
347 pys = sum (elems freq)
348
349
350 toAccuracy :: Map Int Double -> [[PhyloGroup]] -> Double
351 toAccuracy freq branches =
352 if (null branches)
353 then 0
354 else sum
355 $ map (\x ->
356 let px = freq ! x
357 bx = relevantBranches x branches
358 -- | periods containing x
359 periods = nub $ map _phylo_groupPeriod $ filter (\g -> elem x $ g ^. phylo_groupNgrams) $ concat bx
360 wks = sum $ map wk bx
361 in (px / pys) * (sum $ map (\bk -> ((wk bk) / wks) * (accuracy x periods bk)) bx))
362 $ keys freq
363 where
364 pys :: Double
365 pys = sum (elems freq)
366
367
368 -- | here we do the average of all the local f_scores
369 toPhyloQuality :: Double -> Double -> Map Int Double -> [[PhyloGroup]] -> Double
370 toPhyloQuality fdt lambda freq branches =
371 if (null branches)
372 then 0
373 else sum
374 $ map (\x ->
375 -- let px = freq ! x
376 let bx = relevantBranches x branches
377 -- | periods containing x
378 periods = nub $ map _phylo_groupPeriod $ filter (\g -> elem x $ g ^. phylo_groupNgrams) $ concat bx
379 wks = sum $ map wk bx
380 -- in (px / pys) * (sum $ map (\bk -> ((wk bk) / wks) * (fScore beta x bk bx)) bx))
381 -- in (1 / fdt) * (sum $ map (\bk -> ((wk bk) / wks) * (fScore beta x periods bk bx)) bx))
382 in (1 / fdt) * (sum $ map (\bk -> ((wk bk) / wks) * (fScore (tan (lambda * pi / 2)) x periods bk bx)) bx))
383 $ keys freq
384 -- where
385 -- pys :: Double
386 -- pys = sum (elems freq)
387
388 -- 1 / nb de foundation
389
390 ------------------------------------
391 -- | Constant Temporal Matching | --
392 ------------------------------------
393
394
395 groupsToBranches' :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
396 groupsToBranches' groups =
397 {- run the related component algorithm -}
398 let egos = groupBy (\gs gs' -> (fst $ fst $ head' "egos" gs) == (fst $ fst $ head' "egos" gs'))
399 $ sortOn (\gs -> fst $ fst $ head' "egos" gs)
400 $ map (\group -> [getGroupId group]
401 ++ (map fst $ group ^. phylo_groupPeriodParents)
402 ++ (map fst $ group ^. phylo_groupPeriodChilds) ) $ elems groups
403 -- first find the related components by inside each ego's period
404 -- a supprimer
405 graph' = map relatedComponents egos
406 -- then run it for the all the periods
407 graph = zip [1..]
408 $ relatedComponents $ concat (graph' `using` parList rdeepseq)
409 -- update each group's branch id
410 in map (\(bId,ids) ->
411 let groups' = map (\group -> group & phylo_groupBranchId %~ (\(lvl,lst) -> (lvl,lst ++ [bId])))
412 $ elems $ restrictKeys groups (Set.fromList ids)
413 in groups' `using` parList rdeepseq ) graph
414
415
416 reduceFrequency :: Map Int Double -> [[PhyloGroup]] -> Map Int Double
417 reduceFrequency frequency branches =
418 restrictKeys frequency (Set.fromList $ (nub . concat) $ map _phylo_groupNgrams $ concat branches)
419
420 updateThr :: Double -> [[PhyloGroup]] -> [[PhyloGroup]]
421 updateThr thr branches = map (\b -> map (\g ->
422 g & phylo_groupMeta .~ (singleton "seaLevels" (((g ^. phylo_groupMeta) ! "seaLevels") ++ [thr]))) b) branches
423
424
425 -- Sequentially break each branch of a phylo where
426 -- done = all the allready broken branches
427 -- ego = the current branch we want to break
428 -- rest = the branches we still have to break
429 breakBranches :: Double -> Proximity -> Double -> Map Int Double -> Int -> Double -> Double -> Double
430 -> Int -> Map Date Double -> Map Date Cooc -> [PhyloPeriodId] -> [([PhyloGroup],Bool)] -> ([PhyloGroup],Bool) -> [([PhyloGroup],Bool)] -> [([PhyloGroup],Bool)]
431 breakBranches fdt proximity lambda frequency minBranch thr depth elevation frame docs coocs periods done ego rest =
432 -- 1) keep or not the new division of ego
433 let done' = done ++ (if snd ego
434 then
435 (if ((null (fst ego')) || (quality > quality'))
436 then
437 -- trace (" ✗ F(β) = " <> show(quality) <> " (vs) " <> show(quality')
438 -- <> " | " <> show(length $ fst ego) <> " groups : "
439 -- <> " |✓ " <> show(length $ fst ego') <> show(map length $ fst ego')
440 -- <> " |✗ " <> show(length $ snd ego') <> "[" <> show(length $ concat $ snd ego') <> "]")
441 [(fst ego,False)]
442 else
443 -- trace (" ✓ level = " <> printf "%.1f" thr <> "")
444 -- trace (" ✓ F(β) = " <> show(quality) <> " (vs) " <> show(quality')
445 -- <> " | " <> show(length $ fst ego) <> " groups : "
446 -- <> " |✓ " <> show(length $ fst ego') <> show(map length $ fst ego')
447 -- <> " |✗ " <> show(length $ snd ego') <> "[" <> show(length $ concat $ snd ego') <> "]")
448 ((map (\e -> (e,True)) (fst ego')) ++ (map (\e -> (e,False)) (snd ego'))))
449 else [ego])
450 in
451 -- 2) if there is no more branches in rest then return else continue
452 if null rest
453 then done'
454 else breakBranches fdt proximity lambda frequency minBranch thr depth elevation frame docs coocs periods
455 done' (head' "breakBranches" rest) (tail' "breakBranches" rest)
456 where
457 --------------------------------------
458 quality :: Double
459 quality = toPhyloQuality fdt lambda frequency ((map fst done) ++ [fst ego] ++ (map fst rest))
460 --------------------------------------
461 ego' :: ([[PhyloGroup]],[[PhyloGroup]])
462 ego' =
463 let branches = groupsToBranches' $ fromList $ map (\g -> (getGroupId g, g))
464 $ matchGroupsToGroups frame periods proximity thr docs coocs (fst ego)
465 branches' = branches `using` parList rdeepseq
466 in partition (\b -> (length $ nub $ map _phylo_groupPeriod b) >= minBranch)
467 $ thrToMeta thr
468 $ depthToMeta (elevation - depth) branches'
469 --------------------------------------
470 quality' :: Double
471 quality' = toPhyloQuality fdt lambda frequency
472 ((map fst done) ++ (fst ego') ++ (snd ego') ++ (map fst rest))
473
474
475 seaLevelMatching :: Double -> Proximity -> Double -> Int -> Map Int Double -> Double -> Double -> Double -> Double
476 -> Int -> [PhyloPeriodId] -> Map Date Double -> Map Date Cooc -> [([PhyloGroup],Bool)] -> [([PhyloGroup],Bool)]
477 seaLevelMatching fdt proximity lambda minBranch frequency thr step depth elevation frame periods docs coocs branches =
478 -- if there is no branch to break or if seaLvl level > 1 then end
479 if (thr >= 1) || ((not . or) $ map snd branches)
480 then branches
481 else
482 -- break all the possible branches at the current seaLvl level
483 let quality = toPhyloQuality fdt lambda frequency (map fst branches)
484 acc = toAccuracy frequency (map fst branches)
485 rec = toRecall frequency (map fst branches)
486 branches' = trace ("↑ level = " <> printf "%.3f" thr <> " F(λ) = " <> printf "%.5f" quality
487 <> " ξ = " <> printf "%.5f" acc
488 <> " ρ = " <> printf "%.5f" rec
489 <> " branches = " <> show(length branches) <> " ↴")
490 $ breakBranches fdt proximity lambda frequency minBranch thr depth elevation frame docs coocs periods
491 [] (head' "seaLevelMatching" branches) (tail' "seaLevelMatching" branches)
492 frequency' = reduceFrequency frequency (map fst branches')
493 in seaLevelMatching fdt proximity lambda minBranch frequency' (thr + step) step (depth - 1) elevation frame periods docs coocs branches'
494
495
496 constanteTemporalMatching :: Double -> Double -> Phylo -> Phylo
497 constanteTemporalMatching start step phylo = updatePhyloGroups 1
498 (fromList $ map (\g -> (getGroupId g,g)) $ traceMatchEnd $ concat branches)
499 (toPhyloHorizon phylo)
500 where
501 -- 2) process the temporal matching by elevating seaLvl level
502 branches :: [[PhyloGroup]]
503 branches = map fst
504 $ seaLevelMatching (fromIntegral $ Vector.length $ getRoots phylo)
505 (phyloProximity $ getConfig phylo)
506 (_qua_granularity $ phyloQuality $ getConfig phylo)
507 (_qua_minBranch $ phyloQuality $ getConfig phylo)
508 (phylo ^. phylo_termFreq)
509 start step
510 ((((1 - start) / step) - 1))
511 (((1 - start) / step))
512 (getTimeFrame $ timeUnit $ getConfig phylo)
513 (getPeriodIds phylo)
514 (phylo ^. phylo_timeDocs)
515 (phylo ^. phylo_timeCooc)
516 (reverse $ sortOn (length . fst) groups)
517 -- 1) for each group process an initial temporal Matching
518 -- here we suppose that all the groups of level 1 are part of the same big branch
519 groups :: [([PhyloGroup],Bool)]
520 groups = map (\b -> (b,(length $ nub $ map _phylo_groupPeriod b) >= (_qua_minBranch $ phyloQuality $ getConfig phylo)))
521 $ groupsToBranches' $ fromList $ map (\g -> (getGroupId g, g))
522 $ matchGroupsToGroups (getTimeFrame $ timeUnit $ getConfig phylo)
523 (getPeriodIds phylo) (phyloProximity $ getConfig phylo)
524 start
525 (phylo ^. phylo_timeDocs)
526 (phylo ^. phylo_timeCooc)
527 (traceTemporalMatching $ getGroupsFromLevel 1 phylo)
528
529 -----------------
530 -- | Horizon | --
531 -----------------
532
533 toPhyloHorizon :: Phylo -> Phylo
534 toPhyloHorizon phylo =
535 let t0 = take 1 (getPeriodIds phylo)
536 groups = getGroupsFromLevelPeriods 1 t0 phylo
537 sens = getSensibility (phyloProximity $ getConfig phylo)
538 nbDocs = sum $ elems $ filterDocs (phylo ^. phylo_timeDocs) t0
539 diago = reduceDiagos $ filterDiago (phylo ^. phylo_timeCooc) t0
540 in phylo & phylo_horizon .~ (fromList $ map (\(g,g') ->
541 ((getGroupId g,getGroupId g'),weightedLogJaccard' sens nbDocs diago (g ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams))) $ listToCombi' groups)
542
543
544 --------------------------------------
545 -- | Adaptative Temporal Matching | --
546 --------------------------------------
547
548
549 thrToMeta :: Double -> [[PhyloGroup]] -> [[PhyloGroup]]
550 thrToMeta thr branches =
551 map (\b ->
552 map (\g -> g & phylo_groupMeta .~ (adjust (\lst -> lst ++ [thr]) "seaLevels" (g ^. phylo_groupMeta))) b) branches
553
554 depthToMeta :: Double -> [[PhyloGroup]] -> [[PhyloGroup]]
555 depthToMeta depth branches =
556 let break = length branches > 1
557 in map (\b ->
558 map (\g ->
559 if break then g & phylo_groupMeta .~ (adjust (\lst -> lst ++ [depth]) "breaks"(g ^. phylo_groupMeta))
560 else g) b) branches
561
562 reduceTupleMapByKeys :: Eq a => [a] -> Map (a,a) Double -> Map (a,a) Double
563 reduceTupleMapByKeys ks m = filterWithKey (\(k,k') _ -> (elem k ks) && (elem k' ks)) m
564
565
566 getInTupleMap :: Ord a => Map (a,a) Double -> a -> a -> Double
567 getInTupleMap m k k'
568 | isJust (m !? ( k ,k')) = m ! ( k ,k')
569 | isJust (m !? ( k',k )) = m ! ( k',k )
570 | otherwise = 0
571
572
573 toThreshold :: Double -> Map (PhyloGroupId,PhyloGroupId) Double -> Double
574 toThreshold lvl proxiGroups =
575 let idx = ((Map.size proxiGroups) `div` (floor lvl)) - 1
576 in if idx >= 0
577 then (sort $ elems proxiGroups) !! idx
578 else 1
579
580
581 -- done = all the allready broken branches
582 -- ego = the current branch we want to break
583 -- rest = the branches we still have to break
584 adaptativeBreakBranches :: Double -> Proximity -> Double -> Double -> Map (PhyloGroupId,PhyloGroupId) Double
585 -> Double -> Map Int Double -> Int -> Int -> Map Date Double -> Map Date Cooc
586 -> [PhyloPeriodId] -> [([PhyloGroup],(Bool,[Double]))] -> ([PhyloGroup],(Bool,[Double])) -> [([PhyloGroup],(Bool,[Double]))]
587 -> [([PhyloGroup],(Bool,[Double]))]
588 adaptativeBreakBranches fdt proxiConf depth elevation groupsProxi lambda frequency minBranch frame docs coocs periods done ego rest =
589 -- 1) keep or not the new division of ego
590 let done' = done ++ (if (fst . snd) ego
591 then (if ((null (fst ego')) || (quality > quality'))
592 then
593 [(concat $ thrToMeta thr $ [fst ego],(False, ((snd . snd) ego)))]
594 else
595 ( (map (\e -> (e,(True, ((snd . snd) ego) ++ [thr]))) (fst ego'))
596 ++ (map (\e -> (e,(False, ((snd . snd) ego)))) (snd ego'))))
597 else [(concat $ thrToMeta thr $ [fst ego], snd ego)])
598 in
599 -- uncomment let .. in for debugging
600 -- let part1 = partition (snd) done'
601 -- part2 = partition (snd) rest
602 -- in trace ( "[✓ " <> show(length $ fst part1) <> "(" <> show(length $ concat $ map (fst) $ fst part1) <> ")|✗ " <> show(length $ snd part1) <> "(" <> show(length $ concat $ map (fst) $ snd part1) <> ")] "
603 -- <> "[✓ " <> show(length $ fst part2) <> "(" <> show(length $ concat $ map (fst) $ fst part2) <> ")|✗ " <> show(length $ snd part2) <> "(" <> show(length $ concat $ map (fst) $ snd part2) <> ")]"
604 -- ) $
605 -- 2) if there is no more branches in rest then return else continue
606 if null rest
607 then done'
608 else adaptativeBreakBranches fdt proxiConf depth elevation groupsProxi lambda frequency minBranch frame docs coocs periods
609 done' (head' "breakBranches" rest) (tail' "breakBranches" rest)
610 where
611 --------------------------------------
612 thr :: Double
613 thr = toThreshold depth $ Map.filter (\v -> v > (last' "breakBranches" $ (snd . snd) ego)) $ reduceTupleMapByKeys (map getGroupId $ fst ego) groupsProxi
614 --------------------------------------
615 quality :: Double
616 quality = toPhyloQuality fdt lambda frequency ((map fst done) ++ [fst ego] ++ (map fst rest))
617 --------------------------------------
618 ego' :: ([[PhyloGroup]],[[PhyloGroup]])
619 ego' =
620 let branches = groupsToBranches' $ fromList $ map (\g -> (getGroupId g, g))
621 $ matchGroupsToGroups frame periods proxiConf thr docs coocs (fst ego)
622 branches' = branches `using` parList rdeepseq
623 in partition (\b -> (length $ nub $ map _phylo_groupPeriod b) > minBranch)
624 $ thrToMeta thr
625 $ depthToMeta (elevation - depth) branches'
626 --------------------------------------
627 quality' :: Double
628 quality' = toPhyloQuality fdt lambda frequency
629 ((map fst done) ++ (fst ego') ++ (snd ego') ++ (map fst rest))
630
631
632 adaptativeSeaLevelMatching :: Double -> Proximity -> Double -> Double -> Map (PhyloGroupId, PhyloGroupId) Double
633 -> Double -> Int -> Map Int Double
634 -> Int -> [PhyloPeriodId] -> Map Date Double -> Map Date Cooc
635 -> [([PhyloGroup],(Bool,[Double]))] -> [([PhyloGroup],(Bool,[Double]))]
636 adaptativeSeaLevelMatching fdt proxiConf depth elevation groupsProxi lambda minBranch frequency frame periods docs coocs branches =
637 -- if there is no branch to break or if seaLvl level >= depth then end
638 if (Map.null groupsProxi) || (depth <= 0) || ((not . or) $ map (fst . snd) branches)
639 then branches
640 else
641 -- break all the possible branches at the current seaLvl level
642 let branches' = adaptativeBreakBranches fdt proxiConf depth elevation groupsProxi lambda frequency minBranch frame docs coocs periods
643 [] (head' "seaLevelMatching" branches) (tail' "seaLevelMatching" branches)
644 frequency' = reduceFrequency frequency (map fst branches')
645 groupsProxi' = reduceTupleMapByKeys (map (getGroupId) $ concat $ map (fst) $ filter (fst . snd) branches') groupsProxi
646 -- thr = toThreshold depth groupsProxi
647 in trace("\n " <> foldl (\acc _ -> acc <> "🌊 ") "" [0..(elevation - depth)]
648 <> " [✓ " <> show(length $ filter (fst . snd) branches') <> "(" <> show(length $ concat $ map (fst) $ filter (fst . snd) branches')
649 <> ")|✗ " <> show(length $ filter (not . fst . snd) branches') <> "(" <> show(length $ concat $ map (fst) $ filter (not . fst . snd) branches') <> ")]"
650 <> " thr = ")
651 $ adaptativeSeaLevelMatching fdt proxiConf (depth - 1) elevation groupsProxi' lambda minBranch frequency' frame periods docs coocs branches'
652
653
654 adaptativeTemporalMatching :: Double -> Phylo -> Phylo
655 adaptativeTemporalMatching elevation phylo = updatePhyloGroups 1
656 (fromList $ map (\g -> (getGroupId g,g)) $ traceMatchEnd $ concat branches)
657 (toPhyloHorizon phylo)
658 where
659 -- 2) process the temporal matching by elevating seaLvl level
660 branches :: [[PhyloGroup]]
661 branches = map fst
662 $ adaptativeSeaLevelMatching (fromIntegral $ Vector.length $ getRoots phylo)
663 (phyloProximity $ getConfig phylo)
664 (elevation - 1)
665 elevation
666 (phylo ^. phylo_groupsProxi)
667 (_qua_granularity $ phyloQuality $ getConfig phylo)
668 (_qua_minBranch $ phyloQuality $ getConfig phylo)
669 (phylo ^. phylo_termFreq)
670 (getTimeFrame $ timeUnit $ getConfig phylo)
671 (getPeriodIds phylo)
672 (phylo ^. phylo_timeDocs)
673 (phylo ^. phylo_timeCooc)
674 groups
675 -- 1) for each group process an initial temporal Matching
676 -- here we suppose that all the groups of level 1 are part of the same big branch
677 groups :: [([PhyloGroup],(Bool,[Double]))]
678 groups = map (\b -> (b,((length $ nub $ map _phylo_groupPeriod b) >= (_qua_minBranch $ phyloQuality $ getConfig phylo),[thr])))
679 $ groupsToBranches' $ fromList $ map (\g -> (getGroupId g, g))
680 $ matchGroupsToGroups (getTimeFrame $ timeUnit $ getConfig phylo)
681 (getPeriodIds phylo) (phyloProximity $ getConfig phylo)
682 thr
683 (phylo ^. phylo_timeDocs)
684 (phylo ^. phylo_timeCooc)
685 (traceTemporalMatching $ getGroupsFromLevel 1 phylo)
686 --------------------------------------
687 thr :: Double
688 thr = toThreshold elevation (phylo ^. phylo_groupsProxi)