2 Module : Gargantext.Viz.Phylo.TemporalMatching
3 Description : Module dedicated to the adaptative temporal matching of a Phylo.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 {-# LANGUAGE NoImplicitPrelude #-}
12 {-# LANGUAGE FlexibleContexts #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE MultiParamTypeClasses #-}
16 module Gargantext.Viz.Phylo.TemporalMatching where
18 import Data.List (concat, splitAt, tail, sortOn, (++), intersect, null, inits, groupBy, scanl, nub, union, elemIndex, (!!), dropWhile)
19 import Data.Map (Map, fromList, elems, restrictKeys, unionWith, intersectionWith, findWithDefault, filterWithKey, keys, (!))
21 import Gargantext.Prelude
22 import Gargantext.Viz.AdaptativePhylo
23 import Gargantext.Viz.Phylo.PhyloTools
25 -- import Prelude (logBase)
26 import Control.Lens hiding (Level)
27 import Control.Parallel.Strategies (parList, rdeepseq, using)
28 -- import Debug.Trace (trace)
30 import qualified Data.Set as Set
38 -- | Process the inverse sumLog
39 sumInvLog :: Double -> [Double] -> Double
40 sumInvLog s l = foldl (\mem x -> mem + (1 / log (s + x))) 0 l
43 -- | Process the sumLog
44 sumLog :: Double -> [Double] -> Double
45 sumLog s l = foldl (\mem x -> mem + log (s + x)) 0 l
48 -- | To compute a jaccard similarity between two lists
49 jaccard :: [Int] -> [Int] -> Double
50 jaccard inter' union' = ((fromIntegral . length) $ inter') / ((fromIntegral . length) $ union')
53 -- | To process a WeighedLogJaccard distance between to coocurency matrix
54 weightedLogJaccard :: Double -> Double -> Cooc -> Cooc -> [Int] -> [Int] -> Double
55 weightedLogJaccard sens docs cooc cooc' ngrams ngrams'
56 | null ngramsInter = 0
57 | ngramsInter == ngramsUnion = 1
58 | sens == 0 = jaccard ngramsInter ngramsUnion
59 | sens > 0 = (sumInvLog sens coocInter) / (sumInvLog sens coocUnion)
60 | otherwise = (sumLog sens coocInter) / (sumLog sens coocUnion)
62 --------------------------------------
64 ngramsInter = intersect ngrams ngrams'
65 --------------------------------------
67 ngramsUnion = union ngrams ngrams'
68 --------------------------------------
70 coocInter = elems $ map (/docs) $ filterWithKey (\(k,k') _ -> k == k') $ intersectionWith (+) cooc cooc'
71 --------------------------------------
73 coocUnion = elems $ map (/docs) $ filterWithKey (\(k,k') _ -> k == k') $ unionWith (+) cooc cooc'
74 --------------------------------------
77 -- | To choose a proximity function
78 pickProximity :: Proximity -> Double -> Cooc -> Cooc -> [Int] -> [Int] -> Double
79 pickProximity proximity docs cooc cooc' ngrams ngrams' = case proximity of
80 WeightedLogJaccard sens _ _ -> weightedLogJaccard sens docs cooc cooc' ngrams ngrams'
84 -- | To process the proximity between a current group and a pair of targets group
85 toProximity :: Map Date Double -> Proximity -> PhyloGroup -> PhyloGroup -> PhyloGroup -> Double
86 toProximity docs proximity ego target target' =
87 let docs' = sum $ elems docs
88 cooc = if target == target'
89 then (target ^. phylo_groupCooc)
90 else sumCooc (target ^. phylo_groupCooc) (target' ^. phylo_groupCooc)
91 ngrams = if target == target'
92 then (target ^. phylo_groupNgrams)
93 else union (target ^. phylo_groupNgrams) (target' ^. phylo_groupNgrams)
94 in pickProximity proximity docs' (ego ^. phylo_groupCooc) cooc (ego ^. phylo_groupNgrams) ngrams
97 ------------------------
98 -- | Local Matching | --
99 ------------------------
101 toLastPeriod :: Filiation -> [PhyloPeriodId] -> PhyloPeriodId
102 toLastPeriod fil periods = case fil of
103 ToParents -> head' "toLastPeriod" (sortOn fst periods)
104 ToChilds -> last' "toLastPeriod" (sortOn fst periods)
107 toLazyPairs :: [Pointer] -> Filiation -> Double -> Proximity -> PhyloPeriodId -> [(PhyloGroup,PhyloGroup)] -> [(PhyloGroup,PhyloGroup)]
108 toLazyPairs pointers fil thr prox prd pairs =
109 if null pointers then pairs
110 else let rest = filterPointers prox thr pointers
112 then let prd' = toLastPeriod fil (map (fst . fst . fst) pointers)
115 else filter (\(g,g') ->
117 ToParents -> ((fst $ g ^. phylo_groupPeriod) < (fst prd'))
118 || ((fst $ g' ^. phylo_groupPeriod) < (fst prd'))
119 ToChilds -> ((fst $ g ^. phylo_groupPeriod) > (fst prd'))
120 || ((fst $ g' ^. phylo_groupPeriod) > (fst prd'))) pairs
124 -- | Find pairs of valuable candidates to be matched
125 makePairs' :: PhyloGroup -> [PhyloGroup] -> [PhyloPeriodId] -> [Pointer] -> Filiation -> Double -> Proximity -> Map Date Double -> [(PhyloGroup,PhyloGroup)]
126 makePairs' ego candidates periods pointers fil thr prox docs =
129 False -> toLazyPairs pointers fil thr prox lastPrd
130 -- | at least on of the pair candidates should be from the last added period
131 $ filter (\(g,g') -> ((g ^. phylo_groupPeriod) == lastPrd)
132 || ((g' ^. phylo_groupPeriod) == lastPrd))
134 $ filter (\g -> (g ^. phylo_groupPeriod == lastPrd)
135 || ((toProximity docs prox ego ego g) >= thr)) candidates
137 lastPrd :: PhyloPeriodId
138 lastPrd = toLastPeriod fil periods
141 filterPointers :: Proximity -> Double -> [Pointer] -> [Pointer]
142 filterPointers proxi thr pts = filter (\(_,w) -> filterProximity proxi thr w) pts
145 phyloGroupMatching :: [[PhyloGroup]] -> Filiation -> Proximity -> Map Date Double -> Double -> PhyloGroup -> PhyloGroup
146 phyloGroupMatching candidates fil proxi docs thr ego =
147 case null nextPointers of
148 -- | let's find new pointers
149 True -> if null $ filterPointers proxi thr $ getPeriodPointers fil ego
150 then addPointers ego fil TemporalPointer []
151 -- | or keep the old ones
152 else addPointers ego fil TemporalPointer
153 $ filterPointers proxi thr $ getPeriodPointers fil ego
154 False -> addPointers ego fil TemporalPointer
155 $ head' "phyloGroupMatching"
156 -- | Keep only the best set of pointers grouped by proximity
157 $ groupBy (\pt pt' -> snd pt == snd pt')
158 $ reverse $ sortOn snd $ head' "pointers"
160 -- | Find the first time frame where at leats one pointer satisfies the proximity threshold
162 nextPointers :: [[Pointer]]
163 nextPointers = take 1
165 -- | for each time frame, process the proximity on relevant pairs of targeted groups
166 $ scanl (\acc groups ->
167 let periods = nub $ map _phylo_groupPeriod $ concat groups
168 docs' = (filterDocs docs ([ego ^. phylo_groupPeriod] ++ periods))
169 pairs = makePairs' ego (concat groups) periods (getPeriodPointers fil ego) fil thr proxi docs
170 in acc ++ ( filterPointers proxi thr
173 -- | process the proximity between the current group and a pair of candidates
174 let proximity = toProximity docs' proxi ego c c'
176 then [(getGroupId c,proximity)]
177 else [(getGroupId c,proximity),(getGroupId c',proximity)] ) pairs )) []
178 $ inits candidates -- | groups from [[1900],[1900,1901],[1900,1901,1902],...]
181 filterDocs :: Map Date Double -> [PhyloPeriodId] -> Map Date Double
182 filterDocs d pds = restrictKeys d $ periodsToYears pds
185 -----------------------------
186 -- | Matching Processing | --
187 -----------------------------
190 getNextPeriods :: Filiation -> Int -> PhyloPeriodId -> [PhyloPeriodId] -> [PhyloPeriodId]
191 getNextPeriods fil max' pId pIds =
193 ToChilds -> take max' $ (tail . snd) $ splitAt (elemIndex' pId pIds) pIds
194 ToParents -> take max' $ (reverse . fst) $ splitAt (elemIndex' pId pIds) pIds
197 getCandidates :: Filiation -> PhyloGroup -> [[PhyloGroup]] -> [[PhyloGroup]]
198 getCandidates fil ego targets =
201 ToParents -> reverse targets'
203 targets' :: [[PhyloGroup]]
206 filter (\g' -> (not . null) $ intersect (ego ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams)
210 phyloBranchMatching :: Int -> [PhyloPeriodId] -> Proximity -> Double -> Map Date Double -> [PhyloGroup] -> [PhyloGroup]
211 phyloBranchMatching frame periods proximity thr docs branch = traceBranchMatching proximity thr
212 -- $ matchByPeriods ToParents
213 -- $ groupByField _phylo_groupPeriod
215 $ groupByField _phylo_groupPeriod branch
217 --------------------------------------
218 matchByPeriods :: Map PhyloPeriodId [PhyloGroup] -> [PhyloGroup]
219 matchByPeriods branch' = foldl' (\acc prd ->
220 let periodsPar = getNextPeriods ToParents frame prd periods
221 periodsChi = getNextPeriods ToChilds frame prd periods
222 candidatesPar = map (\prd' -> findWithDefault [] prd' branch') periodsPar
223 candidatesChi = map (\prd' -> findWithDefault [] prd' branch') periodsChi
224 docsPar = filterDocs docs ([prd] ++ periodsPar)
225 docsChi = filterDocs docs ([prd] ++ periodsChi)
226 egos = map (\ego -> phyloGroupMatching (getCandidates ToParents ego candidatesPar) ToParents proximity docsPar thr
227 $ phyloGroupMatching (getCandidates ToChilds ego candidatesChi) ToChilds proximity docsChi thr ego)
228 $ findWithDefault [] prd branch'
229 egos' = egos `using` parList rdeepseq
230 in acc ++ egos' ) [] periods
233 -----------------------
234 -- | Phylo Quality | --
235 -----------------------
238 count :: Eq a => a -> [a] -> Int
239 count x = length . filter (== x)
241 termFreq' :: Int -> [PhyloGroup] -> Double
242 termFreq' term groups =
243 let ngrams = concat $ map _phylo_groupNgrams groups
244 in (fromIntegral $ count term ngrams)
245 / (fromIntegral $ length ngrams)
247 relevantBranches :: Int -> Int -> [[PhyloGroup]] -> [[PhyloGroup]]
248 relevantBranches term thr branches =
249 filter (\groups -> (length groups >= thr)
250 && (any (\group -> elem term $ group ^. phylo_groupNgrams) groups)) branches
252 branchCov' :: [PhyloGroup] -> [[PhyloGroup]] -> Double
253 branchCov' branch branches =
254 (fromIntegral $ length branch) / (fromIntegral $ length $ concat branches)
257 toRecall :: Double -> Int -> Int -> [[PhyloGroup]] -> Double
258 toRecall freq term thr branches =
259 -- | given a random term in a phylo
261 -- | for each relevant branches
262 * (sum $ map (\branch ->
263 -- | given its local coverage
264 ((branchCov' branch branches') / (sum $ map (\b -> branchCov' b branches') branches'))
265 -- | compute the local recall
266 * ( (fromIntegral $ length $ filter (\group -> elem term $ group ^. phylo_groupNgrams) branch)
267 / (fromIntegral $ length $ filter (\group -> elem term $ group ^. phylo_groupNgrams) $ concat branches'))) branches')
269 branches' :: [[PhyloGroup]]
270 branches' = relevantBranches term thr branches
273 toAccuracy :: Double -> Int -> Int -> [[PhyloGroup]] -> Double
274 toAccuracy freq term thr branches =
275 -- | given a random term in a phylo
277 -- | for each relevant branches
278 * (sum $ map (\branch ->
279 -- | given its local coverage
280 ((branchCov' branch branches') / (sum $ map (\b -> branchCov' b branches') branches'))
281 -- | compute the local accuracy
282 * ( (fromIntegral $ length $ filter (\group -> elem term $ group ^. phylo_groupNgrams) branch)
283 / (fromIntegral $ length branch))) branches')
285 branches' :: [[PhyloGroup]]
286 branches' = relevantBranches term thr branches
289 toPhyloQuality' :: Quality -> Map Int Double -> [[PhyloGroup]] -> Double
290 toPhyloQuality' quality frequency branches =
291 if (foldl' (\acc b -> acc && (length b < (quality ^. qua_minBranch))) True branches)
292 -- | the local phylo is composed of small branches
295 let relevance = quality ^. qua_relevance
296 -- | compute the F score for a given relevance
297 in ((1 + relevance ** 2) * accuracy * recall)
298 / (((relevance ** 2) * accuracy + recall))
301 terms = keys frequency
302 -- | for each term compute the global accuracy
304 accuracy = sum $ map (\term -> toAccuracy (frequency ! term) term (quality ^. qua_minBranch) branches) terms
305 -- | for each term compute the global recall
307 recall = sum $ map (\term -> toRecall (frequency ! term) term (quality ^. qua_minBranch) branches) terms
312 -----------------------------
313 -- | Adaptative Matching | --
314 -----------------------------
317 groupsToBranches :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
318 groupsToBranches groups =
319 -- | run the related component algorithm
320 let egos = groupBy (\gs gs' -> (fst $ fst $ head' "egos" gs) == (fst $ fst $ head' "egos" gs'))
321 $ sortOn (\gs -> fst $ fst $ head' "egos" gs)
322 $ map (\group -> [getGroupId group]
323 ++ (map fst $ group ^. phylo_groupPeriodParents)
324 ++ (map fst $ group ^. phylo_groupPeriodChilds) ) $ elems groups
325 -- | first find the related components by inside each ego's period
326 graph' = map relatedComponents egos
327 -- | then run it for the all the periods
329 $ relatedComponents $ concat (graph' `using` parList rdeepseq)
330 -- | update each group's branch id
331 in map (\(bId,ids) ->
332 let groups' = map (\group -> group & phylo_groupBranchId %~ (\(lvl,lst) -> (lvl,lst ++ [bId])))
333 $ elems $ restrictKeys groups (Set.fromList ids)
334 in groups' `using` parList rdeepseq ) graph
337 recursiveMatching :: Proximity -> Quality -> Map Int Double -> Double -> Int -> [PhyloPeriodId] -> Map Date Double -> Double -> [[PhyloGroup]] -> [PhyloGroup]
338 recursiveMatching proximity qua freq thr frame periods docs quality branches =
339 if (length branches == (length $ concat branches))
344 -- trace (show(quality) <> " (vs) sum of " <> show(nextQualities))
345 case quality <= (sum nextQualities) of
346 -- | success : the new threshold improves the quality score, let's go deeper (traceMatchSuccess thr quality (sum nextQualities))
349 let idx = fromJust $ elemIndex branches' nextBranches
350 in recursiveMatching proximity qua freq (thr + (getThresholdStep proximity)) frame periods docs (nextQualities !! idx) branches')
352 -- | failure : last step was a local maximum of quality, let's validate it (traceMatchFailure thr quality (sum nextQualities))
353 False -> concat branches
355 -- | 2) for each of the possible next branches process the phyloQuality score
356 nextQualities :: [Double]
357 nextQualities = map (\nextBranch -> toPhyloQuality' qua freq nextBranch) nextBranches
358 -- | 1) for each local branch process a temporal matching then find the resulting branches
359 nextBranches :: [[[PhyloGroup]]]
361 let branches' = map (\branch -> phyloBranchMatching frame periods proximity thr docs branch) branches
362 clusters = map (\branch -> groupsToBranches $ fromList $ map (\group -> (getGroupId group, group)) branch) branches'
363 clusters' = clusters `using` parList rdeepseq
368 temporalMatching :: Phylo -> Phylo
369 temporalMatching phylo = updatePhyloGroups 1 branches' phylo
371 -- | 4) run the recursive matching to find the best repartition among branches
372 branches' :: Map PhyloGroupId PhyloGroup
374 $ map (\g -> (getGroupId g, g))
376 $ recursiveMatching (phyloProximity $ getConfig phylo)
377 (phyloQuality $ getConfig phylo)
379 ( (getThresholdInit $ phyloProximity $ getConfig phylo)
380 + (getThresholdStep $ phyloProximity $ getConfig phylo))
381 (getTimeFrame $ timeUnit $ getConfig phylo)
383 (phylo ^. phylo_timeDocs) quality branches
384 -- | 3) process the quality score
386 quality = toPhyloQuality' (phyloQuality $ getConfig phylo) frequency branches
387 -- | 3) process the constants of the quality score
388 frequency :: Map Int Double
390 let terms = ngramsInBranches branches
391 in fromList $ map (\t -> (t, ((termFreq' t $ concat branches) / (sum $ map (\t' -> termFreq' t' $ concat branches) terms)))) terms
392 -- | 2) group into branches
393 branches :: [[PhyloGroup]]
394 branches = groupsToBranches $ fromList $ map (\group -> (getGroupId group, group)) groups'
395 -- | 1) for each group process an initial temporal Matching
396 groups' :: [PhyloGroup]
397 groups' = phyloBranchMatching (getTimeFrame $ timeUnit $ getConfig phylo) (getPeriodIds phylo)
398 (phyloProximity $ getConfig phylo) (getThresholdInit $ phyloProximity $ getConfig phylo)
399 (phylo ^. phylo_timeDocs)
400 (traceTemporalMatching $ getGroupsFromLevel 1 phylo)