2 Module : Gargantext.Viz.Phylo.TemporalMatching
3 Description : Module dedicated to the adaptative temporal matching of a Phylo.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 {-# LANGUAGE NoImplicitPrelude #-}
12 {-# LANGUAGE FlexibleContexts #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE MultiParamTypeClasses #-}
16 module Gargantext.Viz.Phylo.TemporalMatching where
18 import Data.List (concat, splitAt, tail, sortOn, (++), intersect, null, inits, groupBy, scanl, nub, union, elemIndex, (!!), dropWhile, partition)
19 import Data.Map (Map, fromList, elems, restrictKeys, unionWith, intersectionWith, findWithDefault, filterWithKey)
21 import Gargantext.Prelude
22 import Gargantext.Viz.AdaptativePhylo
23 import Gargantext.Viz.Phylo.PhyloTools
25 import Prelude (logBase)
26 import Control.Lens hiding (Level)
27 import Control.Parallel.Strategies (parList, rdeepseq, using)
28 -- import Debug.Trace (trace)
30 import qualified Data.Set as Set
38 -- | Process the inverse sumLog
39 sumInvLog :: Double -> [Double] -> Double
40 sumInvLog s l = foldl (\mem x -> mem + (1 / log (s + x))) 0 l
43 -- | Process the sumLog
44 sumLog :: Double -> [Double] -> Double
45 sumLog s l = foldl (\mem x -> mem + log (s + x)) 0 l
48 -- | To compute a jaccard similarity between two lists
49 jaccard :: [Int] -> [Int] -> Double
50 jaccard inter' union' = ((fromIntegral . length) $ inter') / ((fromIntegral . length) $ union')
53 -- | To process a WeighedLogJaccard distance between to coocurency matrix
54 weightedLogJaccard :: Double -> Double -> Cooc -> Cooc -> [Int] -> [Int] -> Double
55 weightedLogJaccard sens docs cooc cooc' ngrams ngrams'
56 | null ngramsInter = 0
57 | ngramsInter == ngramsUnion = 1
58 | sens == 0 = jaccard ngramsInter ngramsUnion
59 | sens > 0 = (sumInvLog sens coocInter) / (sumInvLog sens coocUnion)
60 | otherwise = (sumLog sens coocInter) / (sumLog sens coocUnion)
62 --------------------------------------
64 ngramsInter = intersect ngrams ngrams'
65 --------------------------------------
67 ngramsUnion = union ngrams ngrams'
68 --------------------------------------
70 coocInter = elems $ map (/docs) $ filterWithKey (\(k,k') _ -> k == k') $ intersectionWith (+) cooc cooc'
71 --------------------------------------
73 coocUnion = elems $ map (/docs) $ filterWithKey (\(k,k') _ -> k == k') $ unionWith (+) cooc cooc'
74 --------------------------------------
77 -- | To choose a proximity function
78 pickProximity :: Proximity -> Double -> Cooc -> Cooc -> [Int] -> [Int] -> Double
79 pickProximity proximity docs cooc cooc' ngrams ngrams' = case proximity of
80 WeightedLogJaccard sens _ _ -> weightedLogJaccard sens docs cooc cooc' ngrams ngrams'
84 -- | To process the proximity between a current group and a pair of targets group
85 toProximity :: Map Date Double -> Proximity -> PhyloGroup -> PhyloGroup -> PhyloGroup -> Double
86 toProximity docs proximity ego target target' =
87 let docs' = sum $ elems docs
88 cooc = if target == target'
89 then (target ^. phylo_groupCooc)
90 else sumCooc (target ^. phylo_groupCooc) (target' ^. phylo_groupCooc)
91 ngrams = if target == target'
92 then (target ^. phylo_groupNgrams)
93 else union (target ^. phylo_groupNgrams) (target' ^. phylo_groupNgrams)
94 in pickProximity proximity docs' (ego ^. phylo_groupCooc) cooc (ego ^. phylo_groupNgrams) ngrams
97 ------------------------
98 -- | Local Matching | --
99 ------------------------
102 -- | Find pairs of valuable candidates to be matched
103 makePairs :: [PhyloGroup] -> [PhyloPeriodId] -> [PhyloPeriodId] -> [(PhyloGroup,PhyloGroup)]
104 makePairs candidates periods periods' = case null periods of
106 -- | at least on of the pair candidates should be from the last added period
107 False -> filter (\(cdt,cdt') ->
108 ((inLastPeriod cdt periods) || (inLastPeriod cdt' periods))
109 && (not $ inOldPeriods cdt periods')
110 && (not $ inOldPeriods cdt' periods'))
111 $ listToKeys candidates
113 inLastPeriod :: PhyloGroup -> [PhyloPeriodId] -> Bool
114 inLastPeriod g prds = (g ^. phylo_groupPeriod) == (last' "makePairs" prds)
115 --------------------------------------
116 inOldPeriods :: PhyloGroup -> [PhyloPeriodId] -> Bool
117 inOldPeriods g prds = elem (g ^. phylo_groupPeriod) prds
120 keepOldOnes :: Filiation -> Proximity -> Double -> PhyloGroup -> Bool
121 keepOldOnes fil proxi thr ego = any (\(_,w) -> filterProximity proxi thr w)
122 $ getPeriodPointers fil ego
124 filterPointers :: Proximity -> Double -> [Pointer] -> [Pointer]
125 filterPointers proxi thr pts = filter (\(_,w) -> filterProximity proxi thr w) pts
128 findLastPeriod :: Filiation -> [Pointer] -> PhyloPeriodId
129 findLastPeriod fil pts = case fil of
130 ToParents -> head' "findLastPeriod" $ sortOn fst $ map (fst . fst . fst) pts
131 ToChilds -> head' "findLastPeriod" $ reverse $ sortOn fst $ map (fst . fst . fst) pts
135 phyloGroupMatching :: [[PhyloGroup]] -> Filiation -> Proximity -> Map Date Double -> Double -> PhyloGroup -> PhyloGroup
136 phyloGroupMatching candidates fil proxi docs thr ego =
137 if keepOldOnes fil proxi thr ego
138 -- | keep some of the old pointers
139 then addPointers ego fil TemporalPointer
140 $ filterPointers proxi thr
141 $ getPeriodPointers fil ego
142 else case null pointers of
143 -- | let's find new pointers
144 True -> addPointers ego fil TemporalPointer []
145 False -> addPointers ego fil TemporalPointer
146 $ head' "phyloGroupMatching"
147 -- | Keep only the best set of pointers grouped by proximity
148 $ groupBy (\pt pt' -> snd pt == snd pt')
149 $ reverse $ sortOn snd $ head' "pointers" pointers
150 -- | Find the first time frame where at leats one pointer satisfies the proximity threshold
152 --------------------------------------
153 oldPeriods :: [PhyloPeriodId] -> [PhyloPeriodId]
155 if (null $ getPeriodPointers fil ego)
158 let period = findLastPeriod fil $ getPeriodPointers fil ego
159 in fst $ partition (\prd -> case fil of
160 ToChilds -> prd <= period
161 ToParents -> prd >= period ) periods
162 --------------------------------------
163 pointers :: [[Pointer]]
166 -- | for each time frame, process the proximity on relevant pairs of targeted groups
167 $ scanl (\acc groups ->
169 $ concat $ map (\gs -> if null gs
171 else [_phylo_groupPeriod $ head' "pointers" gs]) groups
172 periods' = oldPeriods periods
173 pairs = makePairs (concat groups) periods periods'
174 in acc ++ ( filterPointers proxi thr
177 -- | process the proximity between the current group and a pair of candidates
178 let proximity = toProximity (filterDocs docs ([ego ^. phylo_groupPeriod] ++ periods)) proxi ego c c'
180 then [(getGroupId c,proximity)]
181 else [(getGroupId c,proximity),(getGroupId c',proximity)] ) pairs)
183 -- | groups from [[1900],[1900,1901],[1900,1901,1902],...]
187 filterDocs :: Map Date Double -> [PhyloPeriodId] -> Map Date Double
188 filterDocs d pds = restrictKeys d $ periodsToYears pds
191 -----------------------------
192 -- | Matching Processing | --
193 -----------------------------
196 getNextPeriods :: Filiation -> Int -> PhyloPeriodId -> [PhyloPeriodId] -> [PhyloPeriodId]
197 getNextPeriods fil max' pId pIds =
199 ToChilds -> take max' $ (tail . snd) $ splitAt (elemIndex' pId pIds) pIds
200 ToParents -> take max' $ (reverse . fst) $ splitAt (elemIndex' pId pIds) pIds
203 getCandidates :: Filiation -> PhyloGroup -> [[PhyloGroup]] -> [[PhyloGroup]]
204 getCandidates fil ego targets =
207 ToParents -> reverse targets'
209 targets' :: [[PhyloGroup]]
212 filter (\g' -> (not . null) $ intersect (ego ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams)
216 phyloBranchMatching :: Int -> [PhyloPeriodId] -> Proximity -> Double -> Map Date Double -> [PhyloGroup] -> [PhyloGroup]
217 phyloBranchMatching frame periods proximity thr docs branch = traceBranchMatching proximity thr
218 $ matchByPeriods ToParents
219 $ groupByField _phylo_groupPeriod
220 $ matchByPeriods ToChilds
221 $ groupByField _phylo_groupPeriod branch
223 --------------------------------------
224 matchByPeriods :: Filiation -> Map PhyloPeriodId [PhyloGroup] -> [PhyloGroup]
225 matchByPeriods fil branch' = foldl' (\acc prd ->
226 let periods' = getNextPeriods fil frame prd periods
227 candidates = map (\prd' -> findWithDefault [] prd' branch') periods'
228 docs' = filterDocs docs ([prd] ++ periods')
229 egos = map (\g -> phyloGroupMatching (getCandidates fil g candidates) fil proximity docs' thr g)
230 $ findWithDefault [] prd branch'
231 egos' = egos `using` parList rdeepseq
232 in acc ++ egos' ) [] periods
235 -----------------------
236 -- | Phylo Quality | --
237 -----------------------
240 termFreq :: Int -> [[PhyloGroup]] -> Double
241 termFreq term branches = (sum $ map (\g -> findWithDefault 0 (term,term) (g ^. phylo_groupCooc)) $ concat branches)
242 / (sum $ map (\g -> getTrace $ g ^. phylo_groupCooc) $ concat branches)
245 entropy :: [[PhyloGroup]] -> Double
247 let terms = ngramsInBranches branches
248 in sum $ map (\term -> (1 / log (termFreq term branches))
249 / (sum $ map (\branch -> 1 / log (termFreq term [branch])) branches)
250 * (sum $ map (\branch ->
251 let q = branchObs term (length $ concat branches) branch
254 else - q * logBase 2 q ) branches) ) terms
256 -- | Probability to observe a branch given a random term of the phylo
257 branchObs :: Int -> Int -> [PhyloGroup] -> Double
258 branchObs term total branch = (fromIntegral $ length $ filter (\g -> elem term $ g ^. phylo_groupNgrams) branch)
259 / (fromIntegral total)
262 homogeneity :: [[PhyloGroup]] -> Double
263 homogeneity branches =
264 let nbGroups = length $ concat branches
266 $ map (\branch -> (if (length branch == nbGroups)
268 else (1 / log (branchCov branch nbGroups))
269 / (sum $ map (\branch' -> 1 / log (branchCov branch' nbGroups)) branches))
270 * (sum $ map (\term -> (termFreq term branches)
271 / (sum $ map (\term' -> termFreq term' branches) $ ngramsInBranches [branch])
272 * (fromIntegral $ sum $ ngramsInBranches [filter (\g -> elem term $ g ^. phylo_groupNgrams) branch])
273 / (fromIntegral $ sum $ ngramsInBranches [branch])
274 ) $ ngramsInBranches [branch]) ) branches
276 branchCov :: [PhyloGroup] -> Int -> Double
277 branchCov branch total = (fromIntegral $ length branch) / (fromIntegral total)
280 toPhyloQuality :: [[PhyloGroup]] -> Double
281 toPhyloQuality branches = sqrt (homogeneity branches / entropy branches)
284 -----------------------------
285 -- | Adaptative Matching | --
286 -----------------------------
289 groupsToBranches :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
290 groupsToBranches groups =
291 -- | run the related component algorithm
292 let graph = zip [1..]
294 $ map (\group -> [getGroupId group]
295 ++ (map fst $ group ^. phylo_groupPeriodParents)
296 ++ (map fst $ group ^. phylo_groupPeriodChilds) ) $ elems groups
297 -- | update each group's branch id
298 in map (\(bId,ids) ->
299 map (\group -> group & phylo_groupBranchId %~ (\(lvl,lst) -> (lvl,lst ++ [bId])))
300 $ elems $ restrictKeys groups (Set.fromList ids)) graph
303 recursiveMatching :: Proximity -> Double -> Int -> [PhyloPeriodId] -> Map Date Double -> Double -> [[PhyloGroup]] -> [PhyloGroup]
304 recursiveMatching proximity thr frame periods docs quality branches =
305 if (length branches == (length $ concat branches))
310 case quality <= (sum nextQualities) of
311 -- | success : the new threshold improves the quality score, let's go deeper (traceMatchSuccess thr quality (sum nextQualities))
314 let idx = fromJust $ elemIndex branches' nextBranches
315 in recursiveMatching proximity (thr + (getThresholdStep proximity)) frame periods docs (nextQualities !! idx) branches')
317 -- | failure : last step was a local maximum of quality, let's validate it (traceMatchFailure thr quality (sum nextQualities))
318 False -> concat branches
320 -- | 2) for each of the possible next branches process the phyloQuality score
321 nextQualities :: [Double]
322 nextQualities = map toPhyloQuality nextBranches
323 -- | 1) for each local branch process a temporal matching then find the resulting branches
324 nextBranches :: [[[PhyloGroup]]]
326 let branches' = map (\branch -> phyloBranchMatching frame periods proximity thr docs branch) branches
327 clusters = map (\branch -> groupsToBranches $ fromList $ map (\group -> (getGroupId group, group)) branch) branches'
328 clusters' = clusters `using` parList rdeepseq
333 temporalMatching :: Phylo -> Phylo
334 temporalMatching phylo = updatePhyloGroups 1 branches' phylo
336 -- | 4) run the recursive matching to find the best repartition among branches
337 branches' :: Map PhyloGroupId PhyloGroup
339 $ map (\g -> (getGroupId g, g))
341 $ recursiveMatching (phyloProximity $ getConfig phylo)
342 ( (getThresholdInit $ phyloProximity $ getConfig phylo)
343 + (getThresholdStep $ phyloProximity $ getConfig phylo))
344 (getTimeFrame $ timeUnit $ getConfig phylo)
346 (phylo ^. phylo_timeDocs) quality branches
347 -- | 3) process the quality score
349 quality = toPhyloQuality branches
350 -- | 2) group into branches
351 branches :: [[PhyloGroup]]
352 branches = groupsToBranches $ fromList $ map (\group -> (getGroupId group, group)) groups'
353 -- | 1) for each group process an initial temporal Matching
354 groups' :: [PhyloGroup]
355 groups' = phyloBranchMatching (getTimeFrame $ timeUnit $ getConfig phylo) (getPeriodIds phylo)
356 (phyloProximity $ getConfig phylo) (getThresholdInit $ phyloProximity $ getConfig phylo)
357 (phylo ^. phylo_timeDocs)
358 (traceTemporalMatching $ getGroupsFromLevel 1 phylo)