2 Module : Gargantext.Viz.Phylo.TemporalMatching
3 Description : Module dedicated to the adaptative temporal matching of a Phylo.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 {-# LANGUAGE NoImplicitPrelude #-}
12 {-# LANGUAGE FlexibleContexts #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE MultiParamTypeClasses #-}
16 module Gargantext.Viz.Phylo.TemporalMatching where
18 import Data.List (concat, splitAt, tail, sortOn, (++), intersect, null, inits, find, groupBy, scanl, nub, union, elemIndex, (!!))
19 import Data.Map (Map, fromList, fromListWith, filterWithKey, elems, restrictKeys, unionWith, intersectionWith, findWithDefault)
21 import Gargantext.Prelude
22 import Gargantext.Viz.AdaptativePhylo
23 import Gargantext.Viz.Phylo.PhyloTools
24 import Gargantext.Viz.Phylo.SynchronicClustering
26 import Debug.Trace (trace)
27 import Prelude (logBase)
28 import Control.Lens hiding (Level)
30 import qualified Data.Set as Set
38 -- | Process the inverse sumLog
39 sumInvLog :: Double -> [Double] -> Double
40 sumInvLog s l = foldl (\mem x -> mem + (1 / log (s + x))) 0 l
43 -- | Process the sumLog
44 sumLog :: Double -> [Double] -> Double
45 sumLog s l = foldl (\mem x -> mem + log (s + x)) 0 l
48 -- | To compute a jaccard similarity between two lists
49 jaccard :: [Int] -> [Int] -> Double
50 jaccard inter' union' = ((fromIntegral . length) $ inter') / ((fromIntegral . length) $ union')
53 -- | To process a WeighedLogJaccard distance between to coocurency matrix
54 weightedLogJaccard :: Double -> Double -> Cooc -> Cooc -> [Int] -> [Int] -> Double
55 weightedLogJaccard sens docs cooc cooc' ngrams ngrams'
56 | null ngramsInter = 0
57 | ngramsInter == ngramsUnion = 1
58 | sens == 0 = jaccard ngramsInter ngramsUnion
59 | sens > 0 = (sumInvLog sens coocInter) / (sumInvLog sens coocUnion)
60 | otherwise = (sumLog sens coocInter) / (sumLog sens coocUnion)
62 --------------------------------------
64 ngramsInter = intersect ngrams ngrams'
65 --------------------------------------
67 ngramsUnion = union ngrams ngrams'
68 --------------------------------------
70 coocInter = elems $ map (/docs) $ intersectionWith (+) cooc cooc'
71 --------------------------------------
73 coocUnion = elems $ map (/docs) $ unionWith (+) cooc cooc'
74 --------------------------------------
77 -- | To choose a proximity function
78 pickProximity :: Proximity -> Double -> Cooc -> Cooc -> [Int] -> [Int] -> Double
79 pickProximity proximity docs cooc cooc' ngrams ngrams' = case proximity of
80 WeightedLogJaccard sens _ _ -> weightedLogJaccard sens docs cooc cooc' ngrams ngrams'
84 filterProximity :: Proximity -> Double -> Double -> Bool
85 filterProximity proximity thr local =
87 WeightedLogJaccard _ _ _ -> local >= thr
91 -- | To process the proximity between a current group and a pair of targets group
92 toProximity :: Map Date Double -> Proximity -> PhyloGroup -> PhyloGroup -> PhyloGroup -> Double
93 toProximity docs proximity ego target target' =
94 let docs' = sum $ elems docs
95 cooc = if target == target'
96 then (target ^. phylo_groupCooc)
97 else sumCooc (target ^. phylo_groupCooc) (target' ^. phylo_groupCooc)
98 ngrams = if target == target'
99 then (target ^. phylo_groupNgrams)
100 else union (target ^. phylo_groupNgrams) (target' ^. phylo_groupNgrams)
101 in pickProximity proximity docs' (ego ^. phylo_groupCooc) cooc (ego ^. phylo_groupNgrams) ngrams
104 ------------------------
105 -- | Local Matching | --
106 ------------------------
109 -- | Find pairs of valuable candidates to be matched
110 makePairs :: [PhyloGroup] -> [PhyloPeriodId] -> [(PhyloGroup,PhyloGroup)]
111 makePairs candidates periods = case null periods of
113 -- | at least on of the pair candidates should be from the last added period
114 False -> filter (\(cdt,cdt') -> (inLastPeriod cdt periods)
115 || (inLastPeriod cdt' periods))
116 $ listToKeys candidates
118 inLastPeriod :: PhyloGroup -> [PhyloPeriodId] -> Bool
119 inLastPeriod g prds = (g ^. phylo_groupPeriod) == (last' "makePairs" prds)
122 phyloGroupMatching :: [[PhyloGroup]] -> Filiation -> Proximity -> Map Date Double -> Double-> PhyloGroup -> PhyloGroup
123 phyloGroupMatching candidates fil proxi docs thr ego = case pointers of
124 Nothing -> addPointers ego fil TemporalPointer []
125 Just pts -> addPointers ego fil TemporalPointer
126 $ head' "phyloGroupMatching"
127 -- | Keep only the best set of pointers grouped by proximity
128 $ groupBy (\pt pt' -> snd pt == snd pt')
129 $ reverse $ sortOn snd pts
130 -- | Find the first time frame where at leats one pointer satisfies the proximity threshold
132 pointers :: Maybe [Pointer]
133 pointers = find (not . null)
134 -- | for each time frame, process the proximity on relevant pairs of targeted groups
135 $ scanl (\acc groups ->
136 let periods = nub $ map (\g' -> g' ^. phylo_groupPeriod) $ concat groups
137 pairs = makePairs (concat groups) periods
138 in acc ++ ( filter (\(_,proximity) -> filterProximity proxi thr proximity)
141 -- | process the proximity between the current group and a pair of candidates
142 let proximity = toProximity (filterDocs docs periods) proxi ego c c'
144 then [(getGroupId c,proximity)]
145 else [(getGroupId c,proximity),(getGroupId c',proximity)] ) pairs)
147 -- | groups from [[1900],[1900,1901],[1900,1901,1902],...]
149 --------------------------------------
150 filterDocs :: Map Date Double -> [PhyloPeriodId] -> Map Date Double
151 filterDocs d pds = restrictKeys d $ periodsToYears pds
154 -----------------------------
155 -- | Matching Processing | --
156 -----------------------------
159 getNextPeriods :: Filiation -> Int -> PhyloPeriodId -> [PhyloPeriodId] -> [PhyloPeriodId]
160 getNextPeriods fil max' pId pIds =
162 ToChilds -> take max' $ (tail . snd) $ splitAt (elemIndex' pId pIds) pIds
163 ToParents -> take max' $ (reverse . fst) $ splitAt (elemIndex' pId pIds) pIds
166 getCandidates :: Filiation -> PhyloGroup -> [PhyloPeriodId] -> [PhyloGroup] -> [[PhyloGroup]]
167 getCandidates fil ego pIds targets =
170 ToParents -> reverse targets'
172 targets' :: [[PhyloGroup]]
173 targets' = map (\groups' -> filter (\g' -> (not . null) $ intersect (ego ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams)) groups') $ elems
174 $ filterWithKey (\k _ -> elem k pIds)
177 $ map (\g' -> (g' ^. phylo_groupPeriod,[g'])) targets
180 processMatching :: Int -> [PhyloPeriodId] -> Proximity -> Double -> Map Date Double -> [PhyloGroup] -> [PhyloGroup]
181 processMatching max' periods proximity thr docs groups =
183 let childs = getCandidates ToChilds group
184 (getNextPeriods ToChilds max' (group ^. phylo_groupPeriod) periods) groups
185 parents = getCandidates ToParents group
186 (getNextPeriods ToParents max' (group ^. phylo_groupPeriod) periods) groups
187 in phyloGroupMatching parents ToParents proximity docs thr
188 $ phyloGroupMatching childs ToChilds proximity docs thr group
192 -----------------------
193 -- | Phylo Quality | --
194 -----------------------
197 termFreq :: Int -> [[PhyloGroup]] -> Double
198 termFreq term branches = (sum $ map (\g -> findWithDefault 0 (term,term) (g ^. phylo_groupCooc)) $ concat branches)
199 / (sum $ map (\g -> getTrace $ g ^. phylo_groupCooc) $ concat branches)
202 entropy :: [[PhyloGroup]] -> Double
204 let terms = ngramsInBranches branches
205 in sum $ map (\term -> (1 / log (termFreq term branches))
206 / (sum $ map (\branch -> 1 / log (termFreq term [branch])) branches)
207 * (sum $ map (\branch ->
208 let q = branchObs term (length $ concat branches) branch
211 else - q * logBase 2 q ) branches) ) terms
213 -- | Probability to observe a branch given a random term of the phylo
214 branchObs :: Int -> Int -> [PhyloGroup] -> Double
215 branchObs term total branch = (fromIntegral $ length $ filter (\g -> elem term $ g ^. phylo_groupNgrams) branch)
216 / (fromIntegral total)
219 homogeneity :: [[PhyloGroup]] -> Double
220 homogeneity branches =
221 let nbGroups = length $ concat branches
223 $ map (\branch -> (if (length branch == nbGroups)
225 else (1 / log (branchCov branch nbGroups))
226 / (sum $ map (\branch' -> 1 / log (branchCov branch' nbGroups)) branches))
227 * (sum $ map (\term -> (termFreq term branches)
228 / (sum $ map (\term' -> termFreq term' branches) $ ngramsInBranches [branch])
229 * (fromIntegral $ sum $ ngramsInBranches [filter (\g -> elem term $ g ^. phylo_groupNgrams) branch])
230 / (fromIntegral $ sum $ ngramsInBranches [branch])
231 ) $ ngramsInBranches [branch]) ) branches
233 branchCov :: [PhyloGroup] -> Int -> Double
234 branchCov branch total = (fromIntegral $ length branch) / (fromIntegral total)
237 toPhyloQuality :: [[PhyloGroup]] -> Double
238 toPhyloQuality branches = sqrt (homogeneity branches / entropy branches)
241 -----------------------------
242 -- | Adaptative Matching | --
243 -----------------------------
246 groupsToBranches :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
247 groupsToBranches groups =
248 -- | run the related component algorithm
249 let graph = zip [1..]
251 $ map (\group -> [getGroupId group]
252 ++ (map fst $ group ^. phylo_groupPeriodParents)
253 ++ (map fst $ group ^. phylo_groupPeriodChilds) ) $ elems groups
254 -- | update each group's branch id
255 in map (\(bId,ids) ->
256 map (\group -> group & phylo_groupBranchId %~ (\(lvl,lst) -> (lvl,lst ++ [bId])))
257 $ elems $ restrictKeys groups (Set.fromList ids)
261 recursiveMatching :: Proximity -> Double -> Int -> [PhyloPeriodId] -> Map Date Double -> Double -> [[PhyloGroup]] -> [PhyloGroup]
262 recursiveMatching proximity thr frame periods docs quality branches =
263 if (length branches == (length $ concat branches))
264 then concat $ traceMatchNoSplit branches
266 then concat $ traceMatchLimit branches
268 case quality <= (sum nextQualities) of
269 -- | success : the new threshold improves the quality score, let's go deeper
272 let idx = fromJust $ elemIndex branches' nextBranches
273 in recursiveMatching proximity (thr + (getThresholdStep proximity)) frame periods docs (nextQualities !! idx) branches')
274 $ traceMatchSuccess thr quality (sum nextQualities) nextBranches
275 -- | failure : last step was a local maximum of quality, let's validate it
276 False -> concat $ traceMatchFailure thr quality (sum nextQualities) branches
278 -- | 2) for each of the possible next branches process the phyloQuality score
279 nextQualities :: [Double]
280 nextQualities = map toPhyloQuality nextBranches
281 -- | 1) for each local branch process a temporal matching then find the resulting branches
282 nextBranches :: [[[PhyloGroup]]]
283 nextBranches = map (\branch ->
284 let branch' = processMatching frame periods proximity thr docs branch
285 in groupsToBranches $ fromList $ map (\group -> (getGroupId group, group)) branch'
289 temporalMatching :: Phylo -> Phylo
290 temporalMatching phylo = updatePhyloGroups 1 branches' phylo
292 -- | 4) run the recursive matching to find the best repartition among branches
293 branches' :: Map PhyloGroupId PhyloGroup
295 $ map (\g -> (getGroupId g, g))
297 $ recursiveMatching (phyloProximity $ getConfig phylo)
298 ( (getThresholdInit $ phyloProximity $ getConfig phylo)
299 + (getThresholdStep $ phyloProximity $ getConfig phylo))
300 (getTimeFrame $ timeUnit $ getConfig phylo)
302 (phylo ^. phylo_timeDocs) quality branches
303 -- | 3) process the quality score
305 quality = toPhyloQuality branches
306 -- | 2) group into branches
307 branches :: [[PhyloGroup]]
308 branches = groupsToBranches $ fromList $ map (\group -> (getGroupId group, group))
309 $ trace ("\n" <> "-- | Init temporal matching for " <> show (length $ groups') <> " groups" <> "\n") groups'
310 -- | 1) for each group process an initial temporal Matching
311 groups' :: [PhyloGroup]
312 groups' = processMatching (getTimeFrame $ timeUnit $ getConfig phylo) (getPeriodIds phylo)
313 (phyloProximity $ getConfig phylo) (getThresholdInit $ phyloProximity $ getConfig phylo)
314 (phylo ^. phylo_timeDocs) (getGroupsFromLevel 1 phylo)