2 Module : Gargantext.Viz.Phylo.TemporalMatching
3 Description : Module dedicated to the adaptative temporal matching of a Phylo.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 {-# LANGUAGE NoImplicitPrelude #-}
12 {-# LANGUAGE FlexibleContexts #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE MultiParamTypeClasses #-}
16 module Gargantext.Viz.Phylo.TemporalMatching where
18 import Data.List (concat, splitAt, tail, sortOn, (++), intersect, null, inits, find, groupBy, scanl, any, nub, union)
19 import Data.Map (Map, fromList, toList, fromListWith, filterWithKey, elems, restrictKeys, unionWith, intersectionWith, member, (!))
21 import Gargantext.Prelude
22 import Gargantext.Viz.AdaptativePhylo
23 import Gargantext.Viz.Phylo.PhyloTools
24 import Gargantext.Viz.Phylo.SynchronicClustering
26 import Control.Lens hiding (Level)
28 import qualified Data.Set as Set
36 -- | Process the inverse sumLog
37 sumInvLog :: Double -> [Double] -> Double
38 sumInvLog s l = foldl (\mem x -> mem + (1 / log (s + x))) 0 l
41 -- | Process the sumLog
42 sumLog :: Double -> [Double] -> Double
43 sumLog s l = foldl (\mem x -> mem + log (s + x)) 0 l
46 -- | To compute a jaccard similarity between two lists
47 jaccard :: [Int] -> [Int] -> Double
48 jaccard inter' union' = ((fromIntegral . length) $ inter') / ((fromIntegral . length) $ union')
51 -- | To process a WeighedLogJaccard distance between to coocurency matrix
52 weightedLogJaccard :: Double -> Double -> Cooc -> Cooc -> [Int] -> [Int] -> Double
53 weightedLogJaccard sens docs cooc cooc' ngrams ngrams'
54 | null ngramsInter = 0
55 | ngramsInter == ngramsUnion = 1
56 | sens == 0 = jaccard ngramsInter ngramsUnion
57 | sens > 0 = (sumInvLog sens coocInter) / (sumInvLog sens coocUnion)
58 | otherwise = (sumLog sens coocInter) / (sumLog sens coocUnion)
60 --------------------------------------
62 ngramsInter = intersect ngrams ngrams'
63 --------------------------------------
65 ngramsUnion = union ngrams ngrams'
66 --------------------------------------
68 coocInter = elems $ map (/docs) $ intersectionWith (+) cooc cooc'
69 --------------------------------------
71 coocUnion = elems $ map (/docs) $ unionWith (+) cooc cooc'
72 --------------------------------------
75 -- | To choose a proximity function
76 pickProximity :: Proximity -> Double -> Cooc -> Cooc -> [Int] -> [Int] -> Double
77 pickProximity proximity docs cooc cooc' ngrams ngrams' = case proximity of
78 WeightedLogJaccard sens _ _ -> weightedLogJaccard sens docs cooc cooc' ngrams ngrams'
82 -- | To process the proximity between a current group and a pair of targets group
83 toProximity :: Map Date Double -> Proximity -> PhyloGroup -> PhyloGroup -> PhyloGroup -> Double
84 toProximity docs proximity group target target' =
85 let docs' = sum $ elems docs
86 cooc = if target == target'
87 then (target ^. phylo_groupCooc)
88 else sumCooc (target ^. phylo_groupCooc) (target' ^. phylo_groupCooc)
89 ngrams = if target == target'
90 then (target ^. phylo_groupNgrams)
91 else union (target ^. phylo_groupNgrams) (target' ^. phylo_groupNgrams)
92 in pickProximity proximity docs' (group ^. phylo_groupCooc) cooc (group ^. phylo_groupNgrams) ngrams
95 ------------------------
96 -- | Local Matching | --
97 ------------------------
100 -- | Find pairs of valuable candidates to be matched
101 makePairs :: [PhyloGroup] -> [PhyloPeriodId] -> Map Date Double -> PhyloGroup -> [(PhyloGroup,PhyloGroup)]
102 makePairs candidates periods docs group = case null periods of
104 -- | at least on of the pair candidates should be from the last added period
105 False -> filter (\(cdt,cdt') -> (inLastPeriod cdt periods)
106 || (inLastPeriod cdt' periods))
107 $ listToKeys candidates
109 inLastPeriod :: PhyloGroup -> [PhyloPeriodId] -> Bool
110 inLastPeriod g prds = (g ^. phylo_groupPeriod) == (last' "makePairs" prds)
113 phyloGroupMatching :: [[PhyloGroup]] -> Filiation -> Proximity -> Map Date Double -> PhyloGroup -> PhyloGroup
114 phyloGroupMatching candidates fil proxi docs group = case pointers of
115 Nothing -> addPointers group fil TemporalPointer []
116 Just pts -> addPointers group fil TemporalPointer
117 $ head' "phyloGroupMatching"
118 -- | Keep only the best set of pointers grouped by proximity
119 $ groupBy (\pt pt' -> snd pt == snd pt')
120 $ reverse $ sortOn snd pts
121 -- | Find the first time frame where at leats one pointer satisfies the proximity threshold
123 pointers :: Maybe [Pointer]
124 pointers = find (not . null)
125 -- | for each time frame, process the proximity on relevant pairs of targeted groups
126 $ scanl (\acc groups ->
127 let periods = nub $ map (\g' -> g' ^. phylo_groupPeriod) $ concat groups
128 pairs = makePairs (concat groups) periods docs group
131 -- | process the proximity between the current group and a pair of candidates
132 let proximity = toProximity (filterDocs docs periods) proxi group c c'
134 then [(getGroupId c,proximity)]
135 else [(getGroupId c,proximity),(getGroupId c',proximity)] ) pairs)
137 -- | groups from [[1900],[1900,1901],[1900,1901,1902],...]
139 --------------------------------------
140 filterDocs :: Map Date Double -> [PhyloPeriodId] -> Map Date Double
141 filterDocs d pds = restrictKeys d $ periodsToYears pds
150 -- ghostHunter :: [[PhyloGroup]] -> [[PhyloGroup]]
151 -- ghostHunter branches =
153 -- -- | il manque une référence au group source de chaque pointer
154 -- let pointers = elems $ fromList
155 -- $ map (\pt -> (groupIds ! (fst pt),pt))
156 -- $ filter (\pt -> member (fst pt) groupIds) $ concat $ map (\g -> g ^. phylo_groupGhostPointers) branch
161 -- groupIds :: Map PhyloGroupId Int
162 -- groupIds = fromList $ map (\g -> (getGroupId g, last' "ghostHunter" $ snd $ g ^. phylo_groupBranchId)) $ concat branches
163 -- --------------------------------------
164 -- selectBest :: [Pointers] -> [Pointers]
169 filterPointers :: Double -> [PhyloGroup] -> [PhyloGroup]
170 filterPointers thr groups =
172 let ghosts = filter (\(_,w) -> w < thr) $ group ^. phylo_groupPeriodParents
173 in group & phylo_groupPeriodParents %~ (filter (\(_,w) -> w >= thr))
174 & phylo_groupPeriodChilds %~ (filter (\(_,w) -> w >= thr))
175 & phylo_groupGhostPointers %~ (++ ghosts)
179 -----------------------------
180 -- | Adaptative Matching | --
181 -----------------------------
184 getNextPeriods :: Filiation -> Int -> PhyloPeriodId -> [PhyloPeriodId] -> [PhyloPeriodId]
185 getNextPeriods fil max pId pIds =
187 ToChilds -> take max $ (tail . snd) $ splitAt (elemIndex' pId pIds) pIds
188 ToParents -> take max $ (reverse . fst) $ splitAt (elemIndex' pId pIds) pIds
191 getCandidates :: Filiation -> PhyloGroup -> [PhyloPeriodId] -> [PhyloGroup] -> [[PhyloGroup]]
192 getCandidates fil g pIds targets =
195 ToParents -> reverse targets'
197 targets' :: [[PhyloGroup]]
198 targets' = map (\groups' -> filter (\g' -> (not . null) $ intersect (g ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams)) groups') $ elems
199 $ filterWithKey (\k _ -> elem k pIds)
202 $ map (\g' -> (g' ^. phylo_groupPeriod,[g'])) targets
205 shouldBreak :: Double -> [(Double,[PhyloGroup])] -> Bool
206 shouldBreak thr branches = any (\(quality,_) -> quality < thr) branches
209 toBranchQuality :: [[PhyloGroup]] -> [(Double,[PhyloGroup])]
210 toBranchQuality branches = undefined
213 groupsToBranches :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
214 groupsToBranches groups =
215 -- | run the related component algorithm
216 let graph = zip [1..]
218 $ map (\group -> [getGroupId group]
219 ++ (map fst $ group ^. phylo_groupPeriodParents)
220 ++ (map fst $ group ^. phylo_groupPeriodChilds) ) $ elems groups
221 -- | update each group's branch id
222 in map (\(bId,ids) ->
223 map (\group -> group & phylo_groupBranchId %~ (\(lvl,lst) -> (lvl,lst ++ [bId])))
224 $ elems $ restrictKeys groups (Set.fromList ids)
229 -- findGhostLinks :: [Link] -> [[Link]] -> Map PhyloGroupId
231 adaptativeMatching :: Proximity -> Double -> Double -> [PhyloGroup] -> [PhyloGroup]
232 adaptativeMatching proximity thr thrQua groups =
233 -- | check if we should break some of the new branches or not
234 case shouldBreak thrQua branches' of
235 True -> concat $ map (\(s,b) ->
237 -- | we keep the branch as it is
239 -- | we break the branch using an increased temporal matching threshold
240 else let nextGroups = undefined
241 in adaptativeMatching proximity (thr + (getThresholdStep proximity)) thrQua nextGroups
243 -- | the quality of all the new branches is sufficient
244 False -> concat branches
246 -- | 3) process a quality score for each new branch
247 branches' :: [(Double,[PhyloGroup])]
248 branches' = toBranchQuality branches
249 -- | 2) group the new groups into branches
250 branches :: [[PhyloGroup]]
251 branches = groupsToBranches $ fromList $ map (\group -> (getGroupId group, group)) groups'
252 -- | 1) filter the pointers of each groups regarding the current state of the quality threshold
253 groups' :: [PhyloGroup]
254 groups' = filterPointers thr groups
257 temporalMatching :: Phylo -> Phylo
258 temporalMatching phylo = updatePhyloGroups 1 branches phylo
260 -- | 4) find the ghost links and postprocess the branches
261 branches' :: Map PhyloGroupId PhyloGroup
262 branches' = undefined
263 -- | 3) run the adaptative matching to find the best repartition among branches
264 branches :: Map PhyloGroupId PhyloGroup
266 $ map (\g -> (getGroupId g, g))
267 $ adaptativeMatching proximity (getThresholdInit proximity) (phyloQuality $ getConfig phylo) groups'
268 -- | 2) for each group process an initial temporal Matching
269 groups' :: [PhyloGroup]
271 let maxTime = getTimeFrame $ timeUnit $ getConfig phylo
272 periods = getPeriodIds phylo
273 docs = phylo ^. phylo_timeDocs
274 --------------------------------------
276 let childs = getCandidates ToChilds group
277 (getNextPeriods ToChilds maxTime (group ^. phylo_groupPeriod) periods) groups
278 parents = getCandidates ToParents group
279 (getNextPeriods ToParents maxTime (group ^. phylo_groupPeriod) periods) groups
280 in phyloGroupMatching parents ToParents proximity docs
281 $ phyloGroupMatching childs ToChilds proximity docs group
283 -- | 1) start with all the groups from a given level
284 groups :: [PhyloGroup]
285 groups = getGroupsFromLevel 1 phylo
286 --------------------------------------
287 proximity :: Proximity
288 proximity = phyloProximity $ getConfig phylo