2 Module : Gargantext.Viz.Phylo.TemporalMatching
3 Description : Module dedicated to the adaptative temporal matching of a Phylo.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 {-# LANGUAGE NoImplicitPrelude #-}
12 {-# LANGUAGE FlexibleContexts #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE MultiParamTypeClasses #-}
16 module Gargantext.Viz.Phylo.TemporalMatching where
18 import Data.List (concat, splitAt, tail, sortOn, (++), intersect, null, inits, find, groupBy, scanl, nub, union)
19 import Data.Map (Map, fromList, fromListWith, filterWithKey, elems, restrictKeys, unionWith, intersectionWith, findWithDefault)
21 import Gargantext.Prelude
22 import Gargantext.Viz.AdaptativePhylo
23 import Gargantext.Viz.Phylo.PhyloTools
24 import Gargantext.Viz.Phylo.SynchronicClustering
26 import Prelude (logBase)
27 import Control.Lens hiding (Level)
29 import qualified Data.Set as Set
36 -- | Process the inverse sumLog
37 sumInvLog :: Double -> [Double] -> Double
38 sumInvLog s l = foldl (\mem x -> mem + (1 / log (s + x))) 0 l
41 -- | Process the sumLog
42 sumLog :: Double -> [Double] -> Double
43 sumLog s l = foldl (\mem x -> mem + log (s + x)) 0 l
46 -- | To compute a jaccard similarity between two lists
47 jaccard :: [Int] -> [Int] -> Double
48 jaccard inter' union' = ((fromIntegral . length) $ inter') / ((fromIntegral . length) $ union')
51 -- | To process a WeighedLogJaccard distance between to coocurency matrix
52 weightedLogJaccard :: Double -> Double -> Cooc -> Cooc -> [Int] -> [Int] -> Double
53 weightedLogJaccard sens docs cooc cooc' ngrams ngrams'
54 | null ngramsInter = 0
55 | ngramsInter == ngramsUnion = 1
56 | sens == 0 = jaccard ngramsInter ngramsUnion
57 | sens > 0 = (sumInvLog sens coocInter) / (sumInvLog sens coocUnion)
58 | otherwise = (sumLog sens coocInter) / (sumLog sens coocUnion)
60 --------------------------------------
62 ngramsInter = intersect ngrams ngrams'
63 --------------------------------------
65 ngramsUnion = union ngrams ngrams'
66 --------------------------------------
68 coocInter = elems $ map (/docs) $ intersectionWith (+) cooc cooc'
69 --------------------------------------
71 coocUnion = elems $ map (/docs) $ unionWith (+) cooc cooc'
72 --------------------------------------
75 -- | To choose a proximity function
76 pickProximity :: Proximity -> Double -> Cooc -> Cooc -> [Int] -> [Int] -> Double
77 pickProximity proximity docs cooc cooc' ngrams ngrams' = case proximity of
78 WeightedLogJaccard sens _ _ -> weightedLogJaccard sens docs cooc cooc' ngrams ngrams'
82 filterProximity :: Proximity -> Double -> Double -> Bool
83 filterProximity proximity thr local =
85 WeightedLogJaccard _ _ _ -> local >= thr
89 -- | To process the proximity between a current group and a pair of targets group
90 toProximity :: Map Date Double -> Proximity -> PhyloGroup -> PhyloGroup -> PhyloGroup -> Double
91 toProximity docs proximity ego target target' =
92 let docs' = sum $ elems docs
93 cooc = if target == target'
94 then (target ^. phylo_groupCooc)
95 else sumCooc (target ^. phylo_groupCooc) (target' ^. phylo_groupCooc)
96 ngrams = if target == target'
97 then (target ^. phylo_groupNgrams)
98 else union (target ^. phylo_groupNgrams) (target' ^. phylo_groupNgrams)
99 in pickProximity proximity docs' (ego ^. phylo_groupCooc) cooc (ego ^. phylo_groupNgrams) ngrams
102 ------------------------
103 -- | Local Matching | --
104 ------------------------
107 -- | Find pairs of valuable candidates to be matched
108 makePairs :: [PhyloGroup] -> [PhyloPeriodId] -> [(PhyloGroup,PhyloGroup)]
109 makePairs candidates periods = case null periods of
111 -- | at least on of the pair candidates should be from the last added period
112 False -> filter (\(cdt,cdt') -> (inLastPeriod cdt periods)
113 || (inLastPeriod cdt' periods))
114 $ listToKeys candidates
116 inLastPeriod :: PhyloGroup -> [PhyloPeriodId] -> Bool
117 inLastPeriod g prds = (g ^. phylo_groupPeriod) == (last' "makePairs" prds)
120 phyloGroupMatching :: [[PhyloGroup]] -> Filiation -> Proximity -> Map Date Double -> Double-> PhyloGroup -> PhyloGroup
121 phyloGroupMatching candidates fil proxi docs thr ego = case pointers of
122 Nothing -> addPointers ego fil TemporalPointer []
123 Just pts -> addPointers ego fil TemporalPointer
124 $ head' "phyloGroupMatching"
125 -- | Keep only the best set of pointers grouped by proximity
126 $ groupBy (\pt pt' -> snd pt == snd pt')
127 $ reverse $ sortOn snd pts
128 -- | Find the first time frame where at leats one pointer satisfies the proximity threshold
130 pointers :: Maybe [Pointer]
131 pointers = find (not . null)
132 -- | for each time frame, process the proximity on relevant pairs of targeted groups
133 $ scanl (\acc groups ->
134 let periods = nub $ map (\g' -> g' ^. phylo_groupPeriod) $ concat groups
135 pairs = makePairs (concat groups) periods
136 in acc ++ ( filter (\(_,proximity) -> filterProximity proxi thr proximity)
139 -- | process the proximity between the current group and a pair of candidates
140 let proximity = toProximity (filterDocs docs periods) proxi ego c c'
142 then [(getGroupId c,proximity)]
143 else [(getGroupId c,proximity),(getGroupId c',proximity)] ) pairs)
145 -- | groups from [[1900],[1900,1901],[1900,1901,1902],...]
147 --------------------------------------
148 filterDocs :: Map Date Double -> [PhyloPeriodId] -> Map Date Double
149 filterDocs d pds = restrictKeys d $ periodsToYears pds
152 -----------------------------
153 -- | Matching Processing | --
154 -----------------------------
157 getNextPeriods :: Filiation -> Int -> PhyloPeriodId -> [PhyloPeriodId] -> [PhyloPeriodId]
158 getNextPeriods fil max' pId pIds =
160 ToChilds -> take max' $ (tail . snd) $ splitAt (elemIndex' pId pIds) pIds
161 ToParents -> take max' $ (reverse . fst) $ splitAt (elemIndex' pId pIds) pIds
164 getCandidates :: Filiation -> PhyloGroup -> [PhyloPeriodId] -> [PhyloGroup] -> [[PhyloGroup]]
165 getCandidates fil ego pIds targets =
168 ToParents -> reverse targets'
170 targets' :: [[PhyloGroup]]
171 targets' = map (\groups' -> filter (\g' -> (not . null) $ intersect (ego ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams)) groups') $ elems
172 $ filterWithKey (\k _ -> elem k pIds)
175 $ map (\g' -> (g' ^. phylo_groupPeriod,[g'])) targets
178 processMatching :: Int -> [PhyloPeriodId] -> Proximity -> Double -> Map Date Double -> [PhyloGroup] -> [PhyloGroup]
179 processMatching max' periods proximity thr docs groups =
181 let childs = getCandidates ToChilds group
182 (getNextPeriods ToChilds max' (group ^. phylo_groupPeriod) periods) groups
183 parents = getCandidates ToParents group
184 (getNextPeriods ToParents max' (group ^. phylo_groupPeriod) periods) groups
185 in phyloGroupMatching parents ToParents proximity docs thr
186 $ phyloGroupMatching childs ToChilds proximity docs thr group
190 -----------------------
191 -- | Phylo Quality | --
192 -----------------------
195 termFreq :: Int -> [[PhyloGroup]] -> Double
196 termFreq term branches = (sum $ map (\g -> findWithDefault 0 (term,term) (g ^. phylo_groupCooc)) $ concat branches)
197 / (sum $ map (\g -> getTrace $ g ^. phylo_groupCooc) $ concat branches)
200 entropy :: [[PhyloGroup]] -> Double
202 let terms = ngramsInBranches branches
203 in sum $ map (\term -> (1 / log (termFreq term branches))
204 / (sum $ map (\branch -> 1 / log (termFreq term [branch])) branches)
205 * (sum $ map (\branch ->
206 let q = branchObs term (length $ concat branches) branch
207 in q * logBase 2 q ) branches) ) terms
209 -- | Probability to observe a branch given a random term of the phylo
210 branchObs :: Int -> Int -> [PhyloGroup] -> Double
211 branchObs term total branch = (fromIntegral $ length $ filter (\g -> elem term $ g ^. phylo_groupNgrams) branch)
212 / (fromIntegral total)
215 homogeneity :: [[PhyloGroup]] -> Double
216 homogeneity _ = undefined
218 -- branchCov :: [PhyloGroup] -> Int -> Double
219 -- branchCov branch total = (fromIntegral $ length branch) / (fromIntegral total)
222 toPhyloQuality :: [[PhyloGroup]] -> Double
223 toPhyloQuality branches = sqrt (homogeneity branches / entropy branches)
226 -----------------------------
227 -- | Adaptative Matching | --
228 -----------------------------
231 groupsToBranches :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
232 groupsToBranches groups =
233 -- | run the related component algorithm
234 let graph = zip [1..]
236 $ map (\group -> [getGroupId group]
237 ++ (map fst $ group ^. phylo_groupPeriodParents)
238 ++ (map fst $ group ^. phylo_groupPeriodChilds) ) $ elems groups
239 -- | update each group's branch id
240 in map (\(bId,ids) ->
241 map (\group -> group & phylo_groupBranchId %~ (\(lvl,lst) -> (lvl,lst ++ [bId])))
242 $ elems $ restrictKeys groups (Set.fromList ids)
246 recursiveMatching :: Proximity -> Double -> Int -> [PhyloPeriodId] -> Map Date Double -> Double -> [PhyloGroup] -> [PhyloGroup]
247 recursiveMatching proximity thr max' periods docs quality groups =
248 case quality < quality' of
249 -- | success : we localy improve the quality of the branch, let's go deeper
252 recursiveMatching proximity (thr + (getThresholdStep proximity)) max' periods docs quality' branch
254 -- | failure : last step was the local maximum, let's validate it
257 -- | 3) process a quality score on the local set of branches
259 quality' = toPhyloQuality branches
260 -- | 2) group the new groups into branches
261 branches :: [[PhyloGroup]]
262 branches = groupsToBranches $ fromList $ map (\group -> (getGroupId group, group)) groups'
263 -- | 1) process a temporal matching for each group
264 groups' :: [PhyloGroup]
265 groups' = processMatching max' periods proximity thr docs groups
268 temporalMatching :: Phylo -> Phylo
269 temporalMatching phylo = updatePhyloGroups 1 branches phylo
271 -- | 2) run the recursive matching to find the best repartition among branches
272 branches :: Map PhyloGroupId PhyloGroup
274 $ map (\g -> (getGroupId g, g))
275 $ recursiveMatching (phyloProximity $ getConfig phylo)
276 (getThresholdInit $ phyloProximity $ getConfig phylo)
277 (getTimeFrame $ timeUnit $ getConfig phylo)
279 (phylo ^. phylo_timeDocs) (toPhyloQuality [groups']) groups'
280 -- | 1) for each group process an initial temporal Matching
281 groups' :: [PhyloGroup]
282 groups' = processMatching (getTimeFrame $ timeUnit $ getConfig phylo) (getPeriodIds phylo)
283 (phyloProximity $ getConfig phylo) (getThresholdInit $ phyloProximity $ getConfig phylo)
284 (phylo ^. phylo_timeDocs) (getGroupsFromLevel 1 phylo)