2 Module : Gargantext.Viz.Phylo.TemporalMatching
3 Description : Module dedicated to the adaptative temporal matching of a Phylo.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 {-# LANGUAGE NoImplicitPrelude #-}
12 {-# LANGUAGE FlexibleContexts #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE MultiParamTypeClasses #-}
16 module Gargantext.Viz.Phylo.TemporalMatching where
18 import Data.List (concat, splitAt, tail, sortOn, (++), intersect, null, inits, find, groupBy, scanl, nub, union)
19 import Data.Map (Map, fromList, fromListWith, filterWithKey, elems, restrictKeys, unionWith, intersectionWith, findWithDefault)
21 import Gargantext.Prelude
22 import Gargantext.Viz.AdaptativePhylo
23 import Gargantext.Viz.Phylo.PhyloTools
24 import Gargantext.Viz.Phylo.SynchronicClustering
26 import Prelude (logBase)
27 import Control.Lens hiding (Level)
29 import qualified Data.Set as Set
36 -- | Process the inverse sumLog
37 sumInvLog :: Double -> [Double] -> Double
38 sumInvLog s l = foldl (\mem x -> mem + (1 / log (s + x))) 0 l
41 -- | Process the sumLog
42 sumLog :: Double -> [Double] -> Double
43 sumLog s l = foldl (\mem x -> mem + log (s + x)) 0 l
46 -- | To compute a jaccard similarity between two lists
47 jaccard :: [Int] -> [Int] -> Double
48 jaccard inter' union' = ((fromIntegral . length) $ inter') / ((fromIntegral . length) $ union')
51 -- | To process a WeighedLogJaccard distance between to coocurency matrix
52 weightedLogJaccard :: Double -> Double -> Cooc -> Cooc -> [Int] -> [Int] -> Double
53 weightedLogJaccard sens docs cooc cooc' ngrams ngrams'
54 | null ngramsInter = 0
55 | ngramsInter == ngramsUnion = 1
56 | sens == 0 = jaccard ngramsInter ngramsUnion
57 | sens > 0 = (sumInvLog sens coocInter) / (sumInvLog sens coocUnion)
58 | otherwise = (sumLog sens coocInter) / (sumLog sens coocUnion)
60 --------------------------------------
62 ngramsInter = intersect ngrams ngrams'
63 --------------------------------------
65 ngramsUnion = union ngrams ngrams'
66 --------------------------------------
68 coocInter = elems $ map (/docs) $ intersectionWith (+) cooc cooc'
69 --------------------------------------
71 coocUnion = elems $ map (/docs) $ unionWith (+) cooc cooc'
72 --------------------------------------
75 -- | To choose a proximity function
76 pickProximity :: Proximity -> Double -> Cooc -> Cooc -> [Int] -> [Int] -> Double
77 pickProximity proximity docs cooc cooc' ngrams ngrams' = case proximity of
78 WeightedLogJaccard sens _ _ -> weightedLogJaccard sens docs cooc cooc' ngrams ngrams'
82 filterProximity :: Proximity -> Double -> Double -> Bool
83 filterProximity proximity thr local =
85 WeightedLogJaccard _ _ _ -> local >= thr
89 -- | To process the proximity between a current group and a pair of targets group
90 toProximity :: Map Date Double -> Proximity -> PhyloGroup -> PhyloGroup -> PhyloGroup -> Double
91 toProximity docs proximity ego target target' =
92 let docs' = sum $ elems docs
93 cooc = if target == target'
94 then (target ^. phylo_groupCooc)
95 else sumCooc (target ^. phylo_groupCooc) (target' ^. phylo_groupCooc)
96 ngrams = if target == target'
97 then (target ^. phylo_groupNgrams)
98 else union (target ^. phylo_groupNgrams) (target' ^. phylo_groupNgrams)
99 in pickProximity proximity docs' (ego ^. phylo_groupCooc) cooc (ego ^. phylo_groupNgrams) ngrams
102 ------------------------
103 -- | Local Matching | --
104 ------------------------
107 -- | Find pairs of valuable candidates to be matched
108 makePairs :: [PhyloGroup] -> [PhyloPeriodId] -> [(PhyloGroup,PhyloGroup)]
109 makePairs candidates periods = case null periods of
111 -- | at least on of the pair candidates should be from the last added period
112 False -> filter (\(cdt,cdt') -> (inLastPeriod cdt periods)
113 || (inLastPeriod cdt' periods))
114 $ listToKeys candidates
116 inLastPeriod :: PhyloGroup -> [PhyloPeriodId] -> Bool
117 inLastPeriod g prds = (g ^. phylo_groupPeriod) == (last' "makePairs" prds)
120 phyloGroupMatching :: [[PhyloGroup]] -> Filiation -> Proximity -> Map Date Double -> Double-> PhyloGroup -> PhyloGroup
121 phyloGroupMatching candidates fil proxi docs thr ego = case pointers of
122 Nothing -> addPointers ego fil TemporalPointer []
123 Just pts -> addPointers ego fil TemporalPointer
124 $ head' "phyloGroupMatching"
125 -- | Keep only the best set of pointers grouped by proximity
126 $ groupBy (\pt pt' -> snd pt == snd pt')
127 $ reverse $ sortOn snd pts
128 -- | Find the first time frame where at leats one pointer satisfies the proximity threshold
130 pointers :: Maybe [Pointer]
131 pointers = find (not . null)
132 -- | for each time frame, process the proximity on relevant pairs of targeted groups
133 $ scanl (\acc groups ->
134 let periods = nub $ map (\g' -> g' ^. phylo_groupPeriod) $ concat groups
135 pairs = makePairs (concat groups) periods
136 in acc ++ ( filter (\(_,proximity) -> filterProximity proxi thr proximity)
139 -- | process the proximity between the current group and a pair of candidates
140 let proximity = toProximity (filterDocs docs periods) proxi ego c c'
142 then [(getGroupId c,proximity)]
143 else [(getGroupId c,proximity),(getGroupId c',proximity)] ) pairs)
145 -- | groups from [[1900],[1900,1901],[1900,1901,1902],...]
147 --------------------------------------
148 filterDocs :: Map Date Double -> [PhyloPeriodId] -> Map Date Double
149 filterDocs d pds = restrictKeys d $ periodsToYears pds
152 -----------------------------
153 -- | Matching Processing | --
154 -----------------------------
157 getNextPeriods :: Filiation -> Int -> PhyloPeriodId -> [PhyloPeriodId] -> [PhyloPeriodId]
158 getNextPeriods fil max' pId pIds =
160 ToChilds -> take max' $ (tail . snd) $ splitAt (elemIndex' pId pIds) pIds
161 ToParents -> take max' $ (reverse . fst) $ splitAt (elemIndex' pId pIds) pIds
164 getCandidates :: Filiation -> PhyloGroup -> [PhyloPeriodId] -> [PhyloGroup] -> [[PhyloGroup]]
165 getCandidates fil ego pIds targets =
168 ToParents -> reverse targets'
170 targets' :: [[PhyloGroup]]
171 targets' = map (\groups' -> filter (\g' -> (not . null) $ intersect (ego ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams)) groups') $ elems
172 $ filterWithKey (\k _ -> elem k pIds)
175 $ map (\g' -> (g' ^. phylo_groupPeriod,[g'])) targets
178 processMatching :: Int -> [PhyloPeriodId] -> Proximity -> Double -> Map Date Double -> [PhyloGroup] -> [PhyloGroup]
179 processMatching max' periods proximity thr docs groups =
181 let childs = getCandidates ToChilds group
182 (getNextPeriods ToChilds max' (group ^. phylo_groupPeriod) periods) groups
183 parents = getCandidates ToParents group
184 (getNextPeriods ToParents max' (group ^. phylo_groupPeriod) periods) groups
185 in phyloGroupMatching parents ToParents proximity docs thr
186 $ phyloGroupMatching childs ToChilds proximity docs thr group
190 -----------------------
191 -- | Phylo Quality | --
192 -----------------------
195 termFreq :: Int -> [[PhyloGroup]] -> Double
196 termFreq term branches = (sum $ map (\g -> findWithDefault 0 (term,term) (g ^. phylo_groupCooc)) $ concat branches)
197 / (sum $ map (\g -> getTrace $ g ^. phylo_groupCooc) $ concat branches)
200 entropy :: [[PhyloGroup]] -> Double
202 let terms = ngramsInBranches branches
203 in sum $ map (\term -> (1 / log (termFreq term branches))
204 / (sum $ map (\branch -> 1 / log (termFreq term [branch])) branches)
205 * (sum $ map (\branch ->
206 let q = branchObs term (length $ concat branches) branch
207 in q * logBase 2 q ) branches) ) terms
209 -- | Probability to observe a branch given a random term of the phylo
210 branchObs :: Int -> Int -> [PhyloGroup] -> Double
211 branchObs term total branch = (fromIntegral $ length $ filter (\g -> elem term $ g ^. phylo_groupNgrams) branch)
212 / (fromIntegral total)
215 homogeneity :: [[PhyloGroup]] -> Double
216 homogeneity branches = undefined
221 toPhyloQuality :: [[PhyloGroup]] -> Double
222 toPhyloQuality branches = sqrt (homogeneity branches / entropy branches)
225 -----------------------------
226 -- | Adaptative Matching | --
227 -----------------------------
230 groupsToBranches :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
231 groupsToBranches groups =
232 -- | run the related component algorithm
233 let graph = zip [1..]
235 $ map (\group -> [getGroupId group]
236 ++ (map fst $ group ^. phylo_groupPeriodParents)
237 ++ (map fst $ group ^. phylo_groupPeriodChilds) ) $ elems groups
238 -- | update each group's branch id
239 in map (\(bId,ids) ->
240 map (\group -> group & phylo_groupBranchId %~ (\(lvl,lst) -> (lvl,lst ++ [bId])))
241 $ elems $ restrictKeys groups (Set.fromList ids)
245 recursiveMatching :: Proximity -> Double -> Int -> [PhyloPeriodId] -> Map Date Double -> Double -> [PhyloGroup] -> [PhyloGroup]
246 recursiveMatching proximity thr max' periods docs quality groups =
247 case quality < quality' of
248 -- | success : we localy improve the quality of the branch, let's go deeper
251 recursiveMatching proximity (thr + (getThresholdStep proximity)) max' periods docs quality' branch
253 -- | failure : last step was the local maximum, let's validate it
256 -- | 3) process a quality score on the local set of branches
258 quality' = toPhyloQuality branches
259 -- | 2) group the new groups into branches
260 branches :: [[PhyloGroup]]
261 branches = groupsToBranches $ fromList $ map (\group -> (getGroupId group, group)) groups'
262 -- | 1) process a temporal matching for each group
263 groups' :: [PhyloGroup]
264 groups' = processMatching max' periods proximity thr docs groups
267 temporalMatching :: Phylo -> Phylo
268 temporalMatching phylo = updatePhyloGroups 1 branches phylo
270 -- | 2) run the recursive matching to find the best repartition among branches
271 branches :: Map PhyloGroupId PhyloGroup
273 $ map (\g -> (getGroupId g, g))
274 $ recursiveMatching (phyloProximity $ getConfig phylo)
275 (getThresholdInit $ phyloProximity $ getConfig phylo)
276 (getTimeFrame $ timeUnit $ getConfig phylo)
278 (phylo ^. phylo_timeDocs) (toPhyloQuality [groups']) groups'
279 -- | 1) for each group process an initial temporal Matching
280 groups' :: [PhyloGroup]
281 groups' = processMatching (getTimeFrame $ timeUnit $ getConfig phylo) (getPeriodIds phylo)
282 (phyloProximity $ getConfig phylo) (getThresholdInit $ phyloProximity $ getConfig phylo)
283 (phylo ^. phylo_timeDocs) (getGroupsFromLevel 1 phylo)