2 Module : Gargantext.Viz.Phylo.TemporalMatching
3 Description : Module dedicated to the adaptative temporal matching of a Phylo.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 {-# LANGUAGE NoImplicitPrelude #-}
12 {-# LANGUAGE FlexibleContexts #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE MultiParamTypeClasses #-}
16 module Gargantext.Viz.Phylo.TemporalMatching where
18 import Data.List (concat, splitAt, tail, sortOn, (++), intersect, null, inits, find, groupBy, scanl, any, nub, union)
19 import Data.Map (Map, fromList, toList, fromListWith, filterWithKey, elems, restrictKeys, unionWith, intersectionWith)
21 import Gargantext.Prelude
22 import Gargantext.Viz.AdaptativePhylo
23 import Gargantext.Viz.Phylo.PhyloTools
24 import Gargantext.Viz.Phylo.SynchronicClustering
26 import Control.Lens hiding (Level)
34 -- | Process the inverse sumLog
35 sumInvLog :: Double -> [Double] -> Double
36 sumInvLog s l = foldl (\mem x -> mem + (1 / log (s + x))) 0 l
39 -- | Process the sumLog
40 sumLog :: Double -> [Double] -> Double
41 sumLog s l = foldl (\mem x -> mem + log (s + x)) 0 l
44 -- | To compute a jaccard similarity between two lists
45 jaccard :: [Int] -> [Int] -> Double
46 jaccard inter' union' = ((fromIntegral . length) $ inter') / ((fromIntegral . length) $ union')
49 -- | To process a WeighedLogJaccard distance between to coocurency matrix
50 weightedLogJaccard :: Double -> Double -> Cooc -> Cooc -> [Int] -> [Int] -> Double
51 weightedLogJaccard sens docs cooc cooc' ngrams ngrams'
52 | null ngramsInter = 0
53 | ngramsInter == ngramsUnion = 1
54 | sens == 0 = jaccard ngramsInter ngramsUnion
55 | sens > 0 = (sumInvLog sens coocInter) / (sumInvLog sens coocUnion)
56 | otherwise = (sumLog sens coocInter) / (sumLog sens coocUnion)
58 --------------------------------------
60 ngramsInter = intersect ngrams ngrams'
61 --------------------------------------
63 ngramsUnion = union ngrams ngrams'
64 --------------------------------------
66 coocInter = elems $ map (/docs) $ intersectionWith (+) cooc cooc'
67 --------------------------------------
69 coocUnion = elems $ map (/docs) $ unionWith (+) cooc cooc'
70 --------------------------------------
73 -- | To choose a proximity function
74 pickProximity :: Proximity -> Double -> Cooc -> Cooc -> [Int] -> [Int] -> Double
75 pickProximity proximity docs cooc cooc' ngrams ngrams' = case proximity of
76 WeightedLogJaccard sens -> weightedLogJaccard sens docs cooc cooc' ngrams ngrams'
80 -- | To process the proximity between a current group and a pair of targets group
81 toProximity :: Map Date Double -> Proximity -> PhyloGroup -> PhyloGroup -> PhyloGroup -> Double
82 toProximity docs proximity group target target' =
83 let docs' = sum $ elems docs
84 cooc = if target == target'
85 then (target ^. phylo_groupCooc)
86 else sumCooc (target ^. phylo_groupCooc) (target' ^. phylo_groupCooc)
87 ngrams = if target == target'
88 then (target ^. phylo_groupNgrams)
89 else union (target ^. phylo_groupNgrams) (target' ^. phylo_groupNgrams)
90 in pickProximity proximity docs' (group ^. phylo_groupCooc) cooc (group ^. phylo_groupNgrams) ngrams
93 ------------------------
94 -- | Local Matching | --
95 ------------------------
98 -- | Find pairs of valuable candidates to be matched
99 makePairs :: [PhyloGroup] -> [PhyloPeriodId] -> Double -> Map Date Double -> Proximity -> PhyloGroup -> [(PhyloGroup,PhyloGroup)]
100 makePairs candidates periods thr docs proximity group = case null periods of
102 -- | at least on of the pair candidates should be from the last added period
103 False -> filter (\(cdt,cdt') -> (inLastPeriod cdt periods)
104 || (inLastPeriod cdt' periods))
106 -- | remove poor candidates from previous periods
107 $ filter (\cdt -> (inLastPeriod cdt periods)
108 || ((toProximity (reframeDocs docs periods) proximity group group cdt) >= thr)) candidates
110 inLastPeriod :: PhyloGroup -> [PhyloPeriodId] -> Bool
111 inLastPeriod g prds = (g ^. phylo_groupPeriod) == (last' "makePairs" prds)
114 phyloGroupMatching :: [[PhyloGroup]] -> Filiation -> Double -> Map Date Double -> Proximity -> PhyloGroup -> PhyloGroup
115 phyloGroupMatching candidates fil thr docs proxi group = case pointers of
116 Nothing -> addPointers group fil TemporalPointer []
117 Just pts -> addPointers group fil TemporalPointer
118 $ head' "phyloGroupMatching"
119 -- | Keep only the best set of pointers grouped by proximity
120 $ groupBy (\pt pt' -> snd pt == snd pt')
121 $ reverse $ sortOn snd pts
122 -- | Find the first time frame where at leats one pointer satisfies the proximity threshold
124 pointers :: Maybe [Pointer]
125 pointers = find (not . null)
126 -- | for each time frame, process the proximity on relevant pairs of targeted groups
127 $ scanl (\acc groups ->
128 let periods = nub $ map (\g' -> g' ^. phylo_groupPeriod) $ concat groups
129 pairs = makePairs (concat groups) periods thr docs proxi group
130 in acc ++ ( filter (\(_,proximity) -> proximity >= thr )
133 -- | process the proximity between the current group and a pair of candidates
134 let proximity = toProximity (reframeDocs docs periods) proxi group c c'
136 then [(getGroupId c,proximity)]
137 else [(getGroupId c,proximity),(getGroupId c',proximity)] ) pairs)
139 -- | groups from [[1900],[1900,1901],[1900,1901,1902],...]
143 -----------------------------
144 -- | Adaptative Matching | --
145 -----------------------------
148 getNextPeriods :: Filiation -> Int -> PhyloPeriodId -> [PhyloPeriodId] -> [PhyloPeriodId]
149 getNextPeriods fil max pId pIds =
151 ToChilds -> take max $ (tail . snd) $ splitAt (elemIndex' pId pIds) pIds
152 ToParents -> take max $ (reverse . fst) $ splitAt (elemIndex' pId pIds) pIds
155 getCandidates :: Filiation -> PhyloGroup -> [PhyloPeriodId] -> [PhyloGroup] -> [[PhyloGroup]]
156 getCandidates fil g pIds targets =
159 ToParents -> reverse targets'
161 targets' :: [[PhyloGroup]]
162 targets' = map (\groups' -> filter (\g' -> (not . null) $ intersect (g ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams)) groups') $ elems
163 $ filterWithKey (\k _ -> elem k pIds)
166 $ map (\g' -> (g' ^. phylo_groupPeriod,[g'])) targets
169 shouldBreak :: Double -> [(Double,[PhyloGroup])] -> Bool
170 shouldBreak thr branches = any (\(quality,_) -> quality < thr) branches
173 toBranchQuality :: [[PhyloGroup]] -> [(Double,[PhyloGroup])]
174 toBranchQuality branches = undefined
177 reframeDocs :: Map Date Double -> [PhyloPeriodId] -> Map Date Double
178 reframeDocs docs periods = restrictKeys docs $ periodsToYears periods
181 -- findGhostLinks :: [Link] -> [[Link]] -> Map PhyloGroupId
183 adaptativeMatching :: Int -> Double -> Double -> Double -> Map Date Double -> Proximity -> [PhyloGroup] -> [PhyloGroup] -> [PhyloPeriodId] -> [PhyloGroup]
184 adaptativeMatching maxTime thrStep thrMatch thrQua docs proximity groups candidates periods =
185 -- | check if we should break some of the new branches or not
186 case shouldBreak thrQua branches' of
187 True -> concat $ map (\(s,b) ->
189 -- | we keep the branch as it is
191 -- | we break the branch using an increased temporal matching threshold
192 else let nextGroups = undefined
193 nextCandidates = undefined
194 nextPeriods = undefined
195 in adaptativeMatching maxTime thrStep (thrMatch + thrStep) thrQua
196 (reframeDocs docs nextPeriods)
198 nextGroups nextCandidates nextPeriods
200 -- | the quality of all the new branches is sufficient
201 False -> concat branches
203 -- | 3) process a quality score for each new branch
204 branches' :: [(Double,[PhyloGroup])]
205 branches' = toBranchQuality branches
206 -- | 2) group the new groups into branches
207 branches :: [[PhyloGroup]]
208 branches = relatedComponents groups'
209 -- | 1) connect each group to its parents and childs
210 groups' :: [PhyloGroup]
211 groups' = map (\group ->
212 let childs = getCandidates ToChilds group
213 (getNextPeriods ToChilds maxTime (group ^. phylo_groupPeriod) periods) candidates
214 parents = getCandidates ToParents group
215 (getNextPeriods ToParents maxTime (group ^. phylo_groupPeriod) periods) candidates
216 -- | match the group to its possible childs then parents
217 in phyloGroupMatching parents ToParents thrMatch docs proximity
218 $ phyloGroupMatching childs ToChilds thrMatch docs proximity group
222 temporalMatching :: Phylo -> Phylo
223 temporalMatching phylo =
224 let branches = fromList $ map (\g -> (getGroupId g, g))
225 $ adaptativeMatching (maxTimeMatch $ getConfig phylo) 0 0 0
226 (phylo ^. phylo_timeDocs)
227 (phyloProximity $ getConfig phylo)
228 (getGroupsFromLevel 1 phylo) (getGroupsFromLevel 1 phylo) (getPeriodIds phylo)
229 in updatePhyloGroups 1 branches phylo