2 Module : Gargantext.Viz.Phylo.TemporalMatching
3 Description : Module dedicated to the adaptative temporal matching of a Phylo.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 {-# LANGUAGE NoImplicitPrelude #-}
12 {-# LANGUAGE FlexibleContexts #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE MultiParamTypeClasses #-}
16 module Gargantext.Viz.Phylo.TemporalMatching where
18 import Data.List (concat, splitAt, tail, sortOn, (++), intersect, null, inits, groupBy, scanl, nub, union, dropWhile, partition, or)
19 import Data.Map (Map, fromList, elems, restrictKeys, unionWith, intersectionWith, findWithDefault, keys, (!), filterWithKey)
21 import Gargantext.Prelude
22 import Gargantext.Viz.AdaptativePhylo
23 import Gargantext.Viz.Phylo.PhyloTools
25 -- import Prelude (logBase)
26 import Control.Lens hiding (Level)
27 import Control.Parallel.Strategies (parList, rdeepseq, using)
28 -- import Debug.Trace (trace)
30 import qualified Data.Set as Set
38 -- | Process the inverse sumLog
39 sumInvLog :: Double -> [Double] -> Double
40 sumInvLog s l = foldl (\mem x -> mem + (1 / log (s + x))) 0 l
43 -- | Process the sumLog
44 sumLog :: Double -> [Double] -> Double
45 sumLog s l = foldl (\mem x -> mem + log (s + x)) 0 l
48 -- | To compute a jaccard similarity between two lists
49 jaccard :: [Int] -> [Int] -> Double
50 jaccard inter' union' = ((fromIntegral . length) $ inter') / ((fromIntegral . length) $ union')
53 -- | To process a WeighedLogJaccard distance between to coocurency matrix
54 weightedLogJaccard :: Double -> Double -> Cooc -> Cooc -> [Int] -> [Int] -> Double
55 weightedLogJaccard sens docs cooc cooc' ngrams ngrams'
56 | null ngramsInter = 0
57 | ngramsInter == ngramsUnion = 1
58 | sens == 0 = jaccard ngramsInter ngramsUnion
59 | sens > 0 = (sumInvLog sens coocInter) / (sumInvLog sens coocUnion)
60 | otherwise = (sumLog sens coocInter) / (sumLog sens coocUnion)
62 --------------------------------------
64 ngramsInter = intersect ngrams ngrams'
65 --------------------------------------
67 ngramsUnion = union ngrams ngrams'
68 --------------------------------------
70 coocInter = elems $ map (/docs) $ filterWithKey (\(k,k') _ -> k == k') $ intersectionWith (+) cooc cooc'
71 -- coocInter = elems $ map (/docs) $ intersectionWith (+) cooc cooc'
72 --------------------------------------
74 coocUnion = elems $ map (/docs) $ filterWithKey (\(k,k') _ -> k == k') $ unionWith (+) cooc cooc'
75 --------------------------------------
78 -- | To choose a proximity function
79 pickProximity :: Proximity -> Double -> Cooc -> Cooc -> [Int] -> [Int] -> Double
80 pickProximity proximity docs cooc cooc' ngrams ngrams' = case proximity of
81 WeightedLogJaccard sens _ _ -> weightedLogJaccard sens docs cooc cooc' ngrams ngrams'
85 -- | To process the proximity between a current group and a pair of targets group
86 toProximity :: Map Date Double -> Proximity -> PhyloGroup -> PhyloGroup -> PhyloGroup -> Double
87 toProximity docs proximity ego target target' =
88 let docs' = sum $ elems docs
89 cooc = if target == target'
90 then (target ^. phylo_groupCooc)
91 else sumCooc (target ^. phylo_groupCooc) (target' ^. phylo_groupCooc)
92 ngrams = if target == target'
93 then (target ^. phylo_groupNgrams)
94 else union (target ^. phylo_groupNgrams) (target' ^. phylo_groupNgrams)
95 in pickProximity proximity docs' (ego ^. phylo_groupCooc) cooc (ego ^. phylo_groupNgrams) ngrams
98 ------------------------
99 -- | Local Matching | --
100 ------------------------
102 toLastPeriod :: Filiation -> [PhyloPeriodId] -> PhyloPeriodId
103 toLastPeriod fil periods = case fil of
104 ToParents -> head' "toLastPeriod" (sortOn fst periods)
105 ToChilds -> last' "toLastPeriod" (sortOn fst periods)
108 toLazyPairs :: [Pointer] -> Filiation -> Double -> Proximity -> PhyloPeriodId -> [(PhyloGroup,PhyloGroup)] -> [(PhyloGroup,PhyloGroup)]
109 toLazyPairs pointers fil thr prox prd pairs =
110 if null pointers then pairs
111 else let rest = filterPointers prox thr pointers
113 then let prd' = toLastPeriod fil (map (fst . fst . fst) pointers)
116 else filter (\(g,g') ->
118 ToParents -> ((fst $ g ^. phylo_groupPeriod) < (fst prd'))
119 || ((fst $ g' ^. phylo_groupPeriod) < (fst prd'))
120 ToChilds -> ((fst $ g ^. phylo_groupPeriod) > (fst prd'))
121 || ((fst $ g' ^. phylo_groupPeriod) > (fst prd'))) pairs
125 -- | Find pairs of valuable candidates to be matched
126 makePairs' :: PhyloGroup -> [PhyloGroup] -> [PhyloPeriodId] -> [Pointer] -> Filiation -> Double -> Proximity -> Map Date Double -> [(PhyloGroup,PhyloGroup)]
127 makePairs' ego candidates periods pointers fil thr prox docs =
130 False -> toLazyPairs pointers fil thr prox lastPrd
131 -- | at least on of the pair candidates should be from the last added period
132 $ filter (\(g,g') -> ((g ^. phylo_groupPeriod) == lastPrd)
133 || ((g' ^. phylo_groupPeriod) == lastPrd))
135 $ filter (\g -> (g ^. phylo_groupPeriod == lastPrd)
136 || ((toProximity docs prox ego ego g) >= thr)) candidates
138 lastPrd :: PhyloPeriodId
139 lastPrd = toLastPeriod fil periods
142 filterPointers :: Proximity -> Double -> [Pointer] -> [Pointer]
143 filterPointers proxi thr pts = filter (\(_,w) -> filterProximity proxi thr w) pts
146 phyloGroupMatching :: [[PhyloGroup]] -> Filiation -> Proximity -> Map Date Double -> Double -> PhyloGroup -> PhyloGroup
147 phyloGroupMatching candidates fil proxi docs thr ego =
148 if (null $ filterPointers proxi thr $ getPeriodPointers fil ego)
149 -- | let's find new pointers
150 then if null nextPointers
151 then addPointers ego fil TemporalPointer []
152 else addPointers ego fil TemporalPointer
153 $ head' "phyloGroupMatching"
154 -- | Keep only the best set of pointers grouped by proximity
155 $ groupBy (\pt pt' -> snd pt == snd pt')
156 $ reverse $ sortOn snd $ head' "pointers" nextPointers
157 -- | Find the first time frame where at leats one pointer satisfies the proximity threshold
158 else addPointers ego fil TemporalPointer
159 $ filterPointers proxi thr $ getPeriodPointers fil ego
161 nextPointers :: [[Pointer]]
162 nextPointers = take 1
164 -- | for each time frame, process the proximity on relevant pairs of targeted groups
165 $ scanl (\acc groups ->
166 let periods = nub $ map _phylo_groupPeriod $ concat groups
167 docs' = (filterDocs docs ([ego ^. phylo_groupPeriod] ++ periods))
168 pairs = makePairs' ego (concat groups) periods (getPeriodPointers fil ego) fil thr proxi docs
169 in acc ++ ( filterPointers proxi thr
172 -- | process the proximity between the current group and a pair of candidates
173 let proximity = toProximity docs' proxi ego c c'
175 then [(getGroupId c,proximity)]
176 else [(getGroupId c,proximity),(getGroupId c',proximity)] ) pairs )) []
177 $ inits candidates -- | groups from [[1900],[1900,1901],[1900,1901,1902],...]
180 filterDocs :: Map Date Double -> [PhyloPeriodId] -> Map Date Double
181 filterDocs d pds = restrictKeys d $ periodsToYears pds
184 -----------------------------
185 -- | Matching Processing | --
186 -----------------------------
189 getNextPeriods :: Filiation -> Int -> PhyloPeriodId -> [PhyloPeriodId] -> [PhyloPeriodId]
190 getNextPeriods fil max' pId pIds =
192 ToChilds -> take max' $ (tail . snd) $ splitAt (elemIndex' pId pIds) pIds
193 ToParents -> take max' $ (reverse . fst) $ splitAt (elemIndex' pId pIds) pIds
196 getCandidates :: Filiation -> PhyloGroup -> [[PhyloGroup]] -> [[PhyloGroup]]
197 getCandidates fil ego targets =
200 ToParents -> reverse targets'
202 targets' :: [[PhyloGroup]]
205 filter (\g' -> (not . null) $ intersect (ego ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams)
209 phyloBranchMatching :: Int -> [PhyloPeriodId] -> Proximity -> Double -> Map Date Double -> [PhyloGroup] -> [PhyloGroup]
210 phyloBranchMatching frame periods proximity thr docs branch =
211 -- traceBranchMatching proximity thr
213 $ groupByField _phylo_groupPeriod branch
215 --------------------------------------
216 matchByPeriods :: Map PhyloPeriodId [PhyloGroup] -> [PhyloGroup]
217 matchByPeriods branch' = foldl' (\acc prd ->
218 let periodsPar = getNextPeriods ToParents frame prd periods
219 periodsChi = getNextPeriods ToChilds frame prd periods
220 candidatesPar = map (\prd' -> findWithDefault [] prd' branch') periodsPar
221 candidatesChi = map (\prd' -> findWithDefault [] prd' branch') periodsChi
222 docsPar = filterDocs docs ([prd] ++ periodsPar)
223 docsChi = filterDocs docs ([prd] ++ periodsChi)
224 egos = map (\ego -> phyloGroupMatching (getCandidates ToParents ego candidatesPar) ToParents proximity docsPar thr
225 $ phyloGroupMatching (getCandidates ToChilds ego candidatesChi) ToChilds proximity docsChi thr ego)
226 $ findWithDefault [] prd branch'
227 egos' = egos `using` parList rdeepseq
228 in acc ++ egos' ) [] periods
231 -----------------------
232 -- | Phylo Quality | --
233 -----------------------
236 relevantBranches :: Int -> [[PhyloGroup]] -> [[PhyloGroup]]
237 relevantBranches term branches =
238 filter (\groups -> (any (\group -> elem term $ group ^. phylo_groupNgrams) groups)) branches
240 fScore :: Double -> Int -> [PhyloGroup] -> [[PhyloGroup]] -> Double
241 fScore beta i bk bks =
242 let recall = ( (fromIntegral $ length $ filter (\g -> elem i $ g ^. phylo_groupNgrams) bk)
243 / (fromIntegral $ length $ filter (\g -> elem i $ g ^. phylo_groupNgrams) $ concat bks))
244 accuracy = ( (fromIntegral $ length $ filter (\g -> elem i $ g ^. phylo_groupNgrams) bk)
245 / (fromIntegral $ length bk))
246 in ((1 + beta ** 2) * accuracy * recall)
247 / (((beta ** 2) * accuracy + recall))
250 wk :: [PhyloGroup] -> Double
251 wk bk = fromIntegral $ length bk
254 toPhyloQuality' :: Double -> Map Int Double -> [[PhyloGroup]] -> Double
255 toPhyloQuality' beta freq branches =
260 let bks = relevantBranches i branches
261 in (freq ! i) * (sum $ map (\bk -> ((wk bk) / (sum $ map wk bks)) * (fScore beta i bk bks)) bks))
265 -----------------------------
266 -- | Adaptative Matching | --
267 -----------------------------
270 groupsToBranches :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
271 groupsToBranches groups =
272 -- | run the related component algorithm
273 let egos = groupBy (\gs gs' -> (fst $ fst $ head' "egos" gs) == (fst $ fst $ head' "egos" gs'))
274 $ sortOn (\gs -> fst $ fst $ head' "egos" gs)
275 $ map (\group -> [getGroupId group]
276 ++ (map fst $ group ^. phylo_groupPeriodParents)
277 ++ (map fst $ group ^. phylo_groupPeriodChilds) ) $ elems groups
278 -- | first find the related components by inside each ego's period
279 graph' = map relatedComponents egos
280 -- | then run it for the all the periods
282 $ relatedComponents $ concat (graph' `using` parList rdeepseq)
283 -- | update each group's branch id
284 in map (\(bId,ids) ->
285 let groups' = map (\group -> group & phylo_groupBranchId %~ (\(lvl,lst) -> (lvl,lst ++ [bId])))
286 $ elems $ restrictKeys groups (Set.fromList ids)
287 in groups' `using` parList rdeepseq ) graph
290 reduceFrequency :: Map Int Double -> [[PhyloGroup]] -> Map Int Double
291 reduceFrequency frequency branches =
292 restrictKeys frequency (Set.fromList $ (nub . concat) $ map _phylo_groupNgrams $ concat branches)
295 seqMatching :: Proximity -> Double -> Map Int Double -> Int -> Double -> Int -> Map Date Double -> [PhyloPeriodId] -> [([PhyloGroup],Bool)] -> ([PhyloGroup],Bool) -> [([PhyloGroup],Bool)] -> [([PhyloGroup],Bool)]
296 seqMatching proximity beta frequency minBranch egoThr frame docs periods done ego rest =
297 -- | 1) keep or not the new division of ego
298 let done' = done ++ (if snd ego
299 then (if ((null (fst ego')) || (quality > quality'))
301 -- trace (" ✗ F(β) = " <> show(quality) <> " (vs) " <> show(quality')
302 -- <> " | " <> show(length $ fst ego) <> " groups : "
303 -- <> " |✓ " <> show(length $ fst ego') <> show(map length $ fst ego')
304 -- <> " |✗ " <> show(length $ snd ego') <> "[" <> show(length $ concat $ snd ego') <> "]")
307 -- trace (" ✓ F(β) = " <> show(quality) <> " (vs) " <> show(quality')
308 -- <> " | " <> show(length $ fst ego) <> " groups : "
309 -- <> " |✓ " <> show(length $ fst ego') <> show(map length $ fst ego')
310 -- <> " |✗ " <> show(length $ snd ego') <> "[" <> show(length $ concat $ snd ego') <> "]")
311 ((map (\e -> (e,True)) (fst ego')) ++ (map (\e -> (e,False)) (snd ego'))))
314 -- | 2) if there is no more branches in rest then return else continue
317 else seqMatching proximity beta frequency minBranch egoThr frame docs periods
318 done' (head' "seqMatching" rest) (tail' "seqMatching" rest)
320 --------------------------------------
322 quality = toPhyloQuality' beta frequency ((map fst done) ++ [fst ego] ++ (map fst rest))
323 --------------------------------------
324 ego' :: ([[PhyloGroup]],[[PhyloGroup]])
326 let branches = groupsToBranches $ fromList $ map (\g -> (getGroupId g, g))
327 $ phyloBranchMatching frame periods proximity egoThr docs (fst ego)
328 branches' = branches `using` parList rdeepseq
329 in partition (\b -> (length $ nub $ map _phylo_groupPeriod b) >= minBranch) branches'
330 --------------------------------------
332 quality' = toPhyloQuality' beta frequency
333 ((map fst done) ++ (fst ego') ++ (snd ego') ++ (map fst rest))
336 recursiveMatching' :: Proximity -> Double -> Int -> Map Int Double -> Double -> Int -> [PhyloPeriodId] -> Map Date Double -> [([PhyloGroup],Bool)] -> [([PhyloGroup],Bool)]
337 recursiveMatching' proximity beta minBranch frequency egoThr frame periods docs branches =
338 if (egoThr >= 1) || ((not . or) $ map snd branches)
341 let branches' = seqMatching proximity beta frequency minBranch egoThr frame docs periods
342 [] (head' "recursiveMatching" branches) (tail' "recursiveMatching" branches)
343 frequency' = reduceFrequency frequency (map fst branches')
344 in recursiveMatching' proximity beta minBranch frequency' (egoThr + (getThresholdStep proximity)) frame periods docs branches'
347 temporalMatching :: Phylo -> Phylo
348 temporalMatching phylo = updatePhyloGroups 1
349 (fromList $ map (\g -> (getGroupId g,g)) $ traceMatchEnd $ concat branches)
352 -- | 2) init the recursiveMatching
353 branches :: [[PhyloGroup]]
355 $ recursiveMatching' (phyloProximity $ getConfig phylo)
356 (_qua_granularity $ phyloQuality $ getConfig phylo)
357 (_qua_minBranch $ phyloQuality $ getConfig phylo)
358 (phylo ^. phylo_termFreq)
359 (getThresholdInit $ phyloProximity $ getConfig phylo)
360 (getTimeFrame $ timeUnit $ getConfig phylo)
362 (phylo ^. phylo_timeDocs)
364 -- | 1) for each group process an initial temporal Matching
365 groups :: [PhyloGroup]
366 groups = phyloBranchMatching (getTimeFrame $ timeUnit $ getConfig phylo) (getPeriodIds phylo)
367 (phyloProximity $ getConfig phylo) (getThresholdInit $ phyloProximity $ getConfig phylo)
368 (phylo ^. phylo_timeDocs)
369 (traceTemporalMatching $ getGroupsFromLevel 1 phylo)