]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/Phylo/TemporalMatching.hs
Merge branch 'dev' into dev-getting-started-readme
[gargantext.git] / src / Gargantext / Viz / Phylo / TemporalMatching.hs
1 {-|
2 Module : Gargantext.Viz.Phylo.TemporalMatching
3 Description : Module dedicated to the adaptative temporal matching of a Phylo.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 {-# LANGUAGE NoImplicitPrelude #-}
12 {-# LANGUAGE FlexibleContexts #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE MultiParamTypeClasses #-}
15
16 module Gargantext.Viz.Phylo.TemporalMatching where
17
18 import Data.List (concat, splitAt, tail, sortOn, (++), intersect, null, inits, groupBy, scanl, nub, union, dropWhile, partition, delete)
19 import Data.Map (Map, fromList, elems, restrictKeys, unionWith, intersectionWith, findWithDefault, keys, (!), filterWithKey)
20
21 import Gargantext.Prelude
22 import Gargantext.Viz.AdaptativePhylo
23 import Gargantext.Viz.Phylo.PhyloTools
24
25 -- import Prelude (logBase)
26 import Control.Lens hiding (Level)
27 import Control.Parallel.Strategies (parList, rdeepseq, using)
28 import Debug.Trace (trace)
29
30 import qualified Data.Set as Set
31
32
33 -------------------
34 -- | Proximity | --
35 -------------------
36
37
38 -- | Process the inverse sumLog
39 sumInvLog :: Double -> [Double] -> Double
40 sumInvLog s l = foldl (\mem x -> mem + (1 / log (s + x))) 0 l
41
42
43 -- | Process the sumLog
44 sumLog :: Double -> [Double] -> Double
45 sumLog s l = foldl (\mem x -> mem + log (s + x)) 0 l
46
47
48 -- | To compute a jaccard similarity between two lists
49 jaccard :: [Int] -> [Int] -> Double
50 jaccard inter' union' = ((fromIntegral . length) $ inter') / ((fromIntegral . length) $ union')
51
52
53 -- | To process a WeighedLogJaccard distance between to coocurency matrix
54 weightedLogJaccard :: Double -> Double -> Cooc -> Cooc -> [Int] -> [Int] -> Double
55 weightedLogJaccard sens docs cooc cooc' ngrams ngrams'
56 | null ngramsInter = 0
57 | ngramsInter == ngramsUnion = 1
58 | sens == 0 = jaccard ngramsInter ngramsUnion
59 | sens > 0 = (sumInvLog sens coocInter) / (sumInvLog sens coocUnion)
60 | otherwise = (sumLog sens coocInter) / (sumLog sens coocUnion)
61 where
62 --------------------------------------
63 ngramsInter :: [Int]
64 ngramsInter = intersect ngrams ngrams'
65 --------------------------------------
66 ngramsUnion :: [Int]
67 ngramsUnion = union ngrams ngrams'
68 --------------------------------------
69 coocInter :: [Double]
70 coocInter = elems $ map (/docs) $ filterWithKey (\(k,k') _ -> k == k') $ intersectionWith (+) cooc cooc'
71 -- coocInter = elems $ map (/docs) $ intersectionWith (+) cooc cooc'
72 --------------------------------------
73 coocUnion :: [Double]
74 coocUnion = elems $ map (/docs) $ filterWithKey (\(k,k') _ -> k == k') $ unionWith (+) cooc cooc'
75 --------------------------------------
76
77
78 -- | To choose a proximity function
79 pickProximity :: Proximity -> Double -> Cooc -> Cooc -> [Int] -> [Int] -> Double
80 pickProximity proximity docs cooc cooc' ngrams ngrams' = case proximity of
81 WeightedLogJaccard sens _ _ -> weightedLogJaccard sens docs cooc cooc' ngrams ngrams'
82 Hamming -> undefined
83
84
85 -- | To process the proximity between a current group and a pair of targets group
86 toProximity :: Map Date Double -> Proximity -> PhyloGroup -> PhyloGroup -> PhyloGroup -> Double
87 toProximity docs proximity ego target target' =
88 let docs' = sum $ elems docs
89 cooc = if target == target'
90 then (target ^. phylo_groupCooc)
91 else sumCooc (target ^. phylo_groupCooc) (target' ^. phylo_groupCooc)
92 ngrams = if target == target'
93 then (target ^. phylo_groupNgrams)
94 else union (target ^. phylo_groupNgrams) (target' ^. phylo_groupNgrams)
95 in pickProximity proximity docs' (ego ^. phylo_groupCooc) cooc (ego ^. phylo_groupNgrams) ngrams
96
97
98 ------------------------
99 -- | Local Matching | --
100 ------------------------
101
102 toLastPeriod :: Filiation -> [PhyloPeriodId] -> PhyloPeriodId
103 toLastPeriod fil periods = case fil of
104 ToParents -> head' "toLastPeriod" (sortOn fst periods)
105 ToChilds -> last' "toLastPeriod" (sortOn fst periods)
106
107
108 toLazyPairs :: [Pointer] -> Filiation -> Double -> Proximity -> PhyloPeriodId -> [(PhyloGroup,PhyloGroup)] -> [(PhyloGroup,PhyloGroup)]
109 toLazyPairs pointers fil thr prox prd pairs =
110 if null pointers then pairs
111 else let rest = filterPointers prox thr pointers
112 in if null rest
113 then let prd' = toLastPeriod fil (map (fst . fst . fst) pointers)
114 in if prd' == prd
115 then []
116 else filter (\(g,g') ->
117 case fil of
118 ToParents -> ((fst $ g ^. phylo_groupPeriod) < (fst prd'))
119 || ((fst $ g' ^. phylo_groupPeriod) < (fst prd'))
120 ToChilds -> ((fst $ g ^. phylo_groupPeriod) > (fst prd'))
121 || ((fst $ g' ^. phylo_groupPeriod) > (fst prd'))) pairs
122 else []
123
124
125 -- | Find pairs of valuable candidates to be matched
126 makePairs' :: PhyloGroup -> [PhyloGroup] -> [PhyloPeriodId] -> [Pointer] -> Filiation -> Double -> Proximity -> Map Date Double -> [(PhyloGroup,PhyloGroup)]
127 makePairs' ego candidates periods pointers fil thr prox docs =
128 case null periods of
129 True -> []
130 False -> toLazyPairs pointers fil thr prox lastPrd
131 -- | at least on of the pair candidates should be from the last added period
132 $ filter (\(g,g') -> ((g ^. phylo_groupPeriod) == lastPrd)
133 || ((g' ^. phylo_groupPeriod) == lastPrd))
134 $ listToKeys
135 $ filter (\g -> (g ^. phylo_groupPeriod == lastPrd)
136 || ((toProximity docs prox ego ego g) >= thr)) candidates
137 where
138 lastPrd :: PhyloPeriodId
139 lastPrd = toLastPeriod fil periods
140
141
142 filterPointers :: Proximity -> Double -> [Pointer] -> [Pointer]
143 filterPointers proxi thr pts = filter (\(_,w) -> filterProximity proxi thr w) pts
144
145
146 phyloGroupMatching :: [[PhyloGroup]] -> Filiation -> Proximity -> Map Date Double -> Double -> PhyloGroup -> PhyloGroup
147 phyloGroupMatching candidates fil proxi docs thr ego =
148 case null nextPointers of
149 -- | let's find new pointers
150 True -> if null $ filterPointers proxi thr $ getPeriodPointers fil ego
151 then addPointers ego fil TemporalPointer []
152 -- | or keep the old ones
153 else addPointers ego fil TemporalPointer
154 $ filterPointers proxi thr $ getPeriodPointers fil ego
155 False -> addPointers ego fil TemporalPointer
156 $ head' "phyloGroupMatching"
157 -- | Keep only the best set of pointers grouped by proximity
158 $ groupBy (\pt pt' -> snd pt == snd pt')
159 $ reverse $ sortOn snd $ head' "pointers"
160 $ nextPointers
161 -- | Find the first time frame where at leats one pointer satisfies the proximity threshold
162 where
163 nextPointers :: [[Pointer]]
164 nextPointers = take 1
165 $ dropWhile (null)
166 -- | for each time frame, process the proximity on relevant pairs of targeted groups
167 $ scanl (\acc groups ->
168 let periods = nub $ map _phylo_groupPeriod $ concat groups
169 docs' = (filterDocs docs ([ego ^. phylo_groupPeriod] ++ periods))
170 pairs = makePairs' ego (concat groups) periods (getPeriodPointers fil ego) fil thr proxi docs
171 in acc ++ ( filterPointers proxi thr
172 $ concat
173 $ map (\(c,c') ->
174 -- | process the proximity between the current group and a pair of candidates
175 let proximity = toProximity docs' proxi ego c c'
176 in if (c == c')
177 then [(getGroupId c,proximity)]
178 else [(getGroupId c,proximity),(getGroupId c',proximity)] ) pairs )) []
179 $ inits candidates -- | groups from [[1900],[1900,1901],[1900,1901,1902],...]
180
181
182 filterDocs :: Map Date Double -> [PhyloPeriodId] -> Map Date Double
183 filterDocs d pds = restrictKeys d $ periodsToYears pds
184
185
186 -----------------------------
187 -- | Matching Processing | --
188 -----------------------------
189
190
191 getNextPeriods :: Filiation -> Int -> PhyloPeriodId -> [PhyloPeriodId] -> [PhyloPeriodId]
192 getNextPeriods fil max' pId pIds =
193 case fil of
194 ToChilds -> take max' $ (tail . snd) $ splitAt (elemIndex' pId pIds) pIds
195 ToParents -> take max' $ (reverse . fst) $ splitAt (elemIndex' pId pIds) pIds
196
197
198 getCandidates :: Filiation -> PhyloGroup -> [[PhyloGroup]] -> [[PhyloGroup]]
199 getCandidates fil ego targets =
200 case fil of
201 ToChilds -> targets'
202 ToParents -> reverse targets'
203 where
204 targets' :: [[PhyloGroup]]
205 targets' =
206 map (\groups' ->
207 filter (\g' -> (not . null) $ intersect (ego ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams)
208 ) groups') targets
209
210
211 phyloBranchMatching :: Int -> [PhyloPeriodId] -> Proximity -> Double -> Map Date Double -> [PhyloGroup] -> [PhyloGroup]
212 phyloBranchMatching frame periods proximity thr docs branch = traceBranchMatching proximity thr
213 -- $ matchByPeriods ToParents
214 -- $ groupByField _phylo_groupPeriod
215 $ matchByPeriods
216 $ groupByField _phylo_groupPeriod branch
217 where
218 --------------------------------------
219 matchByPeriods :: Map PhyloPeriodId [PhyloGroup] -> [PhyloGroup]
220 matchByPeriods branch' = foldl' (\acc prd ->
221 let periodsPar = getNextPeriods ToParents frame prd periods
222 periodsChi = getNextPeriods ToChilds frame prd periods
223 candidatesPar = map (\prd' -> findWithDefault [] prd' branch') periodsPar
224 candidatesChi = map (\prd' -> findWithDefault [] prd' branch') periodsChi
225 docsPar = filterDocs docs ([prd] ++ periodsPar)
226 docsChi = filterDocs docs ([prd] ++ periodsChi)
227 egos = map (\ego -> phyloGroupMatching (getCandidates ToParents ego candidatesPar) ToParents proximity docsPar thr
228 $ phyloGroupMatching (getCandidates ToChilds ego candidatesChi) ToChilds proximity docsChi thr ego)
229 $ findWithDefault [] prd branch'
230 egos' = egos `using` parList rdeepseq
231 in acc ++ egos' ) [] periods
232
233
234 -----------------------
235 -- | Phylo Quality | --
236 -----------------------
237
238
239 count :: Eq a => a -> [a] -> Int
240 count x = length . filter (== x)
241
242 termFreq' :: Int -> [PhyloGroup] -> Double
243 termFreq' term groups =
244 let ngrams = concat $ map _phylo_groupNgrams groups
245 in log ((fromIntegral $ count term ngrams)
246 / (fromIntegral $ length ngrams))
247
248 relevantBranches :: Int -> [[PhyloGroup]] -> [[PhyloGroup]]
249 relevantBranches term branches =
250 filter (\groups -> (any (\group -> elem term $ group ^. phylo_groupNgrams) groups)) branches
251
252 branchCov' :: [PhyloGroup] -> [[PhyloGroup]] -> Double
253 branchCov' branch branches =
254 (fromIntegral $ length branch) / (fromIntegral $ length $ concat branches)
255
256
257 toRecall :: Double -> Int -> Int -> [[PhyloGroup]] -> Double
258 toRecall freq term border branches =
259 -- | given a random term in a phylo
260 freq
261 -- | for each relevant branches
262 * (sum $ map (\branch ->
263 -- | given its local coverage
264 ((branchCov' branch branches') / (sum $ map (\b -> branchCov' b branches') branches'))
265 -- | compute the local recall
266 * ( (fromIntegral $ length $ filter (\group -> elem term $ group ^. phylo_groupNgrams) branch)
267 / ( (fromIntegral $ length $ filter (\group -> elem term $ group ^. phylo_groupNgrams) $ concat branches')
268 -- | with a ponderation from border branches
269 + (fromIntegral border)) )) branches')
270 where
271 branches' :: [[PhyloGroup]]
272 branches' = relevantBranches term branches
273
274
275 toAccuracy :: Double -> Int -> [[PhyloGroup]] -> Double
276 toAccuracy freq term branches =
277 if (null branches)
278 then 0
279 else
280 -- | given a random term in a phylo
281 freq
282 -- | for each relevant branches
283 * (sum $ map (\branch ->
284 -- | given its local coverage
285 ((branchCov' branch branches') / (sum $ map (\b -> branchCov' b branches') branches'))
286 -- | compute the local accuracy
287 * ( (fromIntegral $ length $ filter (\group -> elem term $ group ^. phylo_groupNgrams) branch)
288 / (fromIntegral $ length branch))) branches')
289 where
290 branches' :: [[PhyloGroup]]
291 branches' = relevantBranches term branches
292
293
294 toPhyloQuality :: Double -> Map Int Double -> Int -> Double -> [[PhyloGroup]] -> Double
295 toPhyloQuality beta frequency border oldAcc branches =
296 -- trace (" rec : " <> show(recall)) $
297 -- trace (" acc : " <> show(accuracy)) $
298 if (null branches)
299 then 0
300 else ((1 + beta ** 2) * accuracy * recall)
301 / (((beta ** 2) * accuracy + recall))
302 where
303 -- | for each term compute the global accuracy
304 accuracy :: Double
305 accuracy = oldAcc + (sum $ map (\term -> toAccuracy (frequency ! term) term branches) $ keys frequency)
306 -- | for each term compute the global recall
307 recall :: Double
308 recall = sum $ map (\term -> toRecall (frequency ! term) term border branches) $ keys frequency
309
310
311 toBorderAccuracy :: Map Int Double -> [[PhyloGroup]] -> Double
312 toBorderAccuracy freq branches = sum $ map (\t -> toAccuracy (freq ! t) t branches) $ keys freq
313
314
315 -----------------------------
316 -- | Adaptative Matching | --
317 -----------------------------
318
319
320 groupsToBranches :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
321 groupsToBranches groups =
322 -- | run the related component algorithm
323 let egos = groupBy (\gs gs' -> (fst $ fst $ head' "egos" gs) == (fst $ fst $ head' "egos" gs'))
324 $ sortOn (\gs -> fst $ fst $ head' "egos" gs)
325 $ map (\group -> [getGroupId group]
326 ++ (map fst $ group ^. phylo_groupPeriodParents)
327 ++ (map fst $ group ^. phylo_groupPeriodChilds) ) $ elems groups
328 -- | first find the related components by inside each ego's period
329 graph' = map relatedComponents egos
330 -- | then run it for the all the periods
331 graph = zip [1..]
332 $ relatedComponents $ concat (graph' `using` parList rdeepseq)
333 -- | update each group's branch id
334 in map (\(bId,ids) ->
335 let groups' = map (\group -> group & phylo_groupBranchId %~ (\(lvl,lst) -> (lvl,lst ++ [bId])))
336 $ elems $ restrictKeys groups (Set.fromList ids)
337 in groups' `using` parList rdeepseq ) graph
338
339
340 reduceFrequency :: Map Int Double -> [[PhyloGroup]] -> Map Int Double
341 reduceFrequency frequency branches =
342 restrictKeys frequency (Set.fromList $ (nub . concat) $ map _phylo_groupNgrams $ concat branches)
343
344
345 alterBorder :: Int -> [[PhyloGroup]] -> [PhyloGroup] -> Int
346 alterBorder border branches branch = border + (length $ concat branches) - (length branch)
347
348
349 recursiveMatching :: Proximity -> Double -> Int -> Map Int Double -> Double -> Int -> [PhyloPeriodId] -> Map Date Double -> Double -> Int -> Double -> [PhyloGroup] -> [PhyloGroup]
350 recursiveMatching proximity beta minBranch frequency egoThr frame periods docs quality border oldAcc groups =
351 if ((egoThr >= 1) || (quality > quality') || ((length $ concat $ snd branches') == (length groups)))
352 then
353 trace (" ✗ F(β) = " <> show(quality) <> " (vs) " <> show(quality') <> "\n"
354 <> " |✓ " <> show(length $ fst branches') <> show(map length $ fst branches')
355 <> " |✗ " <> show(length $ snd branches') <> "[" <> show(length $ concat $ snd branches') <> "]") $
356 groups
357 else
358 let next = map (\b -> recursiveMatching proximity beta minBranch
359 (reduceFrequency frequency (fst branches'))
360 (egoThr + (getThresholdStep proximity))
361 frame periods docs quality'
362 (alterBorder border (fst branches') b)
363 (oldAcc + (toBorderAccuracy frequency (delete b ((fst branches') ++ (snd branches')))))
364 b ) (fst branches')
365 in trace (" ✓ F(β) = " <> show(quality) <> " (vs) " <> show(quality') <> "\n"
366 <> " |✓ " <> show(length $ fst branches') <> show(map length $ fst branches')
367 <> " |✗ " <> show(length $ snd branches') <> "[" <> show(length $ concat $ snd branches') <> "]") $
368 concat (next ++ (snd branches'))
369 where
370 -- | 2) for each of the possible next branches process the phyloQuality score
371 quality' :: Double
372 quality' = toPhyloQuality beta frequency border oldAcc ((fst branches') ++ (snd branches'))
373 -- | 1) for each local branch process a temporal matching then find the resulting branches
374 branches' :: ([[PhyloGroup]],[[PhyloGroup]])
375 branches' =
376 let branches = groupsToBranches $ fromList $ map (\g -> (getGroupId g, g))
377 $ phyloBranchMatching frame periods proximity egoThr docs groups
378 in partition (\b -> length b >= minBranch) (branches `using` parList rdeepseq)
379
380
381 temporalMatching :: Phylo -> Phylo
382 temporalMatching phylo = updatePhyloGroups 1 branches' phylo
383 where
384 -- | 5) apply the recursive matching
385 branches' :: Map PhyloGroupId PhyloGroup
386 branches' =
387 let next = trace (" ✓ F(β) = " <> show(quality)
388 <> " |✓ " <> show(length $ fst branches) <> show(map length $ fst branches)
389 <> " |✗ " <> show(length $ snd branches) <> "[" <> show(length $ concat $ snd branches) <> "]")
390 $ map (\branch -> recursiveMatching (phyloProximity $ getConfig phylo)
391 (_qua_relevance $ phyloQuality $ getConfig phylo)
392 (_qua_minBranch $ phyloQuality $ getConfig phylo)
393 (reduceFrequency frequency (fst branches))
394 ( (getThresholdInit $ phyloProximity $ getConfig phylo)
395 + (getThresholdStep $ phyloProximity $ getConfig phylo))
396 (getTimeFrame $ timeUnit $ getConfig phylo)
397 (getPeriodIds phylo)
398 (phylo ^. phylo_timeDocs) quality (alterBorder 0 (fst branches) branch)
399 (toBorderAccuracy frequency (delete branch ((fst branches) ++ (snd branches))))
400 branch
401 ) (fst branches)
402 in fromList $ map (\g -> (getGroupId g,g)) $ traceMatchEnd (concat (next ++ (snd branches)))
403 -- | 4) process the quality score
404 quality :: Double
405 quality = toPhyloQuality (_qua_relevance $ phyloQuality $ getConfig phylo) frequency 0 0 ((fst branches) ++ (snd branches))
406 -- | 3) process the constants of the quality score
407 frequency :: Map Int Double
408 frequency =
409 let terms = ngramsInBranches ((fst branches) ++ (snd branches))
410 freqs = map (\t -> termFreq' t $ concat ((fst branches) ++ (snd branches))) terms
411 in fromList $ map (\(t,freq) -> (t,freq/(sum freqs))) $ zip terms freqs
412 -- | 2) group into branches
413 branches :: ([[PhyloGroup]],[[PhyloGroup]])
414 branches = partition (\b -> length b >= (_qua_minBranch $ phyloQuality $ getConfig phylo))
415 $ groupsToBranches $ fromList $ map (\group -> (getGroupId group, group)) groups'
416 -- | 1) for each group process an initial temporal Matching
417 groups' :: [PhyloGroup]
418 groups' = phyloBranchMatching (getTimeFrame $ timeUnit $ getConfig phylo) (getPeriodIds phylo)
419 (phyloProximity $ getConfig phylo) (getThresholdInit $ phyloProximity $ getConfig phylo)
420 (phylo ^. phylo_timeDocs)
421 (traceTemporalMatching $ getGroupsFromLevel 1 phylo)