]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/Phylo/TemporalMatching.hs
add the maxClique (in progress)
[gargantext.git] / src / Gargantext / Viz / Phylo / TemporalMatching.hs
1 {-|
2 Module : Gargantext.Viz.Phylo.TemporalMatching
3 Description : Module dedicated to the adaptative temporal matching of a Phylo.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 {-# LANGUAGE NoImplicitPrelude #-}
12 {-# LANGUAGE FlexibleContexts #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE MultiParamTypeClasses #-}
15
16 module Gargantext.Viz.Phylo.TemporalMatching where
17
18 import Data.List (concat, splitAt, tail, sortOn, (++), intersect, null, inits, groupBy, scanl, nub, union, dropWhile, partition, or)
19 import Data.Map (Map, fromList, elems, restrictKeys, unionWith, intersectionWith, findWithDefault, keys, (!), filterWithKey)
20
21 import Gargantext.Prelude
22 import Gargantext.Viz.AdaptativePhylo
23 import Gargantext.Viz.Phylo.PhyloTools
24
25 -- import Prelude (logBase)
26 import Control.Lens hiding (Level)
27 import Control.Parallel.Strategies (parList, rdeepseq, using)
28 import Debug.Trace (trace)
29
30 import qualified Data.Set as Set
31
32
33 -------------------
34 -- | Proximity | --
35 -------------------
36
37
38 -- | Process the inverse sumLog
39 sumInvLog :: Double -> [Double] -> Double
40 sumInvLog s l = foldl (\mem x -> mem + (1 / log (s + x))) 0 l
41
42
43 -- | Process the sumLog
44 sumLog :: Double -> [Double] -> Double
45 sumLog s l = foldl (\mem x -> mem + log (s + x)) 0 l
46
47
48 -- | To compute a jaccard similarity between two lists
49 jaccard :: [Int] -> [Int] -> Double
50 jaccard inter' union' = ((fromIntegral . length) $ inter') / ((fromIntegral . length) $ union')
51
52
53 -- | To process a WeighedLogJaccard distance between to coocurency matrix
54 weightedLogJaccard :: Double -> Double -> Cooc -> Cooc -> [Int] -> [Int] -> Double
55 weightedLogJaccard sens docs cooc cooc' ngrams ngrams'
56 | null ngramsInter = 0
57 | ngramsInter == ngramsUnion = 1
58 | sens == 0 = jaccard ngramsInter ngramsUnion
59 | sens > 0 = (sumInvLog sens coocInter) / (sumInvLog sens coocUnion)
60 | otherwise = (sumLog sens coocInter) / (sumLog sens coocUnion)
61 where
62 --------------------------------------
63 ngramsInter :: [Int]
64 ngramsInter = intersect ngrams ngrams'
65 --------------------------------------
66 ngramsUnion :: [Int]
67 ngramsUnion = union ngrams ngrams'
68 --------------------------------------
69 coocInter :: [Double]
70 coocInter = elems $ map (/docs) $ filterWithKey (\(k,k') _ -> k == k') $ intersectionWith (+) cooc cooc'
71 -- coocInter = elems $ map (/docs) $ intersectionWith (+) cooc cooc'
72 --------------------------------------
73 coocUnion :: [Double]
74 coocUnion = elems $ map (/docs) $ filterWithKey (\(k,k') _ -> k == k') $ unionWith (+) cooc cooc'
75 --------------------------------------
76
77
78 -- | To choose a proximity function
79 pickProximity :: Proximity -> Double -> Cooc -> Cooc -> [Int] -> [Int] -> Double
80 pickProximity proximity docs cooc cooc' ngrams ngrams' = case proximity of
81 WeightedLogJaccard sens _ _ -> weightedLogJaccard sens docs cooc cooc' ngrams ngrams'
82 Hamming -> undefined
83
84
85 -- | To process the proximity between a current group and a pair of targets group
86 toProximity :: Map Date Double -> Proximity -> PhyloGroup -> PhyloGroup -> PhyloGroup -> Double
87 toProximity docs proximity ego target target' =
88 let docs' = sum $ elems docs
89 cooc = if target == target'
90 then (target ^. phylo_groupCooc)
91 else sumCooc (target ^. phylo_groupCooc) (target' ^. phylo_groupCooc)
92 ngrams = if target == target'
93 then (target ^. phylo_groupNgrams)
94 else union (target ^. phylo_groupNgrams) (target' ^. phylo_groupNgrams)
95 in pickProximity proximity docs' (ego ^. phylo_groupCooc) cooc (ego ^. phylo_groupNgrams) ngrams
96
97
98 ------------------------
99 -- | Local Matching | --
100 ------------------------
101
102 toLastPeriod :: Filiation -> [PhyloPeriodId] -> PhyloPeriodId
103 toLastPeriod fil periods = case fil of
104 ToParents -> head' "toLastPeriod" (sortOn fst periods)
105 ToChilds -> last' "toLastPeriod" (sortOn fst periods)
106
107
108 toLazyPairs :: [Pointer] -> Filiation -> Double -> Proximity -> PhyloPeriodId -> [(PhyloGroup,PhyloGroup)] -> [(PhyloGroup,PhyloGroup)]
109 toLazyPairs pointers fil thr prox prd pairs =
110 if null pointers then pairs
111 else let rest = filterPointers prox thr pointers
112 in if null rest
113 then let prd' = toLastPeriod fil (map (fst . fst . fst) pointers)
114 in if prd' == prd
115 then []
116 else filter (\(g,g') ->
117 case fil of
118 ToParents -> ((fst $ g ^. phylo_groupPeriod) < (fst prd'))
119 || ((fst $ g' ^. phylo_groupPeriod) < (fst prd'))
120 ToChilds -> ((fst $ g ^. phylo_groupPeriod) > (fst prd'))
121 || ((fst $ g' ^. phylo_groupPeriod) > (fst prd'))) pairs
122 else []
123
124
125 -- | Find pairs of valuable candidates to be matched
126 makePairs' :: PhyloGroup -> [PhyloGroup] -> [PhyloPeriodId] -> [Pointer] -> Filiation -> Double -> Proximity -> Map Date Double -> [(PhyloGroup,PhyloGroup)]
127 makePairs' ego candidates periods pointers fil thr prox docs =
128 case null periods of
129 True -> []
130 False -> toLazyPairs pointers fil thr prox lastPrd
131 -- | at least on of the pair candidates should be from the last added period
132 $ filter (\(g,g') -> ((g ^. phylo_groupPeriod) == lastPrd)
133 || ((g' ^. phylo_groupPeriod) == lastPrd))
134 $ listToKeys
135 $ filter (\g -> (g ^. phylo_groupPeriod == lastPrd)
136 || ((toProximity docs prox ego ego g) >= thr)) candidates
137 where
138 lastPrd :: PhyloPeriodId
139 lastPrd = toLastPeriod fil periods
140
141
142 filterPointers :: Proximity -> Double -> [Pointer] -> [Pointer]
143 filterPointers proxi thr pts = filter (\(_,w) -> filterProximity proxi thr w) pts
144
145
146 phyloGroupMatching :: [[PhyloGroup]] -> Filiation -> Proximity -> Map Date Double -> Double -> PhyloGroup -> PhyloGroup
147 phyloGroupMatching candidates fil proxi docs thr ego =
148 case null nextPointers of
149 -- | let's find new pointers
150 True -> if null $ filterPointers proxi thr $ getPeriodPointers fil ego
151 then addPointers ego fil TemporalPointer []
152 -- | or keep the old ones
153 else addPointers ego fil TemporalPointer
154 $ filterPointers proxi thr $ getPeriodPointers fil ego
155 False -> addPointers ego fil TemporalPointer
156 $ head' "phyloGroupMatching"
157 -- | Keep only the best set of pointers grouped by proximity
158 $ groupBy (\pt pt' -> snd pt == snd pt')
159 $ reverse $ sortOn snd $ head' "pointers"
160 $ nextPointers
161 -- | Find the first time frame where at leats one pointer satisfies the proximity threshold
162 where
163 nextPointers :: [[Pointer]]
164 nextPointers = take 1
165 $ dropWhile (null)
166 -- | for each time frame, process the proximity on relevant pairs of targeted groups
167 $ scanl (\acc groups ->
168 let periods = nub $ map _phylo_groupPeriod $ concat groups
169 docs' = (filterDocs docs ([ego ^. phylo_groupPeriod] ++ periods))
170 pairs = makePairs' ego (concat groups) periods (getPeriodPointers fil ego) fil thr proxi docs
171 in acc ++ ( filterPointers proxi thr
172 $ concat
173 $ map (\(c,c') ->
174 -- | process the proximity between the current group and a pair of candidates
175 let proximity = toProximity docs' proxi ego c c'
176 in if (c == c')
177 then [(getGroupId c,proximity)]
178 else [(getGroupId c,proximity),(getGroupId c',proximity)] ) pairs )) []
179 $ inits candidates -- | groups from [[1900],[1900,1901],[1900,1901,1902],...]
180
181
182 filterDocs :: Map Date Double -> [PhyloPeriodId] -> Map Date Double
183 filterDocs d pds = restrictKeys d $ periodsToYears pds
184
185
186 -----------------------------
187 -- | Matching Processing | --
188 -----------------------------
189
190
191 getNextPeriods :: Filiation -> Int -> PhyloPeriodId -> [PhyloPeriodId] -> [PhyloPeriodId]
192 getNextPeriods fil max' pId pIds =
193 case fil of
194 ToChilds -> take max' $ (tail . snd) $ splitAt (elemIndex' pId pIds) pIds
195 ToParents -> take max' $ (reverse . fst) $ splitAt (elemIndex' pId pIds) pIds
196
197
198 getCandidates :: Filiation -> PhyloGroup -> [[PhyloGroup]] -> [[PhyloGroup]]
199 getCandidates fil ego targets =
200 case fil of
201 ToChilds -> targets'
202 ToParents -> reverse targets'
203 where
204 targets' :: [[PhyloGroup]]
205 targets' =
206 map (\groups' ->
207 filter (\g' -> (not . null) $ intersect (ego ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams)
208 ) groups') targets
209
210
211 phyloBranchMatching :: Int -> [PhyloPeriodId] -> Proximity -> Double -> Map Date Double -> [PhyloGroup] -> [PhyloGroup]
212 phyloBranchMatching frame periods proximity thr docs branch = traceBranchMatching proximity thr
213 $ matchByPeriods
214 $ groupByField _phylo_groupPeriod branch
215 where
216 --------------------------------------
217 matchByPeriods :: Map PhyloPeriodId [PhyloGroup] -> [PhyloGroup]
218 matchByPeriods branch' = foldl' (\acc prd ->
219 let periodsPar = getNextPeriods ToParents frame prd periods
220 periodsChi = getNextPeriods ToChilds frame prd periods
221 candidatesPar = map (\prd' -> findWithDefault [] prd' branch') periodsPar
222 candidatesChi = map (\prd' -> findWithDefault [] prd' branch') periodsChi
223 docsPar = filterDocs docs ([prd] ++ periodsPar)
224 docsChi = filterDocs docs ([prd] ++ periodsChi)
225 egos = map (\ego -> phyloGroupMatching (getCandidates ToParents ego candidatesPar) ToParents proximity docsPar thr
226 $ phyloGroupMatching (getCandidates ToChilds ego candidatesChi) ToChilds proximity docsChi thr ego)
227 $ findWithDefault [] prd branch'
228 egos' = egos `using` parList rdeepseq
229 in acc ++ egos' ) [] periods
230
231
232 -----------------------
233 -- | Phylo Quality | --
234 -----------------------
235
236
237 relevantBranches :: Int -> [[PhyloGroup]] -> [[PhyloGroup]]
238 relevantBranches term branches =
239 filter (\groups -> (any (\group -> elem term $ group ^. phylo_groupNgrams) groups)) branches
240
241 fScore :: Double -> Int -> [PhyloGroup] -> [[PhyloGroup]] -> Double
242 fScore beta i bk bks =
243 let recall = ( (fromIntegral $ length $ filter (\g -> elem i $ g ^. phylo_groupNgrams) bk)
244 / (fromIntegral $ length $ filter (\g -> elem i $ g ^. phylo_groupNgrams) $ concat bks))
245 accuracy = ( (fromIntegral $ length $ filter (\g -> elem i $ g ^. phylo_groupNgrams) bk)
246 / (fromIntegral $ length bk))
247 in ((1 + beta ** 2) * accuracy * recall)
248 / (((beta ** 2) * accuracy + recall))
249
250
251 wk :: [PhyloGroup] -> Double
252 wk bk = fromIntegral $ length bk
253
254
255 toPhyloQuality' :: Double -> Map Int Double -> [[PhyloGroup]] -> Double
256 toPhyloQuality' beta freq branches =
257 if (null branches)
258 then 0
259 else sum
260 $ map (\i ->
261 let bks = relevantBranches i branches
262 in (freq ! i) * (sum $ map (\bk -> ((wk bk) / (sum $ map wk bks)) * (fScore beta i bk bks)) bks))
263 $ keys freq
264
265
266 -----------------------------
267 -- | Adaptative Matching | --
268 -----------------------------
269
270
271 groupsToBranches :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
272 groupsToBranches groups =
273 -- | run the related component algorithm
274 let egos = groupBy (\gs gs' -> (fst $ fst $ head' "egos" gs) == (fst $ fst $ head' "egos" gs'))
275 $ sortOn (\gs -> fst $ fst $ head' "egos" gs)
276 $ map (\group -> [getGroupId group]
277 ++ (map fst $ group ^. phylo_groupPeriodParents)
278 ++ (map fst $ group ^. phylo_groupPeriodChilds) ) $ elems groups
279 -- | first find the related components by inside each ego's period
280 graph' = map relatedComponents egos
281 -- | then run it for the all the periods
282 graph = zip [1..]
283 $ relatedComponents $ concat (graph' `using` parList rdeepseq)
284 -- | update each group's branch id
285 in map (\(bId,ids) ->
286 let groups' = map (\group -> group & phylo_groupBranchId %~ (\(lvl,lst) -> (lvl,lst ++ [bId])))
287 $ elems $ restrictKeys groups (Set.fromList ids)
288 in groups' `using` parList rdeepseq ) graph
289
290
291 reduceFrequency :: Map Int Double -> [[PhyloGroup]] -> Map Int Double
292 reduceFrequency frequency branches =
293 restrictKeys frequency (Set.fromList $ (nub . concat) $ map _phylo_groupNgrams $ concat branches)
294
295
296 seqMatching :: Proximity -> Double -> Map Int Double -> Int -> Double -> Int -> Map Date Double -> [PhyloPeriodId] -> [([PhyloGroup],Bool)] -> ([PhyloGroup],Bool) -> [([PhyloGroup],Bool)] -> [([PhyloGroup],Bool)]
297 seqMatching proximity beta frequency minBranch egoThr frame docs periods done ego rest =
298 -- | 1) keep or not the new division of ego
299 let done' = done ++ (if snd ego
300 then (if ((null (fst ego')) || (quality > quality'))
301 then trace (" ✗ F(β) = " <> show(quality) <> " (vs) " <> show(quality')
302 <> " | " <> show(length $ fst ego) <> " groups : "
303 <> " |✓ " <> show(length $ fst ego') <> show(map length $ fst ego')
304 <> " |✗ " <> show(length $ snd ego') <> "[" <> show(length $ concat $ snd ego') <> "]")
305 $ [(fst ego,False)]
306 else trace (" ✓ F(β) = " <> show(quality) <> " (vs) " <> show(quality')
307 <> " | " <> show(length $ fst ego) <> " groups : "
308 <> " |✓ " <> show(length $ fst ego') <> show(map length $ fst ego')
309 <> " |✗ " <> show(length $ snd ego') <> "[" <> show(length $ concat $ snd ego') <> "]")
310 $ ((map (\e -> (e,True)) (fst ego')) ++ (map (\e -> (e,False)) (snd ego'))))
311 else [ego])
312 in
313 -- | 2) if there is no more branches in rest then return else continue
314 if null rest
315 then done'
316 else seqMatching proximity beta frequency minBranch egoThr frame docs periods
317 done' (head' "seqMatching" rest) (tail' "seqMatching" rest)
318 where
319 --------------------------------------
320 quality :: Double
321 quality = toPhyloQuality' beta frequency ((map fst done) ++ [fst ego] ++ (map fst rest))
322 --------------------------------------
323 ego' :: ([[PhyloGroup]],[[PhyloGroup]])
324 ego' =
325 let branches = groupsToBranches $ fromList $ map (\g -> (getGroupId g, g))
326 $ phyloBranchMatching frame periods proximity egoThr docs (fst ego)
327 branches' = branches `using` parList rdeepseq
328 in partition (\b -> (length $ nub $ map _phylo_groupPeriod b) >= minBranch) branches'
329 --------------------------------------
330 quality' :: Double
331 quality' = toPhyloQuality' beta frequency
332 ((map fst done) ++ (fst ego') ++ (snd ego') ++ (map fst rest))
333
334
335 recursiveMatching' :: Proximity -> Double -> Int -> Map Int Double -> Double -> Int -> [PhyloPeriodId] -> Map Date Double -> [([PhyloGroup],Bool)] -> [([PhyloGroup],Bool)]
336 recursiveMatching' proximity beta minBranch frequency egoThr frame periods docs branches =
337 if (egoThr >= 1) || ((not . or) $ map snd branches)
338 then branches
339 else
340 let branches' = seqMatching proximity beta frequency minBranch egoThr frame docs periods
341 [] (head' "recursiveMatching" branches) (tail' "recursiveMatching" branches)
342 frequency' = reduceFrequency frequency (map fst branches')
343 in recursiveMatching' proximity beta minBranch frequency' (egoThr + (getThresholdStep proximity)) frame periods docs branches'
344
345
346 temporalMatching :: Phylo -> Phylo
347 temporalMatching phylo = updatePhyloGroups 1
348 (fromList $ map (\g -> (getGroupId g,g)) $ traceMatchEnd $ concat branches)
349 phylo
350 where
351 -- | 2) init the recursiveMatching
352 branches :: [[PhyloGroup]]
353 branches = map fst
354 $ recursiveMatching' (phyloProximity $ getConfig phylo)
355 (_qua_granularity $ phyloQuality $ getConfig phylo)
356 (_qua_minBranch $ phyloQuality $ getConfig phylo)
357 (phylo ^. phylo_termFreq)
358 (getThresholdInit $ phyloProximity $ getConfig phylo)
359 (getTimeFrame $ timeUnit $ getConfig phylo)
360 (getPeriodIds phylo)
361 (phylo ^. phylo_timeDocs)
362 [(groups,True)]
363 -- | 1) for each group process an initial temporal Matching
364 groups :: [PhyloGroup]
365 groups = phyloBranchMatching (getTimeFrame $ timeUnit $ getConfig phylo) (getPeriodIds phylo)
366 (phyloProximity $ getConfig phylo) (getThresholdInit $ phyloProximity $ getConfig phylo)
367 (phylo ^. phylo_timeDocs)
368 (traceTemporalMatching $ getGroupsFromLevel 1 phylo)