]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/Phylo/TemporalMatching.hs
fix the artifacts
[gargantext.git] / src / Gargantext / Viz / Phylo / TemporalMatching.hs
1 {-|
2 Module : Gargantext.Viz.Phylo.TemporalMatching
3 Description : Module dedicated to the adaptative temporal matching of a Phylo.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 {-# LANGUAGE NoImplicitPrelude #-}
12 {-# LANGUAGE FlexibleContexts #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE MultiParamTypeClasses #-}
15
16 module Gargantext.Viz.Phylo.TemporalMatching where
17
18 import Data.List (concat, splitAt, tail, sortOn, (++), intersect, null, inits, groupBy, scanl, nub, union, dropWhile, partition, or)
19 import Data.Map (Map, fromList, elems, restrictKeys, unionWith, intersectionWith, findWithDefault, keys, (!), filterWithKey)
20
21 import Gargantext.Prelude
22 import Gargantext.Viz.AdaptativePhylo
23 import Gargantext.Viz.Phylo.PhyloTools
24
25 -- import Prelude (logBase)
26 import Control.Lens hiding (Level)
27 import Control.Parallel.Strategies (parList, rdeepseq, using)
28 -- import Debug.Trace (trace)
29
30 import qualified Data.Set as Set
31
32
33 -------------------
34 -- | Proximity | --
35 -------------------
36
37
38 -- | Process the inverse sumLog
39 sumInvLog :: Double -> [Double] -> Double
40 sumInvLog s l = foldl (\mem x -> mem + (1 / log (s + x))) 0 l
41
42
43 -- | Process the sumLog
44 sumLog :: Double -> [Double] -> Double
45 sumLog s l = foldl (\mem x -> mem + log (s + x)) 0 l
46
47
48 -- | To compute a jaccard similarity between two lists
49 jaccard :: [Int] -> [Int] -> Double
50 jaccard inter' union' = ((fromIntegral . length) $ inter') / ((fromIntegral . length) $ union')
51
52
53 -- | To process a WeighedLogJaccard distance between to coocurency matrix
54 weightedLogJaccard :: Double -> Double -> Cooc -> Cooc -> [Int] -> [Int] -> Double
55 weightedLogJaccard sens docs cooc cooc' ngrams ngrams'
56 | null ngramsInter = 0
57 | ngramsInter == ngramsUnion = 1
58 | sens == 0 = jaccard ngramsInter ngramsUnion
59 | sens > 0 = (sumInvLog sens coocInter) / (sumInvLog sens coocUnion)
60 | otherwise = (sumLog sens coocInter) / (sumLog sens coocUnion)
61 where
62 --------------------------------------
63 ngramsInter :: [Int]
64 ngramsInter = intersect ngrams ngrams'
65 --------------------------------------
66 ngramsUnion :: [Int]
67 ngramsUnion = union ngrams ngrams'
68 --------------------------------------
69 coocInter :: [Double]
70 coocInter = elems $ map (/docs) $ filterWithKey (\(k,k') _ -> k == k') $ intersectionWith (+) cooc cooc'
71 -- coocInter = elems $ map (/docs) $ intersectionWith (+) cooc cooc'
72 --------------------------------------
73 coocUnion :: [Double]
74 coocUnion = elems $ map (/docs) $ filterWithKey (\(k,k') _ -> k == k') $ unionWith (+) cooc cooc'
75 --------------------------------------
76
77
78 -- | To choose a proximity function
79 pickProximity :: Proximity -> Double -> Cooc -> Cooc -> [Int] -> [Int] -> Double
80 pickProximity proximity docs cooc cooc' ngrams ngrams' = case proximity of
81 WeightedLogJaccard sens _ _ -> weightedLogJaccard sens docs cooc cooc' ngrams ngrams'
82 Hamming -> undefined
83
84
85 -- | To process the proximity between a current group and a pair of targets group
86 toProximity :: Map Date Double -> Proximity -> PhyloGroup -> PhyloGroup -> PhyloGroup -> Double
87 toProximity docs proximity ego target target' =
88 let docs' = sum $ elems docs
89 cooc = if target == target'
90 then (target ^. phylo_groupCooc)
91 else sumCooc (target ^. phylo_groupCooc) (target' ^. phylo_groupCooc)
92 ngrams = if target == target'
93 then (target ^. phylo_groupNgrams)
94 else union (target ^. phylo_groupNgrams) (target' ^. phylo_groupNgrams)
95 in pickProximity proximity docs' (ego ^. phylo_groupCooc) cooc (ego ^. phylo_groupNgrams) ngrams
96
97
98 ------------------------
99 -- | Local Matching | --
100 ------------------------
101
102 toLastPeriod :: Filiation -> [PhyloPeriodId] -> PhyloPeriodId
103 toLastPeriod fil periods = case fil of
104 ToParents -> head' "toLastPeriod" (sortOn fst periods)
105 ToChilds -> last' "toLastPeriod" (sortOn fst periods)
106
107
108 toLazyPairs :: [Pointer] -> Filiation -> Double -> Proximity -> PhyloPeriodId -> [(PhyloGroup,PhyloGroup)] -> [(PhyloGroup,PhyloGroup)]
109 toLazyPairs pointers fil thr prox prd pairs =
110 if null pointers then pairs
111 else let rest = filterPointers prox thr pointers
112 in if null rest
113 then let prd' = toLastPeriod fil (map (fst . fst . fst) pointers)
114 in if prd' == prd
115 then []
116 else filter (\(g,g') ->
117 case fil of
118 ToParents -> ((fst $ g ^. phylo_groupPeriod) < (fst prd'))
119 || ((fst $ g' ^. phylo_groupPeriod) < (fst prd'))
120 ToChilds -> ((fst $ g ^. phylo_groupPeriod) > (fst prd'))
121 || ((fst $ g' ^. phylo_groupPeriod) > (fst prd'))) pairs
122 else []
123
124
125 -- | Find pairs of valuable candidates to be matched
126 makePairs' :: PhyloGroup -> [PhyloGroup] -> [PhyloPeriodId] -> [Pointer] -> Filiation -> Double -> Proximity -> Map Date Double -> [(PhyloGroup,PhyloGroup)]
127 makePairs' ego candidates periods pointers fil thr prox docs =
128 case null periods of
129 True -> []
130 False -> toLazyPairs pointers fil thr prox lastPrd
131 -- | at least on of the pair candidates should be from the last added period
132 $ filter (\(g,g') -> ((g ^. phylo_groupPeriod) == lastPrd)
133 || ((g' ^. phylo_groupPeriod) == lastPrd))
134 $ listToKeys
135 $ filter (\g -> (g ^. phylo_groupPeriod == lastPrd)
136 || ((toProximity docs prox ego ego g) >= thr)) candidates
137 where
138 lastPrd :: PhyloPeriodId
139 lastPrd = toLastPeriod fil periods
140
141
142 filterPointers :: Proximity -> Double -> [Pointer] -> [Pointer]
143 filterPointers proxi thr pts = filter (\(_,w) -> filterProximity proxi thr w) pts
144
145
146 phyloGroupMatching :: [[PhyloGroup]] -> Filiation -> Proximity -> Map Date Double -> Double -> PhyloGroup -> PhyloGroup
147 phyloGroupMatching candidates fil proxi docs thr ego =
148 if (null $ filterPointers proxi thr $ getPeriodPointers fil ego)
149 -- | let's find new pointers
150 then if null nextPointers
151 then addPointers ego fil TemporalPointer []
152 else addPointers ego fil TemporalPointer
153 $ head' "phyloGroupMatching"
154 -- | Keep only the best set of pointers grouped by proximity
155 $ groupBy (\pt pt' -> snd pt == snd pt')
156 $ reverse $ sortOn snd $ head' "pointers" nextPointers
157 -- | Find the first time frame where at leats one pointer satisfies the proximity threshold
158 else addPointers ego fil TemporalPointer
159 $ filterPointers proxi thr $ getPeriodPointers fil ego
160 where
161 nextPointers :: [[Pointer]]
162 nextPointers = take 1
163 $ dropWhile (null)
164 -- | for each time frame, process the proximity on relevant pairs of targeted groups
165 $ scanl (\acc groups ->
166 let periods = nub $ map _phylo_groupPeriod $ concat groups
167 docs' = (filterDocs docs ([ego ^. phylo_groupPeriod] ++ periods))
168 pairs = makePairs' ego (concat groups) periods (getPeriodPointers fil ego) fil thr proxi docs
169 in acc ++ ( filterPointers proxi thr
170 $ concat
171 $ map (\(c,c') ->
172 -- | process the proximity between the current group and a pair of candidates
173 let proximity = toProximity docs' proxi ego c c'
174 in if (c == c')
175 then [(getGroupId c,proximity)]
176 else [(getGroupId c,proximity),(getGroupId c',proximity)] ) pairs )) []
177 $ inits candidates -- | groups from [[1900],[1900,1901],[1900,1901,1902],...]
178
179
180 filterDocs :: Map Date Double -> [PhyloPeriodId] -> Map Date Double
181 filterDocs d pds = restrictKeys d $ periodsToYears pds
182
183
184 -----------------------------
185 -- | Matching Processing | --
186 -----------------------------
187
188
189 getNextPeriods :: Filiation -> Int -> PhyloPeriodId -> [PhyloPeriodId] -> [PhyloPeriodId]
190 getNextPeriods fil max' pId pIds =
191 case fil of
192 ToChilds -> take max' $ (tail . snd) $ splitAt (elemIndex' pId pIds) pIds
193 ToParents -> take max' $ (reverse . fst) $ splitAt (elemIndex' pId pIds) pIds
194
195
196 getCandidates :: PhyloGroup -> [[PhyloGroup]] -> [[PhyloGroup]]
197 getCandidates ego targets =
198 map (\groups' ->
199 filter (\g' -> (not . null) $ intersect (ego ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams)
200 ) groups') targets
201
202
203 phyloBranchMatching :: Int -> [PhyloPeriodId] -> Proximity -> Double -> Map Date Double -> [PhyloGroup] -> [PhyloGroup]
204 phyloBranchMatching frame periods proximity thr docs branch =
205 -- traceBranchMatching proximity thr
206 matchByPeriods
207 $ groupByField _phylo_groupPeriod branch
208 where
209 --------------------------------------
210 matchByPeriods :: Map PhyloPeriodId [PhyloGroup] -> [PhyloGroup]
211 matchByPeriods branch' = foldl' (\acc prd ->
212 let periodsPar = getNextPeriods ToParents frame prd periods
213 periodsChi = getNextPeriods ToChilds frame prd periods
214 candidatesPar = map (\prd' -> findWithDefault [] prd' branch') periodsPar
215 candidatesChi = map (\prd' -> findWithDefault [] prd' branch') periodsChi
216 docsPar = filterDocs docs ([prd] ++ periodsPar)
217 docsChi = filterDocs docs ([prd] ++ periodsChi)
218 egos = map (\ego -> phyloGroupMatching (getCandidates ego candidatesPar) ToParents proximity docsPar thr
219 $ phyloGroupMatching (getCandidates ego candidatesChi) ToChilds proximity docsChi thr ego)
220 $ findWithDefault [] prd branch'
221 egos' = egos `using` parList rdeepseq
222 in acc ++ egos' ) [] periods
223
224
225 -----------------------
226 -- | Phylo Quality | --
227 -----------------------
228
229
230 relevantBranches :: Int -> [[PhyloGroup]] -> [[PhyloGroup]]
231 relevantBranches term branches =
232 filter (\groups -> (any (\group -> elem term $ group ^. phylo_groupNgrams) groups)) branches
233
234 fScore :: Double -> Int -> [PhyloGroup] -> [[PhyloGroup]] -> Double
235 fScore beta i bk bks =
236 let recall = ( (fromIntegral $ length $ filter (\g -> elem i $ g ^. phylo_groupNgrams) bk)
237 / (fromIntegral $ length $ filter (\g -> elem i $ g ^. phylo_groupNgrams) $ concat bks))
238 accuracy = ( (fromIntegral $ length $ filter (\g -> elem i $ g ^. phylo_groupNgrams) bk)
239 / (fromIntegral $ length bk))
240 in ((1 + beta ** 2) * accuracy * recall)
241 / (((beta ** 2) * accuracy + recall))
242
243
244 wk :: [PhyloGroup] -> Double
245 wk bk = fromIntegral $ length bk
246
247
248 toPhyloQuality' :: Double -> Map Int Double -> [[PhyloGroup]] -> Double
249 toPhyloQuality' beta freq branches =
250 if (null branches)
251 then 0
252 else sum
253 $ map (\i ->
254 let bks = relevantBranches i branches
255 in (freq ! i) * (sum $ map (\bk -> ((wk bk) / (sum $ map wk bks)) * (fScore beta i bk bks)) bks))
256 $ keys freq
257
258
259 -----------------------------
260 -- | Adaptative Matching | --
261 -----------------------------
262
263
264 groupsToBranches :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
265 groupsToBranches groups =
266 -- | run the related component algorithm
267 let egos = groupBy (\gs gs' -> (fst $ fst $ head' "egos" gs) == (fst $ fst $ head' "egos" gs'))
268 $ sortOn (\gs -> fst $ fst $ head' "egos" gs)
269 $ map (\group -> [getGroupId group]
270 ++ (map fst $ group ^. phylo_groupPeriodParents)
271 ++ (map fst $ group ^. phylo_groupPeriodChilds) ) $ elems groups
272 -- | first find the related components by inside each ego's period
273 graph' = map relatedComponents egos
274 -- | then run it for the all the periods
275 graph = zip [1..]
276 $ relatedComponents $ concat (graph' `using` parList rdeepseq)
277 -- | update each group's branch id
278 in map (\(bId,ids) ->
279 let groups' = map (\group -> group & phylo_groupBranchId %~ (\(lvl,lst) -> (lvl,lst ++ [bId])))
280 $ elems $ restrictKeys groups (Set.fromList ids)
281 in groups' `using` parList rdeepseq ) graph
282
283
284 reduceFrequency :: Map Int Double -> [[PhyloGroup]] -> Map Int Double
285 reduceFrequency frequency branches =
286 restrictKeys frequency (Set.fromList $ (nub . concat) $ map _phylo_groupNgrams $ concat branches)
287
288
289 seqMatching :: Proximity -> Double -> Map Int Double -> Int -> Double -> Int -> Map Date Double -> [PhyloPeriodId] -> [([PhyloGroup],Bool)] -> ([PhyloGroup],Bool) -> [([PhyloGroup],Bool)] -> [([PhyloGroup],Bool)]
290 seqMatching proximity beta frequency minBranch egoThr frame docs periods done ego rest =
291 -- | 1) keep or not the new division of ego
292 let done' = done ++ (if snd ego
293 then (if ((null (fst ego')) || (quality > quality'))
294 then
295 -- trace (" ✗ F(β) = " <> show(quality) <> " (vs) " <> show(quality')
296 -- <> " | " <> show(length $ fst ego) <> " groups : "
297 -- <> " |✓ " <> show(length $ fst ego') <> show(map length $ fst ego')
298 -- <> " |✗ " <> show(length $ snd ego') <> "[" <> show(length $ concat $ snd ego') <> "]")
299 [(fst ego,False)]
300 else
301 -- trace (" ✓ F(β) = " <> show(quality) <> " (vs) " <> show(quality')
302 -- <> " | " <> show(length $ fst ego) <> " groups : "
303 -- <> " |✓ " <> show(length $ fst ego') <> show(map length $ fst ego')
304 -- <> " |✗ " <> show(length $ snd ego') <> "[" <> show(length $ concat $ snd ego') <> "]")
305 ((map (\e -> (e,True)) (fst ego')) ++ (map (\e -> (e,False)) (snd ego'))))
306 else [ego])
307 in
308 -- | 2) if there is no more branches in rest then return else continue
309 if null rest
310 then done'
311 else seqMatching proximity beta frequency minBranch egoThr frame docs periods
312 done' (head' "seqMatching" rest) (tail' "seqMatching" rest)
313 where
314 --------------------------------------
315 quality :: Double
316 quality = toPhyloQuality' beta frequency ((map fst done) ++ [fst ego] ++ (map fst rest))
317 --------------------------------------
318 ego' :: ([[PhyloGroup]],[[PhyloGroup]])
319 ego' =
320 let branches = groupsToBranches $ fromList $ map (\g -> (getGroupId g, g))
321 $ phyloBranchMatching frame periods proximity egoThr docs (fst ego)
322 branches' = branches `using` parList rdeepseq
323 in partition (\b -> (length $ nub $ map _phylo_groupPeriod b) >= minBranch) branches'
324 --------------------------------------
325 quality' :: Double
326 quality' = toPhyloQuality' beta frequency
327 ((map fst done) ++ (fst ego') ++ (snd ego') ++ (map fst rest))
328
329
330 recursiveMatching' :: Proximity -> Double -> Int -> Map Int Double -> Double -> Int -> [PhyloPeriodId] -> Map Date Double -> [([PhyloGroup],Bool)] -> [([PhyloGroup],Bool)]
331 recursiveMatching' proximity beta minBranch frequency egoThr frame periods docs branches =
332 if (egoThr >= 1) || ((not . or) $ map snd branches)
333 then branches
334 else
335 let branches' = seqMatching proximity beta frequency minBranch egoThr frame docs periods
336 [] (head' "recursiveMatching" branches) (tail' "recursiveMatching" branches)
337 frequency' = reduceFrequency frequency (map fst branches')
338 in recursiveMatching' proximity beta minBranch frequency' (egoThr + (getThresholdStep proximity)) frame periods docs branches'
339
340
341 temporalMatching :: Phylo -> Phylo
342 temporalMatching phylo = updatePhyloGroups 1
343 (fromList $ map (\g -> (getGroupId g,g)) $ traceMatchEnd $ concat branches)
344 phylo
345 where
346 -- | 2) init the recursiveMatching
347 branches :: [[PhyloGroup]]
348 branches = map fst
349 $ recursiveMatching' (phyloProximity $ getConfig phylo)
350 (_qua_granularity $ phyloQuality $ getConfig phylo)
351 (_qua_minBranch $ phyloQuality $ getConfig phylo)
352 (phylo ^. phylo_termFreq)
353 (getThresholdInit $ phyloProximity $ getConfig phylo)
354 (getTimeFrame $ timeUnit $ getConfig phylo)
355 (getPeriodIds phylo)
356 (phylo ^. phylo_timeDocs)
357 groups
358 -- | 1) for each group process an initial temporal Matching
359 groups :: [([PhyloGroup],Bool)]
360 groups = map (\b -> (b,(length $ nub $ map _phylo_groupPeriod b) >= (_qua_minBranch $ phyloQuality $ getConfig phylo)))
361 $ groupsToBranches $ fromList $ map (\g -> (getGroupId g, g))
362 $ phyloBranchMatching (getTimeFrame $ timeUnit $ getConfig phylo) (getPeriodIds phylo)
363 (phyloProximity $ getConfig phylo) (getThresholdInit $ phyloProximity $ getConfig phylo)
364 (phylo ^. phylo_timeDocs)
365 (traceTemporalMatching $ getGroupsFromLevel 1 phylo)