]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/Phylo/TemporalMatching.hs
add new synchronic clustering
[gargantext.git] / src / Gargantext / Viz / Phylo / TemporalMatching.hs
1 {-|
2 Module : Gargantext.Viz.Phylo.TemporalMatching
3 Description : Module dedicated to the adaptative temporal matching of a Phylo.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 {-# LANGUAGE NoImplicitPrelude #-}
12 {-# LANGUAGE FlexibleContexts #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE MultiParamTypeClasses #-}
15
16 module Gargantext.Viz.Phylo.TemporalMatching where
17
18 import Data.List (concat, splitAt, tail, sortOn, (++), intersect, null, inits, groupBy, scanl, nub, union, elemIndex, (!!), dropWhile, partition)
19 import Data.Map (Map, fromList, elems, restrictKeys, unionWith, intersectionWith, findWithDefault, filterWithKey)
20
21 import Gargantext.Prelude
22 import Gargantext.Viz.AdaptativePhylo
23 import Gargantext.Viz.Phylo.PhyloTools
24
25 import Prelude (logBase)
26 import Control.Lens hiding (Level)
27 import Control.Parallel.Strategies (parList, rdeepseq, using)
28 -- import Debug.Trace (trace)
29
30 import qualified Data.Set as Set
31
32
33 -------------------
34 -- | Proximity | --
35 -------------------
36
37
38 -- | Process the inverse sumLog
39 sumInvLog :: Double -> [Double] -> Double
40 sumInvLog s l = foldl (\mem x -> mem + (1 / log (s + x))) 0 l
41
42
43 -- | Process the sumLog
44 sumLog :: Double -> [Double] -> Double
45 sumLog s l = foldl (\mem x -> mem + log (s + x)) 0 l
46
47
48 -- | To compute a jaccard similarity between two lists
49 jaccard :: [Int] -> [Int] -> Double
50 jaccard inter' union' = ((fromIntegral . length) $ inter') / ((fromIntegral . length) $ union')
51
52
53 -- | To process a WeighedLogJaccard distance between to coocurency matrix
54 weightedLogJaccard :: Double -> Double -> Cooc -> Cooc -> [Int] -> [Int] -> Double
55 weightedLogJaccard sens docs cooc cooc' ngrams ngrams'
56 | null ngramsInter = 0
57 | ngramsInter == ngramsUnion = 1
58 | sens == 0 = jaccard ngramsInter ngramsUnion
59 | sens > 0 = (sumInvLog sens coocInter) / (sumInvLog sens coocUnion)
60 | otherwise = (sumLog sens coocInter) / (sumLog sens coocUnion)
61 where
62 --------------------------------------
63 ngramsInter :: [Int]
64 ngramsInter = intersect ngrams ngrams'
65 --------------------------------------
66 ngramsUnion :: [Int]
67 ngramsUnion = union ngrams ngrams'
68 --------------------------------------
69 coocInter :: [Double]
70 coocInter = elems $ map (/docs) $ filterWithKey (\(k,k') _ -> k == k') $ intersectionWith (+) cooc cooc'
71 --------------------------------------
72 coocUnion :: [Double]
73 coocUnion = elems $ map (/docs) $ filterWithKey (\(k,k') _ -> k == k') $ unionWith (+) cooc cooc'
74 --------------------------------------
75
76
77 -- | To choose a proximity function
78 pickProximity :: Proximity -> Double -> Cooc -> Cooc -> [Int] -> [Int] -> Double
79 pickProximity proximity docs cooc cooc' ngrams ngrams' = case proximity of
80 WeightedLogJaccard sens _ _ -> weightedLogJaccard sens docs cooc cooc' ngrams ngrams'
81 Hamming -> undefined
82
83
84 -- | To process the proximity between a current group and a pair of targets group
85 toProximity :: Map Date Double -> Proximity -> PhyloGroup -> PhyloGroup -> PhyloGroup -> Double
86 toProximity docs proximity ego target target' =
87 let docs' = sum $ elems docs
88 cooc = if target == target'
89 then (target ^. phylo_groupCooc)
90 else sumCooc (target ^. phylo_groupCooc) (target' ^. phylo_groupCooc)
91 ngrams = if target == target'
92 then (target ^. phylo_groupNgrams)
93 else union (target ^. phylo_groupNgrams) (target' ^. phylo_groupNgrams)
94 in pickProximity proximity docs' (ego ^. phylo_groupCooc) cooc (ego ^. phylo_groupNgrams) ngrams
95
96
97 ------------------------
98 -- | Local Matching | --
99 ------------------------
100
101
102 -- | Find pairs of valuable candidates to be matched
103 makePairs :: [PhyloGroup] -> [PhyloPeriodId] -> [PhyloPeriodId] -> [(PhyloGroup,PhyloGroup)]
104 makePairs candidates periods periods' = case null periods of
105 True -> []
106 -- | at least on of the pair candidates should be from the last added period
107 False -> filter (\(cdt,cdt') ->
108 ((inLastPeriod cdt periods) || (inLastPeriod cdt' periods))
109 && (not $ inOldPeriods cdt periods')
110 && (not $ inOldPeriods cdt' periods'))
111 $ listToKeys candidates
112 where
113 inLastPeriod :: PhyloGroup -> [PhyloPeriodId] -> Bool
114 inLastPeriod g prds = (g ^. phylo_groupPeriod) == (last' "makePairs" prds)
115 --------------------------------------
116 inOldPeriods :: PhyloGroup -> [PhyloPeriodId] -> Bool
117 inOldPeriods g prds = elem (g ^. phylo_groupPeriod) prds
118
119
120 keepOldOnes :: Filiation -> Proximity -> Double -> PhyloGroup -> Bool
121 keepOldOnes fil proxi thr ego = any (\(_,w) -> filterProximity proxi thr w)
122 $ getPeriodPointers fil ego
123
124 filterPointers :: Proximity -> Double -> [Pointer] -> [Pointer]
125 filterPointers proxi thr pts = filter (\(_,w) -> filterProximity proxi thr w) pts
126
127
128 findLastPeriod :: Filiation -> [Pointer] -> PhyloPeriodId
129 findLastPeriod fil pts = case fil of
130 ToParents -> head' "findLastPeriod" $ sortOn fst $ map (fst . fst . fst) pts
131 ToChilds -> head' "findLastPeriod" $ reverse $ sortOn fst $ map (fst . fst . fst) pts
132
133
134
135 phyloGroupMatching :: [[PhyloGroup]] -> Filiation -> Proximity -> Map Date Double -> Double -> PhyloGroup -> PhyloGroup
136 phyloGroupMatching candidates fil proxi docs thr ego =
137 if keepOldOnes fil proxi thr ego
138 -- | keep some of the old pointers
139 then addPointers ego fil TemporalPointer
140 $ filterPointers proxi thr
141 $ getPeriodPointers fil ego
142 else case null pointers of
143 -- | let's find new pointers
144 True -> addPointers ego fil TemporalPointer []
145 False -> addPointers ego fil TemporalPointer
146 $ head' "phyloGroupMatching"
147 -- | Keep only the best set of pointers grouped by proximity
148 $ groupBy (\pt pt' -> snd pt == snd pt')
149 $ reverse $ sortOn snd $ head' "pointers" pointers
150 -- | Find the first time frame where at leats one pointer satisfies the proximity threshold
151 where
152 --------------------------------------
153 oldPeriods :: [PhyloPeriodId] -> [PhyloPeriodId]
154 oldPeriods periods =
155 if (null $ getPeriodPointers fil ego)
156 then []
157 else
158 let period = findLastPeriod fil $ getPeriodPointers fil ego
159 in fst $ partition (\prd -> case fil of
160 ToChilds -> prd <= period
161 ToParents -> prd >= period ) periods
162 --------------------------------------
163 pointers :: [[Pointer]]
164 pointers = take 1
165 $ dropWhile (null)
166 -- | for each time frame, process the proximity on relevant pairs of targeted groups
167 $ scanl (\acc groups ->
168 let periods = nub
169 $ concat $ map (\gs -> if null gs
170 then []
171 else [_phylo_groupPeriod $ head' "pointers" gs]) groups
172 periods' = oldPeriods periods
173 pairs = makePairs (concat groups) periods periods'
174 in acc ++ ( filterPointers proxi thr
175 $ concat
176 $ map (\(c,c') ->
177 -- | process the proximity between the current group and a pair of candidates
178 let proximity = toProximity (filterDocs docs ([ego ^. phylo_groupPeriod] ++ periods)) proxi ego c c'
179 in if (c == c')
180 then [(getGroupId c,proximity)]
181 else [(getGroupId c,proximity),(getGroupId c',proximity)] ) pairs)
182 ) []
183 -- | groups from [[1900],[1900,1901],[1900,1901,1902],...]
184 $ inits candidates
185
186
187 filterDocs :: Map Date Double -> [PhyloPeriodId] -> Map Date Double
188 filterDocs d pds = restrictKeys d $ periodsToYears pds
189
190
191 -----------------------------
192 -- | Matching Processing | --
193 -----------------------------
194
195
196 getNextPeriods :: Filiation -> Int -> PhyloPeriodId -> [PhyloPeriodId] -> [PhyloPeriodId]
197 getNextPeriods fil max' pId pIds =
198 case fil of
199 ToChilds -> take max' $ (tail . snd) $ splitAt (elemIndex' pId pIds) pIds
200 ToParents -> take max' $ (reverse . fst) $ splitAt (elemIndex' pId pIds) pIds
201
202
203 getCandidates :: Filiation -> PhyloGroup -> [[PhyloGroup]] -> [[PhyloGroup]]
204 getCandidates fil ego targets =
205 case fil of
206 ToChilds -> targets'
207 ToParents -> reverse targets'
208 where
209 targets' :: [[PhyloGroup]]
210 targets' =
211 map (\groups' ->
212 filter (\g' -> (not . null) $ intersect (ego ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams)
213 ) groups') targets
214
215
216 phyloBranchMatching :: Int -> [PhyloPeriodId] -> Proximity -> Double -> Map Date Double -> [PhyloGroup] -> [PhyloGroup]
217 phyloBranchMatching frame periods proximity thr docs branch = traceBranchMatching proximity thr
218 $ matchByPeriods ToParents
219 $ groupByField _phylo_groupPeriod
220 $ matchByPeriods ToChilds
221 $ groupByField _phylo_groupPeriod branch
222 where
223 --------------------------------------
224 matchByPeriods :: Filiation -> Map PhyloPeriodId [PhyloGroup] -> [PhyloGroup]
225 matchByPeriods fil branch' = foldl' (\acc prd ->
226 let periods' = getNextPeriods fil frame prd periods
227 candidates = map (\prd' -> findWithDefault [] prd' branch') periods'
228 docs' = filterDocs docs ([prd] ++ periods')
229 egos = map (\g -> phyloGroupMatching (getCandidates fil g candidates) fil proximity docs' thr g)
230 $ findWithDefault [] prd branch'
231 egos' = egos `using` parList rdeepseq
232 in acc ++ egos' ) [] periods
233
234
235 -----------------------
236 -- | Phylo Quality | --
237 -----------------------
238
239
240 termFreq :: Int -> [[PhyloGroup]] -> Double
241 termFreq term branches = (sum $ map (\g -> findWithDefault 0 (term,term) (g ^. phylo_groupCooc)) $ concat branches)
242 / (sum $ map (\g -> getTrace $ g ^. phylo_groupCooc) $ concat branches)
243
244
245 entropy :: [[PhyloGroup]] -> Double
246 entropy branches =
247 let terms = ngramsInBranches branches
248 in sum $ map (\term -> (1 / log (termFreq term branches))
249 / (sum $ map (\branch -> 1 / log (termFreq term [branch])) branches)
250 * (sum $ map (\branch ->
251 let q = branchObs term (length $ concat branches) branch
252 in if (q == 0)
253 then 0
254 else - q * logBase 2 q ) branches) ) terms
255 where
256 -- | Probability to observe a branch given a random term of the phylo
257 branchObs :: Int -> Int -> [PhyloGroup] -> Double
258 branchObs term total branch = (fromIntegral $ length $ filter (\g -> elem term $ g ^. phylo_groupNgrams) branch)
259 / (fromIntegral total)
260
261
262 homogeneity :: [[PhyloGroup]] -> Double
263 homogeneity branches =
264 let nbGroups = length $ concat branches
265 in sum
266 $ map (\branch -> (if (length branch == nbGroups)
267 then 1
268 else (1 / log (branchCov branch nbGroups))
269 / (sum $ map (\branch' -> 1 / log (branchCov branch' nbGroups)) branches))
270 * (sum $ map (\term -> (termFreq term branches)
271 / (sum $ map (\term' -> termFreq term' branches) $ ngramsInBranches [branch])
272 * (fromIntegral $ sum $ ngramsInBranches [filter (\g -> elem term $ g ^. phylo_groupNgrams) branch])
273 / (fromIntegral $ sum $ ngramsInBranches [branch])
274 ) $ ngramsInBranches [branch]) ) branches
275 where
276 branchCov :: [PhyloGroup] -> Int -> Double
277 branchCov branch total = (fromIntegral $ length branch) / (fromIntegral total)
278
279
280 toPhyloQuality :: [[PhyloGroup]] -> Double
281 toPhyloQuality branches = sqrt (homogeneity branches / entropy branches)
282
283
284 -----------------------------
285 -- | Adaptative Matching | --
286 -----------------------------
287
288
289 groupsToBranches :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
290 groupsToBranches groups =
291 -- | run the related component algorithm
292 let graph = zip [1..]
293 $ relatedComponents
294 $ map (\group -> [getGroupId group]
295 ++ (map fst $ group ^. phylo_groupPeriodParents)
296 ++ (map fst $ group ^. phylo_groupPeriodChilds) ) $ elems groups
297 -- | update each group's branch id
298 in map (\(bId,ids) ->
299 map (\group -> group & phylo_groupBranchId %~ (\(lvl,lst) -> (lvl,lst ++ [bId])))
300 $ elems $ restrictKeys groups (Set.fromList ids)) graph
301
302
303 recursiveMatching :: Proximity -> Double -> Int -> [PhyloPeriodId] -> Map Date Double -> Double -> [[PhyloGroup]] -> [PhyloGroup]
304 recursiveMatching proximity thr frame periods docs quality branches =
305 if (length branches == (length $ concat branches))
306 then concat branches
307 else if thr > 1
308 then concat branches
309 else
310 case quality <= (sum nextQualities) of
311 -- | success : the new threshold improves the quality score, let's go deeper (traceMatchSuccess thr quality (sum nextQualities))
312 True -> concat
313 $ map (\branches' ->
314 let idx = fromJust $ elemIndex branches' nextBranches
315 in recursiveMatching proximity (thr + (getThresholdStep proximity)) frame periods docs (nextQualities !! idx) branches')
316 $ nextBranches
317 -- | failure : last step was a local maximum of quality, let's validate it (traceMatchFailure thr quality (sum nextQualities))
318 False -> concat branches
319 where
320 -- | 2) for each of the possible next branches process the phyloQuality score
321 nextQualities :: [Double]
322 nextQualities = map toPhyloQuality nextBranches
323 -- | 1) for each local branch process a temporal matching then find the resulting branches
324 nextBranches :: [[[PhyloGroup]]]
325 nextBranches =
326 let branches' = map (\branch -> phyloBranchMatching frame periods proximity thr docs branch) branches
327 clusters = map (\branch -> groupsToBranches $ fromList $ map (\group -> (getGroupId group, group)) branch) branches'
328 clusters' = clusters `using` parList rdeepseq
329 in clusters'
330
331
332
333 temporalMatching :: Phylo -> Phylo
334 temporalMatching phylo = updatePhyloGroups 1 branches' phylo
335 where
336 -- | 4) run the recursive matching to find the best repartition among branches
337 branches' :: Map PhyloGroupId PhyloGroup
338 branches' = fromList
339 $ map (\g -> (getGroupId g, g))
340 $ traceMatchEnd
341 $ recursiveMatching (phyloProximity $ getConfig phylo)
342 ( (getThresholdInit $ phyloProximity $ getConfig phylo)
343 + (getThresholdStep $ phyloProximity $ getConfig phylo))
344 (getTimeFrame $ timeUnit $ getConfig phylo)
345 (getPeriodIds phylo)
346 (phylo ^. phylo_timeDocs) quality branches
347 -- | 3) process the quality score
348 quality :: Double
349 quality = toPhyloQuality branches
350 -- | 2) group into branches
351 branches :: [[PhyloGroup]]
352 branches = groupsToBranches $ fromList $ map (\group -> (getGroupId group, group)) groups'
353 -- | 1) for each group process an initial temporal Matching
354 groups' :: [PhyloGroup]
355 groups' = phyloBranchMatching (getTimeFrame $ timeUnit $ getConfig phylo) (getPeriodIds phylo)
356 (phyloProximity $ getConfig phylo) (getThresholdInit $ phyloProximity $ getConfig phylo)
357 (phylo ^. phylo_timeDocs)
358 (traceTemporalMatching $ getGroupsFromLevel 1 phylo)