]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/Phylo/TemporalMatching.hs
working on perf
[gargantext.git] / src / Gargantext / Viz / Phylo / TemporalMatching.hs
1 {-|
2 Module : Gargantext.Viz.Phylo.TemporalMatching
3 Description : Module dedicated to the adaptative temporal matching of a Phylo.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 {-# LANGUAGE NoImplicitPrelude #-}
12 {-# LANGUAGE FlexibleContexts #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE MultiParamTypeClasses #-}
15
16 module Gargantext.Viz.Phylo.TemporalMatching where
17
18 import Data.List (concat, splitAt, tail, sortOn, (++), intersect, null, inits, find, groupBy, scanl, nub, union, elemIndex, (!!))
19 import Data.Map (Map, fromList, fromListWith, filterWithKey, elems, restrictKeys, unionWith, intersectionWith, findWithDefault)
20
21 import Gargantext.Prelude
22 import Gargantext.Viz.AdaptativePhylo
23 import Gargantext.Viz.Phylo.PhyloTools
24
25 import Debug.Trace (trace)
26 import Prelude (logBase)
27 import Control.Lens hiding (Level)
28 import Control.Parallel.Strategies (parList, rdeepseq, using)
29
30 import qualified Data.Set as Set
31
32
33 -------------------
34 -- | Proximity | --
35 -------------------
36
37
38 -- | Process the inverse sumLog
39 sumInvLog :: Double -> [Double] -> Double
40 sumInvLog s l = foldl (\mem x -> mem + (1 / log (s + x))) 0 l
41
42
43 -- | Process the sumLog
44 sumLog :: Double -> [Double] -> Double
45 sumLog s l = foldl (\mem x -> mem + log (s + x)) 0 l
46
47
48 -- | To compute a jaccard similarity between two lists
49 jaccard :: [Int] -> [Int] -> Double
50 jaccard inter' union' = ((fromIntegral . length) $ inter') / ((fromIntegral . length) $ union')
51
52
53 -- | To process a WeighedLogJaccard distance between to coocurency matrix
54 weightedLogJaccard :: Double -> Double -> Cooc -> Cooc -> [Int] -> [Int] -> Double
55 weightedLogJaccard sens docs cooc cooc' ngrams ngrams'
56 | null ngramsInter = 0
57 | ngramsInter == ngramsUnion = 1
58 | sens == 0 = jaccard ngramsInter ngramsUnion
59 | sens > 0 = (sumInvLog sens coocInter) / (sumInvLog sens coocUnion)
60 | otherwise = (sumLog sens coocInter) / (sumLog sens coocUnion)
61 where
62 --------------------------------------
63 ngramsInter :: [Int]
64 ngramsInter = intersect ngrams ngrams'
65 --------------------------------------
66 ngramsUnion :: [Int]
67 ngramsUnion = union ngrams ngrams'
68 --------------------------------------
69 coocInter :: [Double]
70 coocInter = elems $ map (/docs) $ intersectionWith (+) cooc cooc'
71 --------------------------------------
72 coocUnion :: [Double]
73 coocUnion = elems $ map (/docs) $ unionWith (+) cooc cooc'
74 --------------------------------------
75
76
77 -- | To choose a proximity function
78 pickProximity :: Proximity -> Double -> Cooc -> Cooc -> [Int] -> [Int] -> Double
79 pickProximity proximity docs cooc cooc' ngrams ngrams' = case proximity of
80 WeightedLogJaccard sens _ _ -> weightedLogJaccard sens docs cooc cooc' ngrams ngrams'
81 Hamming -> undefined
82
83
84 filterProximity :: Proximity -> Double -> Double -> Bool
85 filterProximity proximity thr local =
86 case proximity of
87 WeightedLogJaccard _ _ _ -> local >= thr
88 Hamming -> undefined
89
90
91 -- | To process the proximity between a current group and a pair of targets group
92 toProximity :: Map Date Double -> Proximity -> PhyloGroup -> PhyloGroup -> PhyloGroup -> Double
93 toProximity docs proximity ego target target' =
94 let docs' = sum $ elems docs
95 cooc = if target == target'
96 then (target ^. phylo_groupCooc)
97 else sumCooc (target ^. phylo_groupCooc) (target' ^. phylo_groupCooc)
98 ngrams = if target == target'
99 then (target ^. phylo_groupNgrams)
100 else union (target ^. phylo_groupNgrams) (target' ^. phylo_groupNgrams)
101 in pickProximity proximity docs' (ego ^. phylo_groupCooc) cooc (ego ^. phylo_groupNgrams) ngrams
102
103
104 ------------------------
105 -- | Local Matching | --
106 ------------------------
107
108
109 -- | Find pairs of valuable candidates to be matched
110 makePairs :: [PhyloGroup] -> [PhyloPeriodId] -> [(PhyloGroup,PhyloGroup)]
111 makePairs candidates periods = case null periods of
112 True -> []
113 -- | at least on of the pair candidates should be from the last added period
114 False -> filter (\(cdt,cdt') -> (inLastPeriod cdt periods)
115 || (inLastPeriod cdt' periods))
116 $ listToKeys candidates
117 where
118 inLastPeriod :: PhyloGroup -> [PhyloPeriodId] -> Bool
119 inLastPeriod g prds = (g ^. phylo_groupPeriod) == (last' "makePairs" prds)
120
121
122 phyloGroupMatching :: [[PhyloGroup]] -> Filiation -> Proximity -> Map Date Double -> Double-> PhyloGroup -> PhyloGroup
123 phyloGroupMatching candidates fil proxi docs thr ego = case pointers of
124 Nothing -> addPointers ego fil TemporalPointer []
125 Just pts -> addPointers ego fil TemporalPointer
126 $ head' "phyloGroupMatching"
127 -- | Keep only the best set of pointers grouped by proximity
128 $ groupBy (\pt pt' -> snd pt == snd pt')
129 $ reverse $ sortOn snd pts
130 -- | Find the first time frame where at leats one pointer satisfies the proximity threshold
131 where
132 pointers :: Maybe [Pointer]
133 pointers = find (not . null)
134 -- | for each time frame, process the proximity on relevant pairs of targeted groups
135 $ scanl (\acc groups ->
136 let periods = nub $ map (\g' -> g' ^. phylo_groupPeriod) $ concat groups
137 pairs = makePairs (concat groups) periods
138 in acc ++ ( filter (\(_,proximity) -> filterProximity proxi thr proximity)
139 $ concat
140 $ map (\(c,c') ->
141 -- | process the proximity between the current group and a pair of candidates
142 let proximity = toProximity (filterDocs docs periods) proxi ego c c'
143 in if (c == c')
144 then [(getGroupId c,proximity)]
145 else [(getGroupId c,proximity),(getGroupId c',proximity)] ) pairs)
146 ) []
147 -- | groups from [[1900],[1900,1901],[1900,1901,1902],...]
148 $ inits candidates
149 --------------------------------------
150 filterDocs :: Map Date Double -> [PhyloPeriodId] -> Map Date Double
151 filterDocs d pds = restrictKeys d $ periodsToYears pds
152
153
154 -----------------------------
155 -- | Matching Processing | --
156 -----------------------------
157
158
159 getNextPeriods :: Filiation -> Int -> PhyloPeriodId -> [PhyloPeriodId] -> [PhyloPeriodId]
160 getNextPeriods fil max' pId pIds =
161 case fil of
162 ToChilds -> take max' $ (tail . snd) $ splitAt (elemIndex' pId pIds) pIds
163 ToParents -> take max' $ (reverse . fst) $ splitAt (elemIndex' pId pIds) pIds
164
165
166 getCandidates :: Filiation -> PhyloGroup -> [PhyloPeriodId] -> [PhyloGroup] -> [[PhyloGroup]]
167 getCandidates fil ego pIds targets =
168 case fil of
169 ToChilds -> targets'
170 ToParents -> reverse targets'
171 where
172 targets' :: [[PhyloGroup]]
173 targets' = map (\groups' -> filter (\g' -> (not . null) $ intersect (ego ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams)) groups') $ elems
174 $ filterWithKey (\k _ -> elem k pIds)
175 $ fromListWith (++)
176 $ sortOn (fst . fst)
177 $ map (\g' -> (g' ^. phylo_groupPeriod,[g'])) targets
178
179
180 processMatching :: Int -> [PhyloPeriodId] -> Proximity -> Double -> Map Date Double -> [PhyloGroup] -> [PhyloGroup]
181 processMatching max' periods proximity thr docs groups =
182 let branche = map (\group ->
183 let childs = getCandidates ToChilds group
184 (getNextPeriods ToChilds max' (group ^. phylo_groupPeriod) periods) groups
185 parents = getCandidates ToParents group
186 (getNextPeriods ToParents max' (group ^. phylo_groupPeriod) periods) groups
187 in phyloGroupMatching parents ToParents proximity docs thr
188 $ phyloGroupMatching childs ToChilds proximity docs thr group
189 ) groups
190 branche' = branche `using` parList rdeepseq
191 in branche'
192
193
194 -----------------------
195 -- | Phylo Quality | --
196 -----------------------
197
198
199 termFreq :: Int -> [[PhyloGroup]] -> Double
200 termFreq term branches = (sum $ map (\g -> findWithDefault 0 (term,term) (g ^. phylo_groupCooc)) $ concat branches)
201 / (sum $ map (\g -> getTrace $ g ^. phylo_groupCooc) $ concat branches)
202
203
204 entropy :: [[PhyloGroup]] -> Double
205 entropy branches =
206 let terms = ngramsInBranches branches
207 in sum $ map (\term -> (1 / log (termFreq term branches))
208 / (sum $ map (\branch -> 1 / log (termFreq term [branch])) branches)
209 * (sum $ map (\branch ->
210 let q = branchObs term (length $ concat branches) branch
211 in if (q == 0)
212 then 0
213 else - q * logBase 2 q ) branches) ) terms
214 where
215 -- | Probability to observe a branch given a random term of the phylo
216 branchObs :: Int -> Int -> [PhyloGroup] -> Double
217 branchObs term total branch = (fromIntegral $ length $ filter (\g -> elem term $ g ^. phylo_groupNgrams) branch)
218 / (fromIntegral total)
219
220
221 homogeneity :: [[PhyloGroup]] -> Double
222 homogeneity branches =
223 let nbGroups = length $ concat branches
224 in sum
225 $ map (\branch -> (if (length branch == nbGroups)
226 then 1
227 else (1 / log (branchCov branch nbGroups))
228 / (sum $ map (\branch' -> 1 / log (branchCov branch' nbGroups)) branches))
229 * (sum $ map (\term -> (termFreq term branches)
230 / (sum $ map (\term' -> termFreq term' branches) $ ngramsInBranches [branch])
231 * (fromIntegral $ sum $ ngramsInBranches [filter (\g -> elem term $ g ^. phylo_groupNgrams) branch])
232 / (fromIntegral $ sum $ ngramsInBranches [branch])
233 ) $ ngramsInBranches [branch]) ) branches
234 where
235 branchCov :: [PhyloGroup] -> Int -> Double
236 branchCov branch total = (fromIntegral $ length branch) / (fromIntegral total)
237
238
239 toPhyloQuality :: [[PhyloGroup]] -> Double
240 toPhyloQuality branches = sqrt (homogeneity branches / entropy branches)
241
242
243 -----------------------------
244 -- | Adaptative Matching | --
245 -----------------------------
246
247
248 groupsToBranches :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
249 groupsToBranches groups =
250 -- | run the related component algorithm
251 let graph = zip [1..]
252 $ relatedComponents
253 $ map (\group -> [getGroupId group]
254 ++ (map fst $ group ^. phylo_groupPeriodParents)
255 ++ (map fst $ group ^. phylo_groupPeriodChilds) ) $ elems groups
256 -- | update each group's branch id
257 in map (\(bId,ids) ->
258 map (\group -> group & phylo_groupBranchId %~ (\(lvl,lst) -> (lvl,lst ++ [bId])))
259 $ elems $ restrictKeys groups (Set.fromList ids)
260 ) graph
261
262
263 recursiveMatching :: Proximity -> Double -> Int -> [PhyloPeriodId] -> Map Date Double -> Double -> [[PhyloGroup]] -> [PhyloGroup]
264 recursiveMatching proximity thr frame periods docs quality branches =
265 if (length branches == (length $ concat branches))
266 then concat branches
267 else if thr > 1
268 then concat branches
269 else
270 case quality <= (sum nextQualities) of
271 -- | success : the new threshold improves the quality score, let's go deeper (traceMatchSuccess thr quality (sum nextQualities))
272 True -> concat
273 $ map (\branches' ->
274 let idx = fromJust $ elemIndex branches' nextBranches
275 in recursiveMatching proximity (thr + (getThresholdStep proximity)) frame periods docs (nextQualities !! idx) branches')
276 $ nextBranches
277 -- | failure : last step was a local maximum of quality, let's validate it (traceMatchFailure thr quality (sum nextQualities))
278 False -> concat branches
279 where
280 -- | 2) for each of the possible next branches process the phyloQuality score
281 nextQualities :: [Double]
282 nextQualities = map toPhyloQuality nextBranches
283 -- | 1) for each local branch process a temporal matching then find the resulting branches
284 nextBranches :: [[[PhyloGroup]]]
285 nextBranches =
286 -- let next =
287 map (\branch ->
288 let branch' = processMatching frame periods proximity thr docs branch
289 in groupsToBranches $ fromList $ map (\group -> (getGroupId group, group)) branch'
290 ) branches
291 -- next' = next `using` parList rdeepseq
292 -- in next
293
294
295
296 temporalMatching :: Phylo -> Phylo
297 temporalMatching phylo = updatePhyloGroups 1 branches' phylo
298 where
299 -- | 4) run the recursive matching to find the best repartition among branches
300 branches' :: Map PhyloGroupId PhyloGroup
301 branches' = fromList
302 $ map (\g -> (getGroupId g, g))
303 $ traceMatchEnd
304 $ recursiveMatching (phyloProximity $ getConfig phylo)
305 ( (getThresholdInit $ phyloProximity $ getConfig phylo)
306 + (getThresholdStep $ phyloProximity $ getConfig phylo))
307 (getTimeFrame $ timeUnit $ getConfig phylo)
308 (getPeriodIds phylo)
309 (phylo ^. phylo_timeDocs) quality branches
310 -- | 3) process the quality score
311 quality :: Double
312 quality = toPhyloQuality branches
313 -- | 2) group into branches
314 branches :: [[PhyloGroup]]
315 branches = groupsToBranches $ fromList $ map (\group -> (getGroupId group, group))
316 $ trace ("\n" <> "-- | Init temporal matching for " <> show (length $ groups') <> " groups" <> "\n") groups'
317 -- | 1) for each group process an initial temporal Matching
318 groups' :: [PhyloGroup]
319 groups' = processMatching (getTimeFrame $ timeUnit $ getConfig phylo) (getPeriodIds phylo)
320 (phyloProximity $ getConfig phylo) (getThresholdInit $ phyloProximity $ getConfig phylo)
321 (phylo ^. phylo_timeDocs) (getGroupsFromLevel 1 phylo)