]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/Phylo/TemporalMatching.hs
add branching
[gargantext.git] / src / Gargantext / Viz / Phylo / TemporalMatching.hs
1 {-|
2 Module : Gargantext.Viz.Phylo.TemporalMatching
3 Description : Module dedicated to the adaptative temporal matching of a Phylo.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 {-# LANGUAGE NoImplicitPrelude #-}
12 {-# LANGUAGE FlexibleContexts #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE MultiParamTypeClasses #-}
15
16 module Gargantext.Viz.Phylo.TemporalMatching where
17
18 import Data.List (concat, splitAt, tail, sortOn, (++), intersect, null, inits, find, groupBy, scanl, any, nub, union)
19 import Data.Map (Map, fromList, toList, fromListWith, filterWithKey, elems, restrictKeys, unionWith, intersectionWith, member, (!))
20
21 import Gargantext.Prelude
22 import Gargantext.Viz.AdaptativePhylo
23 import Gargantext.Viz.Phylo.PhyloTools
24 import Gargantext.Viz.Phylo.SynchronicClustering
25
26 import Control.Lens hiding (Level)
27
28 import qualified Data.Set as Set
29
30
31 -------------------
32 -- | Proximity | --
33 -------------------
34
35
36 -- | Process the inverse sumLog
37 sumInvLog :: Double -> [Double] -> Double
38 sumInvLog s l = foldl (\mem x -> mem + (1 / log (s + x))) 0 l
39
40
41 -- | Process the sumLog
42 sumLog :: Double -> [Double] -> Double
43 sumLog s l = foldl (\mem x -> mem + log (s + x)) 0 l
44
45
46 -- | To compute a jaccard similarity between two lists
47 jaccard :: [Int] -> [Int] -> Double
48 jaccard inter' union' = ((fromIntegral . length) $ inter') / ((fromIntegral . length) $ union')
49
50
51 -- | To process a WeighedLogJaccard distance between to coocurency matrix
52 weightedLogJaccard :: Double -> Double -> Cooc -> Cooc -> [Int] -> [Int] -> Double
53 weightedLogJaccard sens docs cooc cooc' ngrams ngrams'
54 | null ngramsInter = 0
55 | ngramsInter == ngramsUnion = 1
56 | sens == 0 = jaccard ngramsInter ngramsUnion
57 | sens > 0 = (sumInvLog sens coocInter) / (sumInvLog sens coocUnion)
58 | otherwise = (sumLog sens coocInter) / (sumLog sens coocUnion)
59 where
60 --------------------------------------
61 ngramsInter :: [Int]
62 ngramsInter = intersect ngrams ngrams'
63 --------------------------------------
64 ngramsUnion :: [Int]
65 ngramsUnion = union ngrams ngrams'
66 --------------------------------------
67 coocInter :: [Double]
68 coocInter = elems $ map (/docs) $ intersectionWith (+) cooc cooc'
69 --------------------------------------
70 coocUnion :: [Double]
71 coocUnion = elems $ map (/docs) $ unionWith (+) cooc cooc'
72 --------------------------------------
73
74
75 -- | To choose a proximity function
76 pickProximity :: Proximity -> Double -> Cooc -> Cooc -> [Int] -> [Int] -> Double
77 pickProximity proximity docs cooc cooc' ngrams ngrams' = case proximity of
78 WeightedLogJaccard sens _ _ -> weightedLogJaccard sens docs cooc cooc' ngrams ngrams'
79 Hamming -> undefined
80
81
82 -- | To process the proximity between a current group and a pair of targets group
83 toProximity :: Map Date Double -> Proximity -> PhyloGroup -> PhyloGroup -> PhyloGroup -> Double
84 toProximity docs proximity group target target' =
85 let docs' = sum $ elems docs
86 cooc = if target == target'
87 then (target ^. phylo_groupCooc)
88 else sumCooc (target ^. phylo_groupCooc) (target' ^. phylo_groupCooc)
89 ngrams = if target == target'
90 then (target ^. phylo_groupNgrams)
91 else union (target ^. phylo_groupNgrams) (target' ^. phylo_groupNgrams)
92 in pickProximity proximity docs' (group ^. phylo_groupCooc) cooc (group ^. phylo_groupNgrams) ngrams
93
94
95 ------------------------
96 -- | Local Matching | --
97 ------------------------
98
99
100 -- | Find pairs of valuable candidates to be matched
101 makePairs :: [PhyloGroup] -> [PhyloPeriodId] -> Map Date Double -> PhyloGroup -> [(PhyloGroup,PhyloGroup)]
102 makePairs candidates periods docs group = case null periods of
103 True -> []
104 -- | at least on of the pair candidates should be from the last added period
105 False -> filter (\(cdt,cdt') -> (inLastPeriod cdt periods)
106 || (inLastPeriod cdt' periods))
107 $ listToKeys candidates
108 where
109 inLastPeriod :: PhyloGroup -> [PhyloPeriodId] -> Bool
110 inLastPeriod g prds = (g ^. phylo_groupPeriod) == (last' "makePairs" prds)
111
112
113 phyloGroupMatching :: [[PhyloGroup]] -> Filiation -> Proximity -> Map Date Double -> PhyloGroup -> PhyloGroup
114 phyloGroupMatching candidates fil proxi docs group = case pointers of
115 Nothing -> addPointers group fil TemporalPointer []
116 Just pts -> addPointers group fil TemporalPointer
117 $ head' "phyloGroupMatching"
118 -- | Keep only the best set of pointers grouped by proximity
119 $ groupBy (\pt pt' -> snd pt == snd pt')
120 $ reverse $ sortOn snd pts
121 -- | Find the first time frame where at leats one pointer satisfies the proximity threshold
122 where
123 pointers :: Maybe [Pointer]
124 pointers = find (not . null)
125 -- | for each time frame, process the proximity on relevant pairs of targeted groups
126 $ scanl (\acc groups ->
127 let periods = nub $ map (\g' -> g' ^. phylo_groupPeriod) $ concat groups
128 pairs = makePairs (concat groups) periods docs group
129 in acc ++ ( concat
130 $ map (\(c,c') ->
131 -- | process the proximity between the current group and a pair of candidates
132 let proximity = toProximity (filterDocs docs periods) proxi group c c'
133 in if (c == c')
134 then [(getGroupId c,proximity)]
135 else [(getGroupId c,proximity),(getGroupId c',proximity)] ) pairs)
136 ) []
137 -- | groups from [[1900],[1900,1901],[1900,1901,1902],...]
138 $ inits candidates
139 --------------------------------------
140 filterDocs :: Map Date Double -> [PhyloPeriodId] -> Map Date Double
141 filterDocs d pds = restrictKeys d $ periodsToYears pds
142
143
144
145 ------------------
146 -- | Pointers | --
147 ------------------
148
149
150 -- ghostHunter :: [[PhyloGroup]] -> [[PhyloGroup]]
151 -- ghostHunter branches =
152 -- map (\branch ->
153 -- -- | il manque une référence au group source de chaque pointer
154 -- let pointers = elems $ fromList
155 -- $ map (\pt -> (groupIds ! (fst pt),pt))
156 -- $ filter (\pt -> member (fst pt) groupIds) $ concat $ map (\g -> g ^. phylo_groupGhostPointers) branch
157
158 -- in undefined
159 -- ) branches
160 -- where
161 -- groupIds :: Map PhyloGroupId Int
162 -- groupIds = fromList $ map (\g -> (getGroupId g, last' "ghostHunter" $ snd $ g ^. phylo_groupBranchId)) $ concat branches
163 -- --------------------------------------
164 -- selectBest :: [Pointers] -> [Pointers]
165 -- se
166
167
168
169 filterPointers :: Double -> [PhyloGroup] -> [PhyloGroup]
170 filterPointers thr groups =
171 map (\group ->
172 let ghosts = filter (\(_,w) -> w < thr) $ group ^. phylo_groupPeriodParents
173 in group & phylo_groupPeriodParents %~ (filter (\(_,w) -> w >= thr))
174 & phylo_groupPeriodChilds %~ (filter (\(_,w) -> w >= thr))
175 & phylo_groupGhostPointers %~ (++ ghosts)
176 ) groups
177
178
179 -----------------------------
180 -- | Adaptative Matching | --
181 -----------------------------
182
183
184 getNextPeriods :: Filiation -> Int -> PhyloPeriodId -> [PhyloPeriodId] -> [PhyloPeriodId]
185 getNextPeriods fil max pId pIds =
186 case fil of
187 ToChilds -> take max $ (tail . snd) $ splitAt (elemIndex' pId pIds) pIds
188 ToParents -> take max $ (reverse . fst) $ splitAt (elemIndex' pId pIds) pIds
189
190
191 getCandidates :: Filiation -> PhyloGroup -> [PhyloPeriodId] -> [PhyloGroup] -> [[PhyloGroup]]
192 getCandidates fil g pIds targets =
193 case fil of
194 ToChilds -> targets'
195 ToParents -> reverse targets'
196 where
197 targets' :: [[PhyloGroup]]
198 targets' = map (\groups' -> filter (\g' -> (not . null) $ intersect (g ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams)) groups') $ elems
199 $ filterWithKey (\k _ -> elem k pIds)
200 $ fromListWith (++)
201 $ sortOn (fst . fst)
202 $ map (\g' -> (g' ^. phylo_groupPeriod,[g'])) targets
203
204
205 shouldBreak :: Double -> [(Double,[PhyloGroup])] -> Bool
206 shouldBreak thr branches = any (\(quality,_) -> quality < thr) branches
207
208
209 toBranchQuality :: [[PhyloGroup]] -> [(Double,[PhyloGroup])]
210 toBranchQuality branches = undefined
211
212
213 groupsToBranches :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
214 groupsToBranches groups =
215 -- | run the related component algorithm
216 let graph = zip [1..]
217 $ relatedComponents
218 $ map (\group -> [getGroupId group]
219 ++ (map fst $ group ^. phylo_groupPeriodParents)
220 ++ (map fst $ group ^. phylo_groupPeriodChilds) ) $ elems groups
221 -- | update each group's branch id
222 in map (\(bId,ids) ->
223 map (\group -> group & phylo_groupBranchId %~ (\(lvl,lst) -> (lvl,lst ++ [bId])))
224 $ elems $ restrictKeys groups (Set.fromList ids)
225 ) graph
226
227
228
229 -- findGhostLinks :: [Link] -> [[Link]] -> Map PhyloGroupId
230
231 adaptativeMatching :: Proximity -> Double -> Double -> [PhyloGroup] -> [PhyloGroup]
232 adaptativeMatching proximity thr thrQua groups =
233 -- | check if we should break some of the new branches or not
234 case shouldBreak thrQua branches' of
235 True -> concat $ map (\(s,b) ->
236 if s >= thrQua
237 -- | we keep the branch as it is
238 then b
239 -- | we break the branch using an increased temporal matching threshold
240 else let nextGroups = undefined
241 in adaptativeMatching proximity (thr + (getThresholdStep proximity)) thrQua nextGroups
242 ) branches'
243 -- | the quality of all the new branches is sufficient
244 False -> concat branches
245 where
246 -- | 3) process a quality score for each new branch
247 branches' :: [(Double,[PhyloGroup])]
248 branches' = toBranchQuality branches
249 -- | 2) group the new groups into branches
250 branches :: [[PhyloGroup]]
251 branches = groupsToBranches $ fromList $ map (\group -> (getGroupId group, group)) groups'
252 -- | 1) filter the pointers of each groups regarding the current state of the quality threshold
253 groups' :: [PhyloGroup]
254 groups' = filterPointers thr groups
255
256
257 temporalMatching :: Phylo -> Phylo
258 temporalMatching phylo = updatePhyloGroups 1 branches phylo
259 where
260 -- | 4) find the ghost links and postprocess the branches
261 branches' :: Map PhyloGroupId PhyloGroup
262 branches' = undefined
263 -- | 3) run the adaptative matching to find the best repartition among branches
264 branches :: Map PhyloGroupId PhyloGroup
265 branches = fromList
266 $ map (\g -> (getGroupId g, g))
267 $ adaptativeMatching proximity (getThresholdInit proximity) (phyloQuality $ getConfig phylo) groups'
268 -- | 2) for each group process an initial temporal Matching
269 groups' :: [PhyloGroup]
270 groups' =
271 let maxTime = getTimeFrame $ timeUnit $ getConfig phylo
272 periods = getPeriodIds phylo
273 docs = phylo ^. phylo_timeDocs
274 --------------------------------------
275 in map (\group ->
276 let childs = getCandidates ToChilds group
277 (getNextPeriods ToChilds maxTime (group ^. phylo_groupPeriod) periods) groups
278 parents = getCandidates ToParents group
279 (getNextPeriods ToParents maxTime (group ^. phylo_groupPeriod) periods) groups
280 in phyloGroupMatching parents ToParents proximity docs
281 $ phyloGroupMatching childs ToChilds proximity docs group
282 ) groups
283 -- | 1) start with all the groups from a given level
284 groups :: [PhyloGroup]
285 groups = getGroupsFromLevel 1 phylo
286 --------------------------------------
287 proximity :: Proximity
288 proximity = phyloProximity $ getConfig phylo