]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/Phylo/LinkMaker.hs
add rebranching to link distante branches
[gargantext.git] / src / Gargantext / Viz / Phylo / LinkMaker.hs
1 {-|
2 Module : Gargantext.Viz.Phylo.Tools
3 Description : Phylomemy Tools to build/manage it
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10
11 -}
12
13 {-# LANGUAGE NoImplicitPrelude #-}
14 {-# LANGUAGE FlexibleContexts #-}
15 {-# LANGUAGE OverloadedStrings #-}
16
17 module Gargantext.Viz.Phylo.LinkMaker
18 where
19
20 import Control.Lens hiding (both, Level)
21 import Data.List ((++), sortOn, null, tail, splitAt, elem, concat, sort, delete, intersect, nub, groupBy)
22 import Data.Tuple.Extra
23 import Data.Map (Map,(!),fromListWith,elems,restrictKeys,unionWith,member)
24 import Gargantext.Prelude
25 import Gargantext.Viz.Phylo
26 import Gargantext.Viz.Phylo.Tools
27 import Gargantext.Viz.Phylo.Metrics.Proximity
28 import qualified Data.List as List
29 import qualified Data.Maybe as Maybe
30 import qualified Data.Map as Map
31
32 import qualified Data.Vector.Storable as VS
33 import Debug.Trace (trace)
34 import Numeric.Statistics (percentile)
35
36 -----------------------------
37 -- | From Level to level | --
38 -----------------------------
39
40
41 -- | To choose a LevelLink strategy based an a given Level
42 shouldLink :: (Level,Level) -> PhyloGroup -> PhyloGroup -> Bool
43 shouldLink (lvl,lvl') g g'
44 | (lvl <= 1) && (lvl' <= 1) = doesContainsOrd (getGroupNgrams g) (getGroupNgrams g')
45 | otherwise = elem (getGroupId g) (getGroupLevelChildsId g')
46
47
48 -- | To set the LevelLinks between a given PhyloGroup and a list of childs/parents PhyloGroups
49 linkGroupToGroups :: (Level,Level) -> PhyloGroup -> [PhyloGroup] -> PhyloGroup
50 linkGroupToGroups (lvl,lvl') current targets
51 | lvl < lvl' = setLevelParents current
52 | lvl > lvl' = setLevelChilds current
53 | otherwise = current
54 where
55 --------------------------------------
56 setLevelChilds :: PhyloGroup -> PhyloGroup
57 setLevelChilds = over (phylo_groupLevelChilds) addPointers
58 --------------------------------------
59 setLevelParents :: PhyloGroup -> PhyloGroup
60 setLevelParents = over (phylo_groupLevelParents) addPointers
61 --------------------------------------
62 addPointers :: [Pointer] -> [Pointer]
63 addPointers lp = lp ++ Maybe.mapMaybe (\target ->
64 if shouldLink (lvl,lvl') current target
65 then Just ((getGroupId target),1)
66 else Nothing) targets
67 --------------------------------------
68
69
70 -- | To set the LevelLink of all the PhyloGroups of a Phylo
71 setLevelLinks :: (Level,Level) -> Phylo -> Phylo
72 setLevelLinks (lvl,lvl') p = alterPhyloGroups (\groups ->
73 map (\group -> if getGroupLevel group == lvl
74 then linkGroupToGroups (lvl,lvl') group
75 $ filterCandidates group
76 $ getGroupsWithFilters lvl' (getGroupPeriod group) p
77 else group) groups) p
78
79
80 -------------------------------
81 -- | From Period to Period | --
82 -------------------------------
83
84
85 -- | To get the next or previous PhyloPeriod based on a given PhyloPeriodId
86 getNextPeriods :: Filiation -> PhyloPeriodId -> [PhyloPeriodId] -> [PhyloPeriodId]
87 getNextPeriods to' id l = case to' of
88 Descendant -> (tail . snd) next
89 Ascendant -> (reverse . fst) next
90 _ -> panic ("[ERR][Viz.Phylo.Example.getNextPeriods] Filiation type not defined")
91 where
92 --------------------------------------
93 next :: ([PhyloPeriodId], [PhyloPeriodId])
94 next = splitAt idx l
95 --------------------------------------
96 idx :: Int
97 idx = case (List.elemIndex id l) of
98 Nothing -> panic ("[ERR][Viz.Phylo.Example.getNextPeriods] PhyloPeriodId not defined")
99 Just i -> i
100 --------------------------------------
101
102
103 -- | To get the number of docs produced during a list of periods
104 periodsToNbDocs :: [PhyloPeriodId] -> Phylo -> Double
105 periodsToNbDocs prds phylo = sum $ elems
106 $ restrictKeys (phylo ^. phylo_docsByYears)
107 $ periodsToYears prds
108
109
110 -- | To process a given Proximity
111 processProximity :: Proximity -> Map (Int, Int) Double -> Map (Int, Int) Double -> Double -> Double
112 processProximity proximity cooc cooc' nbDocs = case proximity of
113 WeightedLogJaccard (WLJParams _ sens) -> weightedLogJaccard sens cooc cooc' nbDocs
114 Hamming (HammingParams _) -> hamming cooc cooc'
115 _ -> panic "[ERR][Viz.Phylo.LinkMaker.processProximity] Unknown proximity"
116
117
118
119 -- | Find the best candidates to be time-linked with a group g1 (recursively until the limit of periods is reached)
120 -- | 1) find the next periods and get the mini cooc matrix of g1
121 -- | 2) build the pairs of candidates (single groups or tuples)
122 -- | 3) process the proximity mesure and select the best ones to create the pointers (ie: all the max)
123 findBestCandidates :: Filiation -> Int -> Int -> Proximity -> [(Date,Date)] -> [PhyloGroup] -> PhyloGroup -> Phylo -> ([Pointer],[Double])
124 findBestCandidates filiation depth limit proximity periods candidates g1 phylo
125 | depth > limit || null nextPeriods = ([],[])
126 | (not . null) pointers = (head' "findBestCandidates" $ groupBy (\x y -> snd x == snd y) pointers
127 ,map snd similarities)
128 | otherwise = findBestCandidates filiation (depth + 1) limit proximity periods candidates g1 phylo
129 where
130 --------------------------------------
131 pointers :: [(PhyloGroupId, Double)]
132 pointers = reverse $ sortOn snd $ filter (\(_,score) -> case proximity of
133 WeightedLogJaccard (WLJParams thr _) -> score >= thr
134 Hamming (HammingParams thr) -> score <= thr
135 _ -> panic "[ERR][Viz.Phylo.LinkMaker.findBestCandidates] Unknown proximity"
136 ) similarities
137 --------------------------------------
138 similarities :: [(PhyloGroupId, Double)]
139 similarities = concat $ map (\(g2,g3) -> let nbDocs = periodsToNbDocs [(getGroupPeriod g1),(getGroupPeriod g2),(getGroupPeriod g3)] phylo
140 cooc2 = getGroupCooc g2
141 cooc3 = getGroupCooc g3
142 score = processProximity proximity cooc1 (unionWith (+) cooc2 cooc3) nbDocs
143 in nub $ [(getGroupId g2,score),(getGroupId g3,score)]) pairsOfCandidates
144 --------------------------------------
145 pairsOfCandidates :: [(PhyloGroup,PhyloGroup)]
146 pairsOfCandidates = listToFullCombi $ filter (\g -> elem (getGroupPeriod g) nextPeriods) candidates
147 --------------------------------------
148 cooc1 :: Map (Int,Int) Double
149 cooc1 = getGroupCooc g1
150 --------------------------------------
151 nextPeriods :: [(Date,Date)]
152 nextPeriods = take depth periods
153 --------------------------------------
154
155
156 findBestCandidates' :: Proximity -> [PhyloGroup] -> PhyloGroup -> Phylo -> [Pointer]
157 findBestCandidates' proximity candidates g1 phylo = pointers
158 where
159 --------------------------------------
160 pointers :: [(PhyloGroupId, Double)]
161 pointers = reverse $ sortOn snd $ filter (\(_,score) -> case proximity of
162 WeightedLogJaccard (WLJParams thr _) -> score >= thr
163 Hamming (HammingParams thr) -> score <= thr
164 _ -> panic "[ERR][Viz.Phylo.LinkMaker.findBestCandidates'] Unknown proximity"
165 ) similarities
166 --------------------------------------
167 similarities :: [(PhyloGroupId, Double)]
168 similarities = concat $ map (\(g2,g3) -> let nbDocs = periodsToNbDocs [(getGroupPeriod g1),(getGroupPeriod g2),(getGroupPeriod g3)] phylo
169 cooc2 = getGroupCooc g2
170 cooc3 = getGroupCooc g3
171 score = processProximity proximity cooc1 (unionWith (+) cooc2 cooc3) nbDocs
172 in nub $ [(getGroupId g2,score),(getGroupId g3,score)]) pairsOfCandidates
173 --------------------------------------
174 pairsOfCandidates :: [(PhyloGroup,PhyloGroup)]
175 pairsOfCandidates = listToFullCombi candidates
176 --------------------------------------
177 cooc1 :: Map (Int,Int) Double
178 cooc1 = getGroupCooc g1
179 --------------------------------------
180
181
182 -- | To add some Pointer to a PhyloGroup
183 addPointers' :: Filiation -> [Pointer] -> PhyloGroup -> PhyloGroup
184 addPointers' fil pts g = g & case fil of
185 Descendant -> phylo_groupPeriodChilds %~ (++ pts)
186 Ascendant -> phylo_groupPeriodParents %~ (++ pts)
187 _ -> panic ("[ERR][Viz.Phylo.LinkMaker.addPointers] Wrong type of filiation")
188
189
190
191 -- | To update a list of phyloGroups with some Pointers
192 updateGroups :: Filiation -> Level -> Map PhyloGroupId [Pointer] -> Phylo -> Phylo
193 updateGroups fil lvl m p = alterPhyloGroups (\gs -> map (\g -> if ((getGroupLevel g) == lvl) && (member (getGroupId g) m)
194 then addPointers' fil (m ! (getGroupId g)) g
195 else g ) gs) p
196
197
198
199 -- | Optimisation : to keep only the groups that have at least one ngrams in commons with the target
200 filterCandidates :: PhyloGroup -> [PhyloGroup] -> [PhyloGroup]
201 filterCandidates g gs = filter (\g' -> (not . null) $ intersect (getGroupNgrams g) (getGroupNgrams g'))
202 $ delete g gs
203
204
205 -- | a init avec la [[head groups]] et la tail groups
206 toBranches :: [[PhyloGroup]] -> [PhyloGroup] -> [[PhyloGroup]]
207 toBranches mem gs
208 | null gs = mem
209 | otherwise = toBranches mem' $ tail gs
210 where
211 --------------------------------------
212 mem' :: [[PhyloGroup]]
213 mem' = if (null withHead)
214 then mem ++ [[head' "toBranches" gs]]
215 else (filter (\gs' -> not $ elem gs' withHead) mem)
216 ++
217 [(concat withHead) ++ [head' "toBranches" gs]]
218 --------------------------------------
219 withHead :: [[PhyloGroup]]
220 withHead = filter (\gs' -> (not . null)
221 $ intersect (concat $ map getGroupNgrams gs')
222 (getGroupNgrams $ (head' "toBranches" gs))
223 ) mem
224 --------------------------------------
225
226
227 -- | To process an intertemporal matching task to a Phylo at a given level
228 -- | 1) split all groups (of the level) in branches (ie:related components sharing at least one ngram)
229 -- | 2) for each branch, for each group find the best candidates (by Filiation and Proximity) and create the corresponding pointers
230 -- | 3) update all the groups with the new pointers if they exist
231 interTempoMatching :: Filiation -> Level -> Proximity -> Phylo -> Phylo
232 interTempoMatching fil lvl prox p = traceMatching fil lvl (getThreshold prox) debug $ updateGroups fil lvl pointersMap p
233 where
234 --------------------------------------
235 debug :: [Double]
236 debug = sort $ concat $ map (snd . snd) pointers
237 --------------------------------------
238 pointersMap :: Map PhyloGroupId [Pointer]
239 pointersMap = Map.fromList $ map (\(id,x) -> (id,fst x)) pointers
240 --------------------------------------
241 pointers :: [(PhyloGroupId,([Pointer],[Double]))]
242 pointers = concat
243 $ map (\branche ->
244 map (\g -> ( getGroupId g
245 , findBestCandidates fil 1 (getPhyloMatchingFrame p) prox (getNextPeriods fil (getGroupPeriod g) (getPhyloPeriods p)) (filterCandidates g branche) g p )
246 ) branche ) branches
247 --------------------------------------
248 branches :: [[PhyloGroup]]
249 branches = tracePreBranches
250 $ toBranches [[head' "interTempoMatching" (getGroupsWithLevel lvl p)]]
251 $ tail (getGroupsWithLevel lvl p)
252 --------------------------------------
253
254
255 ------------------------------------------------------------------------
256 -- | Make links from Period to Period after level 1
257
258 toLevelUp :: [Pointer] -> Phylo -> [Pointer]
259 toLevelUp lst p = Map.toList
260 $ map (\ws -> maximum ws)
261 $ fromListWith (++) [(id, [w]) | (id, w) <- pointers]
262 where
263 --------------------------------------
264 pointers :: [Pointer]
265 pointers = map (\(id,v) -> (getGroupLevelParentId $ getGroupFromId id p, v)) lst
266 --------------------------------------
267
268
269 -- | Transpose the parent/child pointers from one level to another
270 transposePeriodLinks :: Level -> Phylo -> Phylo
271 transposePeriodLinks lvl p = alterGroupWithLevel
272 (\g ->
273 --------------------------------------
274 let childs = getGroupsFromIds (map fst $ getGroupLevelChilds g) p
275 ascLink = toLevelUp (concat $ map getGroupPeriodParents childs) p
276 desLink = toLevelUp (concat $ map getGroupPeriodChilds childs) p
277 --------------------------------------
278 in g & phylo_groupPeriodParents %~ (++ ascLink)
279 & phylo_groupPeriodChilds %~ (++ desLink)
280 --------------------------------------
281 ) lvl p
282
283
284 ----------------
285 -- | Tracer | --
286 ----------------
287
288
289 traceMatching :: Filiation -> Level -> Double -> [Double] -> Phylo -> Phylo
290 traceMatching fil lvl thr lst p = trace ( "----\n" <> show (fil) <> " unfiltered temporal Matching in Phylo" <> show (lvl) <> " :\n"
291 <> "count : " <> show (length lst) <> " potential pointers (" <> show (length $ filter (>= thr) lst) <> " >= " <> show (thr) <> ")\n"
292 <> "similarity : " <> show (percentile 25 (VS.fromList lst)) <> " (25%) "
293 <> show (percentile 50 (VS.fromList lst)) <> " (50%) "
294 <> show (percentile 75 (VS.fromList lst)) <> " (75%) "
295 <> show (percentile 90 (VS.fromList lst)) <> " (90%)\n") p
296
297 tracePreBranches :: [[PhyloGroup]] -> [[PhyloGroup]]
298 tracePreBranches bs = trace (show (length bs) <> " pre-branches" <> "\n"
299 <> "with sizes : " <> show (map length bs) <> "\n") bs
300