]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/Phylo/LinkMaker.hs
fix the diagonal issue
[gargantext.git] / src / Gargantext / Viz / Phylo / LinkMaker.hs
1 {-|
2 Module : Gargantext.Viz.Phylo.Tools
3 Description : Phylomemy Tools to build/manage it
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10
11 -}
12
13 {-# LANGUAGE NoImplicitPrelude #-}
14 {-# LANGUAGE FlexibleContexts #-}
15 {-# LANGUAGE OverloadedStrings #-}
16
17 module Gargantext.Viz.Phylo.LinkMaker
18 where
19
20 import Control.Lens hiding (both, Level)
21 import Data.List ((++), sortOn, null, tail, splitAt, elem, concat, sort, delete, intersect, nub, groupBy, union)
22 import Data.Tuple.Extra
23 import Data.Map (Map,(!),fromListWith,elems,restrictKeys,unionWith,member)
24 import Gargantext.Prelude
25 import Gargantext.Viz.Phylo
26 import Gargantext.Viz.Phylo.Tools
27 import Gargantext.Viz.Phylo.Metrics.Proximity
28 import qualified Data.List as List
29 import qualified Data.Maybe as Maybe
30 import qualified Data.Map as Map
31
32 import qualified Data.Vector.Storable as VS
33 import Debug.Trace (trace)
34 import Numeric.Statistics (percentile)
35
36 -----------------------------
37 -- | From Level to level | --
38 -----------------------------
39
40
41 -- | To choose a LevelLink strategy based an a given Level
42 shouldLink :: (Level,Level) -> PhyloGroup -> PhyloGroup -> Bool
43 shouldLink (lvl,lvl') g g'
44 | (lvl <= 1) && (lvl' <= 1) = doesContainsOrd (getGroupNgrams g) (getGroupNgrams g')
45 | otherwise = elem (getGroupId g) (getGroupLevelChildsId g')
46
47
48 -- | To set the LevelLinks between a given PhyloGroup and a list of childs/parents PhyloGroups
49 linkGroupToGroups :: (Level,Level) -> PhyloGroup -> [PhyloGroup] -> PhyloGroup
50 linkGroupToGroups (lvl,lvl') current targets
51 | lvl < lvl' = setLevelParents current
52 | lvl > lvl' = setLevelChilds current
53 | otherwise = current
54 where
55 --------------------------------------
56 setLevelChilds :: PhyloGroup -> PhyloGroup
57 setLevelChilds = over (phylo_groupLevelChilds) addPointers
58 --------------------------------------
59 setLevelParents :: PhyloGroup -> PhyloGroup
60 setLevelParents = over (phylo_groupLevelParents) addPointers
61 --------------------------------------
62 addPointers :: [Pointer] -> [Pointer]
63 addPointers lp = lp ++ Maybe.mapMaybe (\target ->
64 if shouldLink (lvl,lvl') current target
65 then Just ((getGroupId target),1)
66 else Nothing) targets
67 --------------------------------------
68
69
70 -- | To set the LevelLink of all the PhyloGroups of a Phylo
71 setLevelLinks :: (Level,Level) -> Phylo -> Phylo
72 setLevelLinks (lvl,lvl') p = alterPhyloGroups (\groups ->
73 map (\group -> if getGroupLevel group == lvl
74 then linkGroupToGroups (lvl,lvl') group
75 $ filterCandidates group
76 $ getGroupsWithFilters lvl' (getGroupPeriod group) p
77 else group) groups) p
78
79
80 -------------------------------
81 -- | From Period to Period | --
82 -------------------------------
83
84
85 -- | To get the next or previous PhyloPeriod based on a given PhyloPeriodId
86 getNextPeriods :: Filiation -> PhyloPeriodId -> [PhyloPeriodId] -> [PhyloPeriodId]
87 getNextPeriods to' id l = case to' of
88 Descendant -> (tail . snd) next
89 Ascendant -> (reverse . fst) next
90 _ -> panic ("[ERR][Viz.Phylo.Example.getNextPeriods] Filiation type not defined")
91 where
92 --------------------------------------
93 next :: ([PhyloPeriodId], [PhyloPeriodId])
94 next = splitAt idx l
95 --------------------------------------
96 idx :: Int
97 idx = case (List.elemIndex id l) of
98 Nothing -> panic ("[ERR][Viz.Phylo.Example.getNextPeriods] PhyloPeriodId not defined")
99 Just i -> i
100 --------------------------------------
101
102
103 -- | To get the number of docs produced during a list of periods
104 periodsToNbDocs :: [PhyloPeriodId] -> Phylo -> Double
105 periodsToNbDocs prds phylo = sum $ elems
106 $ restrictKeys (phylo ^. phylo_docsByYears)
107 $ periodsToYears prds
108
109
110 -- | To process a given Proximity
111 processProximity :: Proximity -> Double -> Map (Int, Int) Double -> Map (Int, Int) Double -> [Int] -> [Int] -> Double
112 processProximity proximity nbDocs cooc cooc' ngrams ngrams' = case proximity of
113 WeightedLogJaccard (WLJParams _ sens) -> weightedLogJaccard sens nbDocs cooc cooc' ngrams ngrams'
114 Hamming (HammingParams _) -> hamming cooc cooc'
115 _ -> panic "[ERR][Viz.Phylo.LinkMaker.processProximity] Unknown proximity"
116
117
118
119 -- | Find the best candidates to be time-linked with a group g1 (recursively until the limit of periods is reached)
120 -- | 1) find the next periods and get the mini cooc matrix of g1
121 -- | 2) build the pairs of candidates (single groups or tuples)
122 -- | 3) process the proximity mesure and select the best ones to create the pointers (ie: all the max)
123 findBestCandidates :: Filiation -> Int -> Int -> Proximity -> [(Date,Date)] -> [PhyloGroup] -> PhyloGroup -> Phylo -> ([Pointer],[Double])
124 findBestCandidates filiation depth limit proximity periods candidates g1 phylo
125 | depth > limit || null nextPeriods = ([],[])
126 | (not . null) pointers = (head' "findBestCandidates" $ groupBy (\x y -> snd x == snd y) pointers
127 ,map snd similarities)
128 | otherwise = findBestCandidates filiation (depth + 1) limit proximity periods candidates g1 phylo
129 where
130 --------------------------------------
131 pointers :: [(PhyloGroupId, Double)]
132 pointers = reverse $ sortOn snd $ filter (\(_,score) -> case proximity of
133 WeightedLogJaccard (WLJParams thr _) -> score >= thr
134 Hamming (HammingParams thr) -> score <= thr
135 _ -> panic "[ERR][Viz.Phylo.LinkMaker.findBestCandidates] Unknown proximity"
136 ) similarities
137 --------------------------------------
138 similarities :: [(PhyloGroupId, Double)]
139 similarities = concat $ map (\(g2,g3) -> let nbDocs = periodsToNbDocs [(getGroupPeriod g1),(getGroupPeriod g2),(getGroupPeriod g3)] phylo
140 cooc' = unionWith (+) (getGroupCooc g2) (getGroupCooc g3)
141 ngrams' = union (getGroupNgrams g2) (getGroupNgrams g3)
142 score = processProximity proximity nbDocs cooc cooc' ngrams ngrams'
143 in nub $ [(getGroupId g2,score),(getGroupId g3,score)]) pairsOfCandidates
144 --------------------------------------
145 pairsOfCandidates :: [(PhyloGroup,PhyloGroup)]
146 pairsOfCandidates = listToFullCombi $ filter (\g -> elem (getGroupPeriod g) nextPeriods) candidates
147 --------------------------------------
148 cooc :: Map (Int,Int) Double
149 cooc = getGroupCooc g1
150 --------------------------------------
151 ngrams :: [Int]
152 ngrams = getGroupNgrams g1
153 --------------------------------------
154 nextPeriods :: [(Date,Date)]
155 nextPeriods = take depth periods
156 --------------------------------------
157
158
159 findBestCandidates' :: Proximity -> [PhyloGroup] -> PhyloGroup -> Phylo -> [Pointer]
160 findBestCandidates' proximity candidates g1 phylo = pointers
161 where
162 --------------------------------------
163 pointers :: [(PhyloGroupId, Double)]
164 pointers = reverse $ sortOn snd $ filter (\(_,score) -> case proximity of
165 WeightedLogJaccard (WLJParams thr _) -> score >= (thr - 0.1)
166 Hamming (HammingParams thr) -> score <= thr
167 _ -> panic "[ERR][Viz.Phylo.LinkMaker.findBestCandidates'] Unknown proximity"
168 ) similarities
169 --------------------------------------
170 similarities :: [(PhyloGroupId, Double)]
171 similarities = concat $ map (\(g2,g3) -> let nbDocs = periodsToNbDocs [(getGroupPeriod g1),(getGroupPeriod g2),(getGroupPeriod g3)] phylo
172 cooc' = unionWith (+) (getGroupCooc g2) (getGroupCooc g3)
173 ngrams' = union (getGroupNgrams g2) (getGroupNgrams g3)
174 score = processProximity proximity nbDocs cooc cooc' ngrams ngrams'
175 in nub $ [(getGroupId g2,score),(getGroupId g3,score)]) pairsOfCandidates
176 --------------------------------------
177 pairsOfCandidates :: [(PhyloGroup,PhyloGroup)]
178 pairsOfCandidates = listToFullCombi candidates
179 --------------------------------------
180 --------------------------------------
181 cooc :: Map (Int,Int) Double
182 cooc = getGroupCooc g1
183 --------------------------------------
184 ngrams :: [Int]
185 ngrams = getGroupNgrams g1
186 --------------------------------------
187
188
189 -- | To add some Pointer to a PhyloGroup
190 addPointers' :: Filiation -> [Pointer] -> PhyloGroup -> PhyloGroup
191 addPointers' fil pts g = g & case fil of
192 Descendant -> phylo_groupPeriodChilds %~ (++ pts)
193 Ascendant -> phylo_groupPeriodParents %~ (++ pts)
194 _ -> panic ("[ERR][Viz.Phylo.LinkMaker.addPointers] Wrong type of filiation")
195
196
197
198 -- | To update a list of phyloGroups with some Pointers
199 updateGroups :: Filiation -> Level -> Map PhyloGroupId [Pointer] -> Phylo -> Phylo
200 updateGroups fil lvl m p = alterPhyloGroups (\gs -> map (\g -> if ((getGroupLevel g) == lvl) && (member (getGroupId g) m)
201 then addPointers' fil (m ! (getGroupId g)) g
202 else g ) gs) p
203
204
205
206 -- | Optimisation : to keep only the groups that have at least one ngrams in commons with the target
207 filterCandidates :: PhyloGroup -> [PhyloGroup] -> [PhyloGroup]
208 filterCandidates g gs = filter (\g' -> (not . null) $ intersect (getGroupNgrams g) (getGroupNgrams g'))
209 $ delete g gs
210
211
212 -- | a init avec la [[head groups]] et la tail groups
213 toBranches :: [[PhyloGroup]] -> [PhyloGroup] -> [[PhyloGroup]]
214 toBranches mem gs
215 | null gs = mem
216 | otherwise = toBranches mem' $ tail gs
217 where
218 --------------------------------------
219 mem' :: [[PhyloGroup]]
220 mem' = if (null withHead)
221 then mem ++ [[head' "toBranches" gs]]
222 else (filter (\gs' -> not $ elem gs' withHead) mem)
223 ++
224 [(concat withHead) ++ [head' "toBranches" gs]]
225 --------------------------------------
226 withHead :: [[PhyloGroup]]
227 withHead = filter (\gs' -> (not . null)
228 $ intersect (concat $ map getGroupNgrams gs')
229 (getGroupNgrams $ (head' "toBranches" gs))
230 ) mem
231 --------------------------------------
232
233
234 -- | To process an intertemporal matching task to a Phylo at a given level
235 -- | 1) split all groups (of the level) in branches (ie:related components sharing at least one ngram)
236 -- | 2) for each branch, for each group find the best candidates (by Filiation and Proximity) and create the corresponding pointers
237 -- | 3) update all the groups with the new pointers if they exist
238 interTempoMatching :: Filiation -> Level -> Proximity -> Phylo -> Phylo
239 interTempoMatching fil lvl prox p = traceMatching fil lvl (getThreshold prox) debug $ updateGroups fil lvl pointersMap p
240 where
241 --------------------------------------
242 debug :: [Double]
243 debug = sort $ concat $ map (snd . snd) pointers
244 --------------------------------------
245 pointersMap :: Map PhyloGroupId [Pointer]
246 pointersMap = Map.fromList $ map (\(id,x) -> (id,fst x)) pointers
247 --------------------------------------
248 pointers :: [(PhyloGroupId,([Pointer],[Double]))]
249 pointers = concat
250 $ map (\branche ->
251 map (\g -> ( getGroupId g
252 , findBestCandidates fil 1 (getPhyloMatchingFrame p) prox (getNextPeriods fil (getGroupPeriod g) (getPhyloPeriods p)) (filterCandidates g branche) g p )
253 ) branche ) branches
254 --------------------------------------
255 branches :: [[PhyloGroup]]
256 branches = tracePreBranches
257 $ toBranches [[head' "interTempoMatching" (getGroupsWithLevel lvl p)]]
258 $ tail (getGroupsWithLevel lvl p)
259 --------------------------------------
260
261
262 ------------------------------------------------------------------------
263 -- | Make links from Period to Period after level 1
264
265 toLevelUp :: [Pointer] -> Phylo -> [Pointer]
266 toLevelUp lst p = Map.toList
267 $ map (\ws -> maximum ws)
268 $ fromListWith (++) [(id, [w]) | (id, w) <- pointers]
269 where
270 --------------------------------------
271 pointers :: [Pointer]
272 pointers = map (\(id,v) -> (getGroupLevelParentId $ getGroupFromId id p, v)) lst
273 --------------------------------------
274
275
276 -- | Transpose the parent/child pointers from one level to another
277 transposePeriodLinks :: Level -> Phylo -> Phylo
278 transposePeriodLinks lvl p = alterGroupWithLevel
279 (\g ->
280 --------------------------------------
281 let childs = getGroupsFromIds (map fst $ getGroupLevelChilds g) p
282 ascLink = toLevelUp (concat $ map getGroupPeriodParents childs) p
283 desLink = toLevelUp (concat $ map getGroupPeriodChilds childs) p
284 --------------------------------------
285 in g & phylo_groupPeriodParents %~ (++ ascLink)
286 & phylo_groupPeriodChilds %~ (++ desLink)
287 --------------------------------------
288 ) lvl p
289
290
291 ----------------
292 -- | Tracer | --
293 ----------------
294
295
296 traceMatching :: Filiation -> Level -> Double -> [Double] -> Phylo -> Phylo
297 traceMatching fil lvl thr lst p = trace ( "----\n" <> show (fil) <> " unfiltered temporal Matching in Phylo" <> show (lvl) <> " :\n"
298 <> "count : " <> show (length lst) <> " potential pointers (" <> show (length $ filter (>= thr) lst) <> " >= " <> show (thr) <> ")\n"
299 <> "similarity : " <> show (percentile 25 (VS.fromList lst)) <> " (25%) "
300 <> show (percentile 50 (VS.fromList lst)) <> " (50%) "
301 <> show (percentile 75 (VS.fromList lst)) <> " (75%) "
302 <> show (percentile 90 (VS.fromList lst)) <> " (90%)\n") p
303
304 tracePreBranches :: [[PhyloGroup]] -> [[PhyloGroup]]
305 tracePreBranches bs = trace (show (length bs) <> " pre-branches" <> "\n"
306 <> "with sizes : " <> show (map length bs) <> "\n") bs
307