2 Module : Gargantext.Viz.Phylo.Tools
3 Description : Phylomemy Tools to build/manage it
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
13 {-# LANGUAGE NoImplicitPrelude #-}
14 {-# LANGUAGE FlexibleContexts #-}
15 {-# LANGUAGE OverloadedStrings #-}
17 module Gargantext.Viz.Phylo.LinkMaker
20 import Control.Lens hiding (both, Level)
21 import Data.List ((++), sortOn, null, tail, splitAt, elem, concat, sort, delete, intersect, nub, groupBy)
22 import Data.Tuple.Extra
23 import Data.Map (Map,(!),fromListWith,elems,restrictKeys,unionWith,member)
24 import Gargantext.Prelude
25 import Gargantext.Viz.Phylo
26 import Gargantext.Viz.Phylo.Tools
27 import Gargantext.Viz.Phylo.Metrics.Proximity
28 import qualified Data.List as List
29 import qualified Data.Maybe as Maybe
30 import qualified Data.Map as Map
32 import qualified Data.Vector.Storable as VS
33 import Debug.Trace (trace)
34 import Numeric.Statistics (percentile)
36 -----------------------------
37 -- | From Level to level | --
38 -----------------------------
41 -- | To choose a LevelLink strategy based an a given Level
42 shouldLink :: (Level,Level) -> PhyloGroup -> PhyloGroup -> Bool
43 shouldLink (lvl,lvl') g g'
44 | (lvl <= 1) && (lvl' <= 1) = doesContainsOrd (getGroupNgrams g) (getGroupNgrams g')
45 | otherwise = elem (getGroupId g) (getGroupLevelChildsId g')
48 -- | To set the LevelLinks between a given PhyloGroup and a list of childs/parents PhyloGroups
49 linkGroupToGroups :: (Level,Level) -> PhyloGroup -> [PhyloGroup] -> PhyloGroup
50 linkGroupToGroups (lvl,lvl') current targets
51 | lvl < lvl' = setLevelParents current
52 | lvl > lvl' = setLevelChilds current
55 --------------------------------------
56 setLevelChilds :: PhyloGroup -> PhyloGroup
57 setLevelChilds = over (phylo_groupLevelChilds) addPointers
58 --------------------------------------
59 setLevelParents :: PhyloGroup -> PhyloGroup
60 setLevelParents = over (phylo_groupLevelParents) addPointers
61 --------------------------------------
62 addPointers :: [Pointer] -> [Pointer]
63 addPointers lp = lp ++ Maybe.mapMaybe (\target ->
64 if shouldLink (lvl,lvl') current target
65 then Just ((getGroupId target),1)
67 --------------------------------------
70 -- | To set the LevelLink of all the PhyloGroups of a Phylo
71 setLevelLinks :: (Level,Level) -> Phylo -> Phylo
72 setLevelLinks (lvl,lvl') p = alterPhyloGroups (\groups ->
73 map (\group -> if getGroupLevel group == lvl
74 then linkGroupToGroups (lvl,lvl') group
75 $ filterCandidates group
76 $ getGroupsWithFilters lvl' (getGroupPeriod group) p
80 -------------------------------
81 -- | From Period to Period | --
82 -------------------------------
85 -- | To get the next or previous PhyloPeriod based on a given PhyloPeriodId
86 getNextPeriods :: Filiation -> PhyloPeriodId -> [PhyloPeriodId] -> [PhyloPeriodId]
87 getNextPeriods to' id l = case to' of
88 Descendant -> (tail . snd) next
89 Ascendant -> (reverse . fst) next
90 _ -> panic ("[ERR][Viz.Phylo.Example.getNextPeriods] Filiation type not defined")
92 --------------------------------------
93 next :: ([PhyloPeriodId], [PhyloPeriodId])
95 --------------------------------------
97 idx = case (List.elemIndex id l) of
98 Nothing -> panic ("[ERR][Viz.Phylo.Example.getNextPeriods] PhyloPeriodId not defined")
100 --------------------------------------
103 -- | To get the number of docs produced during a list of periods
104 periodsToNbDocs :: [PhyloPeriodId] -> Phylo -> Double
105 periodsToNbDocs prds phylo = sum $ elems
106 $ restrictKeys (phylo ^. phylo_docsByYears)
107 $ periodsToYears prds
110 -- | To process a given Proximity
111 processProximity :: Proximity -> Map (Int, Int) Double -> Map (Int, Int) Double -> Double -> Double
112 processProximity proximity cooc cooc' nbDocs = case proximity of
113 WeightedLogJaccard (WLJParams _ sens) -> weightedLogJaccard sens cooc cooc' nbDocs
114 Hamming (HammingParams _) -> hamming cooc cooc'
115 _ -> panic "[ERR][Viz.Phylo.LinkMaker.processProximity] Unknown proximity"
119 -- | Find the best candidates to be time-linked with a group g1 (recursively until the limit of periods is reached)
120 -- | 1) find the next periods and get the mini cooc matrix of g1
121 -- | 2) build the pairs of candidates (single groups or tuples)
122 -- | 3) process the proximity mesure and select the best ones to create the pointers (ie: all the max)
123 findBestCandidates :: Filiation -> Int -> Int -> Proximity -> [(Date,Date)] -> [PhyloGroup] -> PhyloGroup -> Phylo -> ([Pointer],[Double])
124 findBestCandidates filiation depth limit proximity periods candidates g1 phylo
125 | depth > limit || null nextPeriods = ([],[])
126 | (not . null) pointers = (head' "findBestCandidates" $ groupBy (\x y -> snd x == snd y) pointers
127 ,map snd similarities)
128 | otherwise = findBestCandidates filiation (depth + 1) limit proximity periods candidates g1 phylo
130 --------------------------------------
131 pointers :: [(PhyloGroupId, Double)]
132 pointers = reverse $ sortOn snd $ filter (\(_,score) -> case proximity of
133 WeightedLogJaccard (WLJParams thr _) -> score >= thr
134 Hamming (HammingParams thr) -> score <= thr
135 _ -> panic "[ERR][Viz.Phylo.LinkMaker.findBestCandidates] Unknown proximity"
137 --------------------------------------
138 similarities :: [(PhyloGroupId, Double)]
139 similarities = concat $ map (\(g2,g3) -> let nbDocs = periodsToNbDocs [(getGroupPeriod g1),(getGroupPeriod g2),(getGroupPeriod g3)] phylo
140 cooc2 = getGroupCooc g2
141 cooc3 = getGroupCooc g3
142 score = processProximity proximity cooc1 (unionWith (+) cooc2 cooc3) nbDocs
143 in nub $ [(getGroupId g2,score),(getGroupId g3,score)]) pairsOfCandidates
144 --------------------------------------
145 pairsOfCandidates :: [(PhyloGroup,PhyloGroup)]
146 pairsOfCandidates = listToFullCombi $ filter (\g -> elem (getGroupPeriod g) nextPeriods) candidates
147 --------------------------------------
148 cooc1 :: Map (Int,Int) Double
149 cooc1 = getGroupCooc g1
150 --------------------------------------
151 nextPeriods :: [(Date,Date)]
152 nextPeriods = take depth periods
153 --------------------------------------
156 findBestCandidates' :: Proximity -> [PhyloGroup] -> PhyloGroup -> Phylo -> [Pointer]
157 findBestCandidates' proximity candidates g1 phylo = pointers
159 --------------------------------------
160 pointers :: [(PhyloGroupId, Double)]
161 pointers = reverse $ sortOn snd $ filter (\(_,score) -> case proximity of
162 WeightedLogJaccard (WLJParams thr _) -> score >= thr
163 Hamming (HammingParams thr) -> score <= thr
164 _ -> panic "[ERR][Viz.Phylo.LinkMaker.findBestCandidates'] Unknown proximity"
166 --------------------------------------
167 similarities :: [(PhyloGroupId, Double)]
168 similarities = concat $ map (\(g2,g3) -> let nbDocs = periodsToNbDocs [(getGroupPeriod g1),(getGroupPeriod g2),(getGroupPeriod g3)] phylo
169 cooc2 = getGroupCooc g2
170 cooc3 = getGroupCooc g3
171 score = processProximity proximity cooc1 (unionWith (+) cooc2 cooc3) nbDocs
172 in nub $ [(getGroupId g2,score),(getGroupId g3,score)]) pairsOfCandidates
173 --------------------------------------
174 pairsOfCandidates :: [(PhyloGroup,PhyloGroup)]
175 pairsOfCandidates = listToFullCombi candidates
176 --------------------------------------
177 cooc1 :: Map (Int,Int) Double
178 cooc1 = getGroupCooc g1
179 --------------------------------------
182 -- | To add some Pointer to a PhyloGroup
183 addPointers' :: Filiation -> [Pointer] -> PhyloGroup -> PhyloGroup
184 addPointers' fil pts g = g & case fil of
185 Descendant -> phylo_groupPeriodChilds %~ (++ pts)
186 Ascendant -> phylo_groupPeriodParents %~ (++ pts)
187 _ -> panic ("[ERR][Viz.Phylo.LinkMaker.addPointers] Wrong type of filiation")
191 -- | To update a list of phyloGroups with some Pointers
192 updateGroups :: Filiation -> Level -> Map PhyloGroupId [Pointer] -> Phylo -> Phylo
193 updateGroups fil lvl m p = alterPhyloGroups (\gs -> map (\g -> if ((getGroupLevel g) == lvl) && (member (getGroupId g) m)
194 then addPointers' fil (m ! (getGroupId g)) g
199 -- | Optimisation : to keep only the groups that have at least one ngrams in commons with the target
200 filterCandidates :: PhyloGroup -> [PhyloGroup] -> [PhyloGroup]
201 filterCandidates g gs = filter (\g' -> (not . null) $ intersect (getGroupNgrams g) (getGroupNgrams g'))
205 -- | a init avec la [[head groups]] et la tail groups
206 toBranches :: [[PhyloGroup]] -> [PhyloGroup] -> [[PhyloGroup]]
209 | otherwise = toBranches mem' $ tail gs
211 --------------------------------------
212 mem' :: [[PhyloGroup]]
213 mem' = if (null withHead)
214 then mem ++ [[head' "toBranches" gs]]
215 else (filter (\gs' -> not $ elem gs' withHead) mem)
217 [(concat withHead) ++ [head' "toBranches" gs]]
218 --------------------------------------
219 withHead :: [[PhyloGroup]]
220 withHead = filter (\gs' -> (not . null)
221 $ intersect (concat $ map getGroupNgrams gs')
222 (getGroupNgrams $ (head' "toBranches" gs))
224 --------------------------------------
227 -- | To process an intertemporal matching task to a Phylo at a given level
228 -- | 1) split all groups (of the level) in branches (ie:related components sharing at least one ngram)
229 -- | 2) for each branch, for each group find the best candidates (by Filiation and Proximity) and create the corresponding pointers
230 -- | 3) update all the groups with the new pointers if they exist
231 interTempoMatching :: Filiation -> Level -> Proximity -> Phylo -> Phylo
232 interTempoMatching fil lvl prox p = traceMatching fil lvl (getThreshold prox) debug $ updateGroups fil lvl pointersMap p
234 --------------------------------------
236 debug = sort $ concat $ map (snd . snd) pointers
237 --------------------------------------
238 pointersMap :: Map PhyloGroupId [Pointer]
239 pointersMap = Map.fromList $ map (\(id,x) -> (id,fst x)) pointers
240 --------------------------------------
241 pointers :: [(PhyloGroupId,([Pointer],[Double]))]
244 map (\g -> ( getGroupId g
245 , findBestCandidates fil 1 (getPhyloMatchingFrame p) prox (getNextPeriods fil (getGroupPeriod g) (getPhyloPeriods p)) (filterCandidates g branche) g p )
247 --------------------------------------
248 branches :: [[PhyloGroup]]
249 branches = tracePreBranches
250 $ toBranches [[head' "interTempoMatching" (getGroupsWithLevel lvl p)]]
251 $ tail (getGroupsWithLevel lvl p)
252 --------------------------------------
255 ------------------------------------------------------------------------
256 -- | Make links from Period to Period after level 1
258 toLevelUp :: [Pointer] -> Phylo -> [Pointer]
259 toLevelUp lst p = Map.toList
260 $ map (\ws -> maximum ws)
261 $ fromListWith (++) [(id, [w]) | (id, w) <- pointers]
263 --------------------------------------
264 pointers :: [Pointer]
265 pointers = map (\(id,v) -> (getGroupLevelParentId $ getGroupFromId id p, v)) lst
266 --------------------------------------
269 -- | Transpose the parent/child pointers from one level to another
270 transposePeriodLinks :: Level -> Phylo -> Phylo
271 transposePeriodLinks lvl p = alterGroupWithLevel
273 --------------------------------------
274 let childs = getGroupsFromIds (map fst $ getGroupLevelChilds g) p
275 ascLink = toLevelUp (concat $ map getGroupPeriodParents childs) p
276 desLink = toLevelUp (concat $ map getGroupPeriodChilds childs) p
277 --------------------------------------
278 in g & phylo_groupPeriodParents %~ (++ ascLink)
279 & phylo_groupPeriodChilds %~ (++ desLink)
280 --------------------------------------
289 traceMatching :: Filiation -> Level -> Double -> [Double] -> Phylo -> Phylo
290 traceMatching fil lvl thr lst p = trace ( "----\n" <> show (fil) <> " unfiltered temporal Matching in Phylo" <> show (lvl) <> " :\n"
291 <> "count : " <> show (length lst) <> " potential pointers (" <> show (length $ filter (>= thr) lst) <> " >= " <> show (thr) <> ")\n"
292 <> "similarity : " <> show (percentile 25 (VS.fromList lst)) <> " (25%) "
293 <> show (percentile 50 (VS.fromList lst)) <> " (50%) "
294 <> show (percentile 75 (VS.fromList lst)) <> " (75%) "
295 <> show (percentile 90 (VS.fromList lst)) <> " (90%)\n") p
297 tracePreBranches :: [[PhyloGroup]] -> [[PhyloGroup]]
298 tracePreBranches bs = trace (show (length bs) <> " pre-branches" <> "\n"
299 <> "with sizes : " <> show (map length bs) <> "\n") bs