2 Module : Gargantext.Viz.Phylo.Tools
3 Description : Phylomemy Tools to build/manage it
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
13 {-# LANGUAGE NoImplicitPrelude #-}
14 {-# LANGUAGE FlexibleContexts #-}
15 {-# LANGUAGE OverloadedStrings #-}
17 module Gargantext.Viz.Phylo.LinkMaker
20 import Control.Lens hiding (both, Level)
21 import Data.List ((++), sortOn, null, tail, splitAt, elem, concat, sort, delete, intersect, nub, groupBy, union)
22 import Data.Tuple.Extra
23 import Data.Map (Map,(!),fromListWith,elems,restrictKeys,unionWith,member)
24 import Gargantext.Prelude
25 import Gargantext.Viz.Phylo
26 import Gargantext.Viz.Phylo.Tools
27 import Gargantext.Viz.Phylo.Metrics.Proximity
28 import qualified Data.List as List
29 import qualified Data.Maybe as Maybe
30 import qualified Data.Map as Map
32 import qualified Data.Vector.Storable as VS
33 import Debug.Trace (trace)
34 import Numeric.Statistics (percentile)
36 -----------------------------
37 -- | From Level to level | --
38 -----------------------------
41 -- | To choose a LevelLink strategy based an a given Level
42 shouldLink :: (Level,Level) -> PhyloGroup -> PhyloGroup -> Bool
43 shouldLink (lvl,lvl') g g'
44 | (lvl <= 1) && (lvl' <= 1) = doesContainsOrd (getGroupNgrams g) (getGroupNgrams g')
45 | otherwise = elem (getGroupId g) (getGroupLevelChildsId g')
48 -- | To set the LevelLinks between a given PhyloGroup and a list of childs/parents PhyloGroups
49 linkGroupToGroups :: (Level,Level) -> PhyloGroup -> [PhyloGroup] -> PhyloGroup
50 linkGroupToGroups (lvl,lvl') current targets
51 | lvl < lvl' = setLevelParents current
52 | lvl > lvl' = setLevelChilds current
55 --------------------------------------
56 setLevelChilds :: PhyloGroup -> PhyloGroup
57 setLevelChilds = over (phylo_groupLevelChilds) addPointers
58 --------------------------------------
59 setLevelParents :: PhyloGroup -> PhyloGroup
60 setLevelParents = over (phylo_groupLevelParents) addPointers
61 --------------------------------------
62 addPointers :: [Pointer] -> [Pointer]
63 addPointers lp = lp ++ Maybe.mapMaybe (\target ->
64 if shouldLink (lvl,lvl') current target
65 then Just ((getGroupId target),1)
67 --------------------------------------
70 -- | To set the LevelLink of all the PhyloGroups of a Phylo
71 setLevelLinks :: (Level,Level) -> Phylo -> Phylo
72 setLevelLinks (lvl,lvl') p = alterPhyloGroups (\groups ->
73 map (\group -> if getGroupLevel group == lvl
74 then linkGroupToGroups (lvl,lvl') group
75 $ filterCandidates group
76 $ getGroupsWithFilters lvl' (getGroupPeriod group) p
80 -------------------------------
81 -- | From Period to Period | --
82 -------------------------------
85 -- | To get the next or previous PhyloPeriod based on a given PhyloPeriodId
86 getNextPeriods :: Filiation -> PhyloPeriodId -> [PhyloPeriodId] -> [PhyloPeriodId]
87 getNextPeriods to' id l = case to' of
88 Descendant -> (tail . snd) next
89 Ascendant -> (reverse . fst) next
90 _ -> panic ("[ERR][Viz.Phylo.Example.getNextPeriods] Filiation type not defined")
92 --------------------------------------
93 next :: ([PhyloPeriodId], [PhyloPeriodId])
95 --------------------------------------
97 idx = case (List.elemIndex id l) of
98 Nothing -> panic ("[ERR][Viz.Phylo.Example.getNextPeriods] PhyloPeriodId not defined")
100 --------------------------------------
103 -- | To get the number of docs produced during a list of periods
104 periodsToNbDocs :: [PhyloPeriodId] -> Phylo -> Double
105 periodsToNbDocs prds phylo = sum $ elems
106 $ restrictKeys (phylo ^. phylo_docsByYears)
107 $ periodsToYears prds
110 -- | To process a given Proximity
111 processProximity :: Proximity -> Double -> Map (Int, Int) Double -> Map (Int, Int) Double -> [Int] -> [Int] -> Double
112 processProximity proximity nbDocs cooc cooc' ngrams ngrams' = case proximity of
113 WeightedLogJaccard (WLJParams _ sens) -> weightedLogJaccard sens nbDocs cooc cooc' ngrams ngrams'
114 Hamming (HammingParams _) -> hamming cooc cooc'
115 _ -> panic "[ERR][Viz.Phylo.LinkMaker.processProximity] Unknown proximity"
119 -- | Find the best candidates to be time-linked with a group g1 (recursively until the limit of periods is reached)
120 -- | 1) find the next periods and get the mini cooc matrix of g1
121 -- | 2) build the pairs of candidates (single groups or tuples)
122 -- | 3) process the proximity mesure and select the best ones to create the pointers (ie: all the max)
123 findBestCandidates :: Filiation -> Int -> Int -> Proximity -> [(Date,Date)] -> [PhyloGroup] -> PhyloGroup -> Phylo -> ([Pointer],[Double])
124 findBestCandidates filiation depth limit proximity periods candidates g1 phylo
125 | depth > limit || null nextPeriods = ([],[])
126 | (not . null) pointers = (head' "findBestCandidates" $ groupBy (\x y -> snd x == snd y) pointers
127 ,map snd similarities)
128 | otherwise = findBestCandidates filiation (depth + 1) limit proximity periods candidates g1 phylo
130 --------------------------------------
131 pointers :: [(PhyloGroupId, Double)]
132 pointers = reverse $ sortOn snd $ filter (\(_,score) -> case proximity of
133 WeightedLogJaccard (WLJParams thr _) -> score >= thr
134 Hamming (HammingParams thr) -> score <= thr
135 _ -> panic "[ERR][Viz.Phylo.LinkMaker.findBestCandidates] Unknown proximity"
137 --------------------------------------
138 similarities :: [(PhyloGroupId, Double)]
139 similarities = concat $ map (\(g2,g3) -> let nbDocs = periodsToNbDocs [(getGroupPeriod g1),(getGroupPeriod g2),(getGroupPeriod g3)] phylo
140 cooc' = unionWith (+) (getGroupCooc g2) (getGroupCooc g3)
141 ngrams' = union (getGroupNgrams g2) (getGroupNgrams g3)
142 score = processProximity proximity nbDocs cooc cooc' ngrams ngrams'
143 in nub $ [(getGroupId g2,score),(getGroupId g3,score)]) pairsOfCandidates
144 --------------------------------------
145 pairsOfCandidates :: [(PhyloGroup,PhyloGroup)]
146 pairsOfCandidates = listToFullCombi $ filter (\g -> elem (getGroupPeriod g) nextPeriods) candidates
147 --------------------------------------
148 cooc :: Map (Int,Int) Double
149 cooc = getGroupCooc g1
150 --------------------------------------
152 ngrams = getGroupNgrams g1
153 --------------------------------------
154 nextPeriods :: [(Date,Date)]
155 nextPeriods = take depth periods
156 --------------------------------------
159 findBestCandidates' :: Proximity -> [PhyloGroup] -> PhyloGroup -> Phylo -> [Pointer]
160 findBestCandidates' proximity candidates g1 phylo = pointers
162 --------------------------------------
163 pointers :: [(PhyloGroupId, Double)]
164 pointers = reverse $ sortOn snd $ filter (\(_,score) -> case proximity of
165 WeightedLogJaccard (WLJParams thr _) -> score >= (thr - 0.1)
166 Hamming (HammingParams thr) -> score <= thr
167 _ -> panic "[ERR][Viz.Phylo.LinkMaker.findBestCandidates'] Unknown proximity"
169 --------------------------------------
170 similarities :: [(PhyloGroupId, Double)]
171 similarities = concat $ map (\(g2,g3) -> let nbDocs = periodsToNbDocs [(getGroupPeriod g1),(getGroupPeriod g2),(getGroupPeriod g3)] phylo
172 cooc' = unionWith (+) (getGroupCooc g2) (getGroupCooc g3)
173 ngrams' = union (getGroupNgrams g2) (getGroupNgrams g3)
174 score = processProximity proximity nbDocs cooc cooc' ngrams ngrams'
175 in nub $ [(getGroupId g2,score),(getGroupId g3,score)]) pairsOfCandidates
176 --------------------------------------
177 pairsOfCandidates :: [(PhyloGroup,PhyloGroup)]
178 pairsOfCandidates = listToFullCombi candidates
179 --------------------------------------
180 --------------------------------------
181 cooc :: Map (Int,Int) Double
182 cooc = getGroupCooc g1
183 --------------------------------------
185 ngrams = getGroupNgrams g1
186 --------------------------------------
189 -- | To add some Pointer to a PhyloGroup
190 addPointers' :: Filiation -> [Pointer] -> PhyloGroup -> PhyloGroup
191 addPointers' fil pts g = g & case fil of
192 Descendant -> phylo_groupPeriodChilds %~ (++ pts)
193 Ascendant -> phylo_groupPeriodParents %~ (++ pts)
194 _ -> panic ("[ERR][Viz.Phylo.LinkMaker.addPointers] Wrong type of filiation")
198 -- | To update a list of phyloGroups with some Pointers
199 updateGroups :: Filiation -> Level -> Map PhyloGroupId [Pointer] -> Phylo -> Phylo
200 updateGroups fil lvl m p = alterPhyloGroups (\gs -> map (\g -> if ((getGroupLevel g) == lvl) && (member (getGroupId g) m)
201 then addPointers' fil (m ! (getGroupId g)) g
206 -- | Optimisation : to keep only the groups that have at least one ngrams in commons with the target
207 filterCandidates :: PhyloGroup -> [PhyloGroup] -> [PhyloGroup]
208 filterCandidates g gs = filter (\g' -> (not . null) $ intersect (getGroupNgrams g) (getGroupNgrams g'))
212 -- | a init avec la [[head groups]] et la tail groups
213 toBranches :: [[PhyloGroup]] -> [PhyloGroup] -> [[PhyloGroup]]
216 | otherwise = toBranches mem' $ tail gs
218 --------------------------------------
219 mem' :: [[PhyloGroup]]
220 mem' = if (null withHead)
221 then mem ++ [[head' "toBranches" gs]]
222 else (filter (\gs' -> not $ elem gs' withHead) mem)
224 [(concat withHead) ++ [head' "toBranches" gs]]
225 --------------------------------------
226 withHead :: [[PhyloGroup]]
227 withHead = filter (\gs' -> (not . null)
228 $ intersect (concat $ map getGroupNgrams gs')
229 (getGroupNgrams $ (head' "toBranches" gs))
231 --------------------------------------
234 -- | To process an intertemporal matching task to a Phylo at a given level
235 -- | 1) split all groups (of the level) in branches (ie:related components sharing at least one ngram)
236 -- | 2) for each branch, for each group find the best candidates (by Filiation and Proximity) and create the corresponding pointers
237 -- | 3) update all the groups with the new pointers if they exist
238 interTempoMatching :: Filiation -> Level -> Proximity -> Phylo -> Phylo
239 interTempoMatching fil lvl prox p = traceMatching fil lvl (getThreshold prox) debug $ updateGroups fil lvl pointersMap p
241 --------------------------------------
243 debug = sort $ concat $ map (snd . snd) pointers
244 --------------------------------------
245 pointersMap :: Map PhyloGroupId [Pointer]
246 pointersMap = Map.fromList $ map (\(id,x) -> (id,fst x)) pointers
247 --------------------------------------
248 pointers :: [(PhyloGroupId,([Pointer],[Double]))]
251 map (\g -> ( getGroupId g
252 , findBestCandidates fil 1 (getPhyloMatchingFrame p) prox (getNextPeriods fil (getGroupPeriod g) (getPhyloPeriods p)) (filterCandidates g branche) g p )
254 --------------------------------------
255 branches :: [[PhyloGroup]]
256 branches = tracePreBranches
257 $ toBranches [[head' "interTempoMatching" (getGroupsWithLevel lvl p)]]
258 $ tail (getGroupsWithLevel lvl p)
259 --------------------------------------
262 ------------------------------------------------------------------------
263 -- | Make links from Period to Period after level 1
265 toLevelUp :: [Pointer] -> Phylo -> [Pointer]
266 toLevelUp lst p = Map.toList
267 $ map (\ws -> maximum ws)
268 $ fromListWith (++) [(id, [w]) | (id, w) <- pointers]
270 --------------------------------------
271 pointers :: [Pointer]
272 pointers = map (\(id,v) -> (getGroupLevelParentId $ getGroupFromId id p, v)) lst
273 --------------------------------------
276 -- | Transpose the parent/child pointers from one level to another
277 transposePeriodLinks :: Level -> Phylo -> Phylo
278 transposePeriodLinks lvl p = alterGroupWithLevel
280 --------------------------------------
281 let childs = getGroupsFromIds (map fst $ getGroupLevelChilds g) p
282 ascLink = toLevelUp (concat $ map getGroupPeriodParents childs) p
283 desLink = toLevelUp (concat $ map getGroupPeriodChilds childs) p
284 --------------------------------------
285 in g & phylo_groupPeriodParents %~ (++ ascLink)
286 & phylo_groupPeriodChilds %~ (++ desLink)
287 --------------------------------------
296 traceMatching :: Filiation -> Level -> Double -> [Double] -> Phylo -> Phylo
297 traceMatching fil lvl thr lst p = trace ( "----\n" <> show (fil) <> " unfiltered temporal Matching in Phylo" <> show (lvl) <> " :\n"
298 <> "count : " <> show (length lst) <> " potential pointers (" <> show (length $ filter (>= thr) lst) <> " >= " <> show (thr) <> ")\n"
299 <> "similarity : " <> show (percentile 25 (VS.fromList lst)) <> " (25%) "
300 <> show (percentile 50 (VS.fromList lst)) <> " (50%) "
301 <> show (percentile 75 (VS.fromList lst)) <> " (75%) "
302 <> show (percentile 90 (VS.fromList lst)) <> " (90%)\n") p
304 tracePreBranches :: [[PhyloGroup]] -> [[PhyloGroup]]
305 tracePreBranches bs = trace (show (length bs) <> " pre-branches" <> "\n"
306 <> "with sizes : " <> show (map length bs) <> "\n") bs