2 Module : Gargantext.Viz.Phylo.Tools
3 Description : Phylomemy Tools to build/manage it
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
13 {-# LANGUAGE NoImplicitPrelude #-}
14 {-# LANGUAGE FlexibleContexts #-}
15 {-# LANGUAGE OverloadedStrings #-}
17 module Gargantext.Viz.Phylo.LinkMaker
20 import Control.Lens hiding (both, Level)
21 import Data.List ((++), sortOn, null, tail, splitAt, elem, concat, sort, delete, intersect, nub, groupBy)
22 import Data.Tuple.Extra
23 import Data.Map (Map,(!),fromListWith,elems,restrictKeys,unionWith)
24 import Gargantext.Prelude
25 import Gargantext.Viz.Phylo
26 import Gargantext.Viz.Phylo.Tools
27 import Gargantext.Viz.Phylo.Metrics.Proximity
28 import qualified Data.List as List
29 import qualified Data.Maybe as Maybe
30 import qualified Data.Map as Map
32 import qualified Data.Vector.Storable as VS
33 import Debug.Trace (trace)
34 import Numeric.Statistics (percentile)
36 -----------------------------
37 -- | From Level to level | --
38 -----------------------------
41 -- | To choose a LevelLink strategy based an a given Level
42 shouldLink :: (Level,Level) -> PhyloGroup -> PhyloGroup -> Bool
43 shouldLink (lvl,_lvl) g g'
44 | lvl <= 1 = doesContainsOrd (getGroupNgrams g) (getGroupNgrams g')
45 | lvl > 1 = elem (getGroupId g) (getGroupLevelChildsId g')
46 | otherwise = panic ("[ERR][Viz.Phylo.LinkMaker.shouldLink] Level not defined")
49 -- | To set the LevelLinks between a given PhyloGroup and a list of childs/parents PhyloGroups
50 linkGroupToGroups :: (Level,Level) -> PhyloGroup -> [PhyloGroup] -> PhyloGroup
51 linkGroupToGroups (lvl,lvl') current targets
52 | lvl < lvl' = setLevelParents current
53 | lvl > lvl' = setLevelChilds current
56 --------------------------------------
57 setLevelChilds :: PhyloGroup -> PhyloGroup
58 setLevelChilds = over (phylo_groupLevelChilds) addPointers
59 --------------------------------------
60 setLevelParents :: PhyloGroup -> PhyloGroup
61 setLevelParents = over (phylo_groupLevelParents) addPointers
62 --------------------------------------
63 addPointers :: [Pointer] -> [Pointer]
64 addPointers lp = lp ++ Maybe.mapMaybe (\target ->
65 if shouldLink (lvl,lvl') current target
66 then Just ((getGroupId target),1)
68 --------------------------------------
71 -- | To set the LevelLink of all the PhyloGroups of a Phylo
72 setLevelLinks :: (Level,Level) -> Phylo -> Phylo
73 setLevelLinks (lvl,lvl') p = alterPhyloGroups (\gs -> map (\g -> if getGroupLevel g == lvl
74 then linkGroupToGroups (lvl,lvl') g (filterCandidates g
75 $ filter (\g' -> getGroupPeriod g' == getGroupPeriod g) gs')
78 --------------------------------------
80 gs' = getGroupsWithLevel lvl' p
81 --------------------------------------
84 -------------------------------
85 -- | From Period to Period | --
86 -------------------------------
89 -- | To get the next or previous PhyloPeriod based on a given PhyloPeriodId
90 getNextPeriods :: Filiation -> PhyloPeriodId -> [PhyloPeriodId] -> [PhyloPeriodId]
91 getNextPeriods to' id l = case to' of
92 Descendant -> (tail . snd) next
93 Ascendant -> (reverse . fst) next
94 _ -> panic ("[ERR][Viz.Phylo.Example.getNextPeriods] Filiation type not defined")
96 --------------------------------------
97 next :: ([PhyloPeriodId], [PhyloPeriodId])
99 --------------------------------------
101 idx = case (List.elemIndex id l) of
102 Nothing -> panic ("[ERR][Viz.Phylo.Example.getNextPeriods] PhyloPeriodId not defined")
104 --------------------------------------
107 -- | To get the number of docs produced during a list of periods
108 periodsToNbDocs :: [PhyloPeriodId] -> Phylo -> Double
109 periodsToNbDocs prds phylo = sum $ elems
110 $ restrictKeys (phylo ^. phylo_docsByYears)
111 $ periodsToYears prds
114 -- | To process a given Proximity
115 processProximity :: Proximity -> Map (Int, Int) Double -> Map (Int, Int) Double -> Double -> Double
116 processProximity proximity cooc cooc' nbDocs = case proximity of
117 WeightedLogJaccard (WLJParams _ sens) -> weightedLogJaccard sens cooc cooc' nbDocs
118 Hamming (HammingParams _) -> hamming cooc cooc'
119 _ -> panic "[ERR][Viz.Phylo.LinkMaker.processProximity] Unknown proximity"
123 -- | Find the best candidates to be time-linked with a group g1 (recursively until the limit of periods is reached)
124 -- | 1) find the next periods and get the mini cooc matrix of g1
125 -- | 2) build the pairs of candidates (single groups or tuples)
126 -- | 3) process the proximity mesure and select the best ones to create the pointers (ie: all the max)
127 findBestCandidates :: Filiation -> Int -> Int -> Proximity -> [(Date,Date)] -> [PhyloGroup] -> PhyloGroup -> Phylo -> ([Pointer],[Double])
128 findBestCandidates filiation depth limit proximity periods candidates g1 phylo
129 | depth > limit || null nextPeriods = ([],[])
130 | (not . null) pointers = (head' "findBestCandidates" $ groupBy (\x y -> snd x == snd y) pointers
131 ,map snd similarities)
132 | otherwise = findBestCandidates filiation (depth + 1) limit proximity periods candidates g1 phylo
134 --------------------------------------
135 pointers :: [(PhyloGroupId, Double)]
136 pointers = reverse $ sortOn snd $ filter (\(_,score) -> case proximity of
137 WeightedLogJaccard (WLJParams thr _) -> score >= thr
138 Hamming (HammingParams thr) -> score <= thr
139 _ -> panic "[ERR][Viz.Phylo.LinkMaker.findBestCandidates] Unknown proximity"
141 --------------------------------------
142 similarities :: [(PhyloGroupId, Double)]
143 similarities = concat $ map (\(g2,g3) -> let nbDocs = periodsToNbDocs [(getGroupPeriod g1),(getGroupPeriod g2),(getGroupPeriod g3)] phylo
144 cooc2 = getGroupCooc g2
145 cooc3 = getGroupCooc g3
146 score = processProximity proximity cooc1 (unionWith (+) cooc2 cooc3) nbDocs
147 in nub $ [(getGroupId g2,score),(getGroupId g3,score)]) pairsOfCandidates
148 --------------------------------------
149 pairsOfCandidates :: [(PhyloGroup,PhyloGroup)]
150 pairsOfCandidates = listToFullCombi $ filter (\g -> elem (getGroupPeriod g) nextPeriods) candidates
151 --------------------------------------
152 cooc1 :: Map (Int,Int) Double
153 cooc1 = getGroupCooc g1
154 --------------------------------------
155 nextPeriods :: [(Date,Date)]
156 nextPeriods = take depth periods
157 --------------------------------------
160 -- | To add some Pointer to a PhyloGroup
161 addPointers' :: Filiation -> [Pointer] -> PhyloGroup -> PhyloGroup
162 addPointers' fil pts g = g & case fil of
163 Descendant -> phylo_groupPeriodChilds %~ (++ pts)
164 Ascendant -> phylo_groupPeriodParents %~ (++ pts)
165 _ -> panic ("[ERR][Viz.Phylo.LinkMaker.addPointers] Wrong type of filiation")
169 -- | To update a list of phyloGroups with some Pointers
170 updateGroups :: Filiation -> Level -> Map PhyloGroupId [Pointer] -> Phylo -> Phylo
171 updateGroups fil lvl m p = alterPhyloGroups (\gs -> map (\g -> if (getGroupLevel g) == lvl
172 then addPointers' fil (m ! (getGroupId g)) g
177 -- | Optimisation : to keep only the groups that have at least one ngrams in commons with the target
178 filterCandidates :: PhyloGroup -> [PhyloGroup] -> [PhyloGroup]
179 filterCandidates g gs = filter (\g' -> (not . null) $ intersect (getGroupNgrams g) (getGroupNgrams g'))
183 -- | a init avec la [[head groups]] et la tail groups
184 toBranches :: [[PhyloGroup]] -> [PhyloGroup] -> [[PhyloGroup]]
187 | otherwise = toBranches mem' $ tail gs
189 --------------------------------------
190 mem' :: [[PhyloGroup]]
191 mem' = if (null withHead)
192 then mem ++ [[head' "toBranches" gs]]
193 else (filter (\gs' -> not $ elem gs' withHead) mem)
195 [(concat withHead) ++ [head' "toBranches" gs]]
196 --------------------------------------
197 withHead :: [[PhyloGroup]]
198 withHead = filter (\gs' -> (not . null)
199 $ intersect (concat $ map getGroupNgrams gs')
200 (getGroupNgrams $ (head' "toBranches" gs))
202 --------------------------------------
205 -- | To process an intertemporal matching task to a Phylo at a given level
206 -- | 1) split all groups (of the level) in branches (ie:related components sharing at least one ngram)
207 -- | 2) for each branch, for each group find the best candidates (by Filiation and Proximity) and create the corresponding pointers
208 -- | 3) update all the groups with the new pointers if they exist
209 interTempoMatching :: Filiation -> Level -> Proximity -> Phylo -> Phylo
210 interTempoMatching fil lvl prox p = traceMatching fil lvl (getThreshold prox) debug $ updateGroups fil lvl pointersMap p
212 --------------------------------------
214 debug = sort $ concat $ map (snd . snd) pointers
215 --------------------------------------
216 pointersMap :: Map PhyloGroupId [Pointer]
217 pointersMap = Map.fromList $ map (\(id,x) -> (id,fst x)) pointers
218 --------------------------------------
219 pointers :: [(PhyloGroupId,([Pointer],[Double]))]
222 map (\g -> ( getGroupId g
223 , findBestCandidates fil 1 5 prox (getNextPeriods fil (getGroupPeriod g) (getPhyloPeriods p)) (filterCandidates g branche) g p )
225 --------------------------------------
226 branches :: [[PhyloGroup]]
227 branches = tracePreBranches
228 $ toBranches [[head' "interTempoMatching" (getGroupsWithLevel lvl p)]]
229 $ tail (getGroupsWithLevel lvl p)
230 --------------------------------------
233 ------------------------------------------------------------------------
234 -- | Make links from Period to Period after level 1
236 toLevelUp :: [Pointer] -> Phylo -> [Pointer]
237 toLevelUp lst p = Map.toList
238 $ map (\ws -> maximum ws)
239 $ fromListWith (++) [(id, [w]) | (id, w) <- pointers]
241 --------------------------------------
242 pointers :: [Pointer]
243 pointers = map (\(id,v) -> (getGroupLevelParentId $ getGroupFromId id p, v)) lst
244 --------------------------------------
247 -- | Transpose the parent/child pointers from one level to another
248 transposePeriodLinks :: Level -> Phylo -> Phylo
249 transposePeriodLinks lvl p = alterGroupWithLevel
251 --------------------------------------
252 let childs = getGroupsFromIds (map fst $ getGroupLevelChilds g) p
253 ascLink = toLevelUp (concat $ map getGroupPeriodParents childs) p
254 desLink = toLevelUp (concat $ map getGroupPeriodChilds childs) p
255 --------------------------------------
256 in g & phylo_groupPeriodParents %~ (++ ascLink)
257 & phylo_groupPeriodChilds %~ (++ desLink)
258 --------------------------------------
267 traceMatching :: Filiation -> Level -> Double -> [Double] -> Phylo -> Phylo
268 traceMatching fil lvl thr lst p = trace ( "----\n" <> show (fil) <> " unfiltered temporal Matching in Phylo" <> show (lvl) <> " :\n"
269 <> "count : " <> show (length lst) <> " potential pointers (" <> show (length $ filter (>= thr) lst) <> " >= " <> show (thr) <> ")\n"
270 <> "similarity : " <> show (percentile 25 (VS.fromList lst)) <> " (25%) "
271 <> show (percentile 50 (VS.fromList lst)) <> " (50%) "
272 <> show (percentile 75 (VS.fromList lst)) <> " (75%) "
273 <> show (percentile 90 (VS.fromList lst)) <> " (90%)\n") p
275 tracePreBranches :: [[PhyloGroup]] -> [[PhyloGroup]]
276 tracePreBranches bs = trace (show (length bs) <> " pre-branches" <> "\n"
277 <> "with sizes : " <> show (map length bs) <> "\n") bs