2 Module : Gargantext.Core.Viz.Phylo.SynchronicClustering
3 Description : Module dedicated to the adaptative synchronic clustering of a Phylo.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 module Gargantext.Core.Viz.Phylo.SynchronicClustering where
14 -- import Debug.Trace (trace)
15 import Control.Lens hiding (Level)
16 import Control.Monad (sequence)
17 import Control.Parallel.Strategies (parList, rdeepseq, using)
18 import Data.List ((++), null, intersect, nub, concat, sort, sortOn, groupBy)
19 import Data.Map (Map, fromList, fromListWith, foldlWithKey, (!), insert, empty, restrictKeys, elems, mapWithKey, member, unionWith)
20 import Gargantext.Core.Viz.Phylo
21 import Gargantext.Core.Viz.Phylo.PhyloExport (processDynamics)
22 import Gargantext.Core.Viz.Phylo.PhyloTools
23 import Gargantext.Core.Viz.Phylo.TemporalMatching (weightedLogJaccard', filterDiago, reduceDiagos)
24 import Gargantext.Prelude
25 -- import Debug.Trace (trace)
26 import qualified Data.Map as Map
29 -------------------------
30 -- | New Level Maker | --
31 -------------------------
33 mergeGroups :: [Cooc] -> PhyloGroupId -> Map PhyloGroupId PhyloGroupId -> [PhyloGroup] -> PhyloGroup
34 mergeGroups coocs id mapIds childs =
35 let ngrams = (sort . nub . concat) $ map _phylo_groupNgrams childs
36 counts = foldl (\acc count -> unionWith (+) acc count) empty $ map _phylo_groupRootsCount childs
37 in PhyloGroup (fst $ fst id) (_phylo_groupPeriod' $ head' "mergeGroups" childs)
38 (snd $ fst id) (snd id) ""
39 (sum $ map _phylo_groupSupport childs)
41 $ map _phylo_groupWeight childs)
42 (concat $ map _phylo_groupSources childs)
44 (ngramsToCooc ngrams coocs)
45 (ngramsToDensity ngrams coocs counts)
46 -- todo add density here
48 (mergeMeta bId childs)
50 [] (map (\g -> (getGroupId g, 1)) childs)
51 (updatePointers $ concat $ map _phylo_groupPeriodParents childs)
52 (updatePointers $ concat $ map _phylo_groupPeriodChilds childs)
53 (mergeAncestors $ concat $ map _phylo_groupAncestors childs)
54 (updatePointers' $ concat $ map _phylo_groupPeriodMemoryParents childs)
55 (updatePointers' $ concat $ map _phylo_groupPeriodMemoryChilds childs)
59 bId = mergeBranchIds $ map (\g -> snd $ g ^. phylo_groupBranchId) childs
61 updatePointers :: [Pointer] -> [Pointer]
62 updatePointers pointers = map (\(pId,w) -> (mapIds ! pId,w)) pointers
63 updatePointers' :: [Pointer'] -> [Pointer']
64 updatePointers' pointers = map (\(pId,(t,w)) -> (mapIds ! pId,(t,w))) pointers
66 mergeAncestors :: [Pointer] -> [Pointer]
67 mergeAncestors pointers = Map.toList $ fromListWith max pointers
69 addPhyloScale :: Scale -> Phylo -> Phylo
70 addPhyloScale lvl phylo =
71 over ( phylo_periods . traverse )
72 (\phyloPrd -> phyloPrd & phylo_periodScales
73 %~ (insert (phyloPrd ^. phylo_periodPeriod, lvl)
74 (PhyloScale (phyloPrd ^. phylo_periodPeriod) (phyloPrd ^. phylo_periodPeriodStr) lvl empty))) phylo
77 toNextScale :: Phylo -> [PhyloGroup] -> Phylo
78 toNextScale phylo groups =
79 let curLvl = getLastLevel phylo
80 oldGroups = fromList $ map (\g -> (getGroupId g, getLevelParentId g)) groups
81 newGroups = concat $ groupsToBranches'
82 $ fromList $ map (\g -> (getGroupId g, g))
83 $ foldlWithKey (\acc id groups' ->
84 -- 4) create the parent group
85 let parent = mergeGroups (elems $ restrictKeys (getCoocByDate phylo) $ periodsToYears [(fst . fst) id]) id oldGroups groups'
86 in acc ++ [parent]) []
87 -- 3) group the current groups by parentId
88 $ fromListWith (++) $ map (\g -> (getLevelParentId g, [g])) groups
90 newPeriods = fromListWith (++) $ map (\g -> (g ^. phylo_groupPeriod, [g])) newGroups
92 $ over ( phylo_periods . traverse . phylo_periodScales . traverse
93 -- 6) update each period at curLvl + 1
94 . filtered (\phyloLvl -> phyloLvl ^. phylo_scaleScale == (curLvl + 1)))
95 -- 7) by adding the parents
97 if member (phyloLvl ^. phylo_scalePeriod) newPeriods
98 then phyloLvl & phylo_scaleGroups
99 .~ fromList (map (\g -> (getGroupId g, g)) $ newPeriods ! (phyloLvl ^. phylo_scalePeriod))
101 -- 2) add the curLvl + 1 PhyloScale to the phylo
102 $ addPhyloScale (curLvl + 1)
103 -- 1) update the current groups (with level parent pointers) in the phylo
104 $ updatePhyloGroups curLvl (fromList $ map (\g -> (getGroupId g, g)) groups) phylo
110 toPairs :: SynchronyStrategy -> [PhyloGroup] -> [(PhyloGroup,PhyloGroup)]
111 toPairs strategy groups = case strategy of
112 MergeRegularGroups -> pairs
113 $ filter (\g -> all (== 3) $ (g ^. phylo_groupMeta) ! "dynamics") groups
114 MergeAllGroups -> pairs groups
116 pairs :: [PhyloGroup] -> [(PhyloGroup,PhyloGroup)]
117 pairs gs = filter (\(g,g') -> (not . null) $ intersect (g ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams)) (listToCombi' gs)
120 toDiamonds :: [PhyloGroup] -> [[PhyloGroup]]
121 toDiamonds groups = foldl' (\acc groups' ->
123 $ Map.filter (\v -> length v > 1)
126 acc' ++ (map (\(id,_) -> (id,[g]) ) $ g ^. phylo_groupPeriodChilds)) [] groups')) []
128 $ Map.filter (\v -> length v > 1)
130 $ foldl' (\acc g -> acc ++ (map (\(id,_) -> (id,[g]) ) $ g ^. phylo_groupPeriodParents) ) [] groups
133 groupsToEdges :: PhyloSimilarity -> Synchrony -> Double -> Map Int Double -> [PhyloGroup] -> [((PhyloGroup,PhyloGroup),Double)]
134 groupsToEdges prox sync nbDocs diago groups =
136 ByProximityThreshold thr sens _ strat ->
137 filter (\(_,w) -> w >= thr)
139 $ toPairs strat groups
140 ByProximityDistribution sens strat ->
141 let diamonds = sortOn snd
142 $ toEdges sens $ concat
143 $ map (\gs -> toPairs strat gs) $ toDiamonds groups
144 in take (div (length diamonds) 2) diamonds
146 toEdges :: Double -> [(PhyloGroup,PhyloGroup)] -> [((PhyloGroup,PhyloGroup),Double)]
149 WeightedLogJaccard _ _ -> map (\(g,g') ->
150 ((g,g'), weightedLogJaccard' (sens) nbDocs diago
151 (g ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams))) edges
152 WeightedLogSim _ _ -> map (\(g,g') ->
153 ((g,g'), weightedLogJaccard' (1 / sens) nbDocs diago
154 (g ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams))) edges
158 toParentId :: PhyloGroup -> PhyloGroupId
159 toParentId child = ((child ^. phylo_groupPeriod, child ^. phylo_groupScale + 1), child ^. phylo_groupIndex)
162 reduceGroups :: PhyloSimilarity -> Synchrony -> Map Date Double -> Map Date Cooc -> [PhyloGroup] -> [PhyloGroup]
163 reduceGroups prox sync docs diagos branch =
164 -- 1) reduce a branch as a set of periods & groups
165 let periods = fromListWith (++)
166 $ map (\g -> (g ^. phylo_groupPeriod,[g])) branch
167 in (concat . concat . elems)
168 -- TODO : ajouter un parallelisme
169 $ mapWithKey (\prd groups ->
170 -- 2) for each period, transform the groups as a proximity graph filtered by a threshold
171 let diago = reduceDiagos $ filterDiago diagos [prd]
172 edgesLeft = fromList $ groupsToEdges prox sync ((sum . elems) $ restrictKeys docs $ periodsToYears [prd]) diago groups
173 edgesRight = fromList $ map (\((k1,k2),v) -> ((k2,k1),v))
174 $ groupsToEdges prox sync ((sum . elems) $ restrictKeys docs $ periodsToYears [prd]) diago (reverse groups)
175 mergedEdges = Map.toList
176 $ unionWith (\v1 v2 -> if v1 >= v2
178 else v2) edgesLeft edgesRight
179 -- 3) reduce the graph a a set of related components
180 clusters = toRelatedComponents groups mergedEdges
182 -- 4) add to each groups their futur level parent group
183 let parentId = toParentId (head' "parentId" comp)
184 in map (\g -> g & phylo_groupScaleParents %~ (++ [(parentId,1)]) ) comp )
187 chooseClusteringStrategy :: Synchrony -> [[PhyloGroup]] -> [[PhyloGroup]]
188 chooseClusteringStrategy sync branches = case sync of
189 ByProximityThreshold _ _ scope _ -> case scope of
190 SingleBranch -> branches
191 SiblingBranches -> groupBy (\g g' -> (last' "chooseClusteringStrategy" $ (g ^. phylo_groupMeta) ! "breaks")
192 == (last' "chooseClusteringStrategy" $ (g' ^. phylo_groupMeta) ! "breaks"))
193 $ sortOn _phylo_groupBranchId $ concat branches
194 AllBranches -> [concat branches]
195 ByProximityDistribution _ _ -> branches
198 levelUpAncestors :: [PhyloGroup] -> [PhyloGroup]
199 levelUpAncestors groups =
200 -- 1) create an associative map of (old,new) ids
201 let ids' = fromList $ map (\g -> (getGroupId g, fst $ head' "levelUpAncestors" ( g ^. phylo_groupScaleParents))) groups
203 let id' = ids' ! (getGroupId g)
204 ancestors = g ^. phylo_groupAncestors
205 -- 2) level up the ancestors ids and filter the ones that will be merged
206 ancestors' = filter (\(id,_) -> id /= id') $ map (\(id,w) -> (ids' ! id,w)) ancestors
207 in g & phylo_groupAncestors .~ ancestors'
210 synchronicClustering :: Phylo -> Phylo
211 synchronicClustering phylo =
212 let prox = similarity $ getConfig phylo
213 sync = phyloSynchrony $ getConfig phylo
214 docs = getDocsByDate phylo
215 diagos = map coocToDiago $ getCoocByDate phylo
216 newBranches = map (\branch -> reduceGroups prox sync docs diagos branch)
217 $ map processDynamics
218 $ chooseClusteringStrategy sync
220 $ traceSynchronyStart phylo
221 newBranches' = newBranches `using` parList rdeepseq
222 in toNextScale phylo $ levelUpAncestors $ concat newBranches'
225 -- synchronicDistance :: Phylo -> Level -> String
226 -- synchronicDistance phylo lvl =
227 -- foldl' (\acc branch ->
228 -- acc <> (foldl' (\acc' period ->
229 -- acc' <> let prox = phyloProximity $ getConfig phylo
230 -- sync = phyloSynchrony $ getConfig phylo
231 -- docs = _phylo_timeDocs phylo
232 -- prd = _phylo_groupPeriod $ head' "distance" period
233 -- edges = groupsToEdges prox 0.1 (_bpt_sensibility sync)
234 -- ((sum . elems) $ restrictKeys docs $ periodsToYears [_phylo_groupPeriod $ head' "distance" period]) period
235 -- in foldl' (\mem (_,w) ->
241 -- ) "" $ elems $ groupByField _phylo_groupPeriod branch)
242 -- ) "period\tdistance\n" $ elems $ groupByField _phylo_groupBranchId $ getGroupsFromLevel lvl phylo