2 Module : Gargantext.Core.Viz.Phylo.SynchronicClustering
3 Description : Module dedicated to the adaptative synchronic clustering of a Phylo.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 module Gargantext.Core.Viz.Phylo.SynchronicClustering where
14 -- import Debug.Trace (trace)
15 import Control.Lens hiding (Level)
16 import Control.Monad (sequence)
17 import Control.Parallel.Strategies (parList, rdeepseq, using)
18 import Data.List ((++), null, intersect, nub, concat, sort, sortOn, groupBy)
19 import Data.Map.Strict (Map, fromList, fromListWith, foldlWithKey, (!), insert, empty, restrictKeys, elems, mapWithKey, member)
20 import Gargantext.Core.Viz.Phylo
21 import Gargantext.Core.Viz.Phylo.PhyloExport (processDynamics)
22 import Gargantext.Core.Viz.Phylo.PhyloTools
23 import Gargantext.Core.Viz.Phylo.TemporalMatching (weightedLogJaccard', filterDiago, reduceDiagos)
24 import Gargantext.Prelude
25 import qualified Data.Map.Strict as Map
28 -------------------------
29 -- | New Level Maker | --
30 -------------------------
32 mergeGroups :: [Cooc] -> PhyloGroupId -> Map PhyloGroupId PhyloGroupId -> [PhyloGroup] -> PhyloGroup
33 mergeGroups coocs id mapIds childs =
34 let ngrams = (sort . nub . concat) $ map _phylo_groupNgrams childs
35 in PhyloGroup (fst $ fst id) (_phylo_groupPeriod' $ head' "mergeGroups" childs)
36 (snd $ fst id) (snd id) ""
37 (sum $ map _phylo_groupSupport childs)
39 $ map _phylo_groupWeight childs)
40 (concat $ map _phylo_groupSources childs)
42 (ngramsToCooc ngrams coocs)
44 (mergeMeta bId childs) [] (map (\g -> (getGroupId g, 1)) childs)
45 (updatePointers $ concat $ map _phylo_groupPeriodParents childs)
46 (updatePointers $ concat $ map _phylo_groupPeriodChilds childs)
47 (mergeAncestors $ concat $ map _phylo_groupAncestors childs)
48 (updatePointers' $ concat $ map _phylo_groupPeriodMemoryParents childs)
49 (updatePointers' $ concat $ map _phylo_groupPeriodMemoryChilds childs)
53 bId = mergeBranchIds $ map (\g -> snd $ g ^. phylo_groupBranchId) childs
55 updatePointers :: [Pointer] -> [Pointer]
56 updatePointers pointers = map (\(pId,w) -> (mapIds ! pId,w)) pointers
57 updatePointers' :: [Pointer'] -> [Pointer']
58 updatePointers' pointers = map (\(pId,(t,w)) -> (mapIds ! pId,(t,w))) pointers
60 mergeAncestors :: [Pointer] -> [Pointer]
61 mergeAncestors pointers = Map.toList $ fromListWith max pointers
63 addPhyloScale :: Scale -> Phylo -> Phylo
64 addPhyloScale lvl phylo =
65 over ( phylo_periods . traverse )
66 (\phyloPrd -> phyloPrd & phylo_periodScales
67 %~ (insert (phyloPrd ^. phylo_periodPeriod, lvl)
68 (PhyloScale (phyloPrd ^. phylo_periodPeriod) (phyloPrd ^. phylo_periodPeriodStr) lvl empty))) phylo
71 toNextScale :: Phylo -> [PhyloGroup] -> Phylo
72 toNextScale phylo groups =
73 let curLvl = getLastLevel phylo
74 oldGroups = fromList $ map (\g -> (getGroupId g, getLevelParentId g)) groups
75 newGroups = concat $ groupsToBranches'
76 $ fromList $ map (\g -> (getGroupId g, g))
77 $ foldlWithKey (\acc id groups' ->
78 -- 4) create the parent group
79 let parent = mergeGroups (elems $ restrictKeys (phylo ^. phylo_timeCooc) $ periodsToYears [(fst . fst) id]) id oldGroups groups'
80 in acc ++ [parent]) []
81 -- 3) group the current groups by parentId
82 $ fromListWith (++) $ map (\g -> (getLevelParentId g, [g])) groups
84 newPeriods = fromListWith (++) $ map (\g -> (g ^. phylo_groupPeriod, [g])) newGroups
86 $ over ( phylo_periods . traverse . phylo_periodScales . traverse
87 -- 6) update each period at curLvl + 1
88 . filtered (\phyloLvl -> phyloLvl ^. phylo_scaleScale == (curLvl + 1)))
89 -- 7) by adding the parents
91 if member (phyloLvl ^. phylo_scalePeriod) newPeriods
92 then phyloLvl & phylo_scaleGroups
93 .~ fromList (map (\g -> (getGroupId g, g)) $ newPeriods ! (phyloLvl ^. phylo_scalePeriod))
95 -- 2) add the curLvl + 1 PhyloScale to the phylo
96 $ addPhyloScale (curLvl + 1)
97 -- 1) update the current groups (with level parent pointers) in the phylo
98 $ updatePhyloGroups curLvl (fromList $ map (\g -> (getGroupId g, g)) groups) phylo
104 toPairs :: SynchronyStrategy -> [PhyloGroup] -> [(PhyloGroup,PhyloGroup)]
105 toPairs strategy groups = case strategy of
106 MergeRegularGroups -> pairs
107 $ filter (\g -> all (== 3) $ (g ^. phylo_groupMeta) ! "dynamics") groups
108 MergeAllGroups -> pairs groups
110 pairs :: [PhyloGroup] -> [(PhyloGroup,PhyloGroup)]
111 pairs gs = filter (\(g,g') -> (not . null) $ intersect (g ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams)) (listToCombi' gs)
114 toDiamonds :: [PhyloGroup] -> [[PhyloGroup]]
115 toDiamonds groups = foldl' (\acc groups' ->
117 $ Map.filter (\v -> length v > 1)
120 acc' ++ (map (\(id,_) -> (id,[g]) ) $ g ^. phylo_groupPeriodChilds)) [] groups')) []
122 $ Map.filter (\v -> length v > 1)
124 $ foldl' (\acc g -> acc ++ (map (\(id,_) -> (id,[g]) ) $ g ^. phylo_groupPeriodParents) ) [] groups
127 groupsToEdges :: Proximity -> Synchrony -> Double -> Map Int Double -> [PhyloGroup] -> [((PhyloGroup,PhyloGroup),Double)]
128 groupsToEdges prox sync nbDocs diago groups =
130 ByProximityThreshold thr sens _ strat ->
131 filter (\(_,w) -> w >= thr)
133 $ toPairs strat groups
134 ByProximityDistribution sens strat ->
135 let diamonds = sortOn snd
136 $ toEdges sens $ concat
137 $ map (\gs -> toPairs strat gs) $ toDiamonds groups
138 in take (div (length diamonds) 2) diamonds
140 toEdges :: Double -> [(PhyloGroup,PhyloGroup)] -> [((PhyloGroup,PhyloGroup),Double)]
143 WeightedLogJaccard _ _ -> map (\(g,g') ->
144 ((g,g'), weightedLogJaccard' (sens) nbDocs diago
145 (g ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams))) edges
146 WeightedLogSim _ _ -> map (\(g,g') ->
147 ((g,g'), weightedLogJaccard' (1 / sens) nbDocs diago
148 (g ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams))) edges
152 toParentId :: PhyloGroup -> PhyloGroupId
153 toParentId child = ((child ^. phylo_groupPeriod, child ^. phylo_groupScale + 1), child ^. phylo_groupIndex)
156 reduceGroups :: Proximity -> Synchrony -> Map Date Double -> Map Date Cooc -> [PhyloGroup] -> [PhyloGroup]
157 reduceGroups prox sync docs diagos branch =
158 -- 1) reduce a branch as a set of periods & groups
159 let periods = fromListWith (++)
160 $ map (\g -> (g ^. phylo_groupPeriod,[g])) branch
161 in (concat . concat . elems)
162 -- TODO : ajouter un parallelisme
163 $ mapWithKey (\prd groups ->
164 -- 2) for each period, transform the groups as a proximity graph filtered by a threshold
165 let diago = reduceDiagos $ filterDiago diagos [prd]
166 edges = groupsToEdges prox sync ((sum . elems) $ restrictKeys docs $ periodsToYears [prd]) diago groups
168 -- 4) add to each groups their futur level parent group
169 let parentId = toParentId (head' "parentId" comp)
170 in map (\g -> g & phylo_groupScaleParents %~ (++ [(parentId,1)]) ) comp )
171 -- 3) reduce the graph a a set of related components
172 $ toRelatedComponents groups edges) periods
175 chooseClusteringStrategy :: Synchrony -> [[PhyloGroup]] -> [[PhyloGroup]]
176 chooseClusteringStrategy sync branches = case sync of
177 ByProximityThreshold _ _ scope _ -> case scope of
178 SingleBranch -> branches
179 SiblingBranches -> groupBy (\g g' -> (last' "chooseClusteringStrategy" $ (g ^. phylo_groupMeta) ! "breaks")
180 == (last' "chooseClusteringStrategy" $ (g' ^. phylo_groupMeta) ! "breaks"))
181 $ sortOn _phylo_groupBranchId $ concat branches
182 AllBranches -> [concat branches]
183 ByProximityDistribution _ _ -> branches
186 levelUpAncestors :: [PhyloGroup] -> [PhyloGroup]
187 levelUpAncestors groups =
188 -- 1) create an associative map of (old,new) ids
189 let ids' = fromList $ map (\g -> (getGroupId g, fst $ head' "levelUpAncestors" ( g ^. phylo_groupScaleParents))) groups
191 let id' = ids' ! (getGroupId g)
192 ancestors = g ^. phylo_groupAncestors
193 -- 2) level up the ancestors ids and filter the ones that will be merged
194 ancestors' = filter (\(id,_) -> id /= id') $ map (\(id,w) -> (ids' ! id,w)) ancestors
195 in g & phylo_groupAncestors .~ ancestors'
198 synchronicClustering :: Phylo -> Phylo
199 synchronicClustering phylo =
200 let prox = phyloProximity $ getConfig phylo
201 sync = phyloSynchrony $ getConfig phylo
202 docs = phylo ^. phylo_timeDocs
203 diagos = map coocToDiago $ phylo ^. phylo_timeCooc
204 newBranches = map (\branch -> reduceGroups prox sync docs diagos branch)
205 $ map processDynamics
206 $ chooseClusteringStrategy sync
208 $ traceSynchronyStart phylo
209 newBranches' = newBranches `using` parList rdeepseq
210 in toNextScale phylo $ levelUpAncestors $ concat newBranches'
213 -- synchronicSimilarity :: Phylo -> Level -> String
214 -- synchronicSimilarity phylo lvl =
215 -- foldl' (\acc branch ->
216 -- acc <> (foldl' (\acc' period ->
217 -- acc' <> let prox = phyloProximity $ getConfig phylo
218 -- sync = phyloSynchrony $ getConfig phylo
219 -- docs = _phylo_timeDocs phylo
220 -- prd = _phylo_groupPeriod $ head' "distance" period
221 -- edges = groupsToEdges prox 0.1 (_bpt_sensibility sync)
222 -- ((sum . elems) $ restrictKeys docs $ periodsToYears [_phylo_groupPeriod $ head' "distance" period]) period
223 -- in foldl' (\mem (_,w) ->
229 -- ) "" $ elems $ groupByField _phylo_groupPeriod branch)
230 -- ) "period\tdistance\n" $ elems $ groupByField _phylo_groupBranchId $ getGroupsFromLevel lvl phylo