]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Viz/Phylo/SynchronicClustering.hs
Merge remote-tracking branch 'origin/dev-phylo' into dev
[gargantext.git] / src / Gargantext / Core / Viz / Phylo / SynchronicClustering.hs
1 {-|
2 Module : Gargantext.Core.Viz.Phylo.SynchronicClustering
3 Description : Module dedicated to the adaptative synchronic clustering of a Phylo.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11
12 module Gargantext.Core.Viz.Phylo.SynchronicClustering where
13
14 -- import Debug.Trace (trace)
15 import Control.Lens hiding (Level)
16 import Control.Monad (sequence)
17 import Control.Parallel.Strategies (parList, rdeepseq, using)
18 import Data.List ((++), null, intersect, nub, concat, sort, sortOn, groupBy)
19 import Data.Map (Map, fromList, fromListWith, foldlWithKey, (!), insert, empty, restrictKeys, elems, mapWithKey, member, unionWith)
20 import Gargantext.Core.Viz.Phylo
21 import Gargantext.Core.Viz.Phylo.PhyloExport (processDynamics)
22 import Gargantext.Core.Viz.Phylo.PhyloTools
23 import Gargantext.Core.Viz.Phylo.TemporalMatching (weightedLogJaccard', filterDiago, reduceDiagos)
24 import Gargantext.Prelude
25 -- import Debug.Trace (trace)
26 import qualified Data.Map as Map
27
28
29 -------------------------
30 -- | New Level Maker | --
31 -------------------------
32
33 mergeGroups :: [Cooc] -> PhyloGroupId -> Map PhyloGroupId PhyloGroupId -> [PhyloGroup] -> PhyloGroup
34 mergeGroups coocs id mapIds childs =
35 let ngrams = (sort . nub . concat) $ map _phylo_groupNgrams childs
36 counts = foldl (\acc count -> unionWith (+) acc count) empty $ map _phylo_groupRootsCount childs
37 in PhyloGroup (fst $ fst id) (_phylo_groupPeriod' $ head' "mergeGroups" childs)
38 (snd $ fst id) (snd id) ""
39 (sum $ map _phylo_groupSupport childs)
40 (fmap sum $ sequence
41 $ map _phylo_groupWeight childs)
42 (concat $ map _phylo_groupSources childs)
43 ngrams
44 (ngramsToCooc ngrams coocs)
45 (ngramsToDensity ngrams coocs counts)
46 -- todo add density here
47 ((snd $ fst id),bId)
48 (mergeMeta bId childs)
49 counts
50 [] (map (\g -> (getGroupId g, 1)) childs)
51 (updatePointers $ concat $ map _phylo_groupPeriodParents childs)
52 (updatePointers $ concat $ map _phylo_groupPeriodChilds childs)
53 (mergeAncestors $ concat $ map _phylo_groupAncestors childs)
54 (updatePointers' $ concat $ map _phylo_groupPeriodMemoryParents childs)
55 (updatePointers' $ concat $ map _phylo_groupPeriodMemoryChilds childs)
56 where
57 --------------------
58 bId :: [Int]
59 bId = mergeBranchIds $ map (\g -> snd $ g ^. phylo_groupBranchId) childs
60 --------------------
61 updatePointers :: [Pointer] -> [Pointer]
62 updatePointers pointers = map (\(pId,w) -> (mapIds ! pId,w)) pointers
63 updatePointers' :: [Pointer'] -> [Pointer']
64 updatePointers' pointers = map (\(pId,(t,w)) -> (mapIds ! pId,(t,w))) pointers
65 --------------------
66 mergeAncestors :: [Pointer] -> [Pointer]
67 mergeAncestors pointers = Map.toList $ fromListWith max pointers
68
69 addPhyloScale :: Scale -> Phylo -> Phylo
70 addPhyloScale lvl phylo =
71 over ( phylo_periods . traverse )
72 (\phyloPrd -> phyloPrd & phylo_periodScales
73 %~ (insert (phyloPrd ^. phylo_periodPeriod, lvl)
74 (PhyloScale (phyloPrd ^. phylo_periodPeriod) (phyloPrd ^. phylo_periodPeriodStr) lvl empty))) phylo
75
76
77 toNextScale :: Phylo -> [PhyloGroup] -> Phylo
78 toNextScale phylo groups =
79 let curLvl = getLastLevel phylo
80 oldGroups = fromList $ map (\g -> (getGroupId g, getLevelParentId g)) groups
81 newGroups = concat $ groupsToBranches'
82 $ fromList $ map (\g -> (getGroupId g, g))
83 $ foldlWithKey (\acc id groups' ->
84 -- 4) create the parent group
85 let parent = mergeGroups (elems $ restrictKeys (getCoocByDate phylo) $ periodsToYears [(fst . fst) id]) id oldGroups groups'
86 in acc ++ [parent]) []
87 -- 3) group the current groups by parentId
88 $ fromListWith (++) $ map (\g -> (getLevelParentId g, [g])) groups
89
90 newPeriods = fromListWith (++) $ map (\g -> (g ^. phylo_groupPeriod, [g])) newGroups
91 in traceSynchronyEnd
92 $ over ( phylo_periods . traverse . phylo_periodScales . traverse
93 -- 6) update each period at curLvl + 1
94 . filtered (\phyloLvl -> phyloLvl ^. phylo_scaleScale == (curLvl + 1)))
95 -- 7) by adding the parents
96 (\phyloLvl ->
97 if member (phyloLvl ^. phylo_scalePeriod) newPeriods
98 then phyloLvl & phylo_scaleGroups
99 .~ fromList (map (\g -> (getGroupId g, g)) $ newPeriods ! (phyloLvl ^. phylo_scalePeriod))
100 else phyloLvl)
101 -- 2) add the curLvl + 1 PhyloScale to the phylo
102 $ addPhyloScale (curLvl + 1)
103 -- 1) update the current groups (with level parent pointers) in the phylo
104 $ updatePhyloGroups curLvl (fromList $ map (\g -> (getGroupId g, g)) groups) phylo
105
106 --------------------
107 -- | Clustering | --
108 --------------------
109
110 toPairs :: SynchronyStrategy -> [PhyloGroup] -> [(PhyloGroup,PhyloGroup)]
111 toPairs strategy groups = case strategy of
112 MergeRegularGroups -> pairs
113 $ filter (\g -> all (== 3) $ (g ^. phylo_groupMeta) ! "dynamics") groups
114 MergeAllGroups -> pairs groups
115 where
116 pairs :: [PhyloGroup] -> [(PhyloGroup,PhyloGroup)]
117 pairs gs = filter (\(g,g') -> (not . null) $ intersect (g ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams)) (listToCombi' gs)
118
119
120 toDiamonds :: [PhyloGroup] -> [[PhyloGroup]]
121 toDiamonds groups = foldl' (\acc groups' ->
122 acc ++ ( elems
123 $ Map.filter (\v -> length v > 1)
124 $ fromListWith (++)
125 $ foldl' (\acc' g ->
126 acc' ++ (map (\(id,_) -> (id,[g]) ) $ g ^. phylo_groupPeriodChilds)) [] groups')) []
127 $ elems
128 $ Map.filter (\v -> length v > 1)
129 $ fromListWith (++)
130 $ foldl' (\acc g -> acc ++ (map (\(id,_) -> (id,[g]) ) $ g ^. phylo_groupPeriodParents) ) [] groups
131
132
133 groupsToEdges :: PhyloSimilarity -> Synchrony -> Double -> Map Int Double -> [PhyloGroup] -> [((PhyloGroup,PhyloGroup),Double)]
134 groupsToEdges prox sync nbDocs diago groups =
135 case sync of
136 ByProximityThreshold thr sens _ strat ->
137 filter (\(_,w) -> w >= thr)
138 $ toEdges sens
139 $ toPairs strat groups
140 ByProximityDistribution sens strat ->
141 let diamonds = sortOn snd
142 $ toEdges sens $ concat
143 $ map (\gs -> toPairs strat gs) $ toDiamonds groups
144 in take (div (length diamonds) 2) diamonds
145 where
146 toEdges :: Double -> [(PhyloGroup,PhyloGroup)] -> [((PhyloGroup,PhyloGroup),Double)]
147 toEdges sens edges =
148 case prox of
149 WeightedLogJaccard _ _ -> map (\(g,g') ->
150 ((g,g'), weightedLogJaccard' (sens) nbDocs diago
151 (g ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams))) edges
152 WeightedLogSim _ _ -> map (\(g,g') ->
153 ((g,g'), weightedLogJaccard' (1 / sens) nbDocs diago
154 (g ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams))) edges
155
156 _ -> undefined
157
158 toParentId :: PhyloGroup -> PhyloGroupId
159 toParentId child = ((child ^. phylo_groupPeriod, child ^. phylo_groupScale + 1), child ^. phylo_groupIndex)
160
161
162 reduceGroups :: PhyloSimilarity -> Synchrony -> Map Date Double -> Map Date Cooc -> [PhyloGroup] -> [PhyloGroup]
163 reduceGroups prox sync docs diagos branch =
164 -- 1) reduce a branch as a set of periods & groups
165 let periods = fromListWith (++)
166 $ map (\g -> (g ^. phylo_groupPeriod,[g])) branch
167 in (concat . concat . elems)
168 -- TODO : ajouter un parallelisme
169 $ mapWithKey (\prd groups ->
170 -- 2) for each period, transform the groups as a proximity graph filtered by a threshold
171 let diago = reduceDiagos $ filterDiago diagos [prd]
172 edgesLeft = fromList $ groupsToEdges prox sync ((sum . elems) $ restrictKeys docs $ periodsToYears [prd]) diago groups
173 edgesRight = fromList $ map (\((k1,k2),v) -> ((k2,k1),v))
174 $ groupsToEdges prox sync ((sum . elems) $ restrictKeys docs $ periodsToYears [prd]) diago (reverse groups)
175 mergedEdges = Map.toList
176 $ unionWith (\v1 v2 -> if v1 >= v2
177 then v1
178 else v2) edgesLeft edgesRight
179 -- 3) reduce the graph a a set of related components
180 clusters = toRelatedComponents groups mergedEdges
181 in map (\comp ->
182 -- 4) add to each groups their futur level parent group
183 let parentId = toParentId (head' "parentId" comp)
184 in map (\g -> g & phylo_groupScaleParents %~ (++ [(parentId,1)]) ) comp )
185 $ clusters) periods
186
187 chooseClusteringStrategy :: Synchrony -> [[PhyloGroup]] -> [[PhyloGroup]]
188 chooseClusteringStrategy sync branches = case sync of
189 ByProximityThreshold _ _ scope _ -> case scope of
190 SingleBranch -> branches
191 SiblingBranches -> groupBy (\g g' -> (last' "chooseClusteringStrategy" $ (g ^. phylo_groupMeta) ! "breaks")
192 == (last' "chooseClusteringStrategy" $ (g' ^. phylo_groupMeta) ! "breaks"))
193 $ sortOn _phylo_groupBranchId $ concat branches
194 AllBranches -> [concat branches]
195 ByProximityDistribution _ _ -> branches
196
197
198 levelUpAncestors :: [PhyloGroup] -> [PhyloGroup]
199 levelUpAncestors groups =
200 -- 1) create an associative map of (old,new) ids
201 let ids' = fromList $ map (\g -> (getGroupId g, fst $ head' "levelUpAncestors" ( g ^. phylo_groupScaleParents))) groups
202 in map (\g ->
203 let id' = ids' ! (getGroupId g)
204 ancestors = g ^. phylo_groupAncestors
205 -- 2) level up the ancestors ids and filter the ones that will be merged
206 ancestors' = filter (\(id,_) -> id /= id') $ map (\(id,w) -> (ids' ! id,w)) ancestors
207 in g & phylo_groupAncestors .~ ancestors'
208 ) groups
209
210 synchronicClustering :: Phylo -> Phylo
211 synchronicClustering phylo =
212 let prox = similarity $ getConfig phylo
213 sync = phyloSynchrony $ getConfig phylo
214 docs = getDocsByDate phylo
215 diagos = map coocToDiago $ getCoocByDate phylo
216 newBranches = map (\branch -> reduceGroups prox sync docs diagos branch)
217 $ map processDynamics
218 $ chooseClusteringStrategy sync
219 $ phyloLastScale
220 $ traceSynchronyStart phylo
221 newBranches' = newBranches `using` parList rdeepseq
222 in toNextScale phylo $ levelUpAncestors $ concat newBranches'
223
224
225 -- synchronicDistance :: Phylo -> Level -> String
226 -- synchronicDistance phylo lvl =
227 -- foldl' (\acc branch ->
228 -- acc <> (foldl' (\acc' period ->
229 -- acc' <> let prox = phyloProximity $ getConfig phylo
230 -- sync = phyloSynchrony $ getConfig phylo
231 -- docs = _phylo_timeDocs phylo
232 -- prd = _phylo_groupPeriod $ head' "distance" period
233 -- edges = groupsToEdges prox 0.1 (_bpt_sensibility sync)
234 -- ((sum . elems) $ restrictKeys docs $ periodsToYears [_phylo_groupPeriod $ head' "distance" period]) period
235 -- in foldl' (\mem (_,w) ->
236 -- mem <> show (prd)
237 -- <> "\t"
238 -- <> show (w)
239 -- <> "\n"
240 -- ) "" edges
241 -- ) "" $ elems $ groupByField _phylo_groupPeriod branch)
242 -- ) "period\tdistance\n" $ elems $ groupByField _phylo_groupBranchId $ getGroupsFromLevel lvl phylo