]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Viz/Phylo/SynchronicClustering.hs
Merge branch 'dev-phylo' of https://gitlab.iscpif.fr/gargantext/haskell-gargantext...
[gargantext.git] / src / Gargantext / Core / Viz / Phylo / SynchronicClustering.hs
1 {-|
2 Module : Gargantext.Core.Viz.Phylo.SynchronicClustering
3 Description : Module dedicated to the adaptative synchronic clustering of a Phylo.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11
12 module Gargantext.Core.Viz.Phylo.SynchronicClustering where
13
14 -- import Debug.Trace (trace)
15 import Control.Lens hiding (Level)
16 import Control.Monad (sequence)
17 import Control.Parallel.Strategies (parList, rdeepseq, using)
18 import Data.List ((++), null, intersect, nub, concat, sort, sortOn, groupBy)
19 import Data.Map (Map, fromList, fromListWith, foldlWithKey, (!), insert, empty, restrictKeys, elems, mapWithKey, member)
20 import Gargantext.Core.Viz.Phylo
21 import Gargantext.Core.Viz.Phylo.PhyloExport (processDynamics)
22 import Gargantext.Core.Viz.Phylo.PhyloTools
23 import Gargantext.Core.Viz.Phylo.TemporalMatching (weightedLogJaccard', filterDiago, reduceDiagos)
24 import Gargantext.Prelude
25 import qualified Data.Map as Map
26
27
28 -------------------------
29 -- | New Level Maker | --
30 -------------------------
31
32 mergeGroups :: [Cooc] -> PhyloGroupId -> Map PhyloGroupId PhyloGroupId -> [PhyloGroup] -> PhyloGroup
33 mergeGroups coocs id mapIds childs =
34 let ngrams = (sort . nub . concat) $ map _phylo_groupNgrams childs
35 in PhyloGroup (fst $ fst id) (_phylo_groupPeriod' $ head' "mergeGroups" childs)
36 (snd $ fst id) (snd id) ""
37 (sum $ map _phylo_groupSupport childs)
38 (fmap sum $ sequence
39 $ map _phylo_groupWeight childs)
40 (concat $ map _phylo_groupSources childs)
41 ngrams
42 (ngramsToCooc ngrams coocs)
43 ((snd $ fst id),bId)
44 (mergeMeta bId childs) [] (map (\g -> (getGroupId g, 1)) childs)
45 (updatePointers $ concat $ map _phylo_groupPeriodParents childs)
46 (updatePointers $ concat $ map _phylo_groupPeriodChilds childs)
47 (mergeAncestors $ concat $ map _phylo_groupAncestors childs)
48 (updatePointers' $ concat $ map _phylo_groupPeriodMemoryParents childs)
49 (updatePointers' $ concat $ map _phylo_groupPeriodMemoryChilds childs)
50 where
51 --------------------
52 bId :: [Int]
53 bId = mergeBranchIds $ map (\g -> snd $ g ^. phylo_groupBranchId) childs
54 --------------------
55 updatePointers :: [Pointer] -> [Pointer]
56 updatePointers pointers = map (\(pId,w) -> (mapIds ! pId,w)) pointers
57 updatePointers' :: [Pointer'] -> [Pointer']
58 updatePointers' pointers = map (\(pId,(t,w)) -> (mapIds ! pId,(t,w))) pointers
59 --------------------
60 mergeAncestors :: [Pointer] -> [Pointer]
61 mergeAncestors pointers = Map.toList $ fromListWith max pointers
62
63 addPhyloScale :: Scale -> Phylo -> Phylo
64 addPhyloScale lvl phylo =
65 over ( phylo_periods . traverse )
66 (\phyloPrd -> phyloPrd & phylo_periodScales
67 %~ (insert (phyloPrd ^. phylo_periodPeriod, lvl)
68 (PhyloScale (phyloPrd ^. phylo_periodPeriod) (phyloPrd ^. phylo_periodPeriodStr) lvl empty))) phylo
69
70
71 toNextScale :: Phylo -> [PhyloGroup] -> Phylo
72 toNextScale phylo groups =
73 let curLvl = getLastLevel phylo
74 oldGroups = fromList $ map (\g -> (getGroupId g, getLevelParentId g)) groups
75 newGroups = concat $ groupsToBranches'
76 $ fromList $ map (\g -> (getGroupId g, g))
77 $ foldlWithKey (\acc id groups' ->
78 -- 4) create the parent group
79 let parent = mergeGroups (elems $ restrictKeys (getCoocByDate phylo) $ periodsToYears [(fst . fst) id]) id oldGroups groups'
80 in acc ++ [parent]) []
81 -- 3) group the current groups by parentId
82 $ fromListWith (++) $ map (\g -> (getLevelParentId g, [g])) groups
83
84 newPeriods = fromListWith (++) $ map (\g -> (g ^. phylo_groupPeriod, [g])) newGroups
85 in traceSynchronyEnd
86 $ over ( phylo_periods . traverse . phylo_periodScales . traverse
87 -- 6) update each period at curLvl + 1
88 . filtered (\phyloLvl -> phyloLvl ^. phylo_scaleScale == (curLvl + 1)))
89 -- 7) by adding the parents
90 (\phyloLvl ->
91 if member (phyloLvl ^. phylo_scalePeriod) newPeriods
92 then phyloLvl & phylo_scaleGroups
93 .~ fromList (map (\g -> (getGroupId g, g)) $ newPeriods ! (phyloLvl ^. phylo_scalePeriod))
94 else phyloLvl)
95 -- 2) add the curLvl + 1 PhyloScale to the phylo
96 $ addPhyloScale (curLvl + 1)
97 -- 1) update the current groups (with level parent pointers) in the phylo
98 $ updatePhyloGroups curLvl (fromList $ map (\g -> (getGroupId g, g)) groups) phylo
99
100 --------------------
101 -- | Clustering | --
102 --------------------
103
104 toPairs :: SynchronyStrategy -> [PhyloGroup] -> [(PhyloGroup,PhyloGroup)]
105 toPairs strategy groups = case strategy of
106 MergeRegularGroups -> pairs
107 $ filter (\g -> all (== 3) $ (g ^. phylo_groupMeta) ! "dynamics") groups
108 MergeAllGroups -> pairs groups
109 where
110 pairs :: [PhyloGroup] -> [(PhyloGroup,PhyloGroup)]
111 pairs gs = filter (\(g,g') -> (not . null) $ intersect (g ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams)) (listToCombi' gs)
112
113
114 toDiamonds :: [PhyloGroup] -> [[PhyloGroup]]
115 toDiamonds groups = foldl' (\acc groups' ->
116 acc ++ ( elems
117 $ Map.filter (\v -> length v > 1)
118 $ fromListWith (++)
119 $ foldl' (\acc' g ->
120 acc' ++ (map (\(id,_) -> (id,[g]) ) $ g ^. phylo_groupPeriodChilds)) [] groups')) []
121 $ elems
122 $ Map.filter (\v -> length v > 1)
123 $ fromListWith (++)
124 $ foldl' (\acc g -> acc ++ (map (\(id,_) -> (id,[g]) ) $ g ^. phylo_groupPeriodParents) ) [] groups
125
126
127 groupsToEdges :: Similarity -> Synchrony -> Double -> Map Int Double -> [PhyloGroup] -> [((PhyloGroup,PhyloGroup),Double)]
128 groupsToEdges prox sync nbDocs diago groups =
129 case sync of
130 ByProximityThreshold thr sens _ strat ->
131 filter (\(_,w) -> w >= thr)
132 $ toEdges sens
133 $ toPairs strat groups
134 ByProximityDistribution sens strat ->
135 let diamonds = sortOn snd
136 $ toEdges sens $ concat
137 $ map (\gs -> toPairs strat gs) $ toDiamonds groups
138 in take (div (length diamonds) 2) diamonds
139 where
140 toEdges :: Double -> [(PhyloGroup,PhyloGroup)] -> [((PhyloGroup,PhyloGroup),Double)]
141 toEdges sens edges =
142 case prox of
143 WeightedLogJaccard _ _ -> map (\(g,g') ->
144 ((g,g'), weightedLogJaccard' (sens) nbDocs diago
145 (g ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams))) edges
146 WeightedLogSim _ _ -> map (\(g,g') ->
147 ((g,g'), weightedLogJaccard' (1 / sens) nbDocs diago
148 (g ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams))) edges
149
150 _ -> undefined
151
152 toParentId :: PhyloGroup -> PhyloGroupId
153 toParentId child = ((child ^. phylo_groupPeriod, child ^. phylo_groupScale + 1), child ^. phylo_groupIndex)
154
155
156 reduceGroups :: Similarity -> Synchrony -> Map Date Double -> Map Date Cooc -> [PhyloGroup] -> [PhyloGroup]
157 reduceGroups prox sync docs diagos branch =
158 -- 1) reduce a branch as a set of periods & groups
159 let periods = fromListWith (++)
160 $ map (\g -> (g ^. phylo_groupPeriod,[g])) branch
161 in (concat . concat . elems)
162 -- TODO : ajouter un parallelisme
163 $ mapWithKey (\prd groups ->
164 -- 2) for each period, transform the groups as a proximity graph filtered by a threshold
165 let diago = reduceDiagos $ filterDiago diagos [prd]
166 edges = groupsToEdges prox sync ((sum . elems) $ restrictKeys docs $ periodsToYears [prd]) diago groups
167 in map (\comp ->
168 -- 4) add to each groups their futur level parent group
169 let parentId = toParentId (head' "parentId" comp)
170 in map (\g -> g & phylo_groupScaleParents %~ (++ [(parentId,1)]) ) comp )
171 -- 3) reduce the graph a a set of related components
172 $ toRelatedComponents groups edges) periods
173
174
175 chooseClusteringStrategy :: Synchrony -> [[PhyloGroup]] -> [[PhyloGroup]]
176 chooseClusteringStrategy sync branches = case sync of
177 ByProximityThreshold _ _ scope _ -> case scope of
178 SingleBranch -> branches
179 SiblingBranches -> groupBy (\g g' -> (last' "chooseClusteringStrategy" $ (g ^. phylo_groupMeta) ! "breaks")
180 == (last' "chooseClusteringStrategy" $ (g' ^. phylo_groupMeta) ! "breaks"))
181 $ sortOn _phylo_groupBranchId $ concat branches
182 AllBranches -> [concat branches]
183 ByProximityDistribution _ _ -> branches
184
185
186 levelUpAncestors :: [PhyloGroup] -> [PhyloGroup]
187 levelUpAncestors groups =
188 -- 1) create an associative map of (old,new) ids
189 let ids' = fromList $ map (\g -> (getGroupId g, fst $ head' "levelUpAncestors" ( g ^. phylo_groupScaleParents))) groups
190 in map (\g ->
191 let id' = ids' ! (getGroupId g)
192 ancestors = g ^. phylo_groupAncestors
193 -- 2) level up the ancestors ids and filter the ones that will be merged
194 ancestors' = filter (\(id,_) -> id /= id') $ map (\(id,w) -> (ids' ! id,w)) ancestors
195 in g & phylo_groupAncestors .~ ancestors'
196 ) groups
197
198 synchronicClustering :: Phylo -> Phylo
199 synchronicClustering phylo =
200 let prox = similarity $ getConfig phylo
201 sync = phyloSynchrony $ getConfig phylo
202 docs = getDocsByDate phylo
203 diagos = map coocToDiago $ getCoocByDate phylo
204 newBranches = map (\branch -> reduceGroups prox sync docs diagos branch)
205 $ map processDynamics
206 $ chooseClusteringStrategy sync
207 $ phyloLastScale
208 $ traceSynchronyStart phylo
209 newBranches' = newBranches `using` parList rdeepseq
210 in toNextScale phylo $ levelUpAncestors $ concat newBranches'
211
212
213 -- synchronicDistance :: Phylo -> Level -> String
214 -- synchronicDistance phylo lvl =
215 -- foldl' (\acc branch ->
216 -- acc <> (foldl' (\acc' period ->
217 -- acc' <> let prox = phyloProximity $ getConfig phylo
218 -- sync = phyloSynchrony $ getConfig phylo
219 -- docs = _phylo_timeDocs phylo
220 -- prd = _phylo_groupPeriod $ head' "distance" period
221 -- edges = groupsToEdges prox 0.1 (_bpt_sensibility sync)
222 -- ((sum . elems) $ restrictKeys docs $ periodsToYears [_phylo_groupPeriod $ head' "distance" period]) period
223 -- in foldl' (\mem (_,w) ->
224 -- mem <> show (prd)
225 -- <> "\t"
226 -- <> show (w)
227 -- <> "\n"
228 -- ) "" edges
229 -- ) "" $ elems $ groupByField _phylo_groupPeriod branch)
230 -- ) "period\tdistance\n" $ elems $ groupByField _phylo_groupBranchId $ getGroupsFromLevel lvl phylo