]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Viz/Phylo/SynchronicClustering.hs
Merge branch 'dev' into 571-dev-node-corpus-api-search-fixes-take-2
[gargantext.git] / src / Gargantext / Core / Viz / Phylo / SynchronicClustering.hs
1 {-|
2 Module : Gargantext.Core.Viz.Phylo.SynchronicClustering
3 Description : Module dedicated to the adaptative synchronic clustering of a Phylo.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11
12 module Gargantext.Core.Viz.Phylo.SynchronicClustering where
13
14 -- import Debug.Trace (trace)
15 import Control.Lens hiding (Level)
16 import Control.Monad (sequence)
17 import Control.Parallel.Strategies (parList, rdeepseq, using)
18 import Data.List ((++), null, intersect, nub, concat, sort, sortOn, groupBy)
19 import Data.Map (Map, fromList, fromListWith, foldlWithKey, (!), insert, empty, restrictKeys, elems, mapWithKey, member, unionWith)
20 import Gargantext.Core.Viz.Phylo
21 import Gargantext.Core.Viz.Phylo.PhyloExport (processDynamics)
22 import Gargantext.Core.Viz.Phylo.PhyloTools
23 import Gargantext.Core.Viz.Phylo.TemporalMatching (weightedLogJaccard', filterDiago, reduceDiagos)
24 import Gargantext.Prelude
25 import qualified Data.Map as Map
26
27
28 -------------------------
29 -- | New Level Maker | --
30 -------------------------
31
32 mergeGroups :: [Cooc] -> PhyloGroupId -> Map PhyloGroupId PhyloGroupId -> [PhyloGroup] -> PhyloGroup
33 mergeGroups coocs id mapIds childs =
34 let ngrams = (sort . nub . concat) $ map _phylo_groupNgrams childs
35 counts = foldl (\acc count -> unionWith (+) acc count) empty $ map _phylo_groupRootsCount childs
36 in PhyloGroup (fst $ fst id) (_phylo_groupPeriod' $ head' "mergeGroups" childs)
37 (snd $ fst id) (snd id) ""
38 (sum $ map _phylo_groupSupport childs)
39 (fmap sum $ sequence
40 $ map _phylo_groupWeight childs)
41 (concat $ map _phylo_groupSources childs)
42 ngrams
43 (ngramsToCooc ngrams coocs)
44 (ngramsToDensity ngrams coocs counts)
45 -- todo add density here
46 ((snd $ fst id),bId)
47 (mergeMeta bId childs)
48 counts
49 [] (map (\g -> (getGroupId g, 1)) childs)
50 (updatePointers $ concat $ map _phylo_groupPeriodParents childs)
51 (updatePointers $ concat $ map _phylo_groupPeriodChilds childs)
52 (mergeAncestors $ concat $ map _phylo_groupAncestors childs)
53 (updatePointers' $ concat $ map _phylo_groupPeriodMemoryParents childs)
54 (updatePointers' $ concat $ map _phylo_groupPeriodMemoryChilds childs)
55 where
56 --------------------
57 bId :: [Int]
58 bId = mergeBranchIds $ map (\g -> snd $ g ^. phylo_groupBranchId) childs
59 --------------------
60 updatePointers :: [Pointer] -> [Pointer]
61 updatePointers pointers = map (\(pId,w) -> (mapIds ! pId,w)) pointers
62 updatePointers' :: [Pointer'] -> [Pointer']
63 updatePointers' pointers = map (\(pId,(t,w)) -> (mapIds ! pId,(t,w))) pointers
64 --------------------
65 mergeAncestors :: [Pointer] -> [Pointer]
66 mergeAncestors pointers = Map.toList $ fromListWith max pointers
67
68 addPhyloScale :: Scale -> Phylo -> Phylo
69 addPhyloScale lvl phylo =
70 over ( phylo_periods . traverse )
71 (\phyloPrd -> phyloPrd & phylo_periodScales
72 %~ (insert (phyloPrd ^. phylo_periodPeriod, lvl)
73 (PhyloScale (phyloPrd ^. phylo_periodPeriod) (phyloPrd ^. phylo_periodPeriodStr) lvl empty))) phylo
74
75
76 toNextScale :: Phylo -> [PhyloGroup] -> Phylo
77 toNextScale phylo groups =
78 let curLvl = getLastLevel phylo
79 oldGroups = fromList $ map (\g -> (getGroupId g, getLevelParentId g)) groups
80 newGroups = concat $ groupsToBranches'
81 $ fromList $ map (\g -> (getGroupId g, g))
82 $ foldlWithKey (\acc id groups' ->
83 -- 4) create the parent group
84 let parent = mergeGroups (elems $ restrictKeys (getCoocByDate phylo) $ periodsToYears [(fst . fst) id]) id oldGroups groups'
85 in acc ++ [parent]) []
86 -- 3) group the current groups by parentId
87 $ fromListWith (++) $ map (\g -> (getLevelParentId g, [g])) groups
88
89 newPeriods = fromListWith (++) $ map (\g -> (g ^. phylo_groupPeriod, [g])) newGroups
90 in traceSynchronyEnd
91 $ over ( phylo_periods . traverse . phylo_periodScales . traverse
92 -- 6) update each period at curLvl + 1
93 . filtered (\phyloLvl -> phyloLvl ^. phylo_scaleScale == (curLvl + 1)))
94 -- 7) by adding the parents
95 (\phyloLvl ->
96 if member (phyloLvl ^. phylo_scalePeriod) newPeriods
97 then phyloLvl & phylo_scaleGroups
98 .~ fromList (map (\g -> (getGroupId g, g)) $ newPeriods ! (phyloLvl ^. phylo_scalePeriod))
99 else phyloLvl)
100 -- 2) add the curLvl + 1 PhyloScale to the phylo
101 $ addPhyloScale (curLvl + 1)
102 -- 1) update the current groups (with level parent pointers) in the phylo
103 $ updatePhyloGroups curLvl (fromList $ map (\g -> (getGroupId g, g)) groups) phylo
104
105 --------------------
106 -- | Clustering | --
107 --------------------
108
109 toPairs :: SynchronyStrategy -> [PhyloGroup] -> [(PhyloGroup,PhyloGroup)]
110 toPairs strategy groups = case strategy of
111 MergeRegularGroups -> pairs
112 $ filter (\g -> all (== 3) $ (g ^. phylo_groupMeta) ! "dynamics") groups
113 MergeAllGroups -> pairs groups
114 where
115 pairs :: [PhyloGroup] -> [(PhyloGroup,PhyloGroup)]
116 pairs gs = filter (\(g,g') -> (not . null) $ intersect (g ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams)) (listToCombi' gs)
117
118
119 toDiamonds :: [PhyloGroup] -> [[PhyloGroup]]
120 toDiamonds groups = foldl' (\acc groups' ->
121 acc ++ ( elems
122 $ Map.filter (\v -> length v > 1)
123 $ fromListWith (++)
124 $ foldl' (\acc' g ->
125 acc' ++ (map (\(id,_) -> (id,[g]) ) $ g ^. phylo_groupPeriodChilds)) [] groups')) []
126 $ elems
127 $ Map.filter (\v -> length v > 1)
128 $ fromListWith (++)
129 $ foldl' (\acc g -> acc ++ (map (\(id,_) -> (id,[g]) ) $ g ^. phylo_groupPeriodParents) ) [] groups
130
131
132 groupsToEdges :: PhyloSimilarity -> Synchrony -> Double -> Map Int Double -> [PhyloGroup] -> [((PhyloGroup,PhyloGroup),Double)]
133 groupsToEdges prox sync nbDocs diago groups =
134 case sync of
135 ByProximityThreshold thr sens _ strat ->
136 filter (\(_,w) -> w >= thr)
137 $ toEdges sens
138 $ toPairs strat groups
139 ByProximityDistribution sens strat ->
140 let diamonds = sortOn snd
141 $ toEdges sens $ concat
142 $ map (\gs -> toPairs strat gs) $ toDiamonds groups
143 in take (div (length diamonds) 2) diamonds
144 where
145 toEdges :: Double -> [(PhyloGroup,PhyloGroup)] -> [((PhyloGroup,PhyloGroup),Double)]
146 toEdges sens edges =
147 case prox of
148 WeightedLogJaccard _ _ -> map (\(g,g') ->
149 ((g,g'), weightedLogJaccard' (sens) nbDocs diago
150 (g ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams))) edges
151 WeightedLogSim _ _ -> map (\(g,g') ->
152 ((g,g'), weightedLogJaccard' (1 / sens) nbDocs diago
153 (g ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams))) edges
154
155 _ -> undefined
156
157 toParentId :: PhyloGroup -> PhyloGroupId
158 toParentId child = ((child ^. phylo_groupPeriod, child ^. phylo_groupScale + 1), child ^. phylo_groupIndex)
159
160
161 reduceGroups :: PhyloSimilarity -> Synchrony -> Map Date Double -> Map Date Cooc -> [PhyloGroup] -> [PhyloGroup]
162 reduceGroups prox sync docs diagos branch =
163 -- 1) reduce a branch as a set of periods & groups
164 let periods = fromListWith (++)
165 $ map (\g -> (g ^. phylo_groupPeriod,[g])) branch
166 in (concat . concat . elems)
167 -- TODO : ajouter un parallelisme
168 $ mapWithKey (\prd groups ->
169 -- 2) for each period, transform the groups as a proximity graph filtered by a threshold
170 let diago = reduceDiagos $ filterDiago diagos [prd]
171 edges = groupsToEdges prox sync ((sum . elems) $ restrictKeys docs $ periodsToYears [prd]) diago groups
172 in map (\comp ->
173 -- 4) add to each groups their futur level parent group
174 let parentId = toParentId (head' "parentId" comp)
175 in map (\g -> g & phylo_groupScaleParents %~ (++ [(parentId,1)]) ) comp )
176 -- 3) reduce the graph a a set of related components
177 $ toRelatedComponents groups edges) periods
178
179
180 chooseClusteringStrategy :: Synchrony -> [[PhyloGroup]] -> [[PhyloGroup]]
181 chooseClusteringStrategy sync branches = case sync of
182 ByProximityThreshold _ _ scope _ -> case scope of
183 SingleBranch -> branches
184 SiblingBranches -> groupBy (\g g' -> (last' "chooseClusteringStrategy" $ (g ^. phylo_groupMeta) ! "breaks")
185 == (last' "chooseClusteringStrategy" $ (g' ^. phylo_groupMeta) ! "breaks"))
186 $ sortOn _phylo_groupBranchId $ concat branches
187 AllBranches -> [concat branches]
188 ByProximityDistribution _ _ -> branches
189
190
191 levelUpAncestors :: [PhyloGroup] -> [PhyloGroup]
192 levelUpAncestors groups =
193 -- 1) create an associative map of (old,new) ids
194 let ids' = fromList $ map (\g -> (getGroupId g, fst $ head' "levelUpAncestors" ( g ^. phylo_groupScaleParents))) groups
195 in map (\g ->
196 let id' = ids' ! (getGroupId g)
197 ancestors = g ^. phylo_groupAncestors
198 -- 2) level up the ancestors ids and filter the ones that will be merged
199 ancestors' = filter (\(id,_) -> id /= id') $ map (\(id,w) -> (ids' ! id,w)) ancestors
200 in g & phylo_groupAncestors .~ ancestors'
201 ) groups
202
203 synchronicClustering :: Phylo -> Phylo
204 synchronicClustering phylo =
205 let prox = similarity $ getConfig phylo
206 sync = phyloSynchrony $ getConfig phylo
207 docs = getDocsByDate phylo
208 diagos = map coocToDiago $ getCoocByDate phylo
209 newBranches = map (\branch -> reduceGroups prox sync docs diagos branch)
210 $ map processDynamics
211 $ chooseClusteringStrategy sync
212 $ phyloLastScale
213 $ traceSynchronyStart phylo
214 newBranches' = newBranches `using` parList rdeepseq
215 in toNextScale phylo $ levelUpAncestors $ concat newBranches'
216
217
218 -- synchronicDistance :: Phylo -> Level -> String
219 -- synchronicDistance phylo lvl =
220 -- foldl' (\acc branch ->
221 -- acc <> (foldl' (\acc' period ->
222 -- acc' <> let prox = phyloProximity $ getConfig phylo
223 -- sync = phyloSynchrony $ getConfig phylo
224 -- docs = _phylo_timeDocs phylo
225 -- prd = _phylo_groupPeriod $ head' "distance" period
226 -- edges = groupsToEdges prox 0.1 (_bpt_sensibility sync)
227 -- ((sum . elems) $ restrictKeys docs $ periodsToYears [_phylo_groupPeriod $ head' "distance" period]) period
228 -- in foldl' (\mem (_,w) ->
229 -- mem <> show (prd)
230 -- <> "\t"
231 -- <> show (w)
232 -- <> "\n"
233 -- ) "" edges
234 -- ) "" $ elems $ groupByField _phylo_groupPeriod branch)
235 -- ) "period\tdistance\n" $ elems $ groupByField _phylo_groupBranchId $ getGroupsFromLevel lvl phylo