]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Viz/Phylo/SynchronicClustering.hs
[upload zip] some more work on zipfile parsing
[gargantext.git] / src / Gargantext / Core / Viz / Phylo / SynchronicClustering.hs
1 {-|
2 Module : Gargantext.Core.Viz.Phylo.SynchronicClustering
3 Description : Module dedicated to the adaptative synchronic clustering of a Phylo.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11
12 module Gargantext.Core.Viz.Phylo.SynchronicClustering where
13
14 import Gargantext.Prelude
15 import Gargantext.Core.Viz.AdaptativePhylo
16 import Gargantext.Core.Viz.Phylo.PhyloTools
17 import Gargantext.Core.Viz.Phylo.TemporalMatching (weightedLogJaccard', filterDiago, reduceDiagos)
18 import Gargantext.Core.Viz.Phylo.PhyloExport (processDynamics)
19
20 import Data.List ((++), null, intersect, nub, concat, sort, sortOn, groupBy)
21 import Data.Map (Map, fromList, fromListWith, foldlWithKey, (!), insert, empty, restrictKeys, elems, mapWithKey, member)
22
23 import Control.Lens hiding (Level)
24 import Control.Parallel.Strategies (parList, rdeepseq, using)
25 import Control.Monad (sequence)
26 -- import Debug.Trace (trace)
27
28 import qualified Data.Map as Map
29
30
31 -------------------------
32 -- | New Level Maker | --
33 -------------------------
34
35
36 mergeGroups :: [Cooc] -> PhyloGroupId -> Map PhyloGroupId PhyloGroupId -> [PhyloGroup] -> PhyloGroup
37 mergeGroups coocs id mapIds childs =
38 let ngrams = (sort . nub . concat) $ map _phylo_groupNgrams childs
39 in PhyloGroup (fst $ fst id) (_phylo_groupPeriod' $ head' "mergeGroups" childs)
40 (snd $ fst id) (snd id) ""
41 (sum $ map _phylo_groupSupport childs)
42 (fmap sum $ sequence
43 $ map _phylo_groupWeight childs)
44 (concat $ map _phylo_groupSources childs)
45 ngrams
46 (ngramsToCooc ngrams coocs)
47 ((snd $ fst id),bId)
48 (mergeMeta bId childs) [] (map (\g -> (getGroupId g, 1)) childs)
49 (updatePointers $ concat $ map _phylo_groupPeriodParents childs)
50 (updatePointers $ concat $ map _phylo_groupPeriodChilds childs)
51 (mergeAncestors $ concat $ map _phylo_groupAncestors childs)
52 where
53 --------------------
54 bId :: [Int]
55 bId = mergeBranchIds $ map (\g -> snd $ g ^. phylo_groupBranchId) childs
56 --------------------
57 updatePointers :: [Pointer] -> [Pointer]
58 updatePointers pointers = map (\(pId,w) -> (mapIds ! pId,w)) pointers
59 --------------------
60 mergeAncestors :: [Pointer] -> [Pointer]
61 mergeAncestors pointers = Map.toList $ fromListWith max pointers
62
63 addPhyloLevel :: Level -> Phylo -> Phylo
64 addPhyloLevel lvl phylo =
65 over ( phylo_periods . traverse )
66 (\phyloPrd -> phyloPrd & phylo_periodLevels
67 %~ (insert (phyloPrd ^. phylo_periodPeriod, lvl)
68 (PhyloLevel (phyloPrd ^. phylo_periodPeriod) (phyloPrd ^. phylo_periodPeriod') lvl empty))) phylo
69
70
71 toNextLevel' :: Phylo -> [PhyloGroup] -> Phylo
72 toNextLevel' phylo groups =
73 let curLvl = getLastLevel phylo
74 oldGroups = fromList $ map (\g -> (getGroupId g, getLevelParentId g)) groups
75 newGroups = concat $ groupsToBranches
76 $ fromList $ map (\g -> (getGroupId g, g))
77 $ foldlWithKey (\acc id groups' ->
78 -- 4) create the parent group
79 let parent = mergeGroups (elems $ restrictKeys (phylo ^. phylo_timeCooc) $ periodsToYears [(fst . fst) id]) id oldGroups groups'
80 in acc ++ [parent]) []
81 -- 3) group the current groups by parentId
82 $ fromListWith (++) $ map (\g -> (getLevelParentId g, [g])) groups
83
84 newPeriods = fromListWith (++) $ map (\g -> (g ^. phylo_groupPeriod, [g])) newGroups
85 in traceSynchronyEnd
86 $ over ( phylo_periods . traverse . phylo_periodLevels . traverse
87 -- 6) update each period at curLvl + 1
88 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == (curLvl + 1)))
89 -- 7) by adding the parents
90 (\phyloLvl ->
91 if member (phyloLvl ^. phylo_levelPeriod) newPeriods
92 then phyloLvl & phylo_levelGroups
93 .~ fromList (map (\g -> (getGroupId g, g)) $ newPeriods ! (phyloLvl ^. phylo_levelPeriod))
94 else phyloLvl)
95 -- 2) add the curLvl + 1 phyloLevel to the phylo
96 $ addPhyloLevel (curLvl + 1)
97 -- 1) update the current groups (with level parent pointers) in the phylo
98 $ updatePhyloGroups curLvl (fromList $ map (\g -> (getGroupId g, g)) groups) phylo
99
100 --------------------
101 -- | Clustering | --
102 --------------------
103
104 toPairs :: SynchronyStrategy -> [PhyloGroup] -> [(PhyloGroup,PhyloGroup)]
105 toPairs strategy groups = case strategy of
106 MergeRegularGroups -> pairs
107 $ filter (\g -> all (== 3) $ (g ^. phylo_groupMeta) ! "dynamics") groups
108 MergeAllGroups -> pairs groups
109 where
110 pairs :: [PhyloGroup] -> [(PhyloGroup,PhyloGroup)]
111 pairs gs = filter (\(g,g') -> (not . null) $ intersect (g ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams)) (listToCombi' gs)
112
113
114 toDiamonds :: [PhyloGroup] -> [[PhyloGroup]]
115 toDiamonds groups = foldl' (\acc groups' ->
116 acc ++ ( elems
117 $ Map.filter (\v -> length v > 1)
118 $ fromListWith (++)
119 $ foldl' (\acc' g ->
120 acc' ++ (map (\(id,_) -> (id,[g]) ) $ g ^. phylo_groupPeriodChilds)) [] groups')) []
121 $ elems
122 $ Map.filter (\v -> length v > 1)
123 $ fromListWith (++)
124 $ foldl' (\acc g -> acc ++ (map (\(id,_) -> (id,[g]) ) $ g ^. phylo_groupPeriodParents) ) [] groups
125
126
127 groupsToEdges :: Proximity -> Synchrony -> Double -> Map Int Double -> [PhyloGroup] -> [((PhyloGroup,PhyloGroup),Double)]
128 groupsToEdges prox sync nbDocs diago groups =
129 case sync of
130 ByProximityThreshold thr sens _ strat ->
131 filter (\(_,w) -> w >= thr)
132 $ toEdges sens
133 $ toPairs strat groups
134 ByProximityDistribution sens strat ->
135 let diamonds = sortOn snd
136 $ toEdges sens $ concat
137 $ map (\gs -> toPairs strat gs) $ toDiamonds groups
138 in take (div (length diamonds) 2) diamonds
139 where
140 toEdges :: Double -> [(PhyloGroup,PhyloGroup)] -> [((PhyloGroup,PhyloGroup),Double)]
141 toEdges sens edges =
142 case prox of
143 WeightedLogJaccard _ -> map (\(g,g') ->
144 ((g,g'), weightedLogJaccard' (sens) nbDocs diago
145 (g ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams))) edges
146 WeightedLogSim _ -> map (\(g,g') ->
147 ((g,g'), weightedLogJaccard' (1 / sens) nbDocs diago
148 (g ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams))) edges
149
150 _ -> undefined
151
152 toParentId :: PhyloGroup -> PhyloGroupId
153 toParentId child = ((child ^. phylo_groupPeriod, child ^. phylo_groupLevel + 1), child ^. phylo_groupIndex)
154
155
156 reduceGroups :: Proximity -> Synchrony -> Map Date Double -> Map Date Cooc -> [PhyloGroup] -> [PhyloGroup]
157 reduceGroups prox sync docs diagos branch =
158 -- 1) reduce a branch as a set of periods & groups
159 let periods = fromListWith (++)
160 $ map (\g -> (g ^. phylo_groupPeriod,[g])) branch
161 in (concat . concat . elems)
162 $ mapWithKey (\prd groups ->
163 -- 2) for each period, transform the groups as a proximity graph filtered by a threshold
164 let diago = reduceDiagos $ filterDiago diagos [prd]
165 edges = groupsToEdges prox sync ((sum . elems) $ restrictKeys docs $ periodsToYears [prd]) diago groups
166 in map (\comp ->
167 -- 4) add to each groups their futur level parent group
168 let parentId = toParentId (head' "parentId" comp)
169 in map (\g -> g & phylo_groupLevelParents %~ (++ [(parentId,1)]) ) comp )
170 -- 3) reduce the graph a a set of related components
171 $ toRelatedComponents groups edges) periods
172
173
174 adjustClustering :: Synchrony -> [[PhyloGroup]] -> [[PhyloGroup]]
175 adjustClustering sync branches = case sync of
176 ByProximityThreshold _ _ scope _ -> case scope of
177 SingleBranch -> branches
178 SiblingBranches -> groupBy (\g g' -> (last' "adjustClustering" $ (g ^. phylo_groupMeta) ! "breaks")
179 == (last' "adjustClustering" $ (g' ^. phylo_groupMeta) ! "breaks"))
180 $ sortOn _phylo_groupBranchId $ concat branches
181 AllBranches -> [concat branches]
182 ByProximityDistribution _ _ -> branches
183
184
185 levelUpAncestors :: [PhyloGroup] -> [PhyloGroup]
186 levelUpAncestors groups =
187 -- 1) create an associative map of (old,new) ids
188 let ids' = fromList $ map (\g -> (getGroupId g, fst $ head' "levelUpAncestors" ( g ^. phylo_groupLevelParents))) groups
189 in map (\g ->
190 let id' = ids' ! (getGroupId g)
191 ancestors = g ^. phylo_groupAncestors
192 -- 2) level up the ancestors ids and filter the ones that will be merged
193 ancestors' = filter (\(id,_) -> id /= id') $ map (\(id,w) -> (ids' ! id,w)) ancestors
194 in g & phylo_groupAncestors .~ ancestors'
195 ) groups
196
197 synchronicClustering :: Phylo -> Phylo
198 synchronicClustering phylo =
199 let prox = phyloProximity $ getConfig phylo
200 sync = phyloSynchrony $ getConfig phylo
201 docs = phylo ^. phylo_timeDocs
202 diagos = map coocToDiago $ phylo ^. phylo_timeCooc
203 newBranches = map (\branch -> reduceGroups prox sync docs diagos branch)
204 $ map processDynamics
205 $ adjustClustering sync
206 $ phyloToLastBranches
207 $ traceSynchronyStart phylo
208 newBranches' = newBranches `using` parList rdeepseq
209 in toNextLevel' phylo $ levelUpAncestors $ concat newBranches'
210
211
212 -- synchronicDistance :: Phylo -> Level -> String
213 -- synchronicDistance phylo lvl =
214 -- foldl' (\acc branch ->
215 -- acc <> (foldl' (\acc' period ->
216 -- acc' <> let prox = phyloProximity $ getConfig phylo
217 -- sync = phyloSynchrony $ getConfig phylo
218 -- docs = _phylo_timeDocs phylo
219 -- prd = _phylo_groupPeriod $ head' "distance" period
220 -- edges = groupsToEdges prox 0.1 (_bpt_sensibility sync)
221 -- ((sum . elems) $ restrictKeys docs $ periodsToYears [_phylo_groupPeriod $ head' "distance" period]) period
222 -- in foldl' (\mem (_,w) ->
223 -- mem <> show (prd)
224 -- <> "\t"
225 -- <> show (w)
226 -- <> "\n"
227 -- ) "" edges
228 -- ) "" $ elems $ groupByField _phylo_groupPeriod branch)
229 -- ) "period\tdistance\n" $ elems $ groupByField _phylo_groupBranchId $ getGroupsFromLevel lvl phylo