]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/Phylo/SynchronicClustering.hs
add adjustable clustering by threshold
[gargantext.git] / src / Gargantext / Viz / Phylo / SynchronicClustering.hs
1 {-|
2 Module : Gargantext.Viz.Phylo.SynchronicClustering
3 Description : Module dedicated to the adaptative synchronic clustering of a Phylo.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 {-# LANGUAGE NoImplicitPrelude #-}
12 {-# LANGUAGE FlexibleContexts #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE MultiParamTypeClasses #-}
15
16 module Gargantext.Viz.Phylo.SynchronicClustering where
17
18 import Gargantext.Prelude
19 import Gargantext.Viz.AdaptativePhylo
20 import Gargantext.Viz.Phylo.PhyloTools
21 import Gargantext.Viz.Phylo.TemporalMatching (weightedLogJaccard)
22 import Gargantext.Viz.Phylo.PhyloExport (processDynamics)
23
24 import Data.List ((++), null, intersect, nub, concat, sort, sortOn, init, all, group, maximum, groupBy)
25 import Data.Map (Map, fromList, fromListWith, foldlWithKey, (!), insert, empty, restrictKeys, elems, mapWithKey, member, singleton)
26
27 import Control.Lens hiding (Level)
28 import Control.Parallel.Strategies (parList, rdeepseq, using)
29 -- import Debug.Trace (trace)
30
31 import qualified Data.Map as Map
32 import qualified Data.Set as Set
33
34
35 -------------------------
36 -- | New Level Maker | --
37 -------------------------
38
39 mergeBranchIds :: [[Int]] -> [Int]
40 mergeBranchIds ids = (head' "mergeBranchIds" . sort . mostFreq) ids
41 where
42 -- | 2) find the most Up Left ids in the hierarchy of similarity
43 -- mostUpLeft :: [[Int]] -> [[Int]]
44 -- mostUpLeft ids' =
45 -- let groupIds = (map (\gIds -> (length $ head' "gIds" gIds, head' "gIds" gIds)) . groupBy (\id id' -> length id == length id') . sortOn length) ids'
46 -- inf = (fst . minimum) groupIds
47 -- in map snd $ filter (\gIds -> fst gIds == inf) groupIds
48 -- | 1) find the most frequent ids
49 mostFreq :: [[Int]] -> [[Int]]
50 mostFreq ids' =
51 let groupIds = (map (\gIds -> (length gIds, head' "gIds" gIds)) . group . sort) ids'
52 sup = (fst . maximum) groupIds
53 in map snd $ filter (\gIds -> fst gIds == sup) groupIds
54
55
56 groupsToBranches' :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
57 groupsToBranches' groups =
58 -- | run the related component algorithm
59 let egos = groupBy (\gs gs' -> (fst $ fst $ head' "egos" gs) == (fst $ fst $ head' "egos" gs'))
60 $ sortOn (\gs -> fst $ fst $ head' "egos" gs)
61 $ map (\g -> [getGroupId g]
62 ++ (map fst $ g ^. phylo_groupPeriodParents)
63 ++ (map fst $ g ^. phylo_groupPeriodChilds) ) $ elems groups
64 -- | first find the related components by inside each ego's period
65 graph' = map relatedComponents egos
66 -- | then run it for the all the periods
67 graph = relatedComponents $ concat (graph' `using` parList rdeepseq)
68 -- | update each group's branch id
69 in map (\ids ->
70 -- intervenir ici
71 let groups' = elems $ restrictKeys groups (Set.fromList ids)
72 bId = mergeBranchIds $ map (\g -> snd $ g ^. phylo_groupBranchId) groups'
73 in map (\g -> g & phylo_groupBranchId %~ (\(lvl,_) -> (lvl + 1,bId))) groups') graph
74
75
76 -- toBranchId :: PhyloGroup -> PhyloBranchId
77 -- toBranchId child = ((child ^. phylo_groupLevel) + 1, snd (child ^. phylo_groupBranchId))
78
79 getLastThr :: [PhyloGroup] -> Double
80 getLastThr childs = maximum $ concat $ map (\g -> (g ^. phylo_groupMeta) ! "thr") childs
81
82 mergeGroups :: [Cooc] -> PhyloGroupId -> Map PhyloGroupId PhyloGroupId -> [PhyloGroup] -> PhyloGroup
83 mergeGroups coocs id mapIds childs =
84 let ngrams = (sort . nub . concat) $ map _phylo_groupNgrams childs
85 in PhyloGroup (fst $ fst id) (snd $ fst id) (snd id) ""
86 (sum $ map _phylo_groupSupport childs) ngrams
87 (ngramsToCooc ngrams coocs)
88 ((snd $ fst id),(mergeBranchIds $ map (\g -> snd $ g ^. phylo_groupBranchId) childs))
89 (singleton "thr" [getLastThr childs]) [] (map (\g -> (getGroupId g, 1)) childs)
90 (updatePointers $ concat $ map _phylo_groupPeriodParents childs)
91 (updatePointers $ concat $ map _phylo_groupPeriodChilds childs)
92 where
93 updatePointers :: [Pointer] -> [Pointer]
94 updatePointers pointers = map (\(pId,w) -> (mapIds ! pId,w)) pointers
95
96
97 addPhyloLevel :: Level -> Phylo -> Phylo
98 addPhyloLevel lvl phylo =
99 over ( phylo_periods . traverse )
100 (\phyloPrd -> phyloPrd & phylo_periodLevels
101 %~ (insert (phyloPrd ^. phylo_periodPeriod, lvl) (PhyloLevel (phyloPrd ^. phylo_periodPeriod) lvl empty))) phylo
102
103
104 toNextLevel' :: Phylo -> [PhyloGroup] -> Phylo
105 toNextLevel' phylo groups =
106 let curLvl = getLastLevel phylo
107 oldGroups = fromList $ map (\g -> (getGroupId g, getLevelParentId g)) groups
108 newGroups = concat $ groupsToBranches'
109 $ fromList $ map (\g -> (getGroupId g, g))
110 $ foldlWithKey (\acc id groups' ->
111 -- | 4) create the parent group
112 let parent = mergeGroups (elems $ restrictKeys (phylo ^. phylo_timeCooc) $ periodsToYears [(fst . fst) id]) id oldGroups groups'
113 in acc ++ [parent]) []
114 -- | 3) group the current groups by parentId
115 $ fromListWith (++) $ map (\g -> (getLevelParentId g, [g])) groups
116
117 newPeriods = fromListWith (++) $ map (\g -> (g ^. phylo_groupPeriod, [g])) newGroups
118 in traceSynchronyEnd
119 $ over ( phylo_periods . traverse . phylo_periodLevels . traverse
120 -- | 6) update each period at curLvl + 1
121 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == (curLvl + 1)))
122 -- | 7) by adding the parents
123 (\phyloLvl ->
124 if member (phyloLvl ^. phylo_levelPeriod) newPeriods
125 then phyloLvl & phylo_levelGroups
126 .~ fromList (map (\g -> (getGroupId g, g)) $ newPeriods ! (phyloLvl ^. phylo_levelPeriod))
127 else phyloLvl)
128 -- | 2) add the curLvl + 1 phyloLevel to the phylo
129 $ addPhyloLevel (curLvl + 1)
130 -- | 1) update the current groups (with level parent pointers) in the phylo
131 $ updatePhyloGroups curLvl (fromList $ map (\g -> (getGroupId g, g)) groups) phylo
132
133 --------------------
134 -- | Clustering | --
135 --------------------
136
137 toPairs :: SynchronyStrategy -> [PhyloGroup] -> [(PhyloGroup,PhyloGroup)]
138 toPairs strategy groups = case strategy of
139 MergeRegularGroups -> pairs
140 $ filter (\g -> all (== 3) $ (g ^. phylo_groupMeta) ! "dynamics") groups
141 MergeAllGroups -> pairs groups
142 where
143 pairs :: [PhyloGroup] -> [(PhyloGroup,PhyloGroup)]
144 pairs gs = filter (\(g,g') -> (not . null) $ intersect (g ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams)) (listToCombi' gs)
145
146
147 toDiamonds :: [PhyloGroup] -> [[PhyloGroup]]
148 toDiamonds groups = foldl' (\acc groups' ->
149 acc ++ ( elems
150 $ Map.filter (\v -> length v > 1)
151 $ fromListWith (++)
152 $ foldl' (\acc' g ->
153 acc' ++ (map (\(id,_) -> (id,[g]) ) $ g ^. phylo_groupPeriodChilds)) [] groups')) []
154 $ elems
155 $ Map.filter (\v -> length v > 1)
156 $ fromListWith (++)
157 $ foldl' (\acc g -> acc ++ (map (\(id,_) -> (id,[g]) ) $ g ^. phylo_groupPeriodParents) ) [] groups
158
159
160 groupsToEdges :: Proximity -> Synchrony -> Double -> [PhyloGroup] -> [((PhyloGroup,PhyloGroup),Double)]
161 groupsToEdges prox sync docs groups =
162 case sync of
163 ByProximityThreshold thr sens _ strat ->
164 filter (\(_,w) -> w >= thr)
165 $ toEdges sens
166 $ toPairs strat groups
167 ByProximityDistribution sens strat ->
168 let diamonds = sortOn snd
169 $ toEdges sens $ concat
170 $ map (\gs -> toPairs strat gs) $ toDiamonds groups
171 in take (div (length diamonds) 2) diamonds
172 where
173 toEdges :: Double -> [(PhyloGroup,PhyloGroup)] -> [((PhyloGroup,PhyloGroup),Double)]
174 toEdges sens edges =
175 case prox of
176 WeightedLogJaccard _ _ _ -> map (\(g,g') ->
177 ((g,g'), weightedLogJaccard sens docs
178 (g ^. phylo_groupCooc) (g' ^. phylo_groupCooc)
179 (g ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams))) edges
180 _ -> undefined
181
182
183
184 toRelatedComponents :: [PhyloGroup] -> [((PhyloGroup,PhyloGroup),Double)] -> [[PhyloGroup]]
185 toRelatedComponents nodes edges =
186 let ref = fromList $ map (\g -> (getGroupId g, g)) nodes
187 clusters = relatedComponents $ ((map (\((g,g'),_) -> [getGroupId g, getGroupId g']) edges) ++ (map (\g -> [getGroupId g]) nodes))
188 in map (\cluster -> map (\gId -> ref ! gId) cluster) clusters
189
190 toParentId :: PhyloGroup -> PhyloGroupId
191 toParentId child = ((child ^. phylo_groupPeriod, child ^. phylo_groupLevel + 1), child ^. phylo_groupIndex)
192
193
194 reduceGroups :: Proximity -> Synchrony -> Map Date Double -> [PhyloGroup] -> [PhyloGroup]
195 reduceGroups prox sync docs branch =
196 -- | 1) reduce a branch as a set of periods & groups
197 let periods = fromListWith (++)
198 $ map (\g -> (g ^. phylo_groupPeriod,[g])) branch
199 in (concat . concat . elems)
200 $ mapWithKey (\prd groups ->
201 -- | 2) for each period, transform the groups as a proximity graph filtered by a threshold
202 let edges = groupsToEdges prox sync ((sum . elems) $ restrictKeys docs $ periodsToYears [prd]) groups
203 in map (\comp ->
204 -- | 4) add to each groups their futur level parent group
205 let parentId = toParentId (head' "parentId" comp)
206 in map (\g -> g & phylo_groupLevelParents %~ (++ [(parentId,1)]) ) comp )
207 -- |3) reduce the graph a a set of related components
208 $ toRelatedComponents groups edges) periods
209
210
211 getGroupRealBId :: Double -> PhyloGroup -> [Int]
212 getGroupRealBId step g =
213 let nb = round(getGroupThr g / step) + 2
214 in take nb (snd $ g ^. phylo_groupBranchId)
215
216
217
218 adjustClustering :: Synchrony -> Double -> [[PhyloGroup]] -> [[PhyloGroup]]
219 adjustClustering sync step branches = case sync of
220 ByProximityThreshold _ _ scope _ ->
221 case scope of
222 SingleBranch -> branches
223 SiblingBranches -> groupBy (\g g' -> (init $ getGroupRealBId step g) == (init $ getGroupRealBId step g'))
224 $ sortOn _phylo_groupBranchId $ concat branches
225 -- SiblingBranches -> elems $ fromListWith (++) $ map (\b -> ((init . snd . _phylo_groupBranchId) $ head' "adjustClustering" b,b)) branches
226 AllBranches -> [concat branches]
227 ByProximityDistribution _ _ -> branches
228
229
230
231 synchronicClustering :: Phylo -> Phylo
232 synchronicClustering phylo =
233 let prox = phyloProximity $ getConfig phylo
234 sync = phyloSynchrony $ getConfig phylo
235 docs = phylo ^. phylo_timeDocs
236 newBranches = map (\branch -> reduceGroups prox sync docs branch)
237 $ map processDynamics
238 $ adjustClustering sync (getPhyloThresholdStep phylo)
239 $ phyloToLastBranches
240 $ traceSynchronyStart phylo
241 newBranches' = newBranches `using` parList rdeepseq
242 in toNextLevel' phylo $ concat newBranches'
243
244
245 ----------------
246 -- | probes | --
247 ----------------
248
249 -- synchronicDistance :: Phylo -> Level -> String
250 -- synchronicDistance phylo lvl =
251 -- foldl' (\acc branch ->
252 -- acc <> (foldl' (\acc' period ->
253 -- acc' <> let prox = phyloProximity $ getConfig phylo
254 -- sync = phyloSynchrony $ getConfig phylo
255 -- docs = _phylo_timeDocs phylo
256 -- prd = _phylo_groupPeriod $ head' "distance" period
257 -- edges = groupsToEdges prox 0.1 (_bpt_sensibility sync)
258 -- ((sum . elems) $ restrictKeys docs $ periodsToYears [_phylo_groupPeriod $ head' "distance" period]) period
259 -- in foldl' (\mem (_,w) ->
260 -- mem <> show (prd)
261 -- <> "\t"
262 -- <> show (w)
263 -- <> "\n"
264 -- ) "" edges
265 -- ) "" $ elems $ groupByField _phylo_groupPeriod branch)
266 -- ) "period\tdistance\n" $ elems $ groupByField _phylo_groupBranchId $ getGroupsFromLevel lvl phylo