]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Viz/Phylo/SynchronicClustering.hs
[FEAT] Distributional (WIP)
[gargantext.git] / src / Gargantext / Core / Viz / Phylo / SynchronicClustering.hs
1 {-|
2 Module : Gargantext.Core.Viz.Phylo.SynchronicClustering
3 Description : Module dedicated to the adaptative synchronic clustering of a Phylo.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11
12 module Gargantext.Core.Viz.Phylo.SynchronicClustering where
13
14 import Gargantext.Prelude
15 import Gargantext.Core.Viz.AdaptativePhylo
16 import Gargantext.Core.Viz.Phylo.PhyloTools
17 import Gargantext.Core.Viz.Phylo.TemporalMatching (weightedLogJaccard', filterDiago, reduceDiagos)
18 import Gargantext.Core.Viz.Phylo.PhyloExport (processDynamics)
19
20 import Data.List ((++), null, intersect, nub, concat, sort, sortOn, groupBy)
21 import Data.Map (Map, fromList, fromListWith, foldlWithKey, (!), insert, empty, restrictKeys, elems, mapWithKey, member)
22
23 import Control.Lens hiding (Level)
24 import Control.Parallel.Strategies (parList, rdeepseq, using)
25 -- import Debug.Trace (trace)
26
27 import qualified Data.Map as Map
28
29
30 -------------------------
31 -- | New Level Maker | --
32 -------------------------
33
34
35 mergeGroups :: [Cooc] -> PhyloGroupId -> Map PhyloGroupId PhyloGroupId -> [PhyloGroup] -> PhyloGroup
36 mergeGroups coocs id mapIds childs =
37 let ngrams = (sort . nub . concat) $ map _phylo_groupNgrams childs
38 in PhyloGroup (fst $ fst id) (snd $ fst id) (snd id) ""
39 (sum $ map _phylo_groupSupport childs) ngrams
40 (ngramsToCooc ngrams coocs)
41 ((snd $ fst id),bId)
42 (mergeMeta bId childs) [] (map (\g -> (getGroupId g, 1)) childs)
43 (updatePointers $ concat $ map _phylo_groupPeriodParents childs)
44 (updatePointers $ concat $ map _phylo_groupPeriodChilds childs)
45 []
46 where
47 --------------------
48 bId :: [Int]
49 bId = mergeBranchIds $ map (\g -> snd $ g ^. phylo_groupBranchId) childs
50 --------------------
51 updatePointers :: [Pointer] -> [Pointer]
52 updatePointers pointers = map (\(pId,w) -> (mapIds ! pId,w)) pointers
53
54
55 addPhyloLevel :: Level -> Phylo -> Phylo
56 addPhyloLevel lvl phylo =
57 over ( phylo_periods . traverse )
58 (\phyloPrd -> phyloPrd & phylo_periodLevels
59 %~ (insert (phyloPrd ^. phylo_periodPeriod, lvl) (PhyloLevel (phyloPrd ^. phylo_periodPeriod) lvl empty))) phylo
60
61
62 toNextLevel' :: Phylo -> [PhyloGroup] -> Phylo
63 toNextLevel' phylo groups =
64 let curLvl = getLastLevel phylo
65 oldGroups = fromList $ map (\g -> (getGroupId g, getLevelParentId g)) groups
66 newGroups = concat $ groupsToBranches
67 $ fromList $ map (\g -> (getGroupId g, g))
68 $ foldlWithKey (\acc id groups' ->
69 -- 4) create the parent group
70 let parent = mergeGroups (elems $ restrictKeys (phylo ^. phylo_timeCooc) $ periodsToYears [(fst . fst) id]) id oldGroups groups'
71 in acc ++ [parent]) []
72 -- 3) group the current groups by parentId
73 $ fromListWith (++) $ map (\g -> (getLevelParentId g, [g])) groups
74
75 newPeriods = fromListWith (++) $ map (\g -> (g ^. phylo_groupPeriod, [g])) newGroups
76 in traceSynchronyEnd
77 $ over ( phylo_periods . traverse . phylo_periodLevels . traverse
78 -- 6) update each period at curLvl + 1
79 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == (curLvl + 1)))
80 -- 7) by adding the parents
81 (\phyloLvl ->
82 if member (phyloLvl ^. phylo_levelPeriod) newPeriods
83 then phyloLvl & phylo_levelGroups
84 .~ fromList (map (\g -> (getGroupId g, g)) $ newPeriods ! (phyloLvl ^. phylo_levelPeriod))
85 else phyloLvl)
86 -- 2) add the curLvl + 1 phyloLevel to the phylo
87 $ addPhyloLevel (curLvl + 1)
88 -- 1) update the current groups (with level parent pointers) in the phylo
89 $ updatePhyloGroups curLvl (fromList $ map (\g -> (getGroupId g, g)) groups) phylo
90
91 --------------------
92 -- | Clustering | --
93 --------------------
94
95 toPairs :: SynchronyStrategy -> [PhyloGroup] -> [(PhyloGroup,PhyloGroup)]
96 toPairs strategy groups = case strategy of
97 MergeRegularGroups -> pairs
98 $ filter (\g -> all (== 3) $ (g ^. phylo_groupMeta) ! "dynamics") groups
99 MergeAllGroups -> pairs groups
100 where
101 pairs :: [PhyloGroup] -> [(PhyloGroup,PhyloGroup)]
102 pairs gs = filter (\(g,g') -> (not . null) $ intersect (g ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams)) (listToCombi' gs)
103
104
105 toDiamonds :: [PhyloGroup] -> [[PhyloGroup]]
106 toDiamonds groups = foldl' (\acc groups' ->
107 acc ++ ( elems
108 $ Map.filter (\v -> length v > 1)
109 $ fromListWith (++)
110 $ foldl' (\acc' g ->
111 acc' ++ (map (\(id,_) -> (id,[g]) ) $ g ^. phylo_groupPeriodChilds)) [] groups')) []
112 $ elems
113 $ Map.filter (\v -> length v > 1)
114 $ fromListWith (++)
115 $ foldl' (\acc g -> acc ++ (map (\(id,_) -> (id,[g]) ) $ g ^. phylo_groupPeriodParents) ) [] groups
116
117
118 groupsToEdges :: Proximity -> Synchrony -> Double -> Map Int Double -> [PhyloGroup] -> [((PhyloGroup,PhyloGroup),Double)]
119 groupsToEdges prox sync nbDocs diago groups =
120 case sync of
121 ByProximityThreshold thr sens _ strat ->
122 filter (\(_,w) -> w >= thr)
123 $ toEdges sens
124 $ toPairs strat groups
125 ByProximityDistribution sens strat ->
126 let diamonds = sortOn snd
127 $ toEdges sens $ concat
128 $ map (\gs -> toPairs strat gs) $ toDiamonds groups
129 in take (div (length diamonds) 2) diamonds
130 where
131 toEdges :: Double -> [(PhyloGroup,PhyloGroup)] -> [((PhyloGroup,PhyloGroup),Double)]
132 toEdges sens edges =
133 case prox of
134 WeightedLogJaccard _ -> map (\(g,g') ->
135 ((g,g'), weightedLogJaccard' sens nbDocs diago
136 (g ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams))) edges
137 _ -> undefined
138
139 toParentId :: PhyloGroup -> PhyloGroupId
140 toParentId child = ((child ^. phylo_groupPeriod, child ^. phylo_groupLevel + 1), child ^. phylo_groupIndex)
141
142
143 reduceGroups :: Proximity -> Synchrony -> Map Date Double -> Map Date Cooc -> [PhyloGroup] -> [PhyloGroup]
144 reduceGroups prox sync docs diagos branch =
145 -- 1) reduce a branch as a set of periods & groups
146 let periods = fromListWith (++)
147 $ map (\g -> (g ^. phylo_groupPeriod,[g])) branch
148 in (concat . concat . elems)
149 $ mapWithKey (\prd groups ->
150 -- 2) for each period, transform the groups as a proximity graph filtered by a threshold
151 let diago = reduceDiagos $ filterDiago diagos [prd]
152 edges = groupsToEdges prox sync ((sum . elems) $ restrictKeys docs $ periodsToYears [prd]) diago groups
153 in map (\comp ->
154 -- 4) add to each groups their futur level parent group
155 let parentId = toParentId (head' "parentId" comp)
156 in map (\g -> g & phylo_groupLevelParents %~ (++ [(parentId,1)]) ) comp )
157 -- 3) reduce the graph a a set of related components
158 $ toRelatedComponents groups edges) periods
159
160
161 adjustClustering :: Synchrony -> [[PhyloGroup]] -> [[PhyloGroup]]
162 adjustClustering sync branches = case sync of
163 ByProximityThreshold _ _ scope _ -> case scope of
164 SingleBranch -> branches
165 SiblingBranches -> groupBy (\g g' -> (last' "adjustClustering" $ (g ^. phylo_groupMeta) ! "breaks")
166 == (last' "adjustClustering" $ (g' ^. phylo_groupMeta) ! "breaks"))
167 $ sortOn _phylo_groupBranchId $ concat branches
168 AllBranches -> [concat branches]
169 ByProximityDistribution _ _ -> branches
170
171
172
173 synchronicClustering :: Phylo -> Phylo
174 synchronicClustering phylo =
175 let prox = phyloProximity $ getConfig phylo
176 sync = phyloSynchrony $ getConfig phylo
177 docs = phylo ^. phylo_timeDocs
178 diagos = map coocToDiago $ phylo ^. phylo_timeCooc
179 newBranches = map (\branch -> reduceGroups prox sync docs diagos branch)
180 $ map processDynamics
181 $ adjustClustering sync
182 $ phyloToLastBranches
183 $ traceSynchronyStart phylo
184 newBranches' = newBranches `using` parList rdeepseq
185 in toNextLevel' phylo $ concat newBranches'
186
187
188 -- synchronicDistance :: Phylo -> Level -> String
189 -- synchronicDistance phylo lvl =
190 -- foldl' (\acc branch ->
191 -- acc <> (foldl' (\acc' period ->
192 -- acc' <> let prox = phyloProximity $ getConfig phylo
193 -- sync = phyloSynchrony $ getConfig phylo
194 -- docs = _phylo_timeDocs phylo
195 -- prd = _phylo_groupPeriod $ head' "distance" period
196 -- edges = groupsToEdges prox 0.1 (_bpt_sensibility sync)
197 -- ((sum . elems) $ restrictKeys docs $ periodsToYears [_phylo_groupPeriod $ head' "distance" period]) period
198 -- in foldl' (\mem (_,w) ->
199 -- mem <> show (prd)
200 -- <> "\t"
201 -- <> show (w)
202 -- <> "\n"
203 -- ) "" edges
204 -- ) "" $ elems $ groupByField _phylo_groupPeriod branch)
205 -- ) "period\tdistance\n" $ elems $ groupByField _phylo_groupBranchId $ getGroupsFromLevel lvl phylo