]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/Phylo/SynchronicClustering.hs
[FIX] merge dev-phylo and dev
[gargantext.git] / src / Gargantext / Viz / Phylo / SynchronicClustering.hs
1 {-|
2 Module : Gargantext.Viz.Phylo.SynchronicClustering
3 Description : Module dedicated to the adaptative synchronic clustering of a Phylo.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11
12 module Gargantext.Viz.Phylo.SynchronicClustering where
13
14 import Gargantext.Prelude
15 import Gargantext.Viz.AdaptativePhylo
16 import Gargantext.Viz.Phylo.PhyloTools
17 import Gargantext.Viz.Phylo.TemporalMatching (weightedLogJaccard', filterDiago, reduceDiagos)
18 import Gargantext.Viz.Phylo.PhyloExport (processDynamics)
19
20 import Data.List ((++), null, intersect, nub, concat, sort, sortOn, all, groupBy, group, maximum)
21 import Data.Map (Map, fromList, fromListWith, foldlWithKey, (!), insert, empty, restrictKeys, elems, mapWithKey, member)
22 import Data.Text (Text)
23
24 import Control.Lens hiding (Level)
25 import Control.Parallel.Strategies (parList, rdeepseq, using)
26 -- import Debug.Trace (trace)
27
28 import qualified Data.Map as Map
29 import qualified Data.Set as Set
30
31
32 -------------------------
33 -- | New Level Maker | --
34 -------------------------
35
36 mergeBranchIds :: [[Int]] -> [Int]
37 mergeBranchIds ids = (head' "mergeBranchIds" . sort . mostFreq') ids
38 where
39 -- 2) find the most Up Left ids in the hierarchy of similarity
40 -- mostUpLeft :: [[Int]] -> [[Int]]
41 -- mostUpLeft ids' =
42 -- let groupIds = (map (\gIds -> (length $ head' "gIds" gIds, head' "gIds" gIds)) . groupBy (\id id' -> length id == length id') . sortOn length) ids'
43 -- inf = (fst . minimum) groupIds
44 -- in map snd $ filter (\gIds -> fst gIds == inf) groupIds
45 -- 1) find the most frequent ids
46 mostFreq' :: [[Int]] -> [[Int]]
47 mostFreq' ids' =
48 let groupIds = (map (\gIds -> (length gIds, head' "gIds" gIds)) . group . sort) ids'
49 sup = (fst . maximum) groupIds
50 in map snd $ filter (\gIds -> fst gIds == sup) groupIds
51
52
53 mergeMeta :: [Int] -> [PhyloGroup] -> Map Text [Double]
54 mergeMeta bId groups =
55 let ego = head' "mergeMeta" $ filter (\g -> (snd (g ^. phylo_groupBranchId)) == bId) groups
56 in fromList [("breaks",(ego ^. phylo_groupMeta) ! "breaks"),("seaLevels",(ego ^. phylo_groupMeta) ! "seaLevels")]
57
58
59 groupsToBranches' :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
60 groupsToBranches' groups =
61 -- run the related component algorithm
62 let egos = map (\g -> [getGroupId g]
63 ++ (map fst $ g ^. phylo_groupPeriodParents)
64 ++ (map fst $ g ^. phylo_groupPeriodChilds) ) $ elems groups
65 graph = relatedComponents egos
66 -- update each group's branch id
67 in map (\ids ->
68 let groups' = elems $ restrictKeys groups (Set.fromList ids)
69 bId = mergeBranchIds $ map (\g -> snd $ g ^. phylo_groupBranchId) groups'
70 in map (\g -> g & phylo_groupBranchId %~ (\(lvl,_) -> (lvl,bId))) groups') graph
71
72
73 mergeGroups :: [Cooc] -> PhyloGroupId -> Map PhyloGroupId PhyloGroupId -> [PhyloGroup] -> PhyloGroup
74 mergeGroups coocs id mapIds childs =
75 let ngrams = (sort . nub . concat) $ map _phylo_groupNgrams childs
76 in PhyloGroup (fst $ fst id) (snd $ fst id) (snd id) ""
77 (sum $ map _phylo_groupSupport childs) ngrams
78 (ngramsToCooc ngrams coocs)
79 ((snd $ fst id),bId)
80 (mergeMeta bId childs) [] (map (\g -> (getGroupId g, 1)) childs)
81 (updatePointers $ concat $ map _phylo_groupPeriodParents childs)
82 (updatePointers $ concat $ map _phylo_groupPeriodChilds childs)
83 where
84 --------------------
85 bId :: [Int]
86 bId = mergeBranchIds $ map (\g -> snd $ g ^. phylo_groupBranchId) childs
87 --------------------
88 updatePointers :: [Pointer] -> [Pointer]
89 updatePointers pointers = map (\(pId,w) -> (mapIds ! pId,w)) pointers
90
91
92 addPhyloLevel :: Level -> Phylo -> Phylo
93 addPhyloLevel lvl phylo =
94 over ( phylo_periods . traverse )
95 (\phyloPrd -> phyloPrd & phylo_periodLevels
96 %~ (insert (phyloPrd ^. phylo_periodPeriod, lvl) (PhyloLevel (phyloPrd ^. phylo_periodPeriod) lvl empty))) phylo
97
98
99 toNextLevel' :: Phylo -> [PhyloGroup] -> Phylo
100 toNextLevel' phylo groups =
101 let curLvl = getLastLevel phylo
102 oldGroups = fromList $ map (\g -> (getGroupId g, getLevelParentId g)) groups
103 newGroups = concat $ groupsToBranches'
104 $ fromList $ map (\g -> (getGroupId g, g))
105 $ foldlWithKey (\acc id groups' ->
106 -- 4) create the parent group
107 let parent = mergeGroups (elems $ restrictKeys (phylo ^. phylo_timeCooc) $ periodsToYears [(fst . fst) id]) id oldGroups groups'
108 in acc ++ [parent]) []
109 -- 3) group the current groups by parentId
110 $ fromListWith (++) $ map (\g -> (getLevelParentId g, [g])) groups
111
112 newPeriods = fromListWith (++) $ map (\g -> (g ^. phylo_groupPeriod, [g])) newGroups
113 in traceSynchronyEnd
114 $ over ( phylo_periods . traverse . phylo_periodLevels . traverse
115 -- 6) update each period at curLvl + 1
116 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == (curLvl + 1)))
117 -- 7) by adding the parents
118 (\phyloLvl ->
119 if member (phyloLvl ^. phylo_levelPeriod) newPeriods
120 then phyloLvl & phylo_levelGroups
121 .~ fromList (map (\g -> (getGroupId g, g)) $ newPeriods ! (phyloLvl ^. phylo_levelPeriod))
122 else phyloLvl)
123 -- 2) add the curLvl + 1 phyloLevel to the phylo
124 $ addPhyloLevel (curLvl + 1)
125 -- 1) update the current groups (with level parent pointers) in the phylo
126 $ updatePhyloGroups curLvl (fromList $ map (\g -> (getGroupId g, g)) groups) phylo
127
128 --------------------
129 -- | Clustering | --
130 --------------------
131
132 toPairs :: SynchronyStrategy -> [PhyloGroup] -> [(PhyloGroup,PhyloGroup)]
133 toPairs strategy groups = case strategy of
134 MergeRegularGroups -> pairs
135 $ filter (\g -> all (== 3) $ (g ^. phylo_groupMeta) ! "dynamics") groups
136 MergeAllGroups -> pairs groups
137 where
138 pairs :: [PhyloGroup] -> [(PhyloGroup,PhyloGroup)]
139 pairs gs = filter (\(g,g') -> (not . null) $ intersect (g ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams)) (listToCombi' gs)
140
141
142 toDiamonds :: [PhyloGroup] -> [[PhyloGroup]]
143 toDiamonds groups = foldl' (\acc groups' ->
144 acc ++ ( elems
145 $ Map.filter (\v -> length v > 1)
146 $ fromListWith (++)
147 $ foldl' (\acc' g ->
148 acc' ++ (map (\(id,_) -> (id,[g]) ) $ g ^. phylo_groupPeriodChilds)) [] groups')) []
149 $ elems
150 $ Map.filter (\v -> length v > 1)
151 $ fromListWith (++)
152 $ foldl' (\acc g -> acc ++ (map (\(id,_) -> (id,[g]) ) $ g ^. phylo_groupPeriodParents) ) [] groups
153
154
155 groupsToEdges :: Proximity -> Synchrony -> Double -> Map Int Double -> [PhyloGroup] -> [((PhyloGroup,PhyloGroup),Double)]
156 groupsToEdges prox sync nbDocs diago groups =
157 case sync of
158 ByProximityThreshold thr sens _ strat ->
159 filter (\(_,w) -> w >= thr)
160 $ toEdges sens
161 $ toPairs strat groups
162 ByProximityDistribution sens strat ->
163 let diamonds = sortOn snd
164 $ toEdges sens $ concat
165 $ map (\gs -> toPairs strat gs) $ toDiamonds groups
166 in take (div (length diamonds) 2) diamonds
167 where
168 toEdges :: Double -> [(PhyloGroup,PhyloGroup)] -> [((PhyloGroup,PhyloGroup),Double)]
169 toEdges sens edges =
170 case prox of
171 WeightedLogJaccard _ -> map (\(g,g') ->
172 ((g,g'), weightedLogJaccard' sens nbDocs diago
173 (g ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams))) edges
174 _ -> undefined
175
176 toParentId :: PhyloGroup -> PhyloGroupId
177 toParentId child = ((child ^. phylo_groupPeriod, child ^. phylo_groupLevel + 1), child ^. phylo_groupIndex)
178
179
180 reduceGroups :: Proximity -> Synchrony -> Map Date Double -> Map Date Cooc -> [PhyloGroup] -> [PhyloGroup]
181 reduceGroups prox sync docs diagos branch =
182 -- 1) reduce a branch as a set of periods & groups
183 let periods = fromListWith (++)
184 $ map (\g -> (g ^. phylo_groupPeriod,[g])) branch
185 in (concat . concat . elems)
186 $ mapWithKey (\prd groups ->
187 -- 2) for each period, transform the groups as a proximity graph filtered by a threshold
188 let diago = reduceDiagos $ filterDiago diagos [prd]
189 edges = groupsToEdges prox sync ((sum . elems) $ restrictKeys docs $ periodsToYears [prd]) diago groups
190 in map (\comp ->
191 -- 4) add to each groups their futur level parent group
192 let parentId = toParentId (head' "parentId" comp)
193 in map (\g -> g & phylo_groupLevelParents %~ (++ [(parentId,1)]) ) comp )
194 -- 3) reduce the graph a a set of related components
195 $ toRelatedComponents groups edges) periods
196
197
198 adjustClustering :: Synchrony -> [[PhyloGroup]] -> [[PhyloGroup]]
199 adjustClustering sync branches = case sync of
200 ByProximityThreshold _ _ scope _ -> case scope of
201 SingleBranch -> branches
202 SiblingBranches -> groupBy (\g g' -> (last' "adjustClustering" $ (g ^. phylo_groupMeta) ! "breaks")
203 == (last' "adjustClustering" $ (g' ^. phylo_groupMeta) ! "breaks"))
204 $ sortOn _phylo_groupBranchId $ concat branches
205 AllBranches -> [concat branches]
206 ByProximityDistribution _ _ -> branches
207
208
209
210 synchronicClustering :: Phylo -> Phylo
211 synchronicClustering phylo =
212 let prox = phyloProximity $ getConfig phylo
213 sync = phyloSynchrony $ getConfig phylo
214 docs = phylo ^. phylo_timeDocs
215 diagos = map coocToDiago $ phylo ^. phylo_timeCooc
216 newBranches = map (\branch -> reduceGroups prox sync docs diagos branch)
217 $ map processDynamics
218 $ adjustClustering sync
219 $ phyloToLastBranches
220 $ traceSynchronyStart phylo
221 newBranches' = newBranches `using` parList rdeepseq
222 in toNextLevel' phylo $ concat newBranches'
223
224
225 -- synchronicDistance :: Phylo -> Level -> String
226 -- synchronicDistance phylo lvl =
227 -- foldl' (\acc branch ->
228 -- acc <> (foldl' (\acc' period ->
229 -- acc' <> let prox = phyloProximity $ getConfig phylo
230 -- sync = phyloSynchrony $ getConfig phylo
231 -- docs = _phylo_timeDocs phylo
232 -- prd = _phylo_groupPeriod $ head' "distance" period
233 -- edges = groupsToEdges prox 0.1 (_bpt_sensibility sync)
234 -- ((sum . elems) $ restrictKeys docs $ periodsToYears [_phylo_groupPeriod $ head' "distance" period]) period
235 -- in foldl' (\mem (_,w) ->
236 -- mem <> show (prd)
237 -- <> "\t"
238 -- <> show (w)
239 -- <> "\n"
240 -- ) "" edges
241 -- ) "" $ elems $ groupByField _phylo_groupPeriod branch)
242 -- ) "period\tdistance\n" $ elems $ groupByField _phylo_groupBranchId $ getGroupsFromLevel lvl phylo