]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Viz/Phylo/SynchronicClustering.hs
Merge branch '104-dev-john-snow-nlp' of ssh://gitlab.iscpif.fr:20022/cgenie/haskell...
[gargantext.git] / src / Gargantext / Core / Viz / Phylo / SynchronicClustering.hs
1 {-|
2 Module : Gargantext.Core.Viz.Phylo.SynchronicClustering
3 Description : Module dedicated to the adaptative synchronic clustering of a Phylo.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11
12 module Gargantext.Core.Viz.Phylo.SynchronicClustering where
13
14 -- import Debug.Trace (trace)
15 import Control.Lens hiding (Level)
16 import Control.Monad (sequence)
17 import Control.Parallel.Strategies (parList, rdeepseq, using)
18 import Data.List ((++), null, intersect, nub, concat, sort, sortOn, groupBy)
19 import Data.Map (Map, fromList, fromListWith, foldlWithKey, (!), insert, empty, restrictKeys, elems, mapWithKey, member)
20 import Gargantext.Core.Viz.Phylo
21 import Gargantext.Core.Viz.Phylo.PhyloExport (processDynamics)
22 import Gargantext.Core.Viz.Phylo.PhyloTools
23 import Gargantext.Core.Viz.Phylo.TemporalMatching (weightedLogJaccard', filterDiago, reduceDiagos)
24 import Gargantext.Prelude
25 import qualified Data.Map as Map
26
27
28 -------------------------
29 -- | New Level Maker | --
30 -------------------------
31
32 mergeGroups :: [Cooc] -> PhyloGroupId -> Map PhyloGroupId PhyloGroupId -> [PhyloGroup] -> PhyloGroup
33 mergeGroups coocs id mapIds childs =
34 let ngrams = (sort . nub . concat) $ map _phylo_groupNgrams childs
35 in PhyloGroup (fst $ fst id) (_phylo_groupPeriod' $ head' "mergeGroups" childs)
36 (snd $ fst id) (snd id) ""
37 (sum $ map _phylo_groupSupport childs)
38 (fmap sum $ sequence
39 $ map _phylo_groupWeight childs)
40 (concat $ map _phylo_groupSources childs)
41 ngrams
42 (ngramsToCooc ngrams coocs)
43 ((snd $ fst id),bId)
44 (mergeMeta bId childs) [] (map (\g -> (getGroupId g, 1)) childs)
45 (updatePointers $ concat $ map _phylo_groupPeriodParents childs)
46 (updatePointers $ concat $ map _phylo_groupPeriodChilds childs)
47 (mergeAncestors $ concat $ map _phylo_groupAncestors childs)
48 (updatePointers' $ concat $ map _phylo_groupPeriodMemoryParents childs)
49 (updatePointers' $ concat $ map _phylo_groupPeriodMemoryChilds childs)
50 where
51 --------------------
52 bId :: [Int]
53 bId = mergeBranchIds $ map (\g -> snd $ g ^. phylo_groupBranchId) childs
54 --------------------
55 updatePointers :: [Pointer] -> [Pointer]
56 updatePointers pointers = map (\(pId,w) -> (mapIds ! pId,w)) pointers
57 updatePointers' :: [Pointer'] -> [Pointer']
58 updatePointers' pointers = map (\(pId,(t,w)) -> (mapIds ! pId,(t,w))) pointers
59 --------------------
60 mergeAncestors :: [Pointer] -> [Pointer]
61 mergeAncestors pointers = Map.toList $ fromListWith max pointers
62
63 addPhyloLevel :: Level -> Phylo -> Phylo
64 addPhyloLevel lvl phylo =
65 over ( phylo_periods . traverse )
66 (\phyloPrd -> phyloPrd & phylo_periodLevels
67 %~ (insert (phyloPrd ^. phylo_periodPeriod, lvl)
68 (PhyloLevel (phyloPrd ^. phylo_periodPeriod) (phyloPrd ^. phylo_periodPeriod') lvl empty))) phylo
69
70
71 toNextLevel' :: Phylo -> [PhyloGroup] -> Phylo
72 toNextLevel' phylo groups =
73 let curLvl = getLastLevel phylo
74 oldGroups = fromList $ map (\g -> (getGroupId g, getLevelParentId g)) groups
75 newGroups = concat $ groupsToBranches
76 $ fromList $ map (\g -> (getGroupId g, g))
77 $ foldlWithKey (\acc id groups' ->
78 -- 4) create the parent group
79 let parent = mergeGroups (elems $ restrictKeys (phylo ^. phylo_timeCooc) $ periodsToYears [(fst . fst) id]) id oldGroups groups'
80 in acc ++ [parent]) []
81 -- 3) group the current groups by parentId
82 $ fromListWith (++) $ map (\g -> (getLevelParentId g, [g])) groups
83
84 newPeriods = fromListWith (++) $ map (\g -> (g ^. phylo_groupPeriod, [g])) newGroups
85 in traceSynchronyEnd
86 $ over ( phylo_periods . traverse . phylo_periodLevels . traverse
87 -- 6) update each period at curLvl + 1
88 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == (curLvl + 1)))
89 -- 7) by adding the parents
90 (\phyloLvl ->
91 if member (phyloLvl ^. phylo_levelPeriod) newPeriods
92 then phyloLvl & phylo_levelGroups
93 .~ fromList (map (\g -> (getGroupId g, g)) $ newPeriods ! (phyloLvl ^. phylo_levelPeriod))
94 else phyloLvl)
95 -- 2) add the curLvl + 1 phyloLevel to the phylo
96 $ addPhyloLevel (curLvl + 1)
97 -- 1) update the current groups (with level parent pointers) in the phylo
98 $ updatePhyloGroups curLvl (fromList $ map (\g -> (getGroupId g, g)) groups) phylo
99
100 --------------------
101 -- | Clustering | --
102 --------------------
103
104 toPairs :: SynchronyStrategy -> [PhyloGroup] -> [(PhyloGroup,PhyloGroup)]
105 toPairs strategy groups = case strategy of
106 MergeRegularGroups -> pairs
107 $ filter (\g -> all (== 3) $ (g ^. phylo_groupMeta) ! "dynamics") groups
108 MergeAllGroups -> pairs groups
109 where
110 pairs :: [PhyloGroup] -> [(PhyloGroup,PhyloGroup)]
111 pairs gs = filter (\(g,g') -> (not . null) $ intersect (g ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams)) (listToCombi' gs)
112
113
114 toDiamonds :: [PhyloGroup] -> [[PhyloGroup]]
115 toDiamonds groups = foldl' (\acc groups' ->
116 acc ++ ( elems
117 $ Map.filter (\v -> length v > 1)
118 $ fromListWith (++)
119 $ foldl' (\acc' g ->
120 acc' ++ (map (\(id,_) -> (id,[g]) ) $ g ^. phylo_groupPeriodChilds)) [] groups')) []
121 $ elems
122 $ Map.filter (\v -> length v > 1)
123 $ fromListWith (++)
124 $ foldl' (\acc g -> acc ++ (map (\(id,_) -> (id,[g]) ) $ g ^. phylo_groupPeriodParents) ) [] groups
125
126
127 groupsToEdges :: Proximity -> Synchrony -> Double -> Map Int Double -> [PhyloGroup] -> [((PhyloGroup,PhyloGroup),Double)]
128 groupsToEdges prox sync nbDocs diago groups =
129 case sync of
130 ByProximityThreshold thr sens _ strat ->
131 filter (\(_,w) -> w >= thr)
132 $ toEdges sens
133 $ toPairs strat groups
134 ByProximityDistribution sens strat ->
135 let diamonds = sortOn snd
136 $ toEdges sens $ concat
137 $ map (\gs -> toPairs strat gs) $ toDiamonds groups
138 in take (div (length diamonds) 2) diamonds
139 where
140 toEdges :: Double -> [(PhyloGroup,PhyloGroup)] -> [((PhyloGroup,PhyloGroup),Double)]
141 toEdges sens edges =
142 case prox of
143 WeightedLogJaccard _ -> map (\(g,g') ->
144 ((g,g'), weightedLogJaccard' (sens) nbDocs diago
145 (g ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams))) edges
146 WeightedLogSim _ -> map (\(g,g') ->
147 ((g,g'), weightedLogJaccard' (1 / sens) nbDocs diago
148 (g ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams))) edges
149
150 _ -> undefined
151
152 toParentId :: PhyloGroup -> PhyloGroupId
153 toParentId child = ((child ^. phylo_groupPeriod, child ^. phylo_groupLevel + 1), child ^. phylo_groupIndex)
154
155
156 reduceGroups :: Proximity -> Synchrony -> Map Date Double -> Map Date Cooc -> [PhyloGroup] -> [PhyloGroup]
157 reduceGroups prox sync docs diagos branch =
158 -- 1) reduce a branch as a set of periods & groups
159 let periods = fromListWith (++)
160 $ map (\g -> (g ^. phylo_groupPeriod,[g])) branch
161 in (concat . concat . elems)
162 $ mapWithKey (\prd groups ->
163 -- 2) for each period, transform the groups as a proximity graph filtered by a threshold
164 let diago = reduceDiagos $ filterDiago diagos [prd]
165 edges = groupsToEdges prox sync ((sum . elems) $ restrictKeys docs $ periodsToYears [prd]) diago groups
166 in map (\comp ->
167 -- 4) add to each groups their futur level parent group
168 let parentId = toParentId (head' "parentId" comp)
169 in map (\g -> g & phylo_groupLevelParents %~ (++ [(parentId,1)]) ) comp )
170 -- 3) reduce the graph a a set of related components
171 $ toRelatedComponents groups edges) periods
172
173
174 adjustClustering :: Synchrony -> [[PhyloGroup]] -> [[PhyloGroup]]
175 adjustClustering sync branches = case sync of
176 ByProximityThreshold _ _ scope _ -> case scope of
177 SingleBranch -> branches
178 SiblingBranches -> groupBy (\g g' -> (last' "adjustClustering" $ (g ^. phylo_groupMeta) ! "breaks")
179 == (last' "adjustClustering" $ (g' ^. phylo_groupMeta) ! "breaks"))
180 $ sortOn _phylo_groupBranchId $ concat branches
181 AllBranches -> [concat branches]
182 ByProximityDistribution _ _ -> branches
183
184
185 levelUpAncestors :: [PhyloGroup] -> [PhyloGroup]
186 levelUpAncestors groups =
187 -- 1) create an associative map of (old,new) ids
188 let ids' = fromList $ map (\g -> (getGroupId g, fst $ head' "levelUpAncestors" ( g ^. phylo_groupLevelParents))) groups
189 in map (\g ->
190 let id' = ids' ! (getGroupId g)
191 ancestors = g ^. phylo_groupAncestors
192 -- 2) level up the ancestors ids and filter the ones that will be merged
193 ancestors' = filter (\(id,_) -> id /= id') $ map (\(id,w) -> (ids' ! id,w)) ancestors
194 in g & phylo_groupAncestors .~ ancestors'
195 ) groups
196
197 synchronicClustering :: Phylo -> Phylo
198 synchronicClustering phylo =
199 let prox = phyloProximity $ getConfig phylo
200 sync = phyloSynchrony $ getConfig phylo
201 docs = phylo ^. phylo_timeDocs
202 diagos = map coocToDiago $ phylo ^. phylo_timeCooc
203 newBranches = map (\branch -> reduceGroups prox sync docs diagos branch)
204 $ map processDynamics
205 $ adjustClustering sync
206 $ phyloToLastBranches
207 $ traceSynchronyStart phylo
208 newBranches' = newBranches `using` parList rdeepseq
209 in toNextLevel' phylo $ levelUpAncestors $ concat newBranches'
210
211
212 -- synchronicDistance :: Phylo -> Level -> String
213 -- synchronicDistance phylo lvl =
214 -- foldl' (\acc branch ->
215 -- acc <> (foldl' (\acc' period ->
216 -- acc' <> let prox = phyloProximity $ getConfig phylo
217 -- sync = phyloSynchrony $ getConfig phylo
218 -- docs = _phylo_timeDocs phylo
219 -- prd = _phylo_groupPeriod $ head' "distance" period
220 -- edges = groupsToEdges prox 0.1 (_bpt_sensibility sync)
221 -- ((sum . elems) $ restrictKeys docs $ periodsToYears [_phylo_groupPeriod $ head' "distance" period]) period
222 -- in foldl' (\mem (_,w) ->
223 -- mem <> show (prd)
224 -- <> "\t"
225 -- <> show (w)
226 -- <> "\n"
227 -- ) "" edges
228 -- ) "" $ elems $ groupByField _phylo_groupPeriod branch)
229 -- ) "period\tdistance\n" $ elems $ groupByField _phylo_groupBranchId $ getGroupsFromLevel lvl phylo