]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/Phylo/SynchronicClustering.hs
Merge branch 'dev-corpus-add-file' of ssh://gitlab.iscpif.fr:20022/gargantext/haskell...
[gargantext.git] / src / Gargantext / Viz / Phylo / SynchronicClustering.hs
1 {-|
2 Module : Gargantext.Viz.Phylo.SynchronicClustering
3 Description : Module dedicated to the adaptative synchronic clustering of a Phylo.
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 {-# LANGUAGE NoImplicitPrelude #-}
12 {-# LANGUAGE FlexibleContexts #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE MultiParamTypeClasses #-}
15
16 module Gargantext.Viz.Phylo.SynchronicClustering where
17
18 import Gargantext.Prelude
19 import Gargantext.Viz.AdaptativePhylo
20 import Gargantext.Viz.Phylo.PhyloTools
21 import Gargantext.Viz.Phylo.TemporalMatching (weightedLogJaccard)
22
23 import Data.List ((++), null, intersect, nub, concat, sort, sortOn)
24 import Data.Map (Map, fromList, fromListWith, foldlWithKey, (!), insert, empty, restrictKeys, elems, mapWithKey, member)
25
26 import Control.Lens hiding (Level)
27 import Control.Parallel.Strategies (parList, rdeepseq, using)
28
29 import qualified Data.Map as Map
30
31
32 -------------------------
33 -- | New Level Maker | --
34 -------------------------
35
36 toBranchId :: PhyloGroup -> PhyloBranchId
37 toBranchId child = ((child ^. phylo_groupLevel) + 1, snd (child ^. phylo_groupBranchId))
38
39 mergeGroups :: [Cooc] -> PhyloGroupId -> Map PhyloGroupId PhyloGroupId -> [PhyloGroup] -> PhyloGroup
40 mergeGroups coocs id mapIds childs =
41 let ngrams = (sort . nub . concat) $ map _phylo_groupNgrams childs
42 in PhyloGroup (fst $ fst id) (snd $ fst id) (snd id) ""
43 (sum $ map _phylo_groupSupport childs) ngrams
44 (ngramsToCooc ngrams coocs) (toBranchId (head' "mergeGroups" childs))
45 empty [] (map (\g -> (getGroupId g, 1)) childs)
46 (updatePointers $ concat $ map _phylo_groupPeriodParents childs)
47 (updatePointers $ concat $ map _phylo_groupPeriodChilds childs)
48 where
49 updatePointers :: [Pointer] -> [Pointer]
50 updatePointers pointers = map (\(pId,w) -> (mapIds ! pId,w)) pointers
51
52
53 addPhyloLevel :: Level -> Phylo -> Phylo
54 addPhyloLevel lvl phylo =
55 over ( phylo_periods . traverse )
56 (\phyloPrd -> phyloPrd & phylo_periodLevels
57 %~ (insert (phyloPrd ^. phylo_periodPeriod, lvl) (PhyloLevel (phyloPrd ^. phylo_periodPeriod) lvl empty))) phylo
58
59
60 toNextLevel :: Phylo -> [PhyloGroup] -> Phylo
61 toNextLevel phylo groups =
62 let curLvl = getLastLevel phylo
63 oldGroups = fromList $ map (\g -> (getGroupId g, getLevelParentId g)) groups
64 newGroups = fromListWith (++)
65 -- | 5) group the parents by periods
66 $ foldlWithKey (\acc id groups' ->
67 -- | 4) create the parent group
68 let parent = mergeGroups (elems $ restrictKeys (phylo ^. phylo_timeCooc) $ periodsToYears [(fst . fst) id]) id oldGroups groups'
69 in acc ++ [(parent ^. phylo_groupPeriod, [parent])]) []
70 -- | 3) group the current groups by parentId
71 $ fromListWith (++) $ map (\g -> (getLevelParentId g, [g])) groups
72 in traceSynchronyEnd
73 $ over ( phylo_periods . traverse . phylo_periodLevels . traverse
74 -- | 6) update each period at curLvl + 1
75 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == (curLvl + 1)))
76 -- | 7) by adding the parents
77 (\phyloLvl ->
78 if member (phyloLvl ^. phylo_levelPeriod) newGroups
79 then phyloLvl & phylo_levelGroups
80 .~ fromList (map (\g -> (getGroupId g, g)) $ newGroups ! (phyloLvl ^. phylo_levelPeriod))
81 else phyloLvl)
82 -- | 2) add the curLvl + 1 phyloLevel to the phylo
83 $ addPhyloLevel (curLvl + 1)
84 -- | 1) update the current groups (with level parent pointers) in the phylo
85 $ updatePhyloGroups curLvl (fromList $ map (\g -> (getGroupId g, g)) groups) phylo
86
87
88 --------------------
89 -- | Clustering | --
90 --------------------
91
92
93 toPairs :: [PhyloGroup] -> [(PhyloGroup,PhyloGroup)]
94 toPairs groups = filter (\(g,g') -> (not . null) $ intersect (g ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams))
95 $ listToCombi' groups
96
97
98 toDiamonds :: [PhyloGroup] -> [[PhyloGroup]]
99 toDiamonds groups = foldl' (\acc groups' ->
100 acc ++ ( elems
101 $ Map.filter (\v -> length v > 1)
102 $ fromListWith (++)
103 $ foldl' (\acc' g ->
104 acc' ++ (map (\(id,_) -> (id,[g]) ) $ g ^. phylo_groupPeriodChilds)) [] groups')) []
105 $ elems
106 $ Map.filter (\v -> length v > 1)
107 $ fromListWith (++)
108 $ foldl' (\acc g -> acc ++ (map (\(id,_) -> (id,[g]) ) $ g ^. phylo_groupPeriodParents) ) [] groups
109
110
111 groupsToEdges :: Proximity -> Synchrony -> Double -> [PhyloGroup] -> [((PhyloGroup,PhyloGroup),Double)]
112 groupsToEdges prox sync docs groups =
113 case sync of
114 ByProximityThreshold t s -> filter (\(_,w) -> w >= t)
115 $ toEdges s
116 $ toPairs groups
117
118 ByProximityDistribution s ->
119 let diamonds = sortOn snd
120 $ toEdges s $ concat
121 $ map toPairs $ toDiamonds groups
122 in take (div (length diamonds) 2) diamonds
123 where
124 toEdges :: Double -> [(PhyloGroup,PhyloGroup)] -> [((PhyloGroup,PhyloGroup),Double)]
125 toEdges sens edges =
126 case prox of
127 WeightedLogJaccard _ _ _ -> map (\(g,g') ->
128 ((g,g'), weightedLogJaccard sens docs
129 (g ^. phylo_groupCooc) (g' ^. phylo_groupCooc)
130 (g ^. phylo_groupNgrams) (g' ^. phylo_groupNgrams))) edges
131 _ -> undefined
132
133
134
135 toRelatedComponents :: [PhyloGroup] -> [((PhyloGroup,PhyloGroup),Double)] -> [[PhyloGroup]]
136 toRelatedComponents nodes edges =
137 let ref = fromList $ map (\g -> (getGroupId g, g)) nodes
138 clusters = relatedComponents $ ((map (\((g,g'),_) -> [getGroupId g, getGroupId g']) edges) ++ (map (\g -> [getGroupId g]) nodes))
139 in map (\cluster -> map (\gId -> ref ! gId) cluster) clusters
140
141 toParentId :: PhyloGroup -> PhyloGroupId
142 toParentId child = ((child ^. phylo_groupPeriod, child ^. phylo_groupLevel + 1), child ^. phylo_groupIndex)
143
144
145 reduceBranch :: Proximity -> Synchrony -> Map Date Double -> [PhyloGroup] -> [PhyloGroup]
146 reduceBranch prox sync docs branch =
147 -- | 1) reduce a branch as a set of periods & groups
148 let periods = fromListWith (++)
149 $ map (\g -> (g ^. phylo_groupPeriod,[g])) branch
150 in (concat . concat . elems)
151 $ mapWithKey (\prd groups ->
152 -- | 2) for each period, transform the groups as a proximity graph filtered by a threshold
153 let edges = groupsToEdges prox sync ((sum . elems) $ restrictKeys docs $ periodsToYears [prd]) groups
154 in map (\comp ->
155 -- | 4) add to each groups their futur level parent group
156 let parentId = toParentId (head' "parentId" comp)
157 in map (\g -> g & phylo_groupLevelParents %~ (++ [(parentId,1)]) ) comp )
158 -- |3) reduce the graph a a set of related components
159 $ toRelatedComponents groups edges) periods
160
161
162 synchronicClustering :: Phylo -> Phylo
163 synchronicClustering phylo =
164 let prox = phyloProximity $ getConfig phylo
165 sync = phyloSynchrony $ getConfig phylo
166 docs = phylo ^. phylo_timeDocs
167 branches = map (\branch -> reduceBranch prox sync docs branch)
168 $ phyloToLastBranches
169 $ traceSynchronyStart phylo
170 branches' = branches `using` parList rdeepseq
171 in toNextLevel phylo $ concat branches'
172
173
174 ----------------
175 -- | probes | --
176 ----------------
177
178 -- synchronicDistance :: Phylo -> Level -> String
179 -- synchronicDistance phylo lvl =
180 -- foldl' (\acc branch ->
181 -- acc <> (foldl' (\acc' period ->
182 -- acc' <> let prox = phyloProximity $ getConfig phylo
183 -- sync = phyloSynchrony $ getConfig phylo
184 -- docs = _phylo_timeDocs phylo
185 -- prd = _phylo_groupPeriod $ head' "distance" period
186 -- edges = groupsToEdges prox 0.1 (_bpt_sensibility sync)
187 -- ((sum . elems) $ restrictKeys docs $ periodsToYears [_phylo_groupPeriod $ head' "distance" period]) period
188 -- in foldl' (\mem (_,w) ->
189 -- mem <> show (prd)
190 -- <> "\t"
191 -- <> show (w)
192 -- <> "\n"
193 -- ) "" edges
194 -- ) "" $ elems $ groupByField _phylo_groupPeriod branch)
195 -- ) "period\tdistance\n" $ elems $ groupByField _phylo_groupBranchId $ getGroupsFromLevel lvl phylo