]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Viz/Phylo/PhyloMaker.hs
[REFACT] Group fun and types
[gargantext.git] / src / Gargantext / Core / Viz / Phylo / PhyloMaker.hs
1 {-|
2 Module : Gargantext.Core.Viz.Phylo.PhyloMaker
3 Description : Maker engine for rebuilding a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11
12 module Gargantext.Core.Viz.Phylo.PhyloMaker where
13
14 import Data.List (concat, nub, partition, sort, (++), group, intersect, null, sortOn, groupBy, tail)
15 import Data.Map (Map, fromListWith, keys, unionWith, fromList, empty, toList, elems, (!), restrictKeys, foldlWithKey, insert)
16 import Data.Vector (Vector)
17
18 import Gargantext.Prelude
19 import Gargantext.Core.Viz.AdaptativePhylo
20 import Gargantext.Core.Viz.Phylo.PhyloTools
21 import Gargantext.Core.Viz.Phylo.TemporalMatching (adaptativeTemporalMatching, constanteTemporalMatching, getNextPeriods, filterDocs, filterDiago, reduceDiagos, toProximity)
22 import Gargantext.Core.Viz.Phylo.SynchronicClustering (synchronicClustering)
23 import Gargantext.Core.Text.Context (TermList)
24 import Gargantext.Core.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, Size(..))
25 import Gargantext.Core.Viz.Graph.MaxClique (getMaxCliques)
26 import Gargantext.Core.Viz.Graph.Distances (Distance(Conditional))
27
28 import Control.DeepSeq (NFData)
29 import Control.Parallel.Strategies (parList, rdeepseq, using)
30 import Debug.Trace (trace)
31 import Control.Lens hiding (Level)
32
33 import qualified Data.Vector as Vector
34 import qualified Data.Set as Set
35
36 ------------------
37 -- | To Phylo | --
38 ------------------
39
40
41 toPhylo :: [Document] -> TermList -> Config -> Phylo
42 toPhylo docs lst conf = trace ("# phylo1 groups " <> show(length $ getGroupsFromLevel 1 phylo1))
43 $ traceToPhylo (phyloLevel conf) $
44 if (phyloLevel conf) > 1
45 then foldl' (\phylo' _ -> synchronicClustering phylo') phylo1 [2..(phyloLevel conf)]
46 else phylo1
47 where
48 --------------------------------------
49 phylo1 :: Phylo
50 phylo1 = toPhylo1 docs phyloBase
51 --------------------------------------
52 phyloBase :: Phylo
53 phyloBase = toPhyloBase docs lst conf
54 --------------------------------------
55
56
57
58 --------------------
59 -- | To Phylo 1 | --
60 --------------------
61
62 toGroupsProxi :: Level -> Phylo -> Phylo
63 toGroupsProxi lvl phylo =
64 let proximity = phyloProximity $ getConfig phylo
65 groupsProxi = foldlWithKey (\acc pId pds ->
66 -- 1) process period by period
67 let egos = map (\g -> (getGroupId g, g ^. phylo_groupNgrams))
68 $ elems
69 $ view ( phylo_periodLevels
70 . traverse . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
71 . phylo_levelGroups ) pds
72 next = getNextPeriods ToParents (getTimeFrame $ timeUnit $ getConfig phylo) pId (keys $ phylo ^. phylo_periods)
73 targets = map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ getGroupsFromLevelPeriods lvl next phylo
74 docs = filterDocs (phylo ^. phylo_timeDocs) ([pId] ++ next)
75 diagos = filterDiago (phylo ^. phylo_timeCooc) ([pId] ++ next)
76 -- 2) compute the pairs in parallel
77 pairs = map (\(id,ngrams) ->
78 map (\(id',ngrams') ->
79 let nbDocs = (sum . elems) $ filterDocs docs ([idToPrd id, idToPrd id'])
80 diago = reduceDiagos $ filterDiago diagos ([idToPrd id, idToPrd id'])
81 in ((id,id'),toProximity nbDocs diago proximity ngrams ngrams' ngrams')
82 ) $ filter (\(_,ngrams') -> (not . null) $ intersect ngrams ngrams') targets
83 ) egos
84 pairs' = pairs `using` parList rdeepseq
85 in acc ++ (concat pairs')
86 ) [] $ phylo ^. phylo_periods
87 in phylo & phylo_groupsProxi .~ ((traceGroupsProxi . fromList) groupsProxi)
88
89
90 appendGroups :: (a -> PhyloPeriodId -> Level -> Int -> [Cooc] -> PhyloGroup) -> Level -> Map (Date,Date) [a] -> Phylo -> Phylo
91 appendGroups f lvl m phylo = trace ("\n" <> "-- | Append " <> show (length $ concat $ elems m) <> " groups to Level " <> show (lvl) <> "\n")
92 $ over ( phylo_periods
93 . traverse
94 . phylo_periodLevels
95 . traverse)
96 (\phyloLvl -> if lvl == (phyloLvl ^. phylo_levelLevel)
97 then
98 let pId = phyloLvl ^. phylo_levelPeriod
99 phyloCUnit = m ! pId
100 in phyloLvl
101 & phylo_levelGroups .~ (fromList $ foldl (\groups obj ->
102 groups ++ [ (((pId,lvl),length groups)
103 , f obj pId lvl (length groups)
104 (elems $ restrictKeys (phylo ^. phylo_timeCooc) $ periodsToYears [pId]))
105 ] ) [] phyloCUnit)
106 else
107 phyloLvl )
108 phylo
109
110
111 cliqueToGroup :: PhyloClique -> PhyloPeriodId -> Level -> Int -> [Cooc] -> PhyloGroup
112 cliqueToGroup fis pId lvl idx coocs = PhyloGroup pId lvl idx ""
113 (fis ^. phyloClique_support)
114 (fis ^. phyloClique_nodes)
115 (ngramsToCooc (fis ^. phyloClique_nodes) coocs)
116 (1,[0]) -- branchid (lvl,[path in the branching tree])
117 (fromList [("breaks",[0]),("seaLevels",[0])])
118 [] [] [] [] []
119
120
121 toPhylo1 :: [Document] -> Phylo -> Phylo
122 toPhylo1 docs phyloBase = case (getSeaElevation phyloBase) of
123 Constante start gap -> constanteTemporalMatching start gap
124 $ appendGroups cliqueToGroup 1 phyloClique phyloBase
125 Adaptative steps -> adaptativeTemporalMatching steps
126 $ toGroupsProxi 1
127 $ appendGroups cliqueToGroup 1 phyloClique phyloBase
128 where
129 --------------------------------------
130 phyloClique :: Map (Date,Date) [PhyloClique]
131 phyloClique = toPhyloClique phyloBase docs'
132 --------------------------------------
133 docs' :: Map (Date,Date) [Document]
134 docs' = groupDocsByPeriodRec date (getPeriodIds phyloBase) (sortOn date docs) empty
135 -- docs' = groupDocsByPeriod' date (getPeriodIds phyloBase) docs
136 --------------------------------------
137
138
139 ---------------------------
140 -- | Frequent Item Set | --
141 ---------------------------
142
143
144 -- To apply a filter with the possibility of keeping some periods non empty (keep : True|False)
145 filterClique :: Bool -> Int -> (Int -> [PhyloClique] -> [PhyloClique]) -> Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
146 filterClique keep thr f m = case keep of
147 False -> map (\l -> f thr l) m
148 True -> map (\l -> keepFilled (f) thr l) m
149
150
151 -- To filter Fis with small Support
152 filterCliqueBySupport :: Int -> [PhyloClique] -> [PhyloClique]
153 filterCliqueBySupport thr l = filter (\clq -> (clq ^. phyloClique_support) >= thr) l
154
155
156 -- To filter Fis with small Clique size
157 filterCliqueBySize :: Int -> [PhyloClique] -> [PhyloClique]
158 filterCliqueBySize thr l = filter (\clq -> (length $ clq ^. phyloClique_nodes) >= thr) l
159
160
161 -- To filter nested Fis
162 filterCliqueByNested :: Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
163 filterCliqueByNested m =
164 let clq = map (\l ->
165 foldl (\mem f -> if (any (\f' -> isNested (f' ^. phyloClique_nodes) (f ^. phyloClique_nodes)) mem)
166 then mem
167 else
168 let fMax = filter (\f' -> not $ isNested (f ^. phyloClique_nodes) (f' ^. phyloClique_nodes)) mem
169 in fMax ++ [f] ) [] l)
170 $ elems m
171 clq' = clq `using` parList rdeepseq
172 in fromList $ zip (keys m) clq'
173
174
175 -- | To transform a time map of docs into a time map of Fis with some filters
176 toPhyloClique :: Phylo -> Map (Date, Date) [Document] -> Map (Date,Date) [PhyloClique]
177 toPhyloClique phylo phyloDocs = case (clique $ getConfig phylo) of
178 Fis s s' -> -- traceFis "Filtered Fis"
179 filterCliqueByNested
180 {- \$ traceFis "Filtered by clique size" -}
181 $ filterClique True s' (filterCliqueBySize)
182 {- \$ traceFis "Filtered by support" -}
183 $ filterClique True s (filterCliqueBySupport)
184 {- \$ traceFis "Unfiltered Fis" -}
185 phyloClique
186 MaxClique s -> filterClique True s (filterCliqueBySize)
187 phyloClique
188 where
189 --------------------------------------
190 phyloClique :: Map (Date,Date) [PhyloClique]
191 phyloClique = case (clique $ getConfig phylo) of
192 Fis _ _ ->
193 let fis = map (\(prd,docs) ->
194 let lst = toList $ fisWithSizePolyMap (Segment 1 20) 1 (map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs)
195 in (prd, map (\f -> PhyloClique (Set.toList $ fst f) (snd f) prd) lst))
196 $ toList phyloDocs
197 fis' = fis `using` parList rdeepseq
198 in fromList fis'
199 MaxClique _ ->
200 let mcl = map (\(prd,docs) ->
201 let cooc = map round
202 $ foldl sumCooc empty
203 $ map listToMatrix
204 $ map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs
205 in (prd, map (\cl -> PhyloClique cl 0 prd) $ getMaxCliques Conditional 0.01 cooc))
206 $ toList phyloDocs
207 mcl' = mcl `using` parList rdeepseq
208 in fromList mcl'
209 --------------------------------------
210
211 -- dev viz graph maxClique getMaxClique
212
213
214 --------------------
215 -- | Coocurency | --
216 --------------------
217
218
219 -- To transform the docs into a time map of coocurency matrix
220 docsToTimeScaleCooc :: [Document] -> Vector Ngrams -> Map Date Cooc
221 docsToTimeScaleCooc docs fdt =
222 let mCooc = fromListWith sumCooc
223 $ map (\(_d,l) -> (_d, listToMatrix l))
224 $ map (\doc -> (date doc, sort $ ngramsToIdx (text doc) fdt)) docs
225 mCooc' = fromList
226 $ map (\t -> (t,empty))
227 $ toTimeScale (map date docs) 1
228 in trace ("\n" <> "-- | Build the coocurency matrix for " <> show (length $ keys mCooc') <> " unit of time" <> "\n")
229 $ unionWith sumCooc mCooc mCooc'
230
231
232 -----------------------
233 -- | to Phylo Base | --
234 -----------------------
235
236 groupDocsByPeriodRec :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc] -> Map (date, date) [doc]
237 groupDocsByPeriodRec f prds docs acc =
238 if ((null prds) || (null docs))
239 then acc
240 else
241 let prd = head' "groupBy" prds
242 docs' = partition (\d -> (f d >= fst prd) && (f d <= snd prd)) docs
243 in groupDocsByPeriodRec f (tail prds) (snd docs') (insert prd (fst docs') acc)
244
245
246 -- To group a list of Documents by fixed periods
247 groupDocsByPeriod' :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
248 groupDocsByPeriod' f pds docs =
249 let docs' = groupBy (\d d' -> f d == f d') $ sortOn f docs
250 periods = map (inPeriode f docs') pds
251 periods' = periods `using` parList rdeepseq
252 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length pds) <> " periods" <> "\n")
253 $ fromList $ zip pds periods'
254 where
255 --------------------------------------
256 inPeriode :: Ord b => (t -> b) -> [[t]] -> (b, b) -> [t]
257 inPeriode f' h (start,end) =
258 concat $ fst $ partition (\d -> f' (head' "inPeriode" d) >= start && f' (head' "inPeriode" d) <= end) h
259
260
261
262 -- To group a list of Documents by fixed periods
263 groupDocsByPeriod :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
264 groupDocsByPeriod _ _ [] = panic "[ERR][Viz.Phylo.PhyloMaker] Empty [Documents] can not have any periods"
265 groupDocsByPeriod f pds es =
266 let periods = map (inPeriode f es) pds
267 periods' = periods `using` parList rdeepseq
268
269 in trace ("\n" <> "-- | Group " <> show(length es) <> " docs by " <> show(length pds) <> " periods" <> "\n")
270 $ fromList $ zip pds periods'
271 where
272 --------------------------------------
273 inPeriode :: Ord b => (t -> b) -> [t] -> (b, b) -> [t]
274 inPeriode f' h (start,end) =
275 fst $ partition (\d -> f' d >= start && f' d <= end) h
276 --------------------------------------
277
278
279 docsToTermFreq :: [Document] -> Vector Ngrams -> Map Int Double
280 docsToTermFreq docs fdt =
281 let nbDocs = fromIntegral $ length docs
282 freqs = map (/(nbDocs))
283 $ fromList
284 $ map (\lst -> (head' "docsToTermFreq" lst, fromIntegral $ length lst))
285 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) docs
286 sumFreqs = sum $ elems freqs
287 in map (/sumFreqs) freqs
288
289 docsToLastTermFreq :: Int -> [Document] -> Vector Ngrams -> Map Int Double
290 docsToLastTermFreq n docs fdt =
291 let last = take n $ reverse $ sort $ map date docs
292 nbDocs = fromIntegral $ length $ filter (\d -> elem (date d) last) docs
293 freqs = map (/(nbDocs))
294 $ fromList
295 $ map (\lst -> (head' "docsToLastTermFreq" lst, fromIntegral $ length lst))
296 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) $ filter (\d -> elem (date d) last) docs
297 sumFreqs = sum $ elems freqs
298 in map (/sumFreqs) freqs
299
300
301 -- To count the number of docs by unit of time
302 docsToTimeScaleNb :: [Document] -> Map Date Double
303 docsToTimeScaleNb docs =
304 let docs' = fromListWith (+) $ map (\d -> (date d,1)) docs
305 time = fromList $ map (\t -> (t,0)) $ toTimeScale (keys docs') 1
306 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length time) <> " unit of time" <> "\n")
307 $ unionWith (+) time docs'
308
309
310 initPhyloLevels :: Int -> PhyloPeriodId -> Map PhyloLevelId PhyloLevel
311 initPhyloLevels lvlMax pId =
312 fromList $ map (\lvl -> ((pId,lvl),PhyloLevel pId lvl empty)) [1..lvlMax]
313
314
315 -- To init the basic elements of a Phylo
316 toPhyloBase :: [Document] -> TermList -> Config -> Phylo
317 toPhyloBase docs lst conf =
318 let foundations = PhyloFoundations (Vector.fromList $ nub $ concat $ map text docs) lst
319 params = defaultPhyloParam { _phyloParam_config = conf }
320 periods = toPeriods (sort $ nub $ map date docs) (getTimePeriod $ timeUnit conf) (getTimeStep $ timeUnit conf)
321 in trace ("\n" <> "-- | Create PhyloBase out of " <> show(length docs) <> " docs \n")
322 $ Phylo foundations
323 (docsToTimeScaleCooc docs (foundations ^. foundations_roots))
324 (docsToTimeScaleNb docs)
325 (docsToTermFreq docs (foundations ^. foundations_roots))
326 (docsToLastTermFreq (getTimePeriod $ timeUnit conf) docs (foundations ^. foundations_roots))
327 empty
328 empty
329 params
330 (fromList $ map (\prd -> (prd, PhyloPeriod prd (initPhyloLevels 1 prd))) periods)