]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Viz/Phylo/PhyloMaker.hs
refactoring
[gargantext.git] / src / Gargantext / Core / Viz / Phylo / PhyloMaker.hs
1 {-|
2 Module : Gargantext.Core.Viz.Phylo.PhyloMaker
3 Description : Maker engine for rebuilding a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11
12 module Gargantext.Core.Viz.Phylo.PhyloMaker where
13
14
15 import Control.DeepSeq (NFData)
16 import Control.Lens hiding (Level)
17 import Control.Parallel.Strategies (parList, rdeepseq, using)
18 import Data.List (concat, nub, partition, sort, (++), group, intersect, null, sortOn, groupBy, tail)
19 import Data.Map (Map, fromListWith, keys, unionWith, fromList, empty, toList, elems, (!), restrictKeys, foldlWithKey, insert)
20 import Data.Set (Set)
21 import Data.Text (Text)
22 import Data.Vector (Vector)
23 import Debug.Trace (trace)
24 import Prelude (floor)
25
26 import Gargantext.Core.Methods.Distances (Distance(Conditional))
27 import Gargantext.Core.Methods.Graph.MaxClique (getMaxCliques)
28 import Gargantext.Core.Text.Context (TermList)
29 import Gargantext.Core.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, fisWithSizePolyMap', Size(..))
30 import Gargantext.Core.Viz.Phylo
31 import Gargantext.Core.Viz.Phylo.PhyloExport (toHorizon)
32 import Gargantext.Core.Viz.Phylo.PhyloTools
33 import Gargantext.Core.Viz.Phylo.SynchronicClustering (synchronicClustering)
34 import Gargantext.Core.Viz.Phylo.TemporalMatching (temporalMatching, getNextPeriods, filterDocs, filterDiago, reduceDiagos, toSimilarity)
35 import Gargantext.Prelude
36
37 import qualified Data.Set as Set
38 import qualified Data.List as List
39 import qualified Data.Vector as Vector
40
41 ------------------
42 -- | To Phylo | --
43 ------------------
44
45 {-
46 -- TODO AD
47 data Phylo' = PhyloBase { _phylo'_phyloBase :: Phylo}
48 | PhyloN { _phylo'_flatPhylo :: Phylo}
49
50
51 toPhylo' :: Phylo' -> [Document] -> TermList -> PhyloConfig -> Phylo
52 toPhylo' (PhyloN phylo) = toPhylo'
53 toPhylo' (PhyloBase phylo) = toPhylo
54 -}
55
56 -- TODO an adaptative synchronic clustering with a slider
57
58 toPhylo :: Phylo -> Phylo
59 toPhylo phylowithoutLink = trace ("# flatPhylo groups " <> show(length $ getGroupsFromScale 1 flatPhylo))
60 $ traceToPhylo (phyloScale $ getConfig phylowithoutLink) $
61 if (phyloScale $ getConfig phylowithoutLink) > 1
62 then foldl' (\phylo' _ -> synchronicClustering phylo') phyloAncestors [2..(phyloScale $ getConfig phylowithoutLink)]
63 else phyloAncestors
64 where
65 --------------------------------------
66 phyloAncestors :: Phylo
67 phyloAncestors =
68 if (findAncestors $ getConfig phylowithoutLink)
69 then toHorizon flatPhylo
70 else flatPhylo
71 --------------------------------------
72 flatPhylo :: Phylo
73 flatPhylo = addTemporalLinksToPhylo phylowithoutLink
74 --------------------------------------
75
76
77 -----------------------------
78 -- | Create a flat Phylo | --
79 -----------------------------
80
81
82 {-
83 -- create a square ladder
84 -}
85 squareLadder :: [Double] -> [Double]
86 squareLadder ladder = List.map (\x -> x * x) ladder
87
88
89 {-
90 -- create an adaptative diachronic 'sea elevation' ladder
91 -}
92 adaptDiachronicLadder :: Double -> Set Double -> Set Double -> [Double]
93 adaptDiachronicLadder curr similarities ladder =
94 if curr <= 0 || Set.null similarities
95 then Set.toList ladder
96 else
97 let idx = ((Set.size similarities) `div` (floor curr)) - 1
98 thr = Set.elemAt idx similarities
99 -- we use a sliding methods 1/10, then 1/9, then ... 1/2
100 in adaptDiachronicLadder (curr -1) (Set.filter (> thr) similarities) (Set.insert thr ladder)
101
102
103 {-
104 -- create a constante diachronic 'sea elevation' ladder
105 -}
106 constDiachronicLadder :: Double -> Double -> Set Double -> [Double]
107 constDiachronicLadder curr step ladder =
108 if curr > 1
109 then Set.toList ladder
110 else constDiachronicLadder (curr + step) step (Set.insert curr ladder)
111
112
113 {-
114 -- process an initial scanning of the kinship links
115 -}
116 scanSimilarity :: Scale -> Phylo -> Phylo
117 scanSimilarity lvl phylo =
118 let proximity = similarity $ getConfig phylo
119 scanning = foldlWithKey (\acc pId pds ->
120 -- 1) process period by period
121 let egos = map (\g -> (getGroupId g, g ^. phylo_groupNgrams))
122 $ elems
123 $ view ( phylo_periodScales
124 . traverse . filtered (\phyloLvl -> phyloLvl ^. phylo_scaleScale == lvl)
125 . phylo_scaleGroups ) pds
126 next = getNextPeriods ToParents (getTimeFrame $ timeUnit $ getConfig phylo) pId (keys $ phylo ^. phylo_periods)
127 targets = map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ getGroupsFromScalePeriods lvl next phylo
128 docs = filterDocs (phylo ^. phylo_timeDocs) ([pId] ++ next)
129 diagos = filterDiago (phylo ^. phylo_timeCooc) ([pId] ++ next)
130 -- 2) compute the pairs in parallel
131 pairs = map (\(id,ngrams) ->
132 map (\(id',ngrams') ->
133 let nbDocs = (sum . elems) $ filterDocs docs ([idToPrd id, idToPrd id'])
134 diago = reduceDiagos $ filterDiago diagos ([idToPrd id, idToPrd id'])
135 in ((id,id'),toSimilarity nbDocs diago proximity ngrams ngrams' ngrams')
136 ) $ filter (\(_,ngrams') -> (not . null) $ intersect ngrams ngrams') targets
137 ) egos
138 pairs' = pairs `using` parList rdeepseq
139 in acc ++ (concat pairs')
140 ) [] $ phylo ^. phylo_periods
141 in phylo & phylo_diaSimScan .~ Set.fromList (traceGroupsProxi $ map snd scanning)
142
143
144
145 appendGroups :: (a -> Period -> (Text,Text) -> Scale -> Int -> [Cooc] -> PhyloGroup) -> Scale -> Map (Date,Date) [a] -> Phylo -> Phylo
146 appendGroups f lvl m phylo = trace ("\n" <> "-- | Append " <> show (length $ concat $ elems m) <> " groups to Level " <> show (lvl) <> "\n")
147 $ over ( phylo_periods
148 . traverse
149 . phylo_periodScales
150 . traverse)
151 (\phyloLvl -> if lvl == (phyloLvl ^. phylo_scaleScale)
152 then
153 let pId = phyloLvl ^. phylo_scalePeriod
154 pId' = phyloLvl ^. phylo_scalePeriodStr
155 phyloCUnit = m ! pId
156 in phyloLvl
157 & phylo_scaleGroups .~ (fromList $ foldl (\groups obj ->
158 groups ++ [ (((pId,lvl),length groups)
159 , f obj pId pId' lvl (length groups)
160 (elems $ restrictKeys (phylo ^. phylo_timeCooc) $ periodsToYears [pId]))
161 ] ) [] phyloCUnit)
162 else
163 phyloLvl )
164 phylo
165
166
167 clusterToGroup :: Clustering -> Period -> (Text,Text) -> Scale -> Int -> [Cooc] -> PhyloGroup
168 clusterToGroup fis pId pId' lvl idx coocs = PhyloGroup pId pId' lvl idx ""
169 (fis ^. clustering_support )
170 (fis ^. clustering_visWeighting)
171 (fis ^. clustering_visFiltering)
172 (fis ^. clustering_roots)
173 (ngramsToCooc (fis ^. clustering_roots) coocs)
174 (1,[0]) -- branchid (lvl,[path in the branching tree])
175 (fromList [("breaks",[0]),("seaLevels",[0])])
176 [] [] [] [] [] [] []
177
178 {-
179 -- enhance the phylo with temporal links
180 -}
181 addTemporalLinksToPhylo :: Phylo -> Phylo
182 addTemporalLinksToPhylo phylowithoutLink = case strategy of
183 Constante start gap -> temporalMatching (constDiachronicLadder start gap Set.empty) phylowithoutLink
184 Adaptative steps -> temporalMatching (squareLadder $ adaptDiachronicLadder steps (phylowithoutLink ^. phylo_diaSimScan) Set.empty) phylowithoutLink
185 where
186 strategy :: SeaElevation
187 strategy = getSeaElevation phylowithoutLink
188
189 -----------------------
190 -- | To Phylo Step | --
191 -----------------------
192
193
194 indexDates' :: Map (Date,Date) [Document] -> Map (Date,Date) (Text,Text)
195 indexDates' m = map (\docs ->
196 let ds = map (\d -> date' d) docs
197 f = if (null ds)
198 then ""
199 else toFstDate ds
200 l = if (null ds)
201 then ""
202 else toLstDate ds
203 in (f,l)) m
204
205
206 -- To build the first phylo step from docs and terms
207 -- QL: backend entre phyloBase et Clustering
208 -- tophylowithoutLink
209 toPhyloWithoutLink :: [Document] -> TermList -> PhyloConfig -> Phylo
210 toPhyloWithoutLink docs lst conf = case (getSeaElevation phyloBase) of
211 Constante _ _ -> appendGroups clusterToGroup 1 seriesOfClustering (updatePeriods (indexDates' docs') phyloBase)
212 Adaptative _ -> scanSimilarity 1
213 $ appendGroups clusterToGroup 1 seriesOfClustering (updatePeriods (indexDates' docs') phyloBase)
214 where
215 --------------------------------------
216 seriesOfClustering :: Map (Date,Date) [Clustering]
217 seriesOfClustering = toSeriesOfClustering phyloBase docs'
218 --------------------------------------
219 docs' :: Map (Date,Date) [Document]
220 -- QL: Time Consuming here
221 docs' = groupDocsByPeriodRec date (getPeriodIds phyloBase) (sortOn date docs) empty
222 --------------------------------------
223 phyloBase :: Phylo
224 phyloBase = initPhylo docs lst conf
225 --------------------------------------
226
227 ---------------------------
228 -- | Frequent Item Set | --
229 ---------------------------
230
231
232 -- To apply a filter with the possibility of keeping some periods non empty (keep : True|False)
233 filterClique :: Bool -> Int -> (Int -> [Clustering] -> [Clustering]) -> Map (Date, Date) [Clustering] -> Map (Date, Date) [Clustering]
234 filterClique keep thr f m = case keep of
235 False -> map (\l -> f thr l) m
236 True -> map (\l -> keepFilled (f) thr l) m
237
238
239 -- To filter Fis with small Support
240 filterCliqueBySupport :: Int -> [Clustering] -> [Clustering]
241 filterCliqueBySupport thr l = filter (\clq -> (clq ^. clustering_support ) >= thr) l
242
243
244 -- To filter Fis with small Clique size
245 filterCliqueBySize :: Int -> [Clustering] -> [Clustering]
246 filterCliqueBySize thr l = filter (\clq -> (length $ clq ^. clustering_roots) >= thr) l
247
248
249 -- To filter nested Fis
250 filterCliqueByNested :: Map (Date, Date) [Clustering] -> Map (Date, Date) [Clustering]
251 filterCliqueByNested m =
252 let clq = map (\l ->
253 foldl (\mem f -> if (any (\f' -> isNested (f' ^. clustering_roots) (f ^. clustering_roots)) mem)
254 then mem
255 else
256 let fMax = filter (\f' -> not $ isNested (f ^. clustering_roots) (f' ^. clustering_roots)) mem
257 in fMax ++ [f] ) [] l)
258 $ elems m
259 clq' = clq `using` parList rdeepseq
260 in fromList $ zip (keys m) clq'
261
262
263 -- | To transform a time map of docs into a time map of Fis with some filters
264 toSeriesOfClustering :: Phylo -> Map (Date, Date) [Document] -> Map (Date,Date) [Clustering]
265 toSeriesOfClustering phylo phyloDocs = case (clique $ getConfig phylo) of
266 Fis s s' -> -- traceFis "Filtered Fis"
267 filterCliqueByNested
268 {- \$ traceFis "Filtered by clique size" -}
269 $ filterClique True s' (filterCliqueBySize)
270 {- \$ traceFis "Filtered by support" -}
271 $ filterClique True s (filterCliqueBySupport)
272 {- \$ traceFis "Unfiltered Fis" -}
273 seriesOfClustering
274 MaxClique s _ _ -> filterClique True s (filterCliqueBySize)
275 seriesOfClustering
276 where
277 --------------------------------------
278 seriesOfClustering :: Map (Date,Date) [Clustering]
279 seriesOfClustering = case (clique $ getConfig phylo) of
280 Fis _ _ ->
281 let fis = map (\(prd,docs) ->
282 case (corpusParser $ getConfig phylo) of
283 Csv' _ -> let lst = toList
284 $ fisWithSizePolyMap' (Segment 1 20) 1 (map (\d -> (ngramsToIdx (text d) (getRoots phylo), (weight d, (sourcesToIdx (sources d) (getSources phylo))))) docs)
285 in (prd, map (\f -> Clustering (Set.toList $ fst f) ((fst . snd) f) prd ((fst . snd . snd) f) (((snd . snd . snd) f))) lst)
286 _ -> let lst = toList
287 $ fisWithSizePolyMap (Segment 1 20) 1 (map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs)
288 in (prd, map (\f -> Clustering (Set.toList $ fst f) (snd f) prd (Just $ fromIntegral $ snd f) []) lst)
289 )
290 $ toList phyloDocs
291 fis' = fis `using` parList rdeepseq
292 in fromList fis'
293 MaxClique _ thr filterType ->
294 let mcl = map (\(prd,docs) ->
295 let cooc = map round
296 $ foldl sumCooc empty
297 $ map listToMatrix
298 $ map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs
299 in (prd, map (\cl -> Clustering cl 0 prd Nothing []) $ getMaxCliques filterType Conditional thr cooc))
300 $ toList phyloDocs
301 mcl' = mcl `using` parList rdeepseq
302 in fromList mcl'
303 --------------------------------------
304
305 -- dev viz graph maxClique getMaxClique
306
307
308 --------------------
309 -- | Coocurency | --
310 --------------------
311
312
313 -- To transform the docs into a time map of coocurency matrix
314 docsToTimeScaleCooc :: [Document] -> Vector Ngrams -> Map Date Cooc
315 docsToTimeScaleCooc docs fdt =
316 let mCooc = fromListWith sumCooc
317 $ map (\(_d,l) -> (_d, listToMatrix l))
318 $ map (\doc -> (date doc, sort $ ngramsToIdx (text doc) fdt)) docs
319 mCooc' = fromList
320 $ map (\t -> (t,empty))
321 $ toTimeScale (map date docs) 1
322 in trace ("\n" <> "-- | Build the coocurency matrix for " <> show (length $ keys mCooc') <> " unit of time" <> "\n")
323 $ unionWith sumCooc mCooc mCooc'
324
325
326 -----------------------
327 -- | to Phylo Base | --
328 -----------------------
329 -- TODO anoe
330 groupDocsByPeriodRec :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc] -> Map (date, date) [doc]
331 groupDocsByPeriodRec f prds docs acc =
332 if ((null prds) || (null docs))
333 then acc
334 else
335 let prd = head' "groupBy" prds
336 docs' = partition (\d -> (f d >= fst prd) && (f d <= snd prd)) docs
337 in groupDocsByPeriodRec f (tail prds) (snd docs') (insert prd (fst docs') acc)
338
339
340 -- To group a list of Documents by fixed periods
341 groupDocsByPeriod' :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
342 groupDocsByPeriod' f pds docs =
343 let docs' = groupBy (\d d' -> f d == f d') $ sortOn f docs
344 periods = map (inPeriode f docs') pds
345 periods' = periods `using` parList rdeepseq
346 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length pds) <> " periods" <> "\n")
347 $ fromList $ zip pds periods'
348 where
349 --------------------------------------
350 inPeriode :: Ord b => (t -> b) -> [[t]] -> (b, b) -> [t]
351 inPeriode f' h (start,end) =
352 concat $ fst $ partition (\d -> f' (head' "inPeriode" d) >= start && f' (head' "inPeriode" d) <= end) h
353
354
355
356 -- To group a list of Documents by fixed periods
357 groupDocsByPeriod :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
358 groupDocsByPeriod _ _ [] = panic "[ERR][Viz.Phylo.PhyloMaker] Empty [Documents] can not have any periods"
359 groupDocsByPeriod f pds es =
360 let periods = map (inPeriode f es) pds
361 periods' = periods `using` parList rdeepseq
362
363 in trace ("\n" <> "-- | Group " <> show(length es) <> " docs by " <> show(length pds) <> " periods" <> "\n")
364 $ fromList $ zip pds periods'
365 where
366 --------------------------------------
367 inPeriode :: Ord b => (t -> b) -> [t] -> (b, b) -> [t]
368 inPeriode f' h (start,end) =
369 fst $ partition (\d -> f' d >= start && f' d <= end) h
370 --------------------------------------
371
372
373 docsToTermFreq :: [Document] -> Vector Ngrams -> Map Int Double
374 docsToTermFreq docs fdt =
375 let nbDocs = fromIntegral $ length docs
376 freqs = map (/(nbDocs))
377 $ fromList
378 $ map (\lst -> (head' "docsToTermFreq" lst, fromIntegral $ length lst))
379 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) docs
380 sumFreqs = sum $ elems freqs
381 in map (/sumFreqs) freqs
382
383 docsToLastTermFreq :: Int -> [Document] -> Vector Ngrams -> Map Int Double
384 docsToLastTermFreq n docs fdt =
385 let last = take n $ reverse $ sort $ map date docs
386 nbDocs = fromIntegral $ length $ filter (\d -> elem (date d) last) docs
387 freqs = map (/(nbDocs))
388 $ fromList
389 $ map (\lst -> (head' "docsToLastTermFreq" lst, fromIntegral $ length lst))
390 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) $ filter (\d -> elem (date d) last) docs
391 sumFreqs = sum $ elems freqs
392 in map (/sumFreqs) freqs
393
394
395 -- To count the number of docs by unit of time
396 docsToTimeScaleNb :: [Document] -> Map Date Double
397 docsToTimeScaleNb docs =
398 let docs' = fromListWith (+) $ map (\d -> (date d,1)) docs
399 time = fromList $ map (\t -> (t,0)) $ toTimeScale (keys docs') 1
400 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length time) <> " unit of time" <> "\n")
401 $ unionWith (+) time docs'
402
403
404 initPhyloScales :: Int -> Period -> Map PhyloScaleId PhyloScale
405 initPhyloScales lvlMax pId =
406 fromList $ map (\lvl -> ((pId,lvl),PhyloScale pId ("","") lvl empty)) [1..lvlMax]
407
408
409
410 -- Init the basic elements of a Phylo
411 --
412 initPhylo :: [Document] -> TermList -> PhyloConfig -> Phylo
413 initPhylo docs lst conf =
414 let foundations = PhyloFoundations (Vector.fromList $ nub $ concat $ map text docs) lst
415 docsSources = PhyloSources (Vector.fromList $ nub $ concat $ map sources docs)
416 params = defaultPhyloParam { _phyloParam_config = conf }
417 periods = toPeriods (sort $ nub $ map date docs) (getTimePeriod $ timeUnit conf) (getTimeStep $ timeUnit conf)
418 in trace ("\n" <> "-- | Init a phylo out of " <> show(length docs) <> " docs \n")
419 $ Phylo foundations
420 docsSources
421 (docsToTimeScaleCooc docs (foundations ^. foundations_roots))
422 (docsToTimeScaleNb docs)
423 (docsToTermFreq docs (foundations ^. foundations_roots))
424 (docsToLastTermFreq (getTimePeriod $ timeUnit conf) docs (foundations ^. foundations_roots))
425 Set.empty
426 params
427 (fromList $ map (\prd -> (prd, PhyloPeriod prd ("","") (initPhyloScales 1 prd))) periods)
428 0