2 Module : Gargantext.Core.Viz.Phylo.PhyloMaker
3 Description : Maker engine for rebuilding a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 module Gargantext.Core.Viz.Phylo.PhyloMaker where
15 import Control.DeepSeq (NFData)
16 import Control.Lens hiding (Level)
17 import Control.Parallel.Strategies (parList, rdeepseq, using)
18 import Data.List (concat, nub, partition, sort, (++), group, intersect, null, sortOn, groupBy, tail)
19 import Data.Map (Map, fromListWith, keys, unionWith, fromList, empty, toList, elems, (!), restrictKeys, insert)
21 import Data.Text (Text)
22 import Data.Vector (Vector)
23 import Debug.Trace (trace)
24 import Prelude (floor)
26 import Gargantext.Core.Methods.Similarities (Similarity(Conditional))
27 import Gargantext.Core.Methods.Graph.MaxClique (getMaxCliques)
28 import Gargantext.Core.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, fisWithSizePolyMap', Size(..))
29 import Gargantext.Core.Viz.Phylo
30 import Gargantext.Core.Viz.Phylo.PhyloExport (toHorizon)
31 import Gargantext.Core.Viz.Phylo.PhyloTools
32 import Gargantext.Core.Viz.Phylo.SynchronicClustering (synchronicClustering)
33 import Gargantext.Core.Viz.Phylo.TemporalMatching (toPhyloQuality, temporalMatching, getNextPeriods, filterDocs, filterDiago, reduceDiagos, toSimilarity)
34 import Gargantext.Prelude
36 import qualified Data.Set as Set
37 import qualified Data.Map as Map
38 import qualified Data.List as List
39 import qualified Data.Vector as Vector
47 data Phylo' = PhyloBase { _phylo'_phyloBase :: Phylo}
48 | PhyloN { _phylo'_flatPhylo :: Phylo}
51 toPhylo' :: Phylo' -> [Document] -> TermList -> PhyloConfig -> Phylo
52 toPhylo' (PhyloN phylo) = toPhylo'
53 toPhylo' (PhyloBase phylo) = toPhylo
56 -- TODO an adaptative synchronic clustering with a slider
58 toPhylo :: Phylo -> Phylo
59 toPhylo phylowithoutLink = traceToPhylo (phyloScale $ getConfig phylowithoutLink) $
60 if (phyloScale $ getConfig phylowithoutLink) > 1
61 then foldl' (\phylo' _ -> synchronicClustering phylo') phyloAncestors [2..(phyloScale $ getConfig phylowithoutLink)]
64 --------------------------------------
65 phyloAncestors :: Phylo
67 if (findAncestors $ getConfig phylowithoutLink)
68 then toHorizon phyloWithLinks
70 --------------------------------------
71 phyloWithLinks :: Phylo
72 phyloWithLinks = temporalMatching (getLadder phylowithoutLink) phylowithoutLink
73 --------------------------------------
76 -----------------------------
77 -- | Create a flat Phylo | --
78 -----------------------------
82 -- create a square ladder
84 squareLadder :: [Double] -> [Double]
85 squareLadder ladder = List.map (\x -> x * x) ladder
89 -- create an adaptative 'sea elevation' ladder
91 adaptSeaLadder :: Double -> Set Double -> Set Double -> [Double]
92 adaptSeaLadder curr similarities ladder =
93 if curr <= 0 || Set.null similarities
94 then Set.toList ladder
96 let idx = ((Set.size similarities) `div` (floor curr)) - 1
97 thr = Set.elemAt idx similarities
98 -- we use a sliding methods 1/10, then 1/9, then ... 1/2
99 in adaptSeaLadder (curr -1) (Set.filter (> thr) similarities) (Set.insert thr ladder)
103 -- create a constante 'sea elevation' ladder
105 constSeaLadder :: Double -> Double -> Set Double -> [Double]
106 constSeaLadder curr step ladder =
108 then Set.toList ladder
109 else constSeaLadder (curr + step) step (Set.insert curr ladder)
114 -- create an evolving 'sea elevation' ladder based on estimated & local quality maxima
116 evolvSeaLadder :: Double -> Double -> Map Int Double -> Set Double -> [((PhyloGroup,PhyloGroup),Double)] -> [Double]
117 evolvSeaLadder nbFdt lambda freq similarities graph = map snd
119 $ zip maxima (map fst qua')
120 -- 3) find the corresponding measures of similarity and create the ladder
123 -- 2) find the local maxima in the quality distribution
125 maxima = [snd (List.head qua') > snd (List.head $ List.tail qua')] ++ (findMaxima qua') ++ [snd (List.head $ reverse qua') > snd (List.head $ List.tail $ reverse qua')]
128 qua' :: [(Double,Double)]
129 qua' = foldl (\acc (s,q) ->
132 else if (snd (List.last acc)) == q
135 ) [] $ zip (Set.toList similarities) qua
137 -- 1.1) for each measure of similarity, prune the flat phylo, compute the branches and estimate the quality
140 let edges = filter (\edge -> snd edge >= thr) graph
141 nodes = nub $ concat $ map (\((n,n'),_) -> [n,n']) edges
142 branches = toRelatedComponents nodes edges
143 in toPhyloQuality nbFdt lambda freq branches
144 ) $ (Set.toList similarities)
148 -- find a similarity ladder regarding the "sea elevation" strategy
150 findSeaLadder :: Phylo -> Phylo
151 findSeaLadder phylo = case getSeaElevation phylo of
152 Constante start gap -> phylo & phylo_seaLadder .~ (constSeaLadder start gap Set.empty)
153 Adaptative steps -> phylo & phylo_seaLadder .~ (squareLadder $ adaptSeaLadder steps similarities Set.empty)
154 Evolving _ -> let ladder = evolvSeaLadder
155 (fromIntegral $ Vector.length $ getRoots phylo)
158 similarities simGraph
159 in phylo & phylo_seaLadder .~ (if length ladder > 0
161 -- if we don't find any local maxima with the evolving strategy
162 else constSeaLadder 0.1 0.1 Set.empty)
165 -- 2) extract the values of the kinship links
166 similarities :: Set Double
167 similarities = Set.fromList $ sort $ map snd simGraph
169 -- 1) we process an initial calculation of the kinship links
170 -- this initial calculation is used to estimate the real sea ladder
171 simGraph :: [((PhyloGroup,PhyloGroup),Double)]
172 simGraph = foldl' (\acc period ->
173 -- 1.1) process period by period
174 let sources = getGroupsFromScalePeriods 1 [period] phylo
175 next = getNextPeriods ToParents 3 period (keys $ phylo ^. phylo_periods)
176 targets = getGroupsFromScalePeriods 1 next phylo
177 docs = filterDocs (getDocsByDate phylo) ([period] ++ next)
178 diagos = filterDiago (getCoocByDate phylo) ([period] ++ next)
179 -- 1.2) compute the kinship similarities between pairs of source & target in parallel
180 pairs = map (\source ->
181 let candidates = filter (\target -> (> 2) $ length
182 $ intersect (getGroupNgrams source) (getGroupNgrams target)) targets
184 let nbDocs = (sum . elems)
185 $ filterDocs docs ([idToPrd (getGroupId source), idToPrd (getGroupId target)])
187 $ filterDiago diagos ([idToPrd (getGroupId source), idToPrd (getGroupId target)])
188 in ((source,target),toSimilarity nbDocs diago (getSimilarity phylo) (getGroupNgrams source) (getGroupNgrams target) (getGroupNgrams target))
191 pairs' = pairs `using` parList rdeepseq
192 in acc ++ (concat pairs')
193 ) [] $ keys $ phylo ^. phylo_periods
195 appendGroups :: (a -> Period -> (Text,Text) -> Scale -> Int -> [Cooc] -> Map Int Double -> PhyloGroup) -> Scale -> Map (Date,Date) [a] -> Phylo -> Phylo
196 appendGroups f lvl m phylo = trace ("\n" <> "-- | Append " <> show (length $ concat $ elems m) <> " groups to Level " <> show (lvl) <> "\n")
197 $ over ( phylo_periods
201 (\phyloLvl -> if lvl == (phyloLvl ^. phylo_scaleScale)
203 let pId = phyloLvl ^. phylo_scalePeriod
204 pId' = phyloLvl ^. phylo_scalePeriodStr
207 & phylo_scaleGroups .~ (fromList $ foldl (\groups obj ->
208 groups ++ [ (((pId,lvl),length groups)
209 , f obj pId pId' lvl (length groups)
210 -- select the cooc of the periods
211 (elems $ restrictKeys (getCoocByDate phylo) $ periodsToYears [pId])
212 -- select and merge the roots count of the periods
213 (foldl (\acc count -> unionWith (+) acc count) empty
214 $ elems $ restrictKeys (getRootsCountByDate phylo) $ periodsToYears [pId]))
221 clusterToGroup :: Clustering -> Period -> (Text,Text) -> Scale -> Int -> [Cooc] -> Map Int Double -> PhyloGroup
222 clusterToGroup fis pId pId' lvl idx coocs rootsCount = PhyloGroup pId pId' lvl idx ""
223 (fis ^. clustering_support )
224 (fis ^. clustering_visWeighting)
225 (fis ^. clustering_visFiltering)
226 (fis ^. clustering_roots)
227 (ngramsToCooc (fis ^. clustering_roots) coocs)
228 (ngramsToDensity (fis ^. clustering_roots) coocs rootsCount)
229 (1,[0]) -- branchid (lvl,[path in the branching tree])
230 (fromList [("breaks",[0]),("seaLevels",[0])])
231 rootsCount [] [] [] [] [] [] []
234 -----------------------
235 -- | To Phylo Step | --
236 -----------------------
239 indexDates' :: Map (Date,Date) [Document] -> Map (Date,Date) (Text,Text)
240 indexDates' m = map (\docs ->
241 let ds = map (\d -> date' d) docs
251 -- create a map of roots and group ids
252 joinRoots :: Phylo -> Phylo
253 joinRoots phylo = set (phylo_foundations . foundations_rootsInGroups) rootsMap phylo
255 --------------------------------------
256 rootsMap :: Map Int [PhyloGroupId]
257 rootsMap = fromListWith (++)
260 map (\n -> (n,[getGroupId g])) $ _phylo_groupNgrams g)
261 $ getGroupsFromScale 1 phylo
264 maybeDefaultParams :: Phylo -> Phylo
265 maybeDefaultParams phylo = if (defaultMode (getConfig phylo))
266 then findDefaultLevel phylo
270 -- To build the first phylo step from docs and terms
271 -- QL: backend entre phyloBase et Clustering
272 toPhyloWithoutLink :: [Document] -> PhyloConfig -> Phylo
273 toPhyloWithoutLink docs conf = joinRoots
276 $ appendGroups clusterToGroup 1 seriesOfClustering (updatePeriods (indexDates' docs') phyloBase)
278 --------------------------------------
279 seriesOfClustering :: Map (Date,Date) [Clustering]
280 seriesOfClustering = toSeriesOfClustering phyloBase docs'
281 --------------------------------------
282 docs' :: Map (Date,Date) [Document]
283 -- QL: Time Consuming here
284 docs' = groupDocsByPeriodRec date (getPeriodIds phyloBase) (sortOn date docs) empty
285 --------------------------------------
287 phyloBase = initPhylo docs conf
288 --------------------------------------
290 ---------------------------
291 -- | Frequent Item Set | --
292 ---------------------------
295 -- To apply a filter with the possibility of keeping some periods non empty (keep : True|False)
296 filterClique :: Bool -> Int -> (Int -> [Clustering] -> [Clustering]) -> Map (Date, Date) [Clustering] -> Map (Date, Date) [Clustering]
297 filterClique keep thr f m = case keep of
298 False -> map (\l -> f thr l) m
299 True -> map (\l -> keepFilled (f) thr l) m
302 -- To filter Fis with small Support
303 filterCliqueBySupport :: Int -> [Clustering] -> [Clustering]
304 filterCliqueBySupport thr l = filter (\clq -> (clq ^. clustering_support ) >= thr) l
307 -- To filter Fis with small Clique size
308 filterCliqueBySize :: Int -> [Clustering] -> [Clustering]
309 filterCliqueBySize thr l = filter (\clq -> (length $ clq ^. clustering_roots) >= thr) l
312 -- To filter nested Fis
313 filterCliqueByNested :: Map (Date, Date) [Clustering] -> Map (Date, Date) [Clustering]
314 filterCliqueByNested m =
316 foldl (\mem f -> if (any (\f' -> isNested (f' ^. clustering_roots) (f ^. clustering_roots)) mem)
319 let fMax = filter (\f' -> not $ isNested (f ^. clustering_roots) (f' ^. clustering_roots)) mem
320 in fMax ++ [f] ) [] l)
322 clq' = clq `using` parList rdeepseq
323 in fromList $ zip (keys m) clq'
326 -- | To transform a time map of docs into a time map of Fis with some filters
327 toSeriesOfClustering :: Phylo -> Map (Date, Date) [Document] -> Map (Date,Date) [Clustering]
328 toSeriesOfClustering phylo phyloDocs = case (clique $ getConfig phylo) of
329 Fis s s' -> -- traceFis "Filtered Fis"
331 {- \$ traceFis "Filtered by clique size" -}
332 $ filterClique True s' (filterCliqueBySize)
333 {- \$ traceFis "Filtered by support" -}
334 $ filterClique True s (filterCliqueBySupport)
335 {- \$ traceFis "Unfiltered Fis" -}
337 MaxClique s _ _ -> filterClique True s (filterCliqueBySize)
340 --------------------------------------
341 seriesOfClustering :: Map (Date,Date) [Clustering]
342 seriesOfClustering = case (clique $ getConfig phylo) of
344 let fis = map (\(prd,docs) ->
345 case (corpusParser $ getConfig phylo) of
346 Csv' _ -> let lst = toList
347 $ fisWithSizePolyMap' (Segment 1 20) 1 (map (\d -> (ngramsToIdx (text d) (getRoots phylo), (weight d, (sourcesToIdx (sources d) (getSources phylo))))) docs)
348 in (prd, map (\f -> Clustering (Set.toList $ fst f) ((fst . snd) f) prd ((fst . snd . snd) f) (((snd . snd . snd) f))) lst)
349 _ -> let lst = toList
350 $ fisWithSizePolyMap (Segment 1 20) 1 (map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs)
351 in (prd, map (\f -> Clustering (Set.toList $ fst f) (snd f) prd (Just $ fromIntegral $ snd f) []) lst)
354 fis' = fis `using` parList rdeepseq
356 MaxClique _ thr filterType ->
357 let mcl = map (\(prd,docs) ->
359 $ foldl sumCooc empty
361 $ map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs
362 in (prd, map (\cl -> Clustering cl 0 prd Nothing []) $ getMaxCliques filterType Conditional thr cooc))
364 mcl' = mcl `using` parList rdeepseq
366 --------------------------------------
368 -- dev viz graph maxClique getMaxClique
376 -- To transform the docs into a time map of coocurency matrix
377 docsToTimeScaleCooc :: [Document] -> Vector Ngrams -> Map Date Cooc
378 docsToTimeScaleCooc docs fdt =
379 let mCooc = fromListWith sumCooc
380 $ map (\(_d,l) -> (_d, listToMatrix l))
381 $ map (\doc -> (date doc, sort $ ngramsToIdx (text doc) fdt)) docs
383 $ map (\t -> (t,empty))
384 $ toTimeScale (map date docs) 1
385 in trace ("\n" <> "-- | Build the coocurency matrix for " <> show (length $ keys mCooc') <> " unit of time" <> "\n")
386 $ unionWith sumCooc mCooc mCooc'
389 -----------------------
390 -- | to Phylo Base | --
391 -----------------------
394 groupDocsByPeriodRec :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc] -> Map (date, date) [doc]
395 groupDocsByPeriodRec f prds docs acc =
396 if ((null prds) || (null docs))
399 let prd = head' "groupBy" prds
400 docs' = partition (\d -> (f d >= fst prd) && (f d <= snd prd)) docs
401 in groupDocsByPeriodRec f (tail prds) (snd docs') (insert prd (fst docs') acc)
404 -- To group a list of Documents by fixed periods
405 groupDocsByPeriod' :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
406 groupDocsByPeriod' f pds docs =
407 let docs' = groupBy (\d d' -> f d == f d') $ sortOn f docs
408 periods = map (inPeriode f docs') pds
409 periods' = periods `using` parList rdeepseq
410 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length pds) <> " periods" <> "\n")
411 $ fromList $ zip pds periods'
413 --------------------------------------
414 inPeriode :: Ord b => (t -> b) -> [[t]] -> (b, b) -> [t]
415 inPeriode f' h (start,end) =
416 concat $ fst $ partition (\d -> f' (head' "inPeriode" d) >= start && f' (head' "inPeriode" d) <= end) h
420 -- To group a list of Documents by fixed periods
421 groupDocsByPeriod :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
422 groupDocsByPeriod _ _ [] = panic "[ERR][Viz.Phylo.PhyloMaker] Empty [Documents] can not have any periods"
423 groupDocsByPeriod f pds es =
424 let periods = map (inPeriode f es) pds
425 periods' = periods `using` parList rdeepseq
427 in trace ("\n" <> "-- | Group " <> show(length es) <> " docs by " <> show(length pds) <> " periods" <> "\n")
428 $ fromList $ zip pds periods'
430 --------------------------------------
431 inPeriode :: Ord b => (t -> b) -> [t] -> (b, b) -> [t]
432 inPeriode f' h (start,end) =
433 fst $ partition (\d -> f' d >= start && f' d <= end) h
434 --------------------------------------
437 docsToTermFreq :: [Document] -> Vector Ngrams -> Map Int Double
438 docsToTermFreq docs fdt =
439 let nbDocs = fromIntegral $ length docs
440 freqs = map (/(nbDocs))
442 $ map (\lst -> (head' "docsToTermFreq" lst, fromIntegral $ length lst))
443 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) docs
444 sumFreqs = sum $ elems freqs
445 in map (/sumFreqs) freqs
448 docsToTermCount :: [Document] -> Vector Ngrams -> Map Int Double
449 docsToTermCount docs roots = fromList
450 $ map (\lst -> (head' "docsToTermCount" lst, fromIntegral $ length lst))
451 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) roots) docs
455 docsToTimeTermCount :: [Document] -> Vector Ngrams -> (Map Date (Map Int Double))
456 docsToTimeTermCount docs roots =
457 let docs' = Map.map (\l -> fromList $ map (\lst -> (head' "docsToTimeTermCount" lst, fromIntegral $ length lst))
460 $ map (\d -> (date d, nub $ ngramsToIdx (text d) roots)) docs
461 time = fromList $ map (\t -> (t,Map.empty)) $ toTimeScale (keys docs') 1
462 in unionWith (Map.union) time docs'
465 docsToLastTermFreq :: Int -> [Document] -> Vector Ngrams -> Map Int Double
466 docsToLastTermFreq n docs fdt =
467 let last = take n $ reverse $ sort $ map date docs
468 nbDocs = fromIntegral $ length $ filter (\d -> elem (date d) last) docs
469 freqs = map (/(nbDocs))
471 $ map (\lst -> (head' "docsToLastTermFreq" lst, fromIntegral $ length lst))
472 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) $ filter (\d -> elem (date d) last) docs
473 sumFreqs = sum $ elems freqs
474 in map (/sumFreqs) freqs
477 -- To count the number of docs by unit of time
478 docsToTimeScaleNb :: [Document] -> Map Date Double
479 docsToTimeScaleNb docs =
480 let docs' = fromListWith (+) $ map (\d -> (date d,1)) docs
481 time = fromList $ map (\t -> (t,0)) $ toTimeScale (keys docs') 1
482 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length time) <> " unit of time" <> "\n")
483 $ unionWith (+) time docs'
486 initPhyloScales :: Int -> Period -> Map PhyloScaleId PhyloScale
487 initPhyloScales lvlMax pId =
488 fromList $ map (\lvl -> ((pId,lvl),PhyloScale pId ("","") lvl empty)) [1..lvlMax]
491 setDefault :: PhyloConfig -> TimeUnit -> PhyloConfig
492 setDefault conf timeScale = conf {
494 similarity = WeightedLogJaccard 0.5 2,
495 findAncestors = True,
496 phyloSynchrony = ByProximityThreshold 0.6 0 SiblingBranches MergeAllGroups,
497 phyloQuality = Quality 0.5 3,
498 timeUnit = timeScale,
500 exportLabel = [BranchLabel MostEmergentTfIdf 2, GroupLabel MostEmergentInclusive 2],
501 exportSort = ByHierarchy Desc,
502 exportFilter = [ByBranchSize 3]
506 -- Init the basic elements of a Phylo
508 initPhylo :: [Document] -> PhyloConfig -> Phylo
509 initPhylo docs conf =
510 let roots = Vector.fromList $ nub $ concat $ map text docs
511 timeScale = head' "initPhylo" $ map docTime docs
512 foundations = PhyloFoundations roots empty
513 docsSources = PhyloSources (Vector.fromList $ nub $ concat $ map sources docs)
514 docsCounts = PhyloCounts (docsToTimeScaleCooc docs (foundations ^. foundations_roots))
515 (docsToTimeScaleNb docs)
516 (docsToTimeTermCount docs (foundations ^. foundations_roots))
517 (docsToTermCount docs (foundations ^. foundations_roots))
518 (docsToTermFreq docs (foundations ^. foundations_roots))
519 (docsToLastTermFreq (getTimePeriod timeScale) docs (foundations ^. foundations_roots))
520 params = if (defaultMode conf)
521 then defaultPhyloParam { _phyloParam_config = setDefault conf timeScale }
522 else defaultPhyloParam { _phyloParam_config = conf }
523 periods = toPeriods (sort $ nub $ map date docs) (getTimePeriod timeScale) (getTimeStep timeScale)
524 in trace ("\n" <> "-- | Init a phylo out of " <> show(length docs) <> " docs \n")
525 $ trace ("\n" <> "-- | lambda " <> show(_qua_granularity $ phyloQuality $ _phyloParam_config params))
531 (fromList $ map (\prd -> (prd, PhyloPeriod prd ("","") (initPhyloScales 1 prd))) periods)
533 (_qua_granularity $ phyloQuality $ _phyloParam_config params)