2 Module : Gargantext.Core.Viz.Phylo.PhyloMaker
3 Description : Maker engine for rebuilding a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 module Gargantext.Core.Viz.Phylo.PhyloMaker where
15 import Control.DeepSeq (NFData)
16 import Control.Lens hiding (Level)
17 import Control.Parallel.Strategies (parList, rdeepseq, using)
18 import Data.List (concat, nub, partition, sort, (++), group, intersect, null, sortOn, groupBy, tail)
19 import Data.Map (Map, fromListWith, keys, unionWith, fromList, empty, toList, elems, (!), restrictKeys, insert)
21 import Data.Text (Text)
22 import Data.Vector (Vector)
23 import Debug.Trace (trace)
24 import Prelude (floor)
26 import Gargantext.Core.Methods.Similarities (Similarity(Conditional))
27 import Gargantext.Core.Methods.Graph.MaxClique (getMaxCliques)
28 import Gargantext.Core.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, fisWithSizePolyMap', Size(..))
29 import Gargantext.Core.Viz.Phylo
30 import Gargantext.Core.Viz.Phylo.PhyloExport (toHorizon)
31 import Gargantext.Core.Viz.Phylo.PhyloTools
32 import Gargantext.Core.Viz.Phylo.SynchronicClustering (synchronicClustering)
33 import Gargantext.Core.Viz.Phylo.TemporalMatching (toPhyloQuality, temporalMatching, getNextPeriods, filterDocs, filterDiago, reduceDiagos, toSimilarity)
34 import Gargantext.Prelude
36 import qualified Data.Set as Set
37 import qualified Data.List as List
38 import qualified Data.Vector as Vector
46 data Phylo' = PhyloBase { _phylo'_phyloBase :: Phylo}
47 | PhyloN { _phylo'_flatPhylo :: Phylo}
50 toPhylo' :: Phylo' -> [Document] -> TermList -> PhyloConfig -> Phylo
51 toPhylo' (PhyloN phylo) = toPhylo'
52 toPhylo' (PhyloBase phylo) = toPhylo
55 -- TODO an adaptative synchronic clustering with a slider
57 toPhylo :: Phylo -> Phylo
58 toPhylo phylowithoutLink = traceToPhylo (phyloScale $ getConfig phylowithoutLink) $
59 if (phyloScale $ getConfig phylowithoutLink) > 1
60 then foldl' (\phylo' _ -> synchronicClustering phylo') phyloAncestors [2..(phyloScale $ getConfig phylowithoutLink)]
63 --------------------------------------
64 phyloAncestors :: Phylo
66 if (findAncestors $ getConfig phylowithoutLink)
67 then toHorizon phyloWithLinks
69 --------------------------------------
70 phyloWithLinks :: Phylo
71 phyloWithLinks = temporalMatching (getLadder phylowithoutLink) phylowithoutLink
72 --------------------------------------
75 -----------------------------
76 -- | Create a flat Phylo | --
77 -----------------------------
81 -- create a square ladder
83 squareLadder :: [Double] -> [Double]
84 squareLadder ladder = List.map (\x -> x * x) ladder
88 -- create an adaptative 'sea elevation' ladder
90 adaptSeaLadder :: Double -> Set Double -> Set Double -> [Double]
91 adaptSeaLadder curr similarities ladder =
92 if curr <= 0 || Set.null similarities
93 then Set.toList ladder
95 let idx = ((Set.size similarities) `div` (floor curr)) - 1
96 thr = Set.elemAt idx similarities
97 -- we use a sliding methods 1/10, then 1/9, then ... 1/2
98 in adaptSeaLadder (curr -1) (Set.filter (> thr) similarities) (Set.insert thr ladder)
102 -- create a constante 'sea elevation' ladder
104 constSeaLadder :: Double -> Double -> Set Double -> [Double]
105 constSeaLadder curr step ladder =
107 then Set.toList ladder
108 else constSeaLadder (curr + step) step (Set.insert curr ladder)
113 -- create an evolving 'sea elevation' ladder based on estimated & local quality maxima
115 evolvSeaLadder :: Double -> Double -> Map Int Double -> Set Double -> [((PhyloGroup,PhyloGroup),Double)] -> [Double]
116 evolvSeaLadder nbFdt lambda freq similarities graph = map snd
118 $ zip maxima (map fst qua')
119 -- 3) find the corresponding measures of similarity and create the ladder
122 -- 2) find the local maxima in the quality distribution
124 maxima = [snd (List.head qua') > snd (List.head $ List.tail qua')] ++ (findMaxima qua') ++ [snd (List.head $ reverse qua') > snd (List.head $ List.tail $ reverse qua')]
127 qua' :: [(Double,Double)]
128 qua' = foldl (\acc (s,q) ->
131 else if (snd (List.last acc)) == q
134 ) [] $ zip (Set.toList similarities) qua
136 -- 1.1) for each measure of similarity, prune the flat phylo, compute the branches and estimate the quality
139 let edges = filter (\edge -> snd edge >= thr) graph
140 nodes = nub $ concat $ map (\((n,n'),_) -> [n,n']) edges
141 branches = toRelatedComponents nodes edges
142 in toPhyloQuality nbFdt lambda freq branches
143 ) $ (Set.toList similarities)
147 -- find a similarity ladder regarding the "sea elevation" strategy
149 findSeaLadder :: Phylo -> Phylo
150 findSeaLadder phylo = case getSeaElevation phylo of
151 Constante start gap -> phylo & phylo_seaLadder .~ (constSeaLadder start gap Set.empty)
152 Adaptative steps -> phylo & phylo_seaLadder .~ (squareLadder $ adaptSeaLadder steps similarities Set.empty)
153 Evolving _ -> let ladder = evolvSeaLadder
154 (fromIntegral $ Vector.length $ getRoots phylo)
157 similarities simGraph
158 in phylo & phylo_seaLadder .~ (if length ladder > 0
160 -- if we don't find any local maxima with the evolving strategy
161 else constSeaLadder 0.1 0.1 Set.empty)
164 -- 2) extract the values of the kinship links
165 similarities :: Set Double
166 similarities = Set.fromList $ sort $ map snd simGraph
168 -- 1) we process an initial calculation of the kinship links
169 -- this initial calculation is used to estimate the real sea ladder
170 simGraph :: [((PhyloGroup,PhyloGroup),Double)]
171 simGraph = foldl' (\acc period ->
172 -- 1.1) process period by period
173 let sources = getGroupsFromScalePeriods 1 [period] phylo
174 next = getNextPeriods ToParents 3 period (keys $ phylo ^. phylo_periods)
175 targets = getGroupsFromScalePeriods 1 next phylo
176 docs = filterDocs (getDocsByDate phylo) ([period] ++ next)
177 diagos = filterDiago (getCoocByDate phylo) ([period] ++ next)
178 -- 1.2) compute the kinship similarities between pairs of source & target in parallel
179 pairs = map (\source ->
180 let candidates = filter (\target -> (> 2) $ length
181 $ intersect (getGroupNgrams source) (getGroupNgrams target)) targets
183 let nbDocs = (sum . elems)
184 $ filterDocs docs ([idToPrd (getGroupId source), idToPrd (getGroupId target)])
186 $ filterDiago diagos ([idToPrd (getGroupId source), idToPrd (getGroupId target)])
187 in ((source,target),toSimilarity nbDocs diago (getSimilarity phylo) (getGroupNgrams source) (getGroupNgrams target) (getGroupNgrams target))
190 pairs' = pairs `using` parList rdeepseq
191 in acc ++ (concat pairs')
192 ) [] $ keys $ phylo ^. phylo_periods
194 appendGroups :: (a -> Period -> (Text,Text) -> Scale -> Int -> [Cooc] -> PhyloGroup) -> Scale -> Map (Date,Date) [a] -> Phylo -> Phylo
195 appendGroups f lvl m phylo = trace ("\n" <> "-- | Append " <> show (length $ concat $ elems m) <> " groups to Level " <> show (lvl) <> "\n")
196 $ over ( phylo_periods
200 (\phyloLvl -> if lvl == (phyloLvl ^. phylo_scaleScale)
202 let pId = phyloLvl ^. phylo_scalePeriod
203 pId' = phyloLvl ^. phylo_scalePeriodStr
206 & phylo_scaleGroups .~ (fromList $ foldl (\groups obj ->
207 groups ++ [ (((pId,lvl),length groups)
208 , f obj pId pId' lvl (length groups)
209 (elems $ restrictKeys (getCoocByDate phylo) $ periodsToYears [pId]))
216 clusterToGroup :: Clustering -> Period -> (Text,Text) -> Scale -> Int -> [Cooc] -> PhyloGroup
217 clusterToGroup fis pId pId' lvl idx coocs = PhyloGroup pId pId' lvl idx ""
218 (fis ^. clustering_support )
219 (fis ^. clustering_visWeighting)
220 (fis ^. clustering_visFiltering)
221 (fis ^. clustering_roots)
222 (ngramsToCooc (fis ^. clustering_roots) coocs)
223 (1,[0]) -- branchid (lvl,[path in the branching tree])
224 (fromList [("breaks",[0]),("seaLevels",[0])])
228 -----------------------
229 -- | To Phylo Step | --
230 -----------------------
233 indexDates' :: Map (Date,Date) [Document] -> Map (Date,Date) (Text,Text)
234 indexDates' m = map (\docs ->
235 let ds = map (\d -> date' d) docs
245 -- create a map of roots and group ids
246 joinRoots :: Phylo -> Phylo
247 joinRoots phylo = set (phylo_foundations . foundations_rootsInGroups) rootsMap phylo
249 --------------------------------------
250 rootsMap :: Map Int [PhyloGroupId]
251 rootsMap = fromListWith (++)
254 map (\n -> (n,[getGroupId g])) $ _phylo_groupNgrams g)
255 $ getGroupsFromScale 1 phylo
258 maybeDefaultParams :: Phylo -> Phylo
259 maybeDefaultParams phylo = if (defaultMode (getConfig phylo))
260 then findDefaultLevel phylo
264 -- To build the first phylo step from docs and terms
265 -- QL: backend entre phyloBase et Clustering
266 -- tophylowithoutLink
267 toPhyloWithoutLink :: [Document] -> PhyloConfig -> Phylo
268 toPhyloWithoutLink docs conf = joinRoots
271 $ appendGroups clusterToGroup 1 seriesOfClustering (updatePeriods (indexDates' docs') phyloBase)
273 --------------------------------------
274 seriesOfClustering :: Map (Date,Date) [Clustering]
275 seriesOfClustering = toSeriesOfClustering phyloBase docs'
276 --------------------------------------
277 docs' :: Map (Date,Date) [Document]
278 -- QL: Time Consuming here
279 docs' = groupDocsByPeriodRec date (getPeriodIds phyloBase) (sortOn date docs) empty
280 --------------------------------------
282 phyloBase = initPhylo docs conf
283 --------------------------------------
285 ---------------------------
286 -- | Frequent Item Set | --
287 ---------------------------
290 -- To apply a filter with the possibility of keeping some periods non empty (keep : True|False)
291 filterClique :: Bool -> Int -> (Int -> [Clustering] -> [Clustering]) -> Map (Date, Date) [Clustering] -> Map (Date, Date) [Clustering]
292 filterClique keep thr f m = case keep of
293 False -> map (\l -> f thr l) m
294 True -> map (\l -> keepFilled (f) thr l) m
297 -- To filter Fis with small Support
298 filterCliqueBySupport :: Int -> [Clustering] -> [Clustering]
299 filterCliqueBySupport thr l = filter (\clq -> (clq ^. clustering_support ) >= thr) l
302 -- To filter Fis with small Clique size
303 filterCliqueBySize :: Int -> [Clustering] -> [Clustering]
304 filterCliqueBySize thr l = filter (\clq -> (length $ clq ^. clustering_roots) >= thr) l
307 -- To filter nested Fis
308 filterCliqueByNested :: Map (Date, Date) [Clustering] -> Map (Date, Date) [Clustering]
309 filterCliqueByNested m =
311 foldl (\mem f -> if (any (\f' -> isNested (f' ^. clustering_roots) (f ^. clustering_roots)) mem)
314 let fMax = filter (\f' -> not $ isNested (f ^. clustering_roots) (f' ^. clustering_roots)) mem
315 in fMax ++ [f] ) [] l)
317 clq' = clq `using` parList rdeepseq
318 in fromList $ zip (keys m) clq'
321 -- | To transform a time map of docs into a time map of Fis with some filters
322 toSeriesOfClustering :: Phylo -> Map (Date, Date) [Document] -> Map (Date,Date) [Clustering]
323 toSeriesOfClustering phylo phyloDocs = case (clique $ getConfig phylo) of
324 Fis s s' -> -- traceFis "Filtered Fis"
326 {- \$ traceFis "Filtered by clique size" -}
327 $ filterClique True s' (filterCliqueBySize)
328 {- \$ traceFis "Filtered by support" -}
329 $ filterClique True s (filterCliqueBySupport)
330 {- \$ traceFis "Unfiltered Fis" -}
332 MaxClique s _ _ -> filterClique True s (filterCliqueBySize)
335 --------------------------------------
336 seriesOfClustering :: Map (Date,Date) [Clustering]
337 seriesOfClustering = case (clique $ getConfig phylo) of
339 let fis = map (\(prd,docs) ->
340 case (corpusParser $ getConfig phylo) of
341 Csv' _ -> let lst = toList
342 $ fisWithSizePolyMap' (Segment 1 20) 1 (map (\d -> (ngramsToIdx (text d) (getRoots phylo), (weight d, (sourcesToIdx (sources d) (getSources phylo))))) docs)
343 in (prd, map (\f -> Clustering (Set.toList $ fst f) ((fst . snd) f) prd ((fst . snd . snd) f) (((snd . snd . snd) f))) lst)
344 _ -> let lst = toList
345 $ fisWithSizePolyMap (Segment 1 20) 1 (map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs)
346 in (prd, map (\f -> Clustering (Set.toList $ fst f) (snd f) prd (Just $ fromIntegral $ snd f) []) lst)
349 fis' = fis `using` parList rdeepseq
351 MaxClique _ thr filterType ->
352 let mcl = map (\(prd,docs) ->
354 $ foldl sumCooc empty
356 $ map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs
357 in (prd, map (\cl -> Clustering cl 0 prd Nothing []) $ getMaxCliques filterType Conditional thr cooc))
359 mcl' = mcl `using` parList rdeepseq
361 --------------------------------------
363 -- dev viz graph maxClique getMaxClique
371 -- To transform the docs into a time map of coocurency matrix
372 docsToTimeScaleCooc :: [Document] -> Vector Ngrams -> Map Date Cooc
373 docsToTimeScaleCooc docs fdt =
374 let mCooc = fromListWith sumCooc
375 $ map (\(_d,l) -> (_d, listToMatrix l))
376 $ map (\doc -> (date doc, sort $ ngramsToIdx (text doc) fdt)) docs
378 $ map (\t -> (t,empty))
379 $ toTimeScale (map date docs) 1
380 in trace ("\n" <> "-- | Build the coocurency matrix for " <> show (length $ keys mCooc') <> " unit of time" <> "\n")
381 $ unionWith sumCooc mCooc mCooc'
384 -----------------------
385 -- | to Phylo Base | --
386 -----------------------
389 groupDocsByPeriodRec :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc] -> Map (date, date) [doc]
390 groupDocsByPeriodRec f prds docs acc =
391 if ((null prds) || (null docs))
394 let prd = head' "groupBy" prds
395 docs' = partition (\d -> (f d >= fst prd) && (f d <= snd prd)) docs
396 in groupDocsByPeriodRec f (tail prds) (snd docs') (insert prd (fst docs') acc)
399 -- To group a list of Documents by fixed periods
400 groupDocsByPeriod' :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
401 groupDocsByPeriod' f pds docs =
402 let docs' = groupBy (\d d' -> f d == f d') $ sortOn f docs
403 periods = map (inPeriode f docs') pds
404 periods' = periods `using` parList rdeepseq
405 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length pds) <> " periods" <> "\n")
406 $ fromList $ zip pds periods'
408 --------------------------------------
409 inPeriode :: Ord b => (t -> b) -> [[t]] -> (b, b) -> [t]
410 inPeriode f' h (start,end) =
411 concat $ fst $ partition (\d -> f' (head' "inPeriode" d) >= start && f' (head' "inPeriode" d) <= end) h
415 -- To group a list of Documents by fixed periods
416 groupDocsByPeriod :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
417 groupDocsByPeriod _ _ [] = panic "[ERR][Viz.Phylo.PhyloMaker] Empty [Documents] can not have any periods"
418 groupDocsByPeriod f pds es =
419 let periods = map (inPeriode f es) pds
420 periods' = periods `using` parList rdeepseq
422 in trace ("\n" <> "-- | Group " <> show(length es) <> " docs by " <> show(length pds) <> " periods" <> "\n")
423 $ fromList $ zip pds periods'
425 --------------------------------------
426 inPeriode :: Ord b => (t -> b) -> [t] -> (b, b) -> [t]
427 inPeriode f' h (start,end) =
428 fst $ partition (\d -> f' d >= start && f' d <= end) h
429 --------------------------------------
432 docsToTermFreq :: [Document] -> Vector Ngrams -> Map Int Double
433 docsToTermFreq docs fdt =
434 let nbDocs = fromIntegral $ length docs
435 freqs = map (/(nbDocs))
437 $ map (\lst -> (head' "docsToTermFreq" lst, fromIntegral $ length lst))
438 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) docs
439 sumFreqs = sum $ elems freqs
440 in map (/sumFreqs) freqs
443 docsToTermCount :: [Document] -> Vector Ngrams -> Map Int Double
444 docsToTermCount docs roots = fromList
445 $ map (\lst -> (head' "docsToTermCount" lst, fromIntegral $ length lst))
446 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) roots) docs
450 docsToLastTermFreq :: Int -> [Document] -> Vector Ngrams -> Map Int Double
451 docsToLastTermFreq n docs fdt =
452 let last = take n $ reverse $ sort $ map date docs
453 nbDocs = fromIntegral $ length $ filter (\d -> elem (date d) last) docs
454 freqs = map (/(nbDocs))
456 $ map (\lst -> (head' "docsToLastTermFreq" lst, fromIntegral $ length lst))
457 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) $ filter (\d -> elem (date d) last) docs
458 sumFreqs = sum $ elems freqs
459 in map (/sumFreqs) freqs
462 -- To count the number of docs by unit of time
463 docsToTimeScaleNb :: [Document] -> Map Date Double
464 docsToTimeScaleNb docs =
465 let docs' = fromListWith (+) $ map (\d -> (date d,1)) docs
466 time = fromList $ map (\t -> (t,0)) $ toTimeScale (keys docs') 1
467 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length time) <> " unit of time" <> "\n")
468 $ unionWith (+) time docs'
471 initPhyloScales :: Int -> Period -> Map PhyloScaleId PhyloScale
472 initPhyloScales lvlMax pId =
473 fromList $ map (\lvl -> ((pId,lvl),PhyloScale pId ("","") lvl empty)) [1..lvlMax]
476 setDefault :: PhyloConfig -> PhyloConfig
477 setDefault conf = conf {
479 similarity = WeightedLogJaccard 0.5 2,
480 findAncestors = True,
481 phyloSynchrony = ByProximityThreshold 0.6 0 SiblingBranches MergeAllGroups,
482 phyloQuality = Quality 0.5 3,
483 timeUnit = Year 3 1 3,
484 clique = MaxClique 5 30 ByNeighbours,
485 exportLabel = [BranchLabel MostEmergentTfIdf 2, GroupLabel MostEmergentInclusive 2],
486 exportSort = ByHierarchy Desc,
487 exportFilter = [ByBranchSize 3]
491 -- Init the basic elements of a Phylo
493 initPhylo :: [Document] -> PhyloConfig -> Phylo
494 initPhylo docs conf =
495 let roots = Vector.fromList $ nub $ concat $ map text docs
496 foundations = PhyloFoundations roots empty
497 docsSources = PhyloSources (Vector.fromList $ nub $ concat $ map sources docs)
498 docsCounts = PhyloCounts (docsToTimeScaleCooc docs (foundations ^. foundations_roots))
499 (docsToTimeScaleNb docs)
500 (docsToTermCount docs (foundations ^. foundations_roots))
501 (docsToTermFreq docs (foundations ^. foundations_roots))
502 (docsToLastTermFreq (getTimePeriod $ timeUnit conf) docs (foundations ^. foundations_roots))
503 params = if (defaultMode conf)
504 then defaultPhyloParam { _phyloParam_config = setDefault conf }
505 else defaultPhyloParam { _phyloParam_config = conf }
506 periods = toPeriods (sort $ nub $ map date docs) (getTimePeriod $ timeUnit conf) (getTimeStep $ timeUnit conf)
507 in trace ("\n" <> "-- | Init a phylo out of " <> show(length docs) <> " docs \n")
513 (fromList $ map (\prd -> (prd, PhyloPeriod prd ("","") (initPhyloScales 1 prd))) periods)
515 (_qua_granularity $ phyloQuality $ conf)