2 Module : Gargantext.Core.Viz.Phylo.PhyloMaker
3 Description : Maker engine for rebuilding a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 module Gargantext.Core.Viz.Phylo.PhyloMaker where
15 import Control.DeepSeq (NFData)
16 import Control.Lens hiding (Level)
17 import Control.Parallel.Strategies (parList, rdeepseq, using)
18 import Data.List (concat, nub, partition, sort, (++), group, intersect, null, sortOn, groupBy, tail)
19 import Data.Map (Map, fromListWith, keys, unionWith, fromList, empty, toList, elems, (!), restrictKeys, insert)
21 import Data.Text (Text)
22 import Data.Vector (Vector)
23 import Debug.Trace (trace)
24 import Prelude (floor)
26 import Gargantext.Core.Methods.Similarities (Similarity(Conditional))
27 import Gargantext.Core.Methods.Graph.MaxClique (getMaxCliques)
28 import Gargantext.Core.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, fisWithSizePolyMap', Size(..))
29 import Gargantext.Core.Viz.Phylo
30 import Gargantext.Core.Viz.Phylo.PhyloExport (toHorizon)
31 import Gargantext.Core.Viz.Phylo.PhyloTools
32 import Gargantext.Core.Viz.Phylo.SynchronicClustering (synchronicClustering)
33 import Gargantext.Core.Viz.Phylo.TemporalMatching (toPhyloQuality, temporalMatching, getNextPeriods, filterDocs, filterDiago, reduceDiagos, toSimilarity)
34 import Gargantext.Prelude
36 import qualified Data.Set as Set
37 import qualified Data.List as List
38 import qualified Data.Vector as Vector
46 data Phylo' = PhyloBase { _phylo'_phyloBase :: Phylo}
47 | PhyloN { _phylo'_flatPhylo :: Phylo}
50 toPhylo' :: Phylo' -> [Document] -> TermList -> PhyloConfig -> Phylo
51 toPhylo' (PhyloN phylo) = toPhylo'
52 toPhylo' (PhyloBase phylo) = toPhylo
55 -- TODO an adaptative synchronic clustering with a slider
57 toPhylo :: Phylo -> Phylo
58 toPhylo phylowithoutLink = traceToPhylo (phyloScale $ getConfig phylowithoutLink) $
59 if (phyloScale $ getConfig phylowithoutLink) > 1
60 then foldl' (\phylo' _ -> synchronicClustering phylo') phyloAncestors [2..(phyloScale $ getConfig phylowithoutLink)]
63 --------------------------------------
64 phyloAncestors :: Phylo
66 if (findAncestors $ getConfig phylowithoutLink)
67 then toHorizon phyloWithLinks
69 --------------------------------------
70 phyloWithLinks :: Phylo
71 phyloWithLinks = temporalMatching (getLadder phylowithoutLink) phylowithoutLink
72 --------------------------------------
75 -----------------------------
76 -- | Create a flat Phylo | --
77 -----------------------------
81 -- create a square ladder
83 squareLadder :: [Double] -> [Double]
84 squareLadder ladder = List.map (\x -> x * x) ladder
88 -- create an adaptative 'sea elevation' ladder
90 adaptSeaLadder :: Double -> Set Double -> Set Double -> [Double]
91 adaptSeaLadder curr similarities ladder =
92 if curr <= 0 || Set.null similarities
93 then Set.toList ladder
95 let idx = ((Set.size similarities) `div` (floor curr)) - 1
96 thr = Set.elemAt idx similarities
97 -- we use a sliding methods 1/10, then 1/9, then ... 1/2
98 in adaptSeaLadder (curr -1) (Set.filter (> thr) similarities) (Set.insert thr ladder)
102 -- create a constante 'sea elevation' ladder
104 constSeaLadder :: Double -> Double -> Set Double -> [Double]
105 constSeaLadder curr step ladder =
107 then Set.toList ladder
108 else constSeaLadder (curr + step) step (Set.insert curr ladder)
113 -- create an evolving 'sea elevation' ladder based on estimated & local quality maxima
115 evolvSeaLadder :: Double -> Double -> Map Int Double -> Set Double -> [((PhyloGroup,PhyloGroup),Double)] -> [Double]
116 evolvSeaLadder nbFdt lambda freq similarities graph = map snd
118 $ zip maxima (map fst qua')
119 -- 3) find the corresponding measures of similarity and create the ladder
122 -- 2) find the local maxima in the quality distribution
124 maxima = [snd (List.head qua') > snd (List.head $ List.tail qua')] ++ (findMaxima qua') ++ [snd (List.head $ reverse qua') > snd (List.head $ List.tail $ reverse qua')]
127 qua' :: [(Double,Double)]
128 qua' = foldl (\acc (s,q) ->
131 else if (snd (List.last acc)) == q
134 ) [] $ zip (Set.toList similarities) qua
136 -- 1.1) for each measure of similarity, prune the flat phylo, compute the branches and estimate the quality
139 let edges = filter (\edge -> snd edge >= thr) graph
140 nodes = nub $ concat $ map (\((n,n'),_) -> [n,n']) edges
141 branches = toRelatedComponents nodes edges
142 in toPhyloQuality nbFdt lambda freq branches
143 ) $ (Set.toList similarities)
147 -- find a similarity ladder regarding the "sea elevation" strategy
149 findSeaLadder :: Phylo -> Phylo
150 findSeaLadder phylo = case getSeaElevation phylo of
151 Constante start gap -> phylo & phylo_seaLadder .~ (constSeaLadder start gap Set.empty)
152 Adaptative steps -> phylo & phylo_seaLadder .~ (squareLadder $ adaptSeaLadder steps similarities Set.empty)
153 Evolving _ -> let ladder = evolvSeaLadder
154 (fromIntegral $ Vector.length $ getRoots phylo)
157 similarities simGraph
158 in phylo & phylo_seaLadder .~ (if length ladder > 0
160 -- if we don't find any local maxima with the evolving strategy
161 else constSeaLadder 0.1 0.1 Set.empty)
164 -- 2) extract the values of the kinship links
165 similarities :: Set Double
166 similarities = Set.fromList $ sort $ map snd simGraph
168 -- 1) we process an initial calculation of the kinship links
169 -- this initial calculation is used to estimate the real sea ladder
170 simGraph :: [((PhyloGroup,PhyloGroup),Double)]
171 simGraph = foldl' (\acc period ->
172 -- 1.1) process period by period
173 let sources = getGroupsFromScalePeriods 1 [period] phylo
174 next = getNextPeriods ToParents 3 period (keys $ phylo ^. phylo_periods)
175 targets = getGroupsFromScalePeriods 1 next phylo
176 docs = filterDocs (getDocsByDate phylo) ([period] ++ next)
177 diagos = filterDiago (getCoocByDate phylo) ([period] ++ next)
178 -- 1.2) compute the kinship similarities between pairs of source & target in parallel
179 pairs = map (\source ->
180 let candidates = filter (\target -> (> 2) $ length
181 $ intersect (getGroupNgrams source) (getGroupNgrams target)) targets
183 let nbDocs = (sum . elems)
184 $ filterDocs docs ([idToPrd (getGroupId source), idToPrd (getGroupId target)])
186 $ filterDiago diagos ([idToPrd (getGroupId source), idToPrd (getGroupId target)])
187 in ((source,target),toSimilarity nbDocs diago (getSimilarity phylo) (getGroupNgrams source) (getGroupNgrams target) (getGroupNgrams target))
190 pairs' = pairs `using` parList rdeepseq
191 in acc ++ (concat pairs')
192 ) [] $ keys $ phylo ^. phylo_periods
194 appendGroups :: (a -> Period -> (Text,Text) -> Scale -> Int -> [Cooc] -> PhyloGroup) -> Scale -> Map (Date,Date) [a] -> Phylo -> Phylo
195 appendGroups f lvl m phylo = trace ("\n" <> "-- | Append " <> show (length $ concat $ elems m) <> " groups to Level " <> show (lvl) <> "\n")
196 $ over ( phylo_periods
200 (\phyloLvl -> if lvl == (phyloLvl ^. phylo_scaleScale)
202 let pId = phyloLvl ^. phylo_scalePeriod
203 pId' = phyloLvl ^. phylo_scalePeriodStr
206 & phylo_scaleGroups .~ (fromList $ foldl (\groups obj ->
207 groups ++ [ (((pId,lvl),length groups)
208 , f obj pId pId' lvl (length groups)
209 (elems $ restrictKeys (getCoocByDate phylo) $ periodsToYears [pId]))
216 clusterToGroup :: Clustering -> Period -> (Text,Text) -> Scale -> Int -> [Cooc] -> PhyloGroup
217 clusterToGroup fis pId pId' lvl idx coocs = PhyloGroup pId pId' lvl idx ""
218 (fis ^. clustering_support )
219 (fis ^. clustering_visWeighting)
220 (fis ^. clustering_visFiltering)
221 (fis ^. clustering_roots)
222 (ngramsToCooc (fis ^. clustering_roots) coocs)
223 (1,[0]) -- branchid (lvl,[path in the branching tree])
224 (fromList [("breaks",[0]),("seaLevels",[0])])
228 -----------------------
229 -- | To Phylo Step | --
230 -----------------------
233 indexDates' :: Map (Date,Date) [Document] -> Map (Date,Date) (Text,Text)
234 indexDates' m = map (\docs ->
235 let ds = map (\d -> date' d) docs
245 -- create a map of roots and group ids
246 joinRoots :: Phylo -> Phylo
247 joinRoots phylo = set (phylo_foundations . foundations_rootsInGroups) rootsMap phylo
249 --------------------------------------
250 rootsMap :: Map Int [PhyloGroupId]
251 rootsMap = fromListWith (++)
254 map (\n -> (n,[getGroupId g])) $ _phylo_groupNgrams g)
255 $ getGroupsFromScale 1 phylo
258 maybeDefaultParams :: Phylo -> Phylo
259 maybeDefaultParams phylo = if (defaultMode (getConfig phylo))
260 then findDefaultLevel phylo
264 -- To build the first phylo step from docs and terms
265 -- QL: backend entre phyloBase et Clustering
266 toPhyloWithoutLink :: [Document] -> PhyloConfig -> Phylo
267 toPhyloWithoutLink docs conf = joinRoots
270 $ appendGroups clusterToGroup 1 seriesOfClustering (updatePeriods (indexDates' docs') phyloBase)
272 --------------------------------------
273 seriesOfClustering :: Map (Date,Date) [Clustering]
274 seriesOfClustering = toSeriesOfClustering phyloBase docs'
275 --------------------------------------
276 docs' :: Map (Date,Date) [Document]
277 -- QL: Time Consuming here
278 docs' = groupDocsByPeriodRec date (getPeriodIds phyloBase) (sortOn date docs) empty
279 --------------------------------------
281 phyloBase = initPhylo docs conf
282 --------------------------------------
284 ---------------------------
285 -- | Frequent Item Set | --
286 ---------------------------
289 -- To apply a filter with the possibility of keeping some periods non empty (keep : True|False)
290 filterClique :: Bool -> Int -> (Int -> [Clustering] -> [Clustering]) -> Map (Date, Date) [Clustering] -> Map (Date, Date) [Clustering]
291 filterClique keep thr f m = case keep of
292 False -> map (\l -> f thr l) m
293 True -> map (\l -> keepFilled (f) thr l) m
296 -- To filter Fis with small Support
297 filterCliqueBySupport :: Int -> [Clustering] -> [Clustering]
298 filterCliqueBySupport thr l = filter (\clq -> (clq ^. clustering_support ) >= thr) l
301 -- To filter Fis with small Clique size
302 filterCliqueBySize :: Int -> [Clustering] -> [Clustering]
303 filterCliqueBySize thr l = filter (\clq -> (length $ clq ^. clustering_roots) >= thr) l
306 -- To filter nested Fis
307 filterCliqueByNested :: Map (Date, Date) [Clustering] -> Map (Date, Date) [Clustering]
308 filterCliqueByNested m =
310 foldl (\mem f -> if (any (\f' -> isNested (f' ^. clustering_roots) (f ^. clustering_roots)) mem)
313 let fMax = filter (\f' -> not $ isNested (f ^. clustering_roots) (f' ^. clustering_roots)) mem
314 in fMax ++ [f] ) [] l)
316 clq' = clq `using` parList rdeepseq
317 in fromList $ zip (keys m) clq'
320 -- | To transform a time map of docs into a time map of Fis with some filters
321 toSeriesOfClustering :: Phylo -> Map (Date, Date) [Document] -> Map (Date,Date) [Clustering]
322 toSeriesOfClustering phylo phyloDocs = case (clique $ getConfig phylo) of
323 Fis s s' -> -- traceFis "Filtered Fis"
325 {- \$ traceFis "Filtered by clique size" -}
326 $ filterClique True s' (filterCliqueBySize)
327 {- \$ traceFis "Filtered by support" -}
328 $ filterClique True s (filterCliqueBySupport)
329 {- \$ traceFis "Unfiltered Fis" -}
331 MaxClique s _ _ -> filterClique True s (filterCliqueBySize)
334 --------------------------------------
335 seriesOfClustering :: Map (Date,Date) [Clustering]
336 seriesOfClustering = case (clique $ getConfig phylo) of
338 let fis = map (\(prd,docs) ->
339 case (corpusParser $ getConfig phylo) of
340 Csv' _ -> let lst = toList
341 $ fisWithSizePolyMap' (Segment 1 20) 1 (map (\d -> (ngramsToIdx (text d) (getRoots phylo), (weight d, (sourcesToIdx (sources d) (getSources phylo))))) docs)
342 in (prd, map (\f -> Clustering (Set.toList $ fst f) ((fst . snd) f) prd ((fst . snd . snd) f) (((snd . snd . snd) f))) lst)
343 _ -> let lst = toList
344 $ fisWithSizePolyMap (Segment 1 20) 1 (map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs)
345 in (prd, map (\f -> Clustering (Set.toList $ fst f) (snd f) prd (Just $ fromIntegral $ snd f) []) lst)
348 fis' = fis `using` parList rdeepseq
350 MaxClique _ thr filterType ->
351 let mcl = map (\(prd,docs) ->
353 $ foldl sumCooc empty
355 $ map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs
356 in (prd, map (\cl -> Clustering cl 0 prd Nothing []) $ getMaxCliques filterType Conditional thr cooc))
358 mcl' = mcl `using` parList rdeepseq
360 --------------------------------------
362 -- dev viz graph maxClique getMaxClique
370 -- To transform the docs into a time map of coocurency matrix
371 docsToTimeScaleCooc :: [Document] -> Vector Ngrams -> Map Date Cooc
372 docsToTimeScaleCooc docs fdt =
373 let mCooc = fromListWith sumCooc
374 $ map (\(_d,l) -> (_d, listToMatrix l))
375 $ map (\doc -> (date doc, sort $ ngramsToIdx (text doc) fdt)) docs
377 $ map (\t -> (t,empty))
378 $ toTimeScale (map date docs) 1
379 in trace ("\n" <> "-- | Build the coocurency matrix for " <> show (length $ keys mCooc') <> " unit of time" <> "\n")
380 $ unionWith sumCooc mCooc mCooc'
383 -----------------------
384 -- | to Phylo Base | --
385 -----------------------
388 groupDocsByPeriodRec :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc] -> Map (date, date) [doc]
389 groupDocsByPeriodRec f prds docs acc =
390 if ((null prds) || (null docs))
393 let prd = head' "groupBy" prds
394 docs' = partition (\d -> (f d >= fst prd) && (f d <= snd prd)) docs
395 in groupDocsByPeriodRec f (tail prds) (snd docs') (insert prd (fst docs') acc)
398 -- To group a list of Documents by fixed periods
399 groupDocsByPeriod' :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
400 groupDocsByPeriod' f pds docs =
401 let docs' = groupBy (\d d' -> f d == f d') $ sortOn f docs
402 periods = map (inPeriode f docs') pds
403 periods' = periods `using` parList rdeepseq
404 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length pds) <> " periods" <> "\n")
405 $ fromList $ zip pds periods'
407 --------------------------------------
408 inPeriode :: Ord b => (t -> b) -> [[t]] -> (b, b) -> [t]
409 inPeriode f' h (start,end) =
410 concat $ fst $ partition (\d -> f' (head' "inPeriode" d) >= start && f' (head' "inPeriode" d) <= end) h
414 -- To group a list of Documents by fixed periods
415 groupDocsByPeriod :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
416 groupDocsByPeriod _ _ [] = panic "[ERR][Viz.Phylo.PhyloMaker] Empty [Documents] can not have any periods"
417 groupDocsByPeriod f pds es =
418 let periods = map (inPeriode f es) pds
419 periods' = periods `using` parList rdeepseq
421 in trace ("\n" <> "-- | Group " <> show(length es) <> " docs by " <> show(length pds) <> " periods" <> "\n")
422 $ fromList $ zip pds periods'
424 --------------------------------------
425 inPeriode :: Ord b => (t -> b) -> [t] -> (b, b) -> [t]
426 inPeriode f' h (start,end) =
427 fst $ partition (\d -> f' d >= start && f' d <= end) h
428 --------------------------------------
431 docsToTermFreq :: [Document] -> Vector Ngrams -> Map Int Double
432 docsToTermFreq docs fdt =
433 let nbDocs = fromIntegral $ length docs
434 freqs = map (/(nbDocs))
436 $ map (\lst -> (head' "docsToTermFreq" lst, fromIntegral $ length lst))
437 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) docs
438 sumFreqs = sum $ elems freqs
439 in map (/sumFreqs) freqs
442 docsToTermCount :: [Document] -> Vector Ngrams -> Map Int Double
443 docsToTermCount docs roots = fromList
444 $ map (\lst -> (head' "docsToTermCount" lst, fromIntegral $ length lst))
445 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) roots) docs
449 docsToLastTermFreq :: Int -> [Document] -> Vector Ngrams -> Map Int Double
450 docsToLastTermFreq n docs fdt =
451 let last = take n $ reverse $ sort $ map date docs
452 nbDocs = fromIntegral $ length $ filter (\d -> elem (date d) last) docs
453 freqs = map (/(nbDocs))
455 $ map (\lst -> (head' "docsToLastTermFreq" lst, fromIntegral $ length lst))
456 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) $ filter (\d -> elem (date d) last) docs
457 sumFreqs = sum $ elems freqs
458 in map (/sumFreqs) freqs
461 -- To count the number of docs by unit of time
462 docsToTimeScaleNb :: [Document] -> Map Date Double
463 docsToTimeScaleNb docs =
464 let docs' = fromListWith (+) $ map (\d -> (date d,1)) docs
465 time = fromList $ map (\t -> (t,0)) $ toTimeScale (keys docs') 1
466 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length time) <> " unit of time" <> "\n")
467 $ unionWith (+) time docs'
470 initPhyloScales :: Int -> Period -> Map PhyloScaleId PhyloScale
471 initPhyloScales lvlMax pId =
472 fromList $ map (\lvl -> ((pId,lvl),PhyloScale pId ("","") lvl empty)) [1..lvlMax]
475 setDefault :: PhyloConfig -> PhyloConfig
476 setDefault conf = conf {
478 similarity = WeightedLogJaccard 0.5 2,
479 findAncestors = True,
480 phyloSynchrony = ByProximityThreshold 0.6 0 SiblingBranches MergeAllGroups,
481 phyloQuality = Quality 0.5 3,
482 timeUnit = Year 3 1 3,
483 clique = MaxClique 5 30 ByNeighbours,
484 exportLabel = [BranchLabel MostEmergentTfIdf 2, GroupLabel MostEmergentInclusive 2],
485 exportSort = ByHierarchy Desc,
486 exportFilter = [ByBranchSize 3]
490 -- Init the basic elements of a Phylo
492 initPhylo :: [Document] -> PhyloConfig -> Phylo
493 initPhylo docs conf =
494 let roots = Vector.fromList $ nub $ concat $ map text docs
495 foundations = PhyloFoundations roots empty
496 docsSources = PhyloSources (Vector.fromList $ nub $ concat $ map sources docs)
497 docsCounts = PhyloCounts (docsToTimeScaleCooc docs (foundations ^. foundations_roots))
498 (docsToTimeScaleNb docs)
499 (docsToTermCount docs (foundations ^. foundations_roots))
500 (docsToTermFreq docs (foundations ^. foundations_roots))
501 (docsToLastTermFreq (getTimePeriod $ timeUnit conf) docs (foundations ^. foundations_roots))
502 params = if (defaultMode conf)
503 then defaultPhyloParam { _phyloParam_config = setDefault conf }
504 else defaultPhyloParam { _phyloParam_config = conf }
505 periods = toPeriods (sort $ nub $ map date docs) (getTimePeriod $ timeUnit conf) (getTimeStep $ timeUnit conf)
506 in trace ("\n" <> "-- | Init a phylo out of " <> show(length docs) <> " docs \n")
512 (fromList $ map (\prd -> (prd, PhyloPeriod prd ("","") (initPhyloScales 1 prd))) periods)
514 (_qua_granularity $ phyloQuality $ conf)