2 Module : Gargantext.Core.Viz.Phylo.PhyloMaker
3 Description : Maker engine for rebuilding a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 module Gargantext.Core.Viz.Phylo.PhyloMaker where
15 import Control.DeepSeq (NFData)
16 import Control.Lens hiding (Level)
17 import Control.Parallel.Strategies (parList, rdeepseq, using)
18 import Data.List (concat, nub, partition, sort, (++), group, intersect, null, sortOn, groupBy, tail)
19 import Data.Map (Map, fromListWith, keys, unionWith, fromList, empty, toList, elems, (!), restrictKeys, insert)
21 import Data.Text (Text)
22 import Data.Vector (Vector)
23 import Debug.Trace (trace)
24 import Prelude (floor)
26 import Gargantext.Core.Methods.Similarities (Similarity(Conditional))
27 import Gargantext.Core.Methods.Graph.MaxClique (getMaxCliques)
28 import Gargantext.Core.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, fisWithSizePolyMap', Size(..))
29 import Gargantext.Core.Viz.Phylo
30 import Gargantext.Core.Viz.Phylo.PhyloExport (toHorizon)
31 import Gargantext.Core.Viz.Phylo.PhyloTools
32 import Gargantext.Core.Viz.Phylo.SynchronicClustering (synchronicClustering)
33 import Gargantext.Core.Viz.Phylo.TemporalMatching (toPhyloQuality, temporalMatching, getNextPeriods, filterDocs, filterDiago, reduceDiagos, toSimilarity)
34 import Gargantext.Prelude
36 import qualified Data.Set as Set
37 import qualified Data.Map as Map
38 import qualified Data.List as List
39 import qualified Data.Vector as Vector
47 data Phylo' = PhyloBase { _phylo'_phyloBase :: Phylo}
48 | PhyloN { _phylo'_flatPhylo :: Phylo}
51 toPhylo' :: Phylo' -> [Document] -> TermList -> PhyloConfig -> Phylo
52 toPhylo' (PhyloN phylo) = toPhylo'
53 toPhylo' (PhyloBase phylo) = toPhylo
56 -- TODO an adaptative synchronic clustering with a slider
58 toPhylo :: Phylo -> Phylo
59 toPhylo phylowithoutLink = traceToPhylo (phyloScale $ getConfig phylowithoutLink) $
60 if (phyloScale $ getConfig phylowithoutLink) > 1
61 then foldl' (\phylo' _ -> synchronicClustering phylo') phyloAncestors [2..(phyloScale $ getConfig phylowithoutLink)]
64 --------------------------------------
65 phyloAncestors :: Phylo
67 if (findAncestors $ getConfig phylowithoutLink)
68 then toHorizon phyloWithLinks
70 --------------------------------------
71 phyloWithLinks :: Phylo
72 phyloWithLinks = temporalMatching (getLadder phylowithoutLink) phylowithoutLink
73 --------------------------------------
76 -----------------------------
77 -- | Create a flat Phylo | --
78 -----------------------------
82 -- create a square ladder
84 squareLadder :: [Double] -> [Double]
85 squareLadder ladder = List.map (\x -> x * x) ladder
89 -- create an adaptative 'sea elevation' ladder
91 adaptSeaLadder :: Double -> Set Double -> Set Double -> [Double]
92 adaptSeaLadder curr similarities ladder =
93 if curr <= 0 || Set.null similarities
94 then Set.toList ladder
96 let idx = ((Set.size similarities) `div` (floor curr)) - 1
97 thr = Set.elemAt idx similarities
98 -- we use a sliding methods 1/10, then 1/9, then ... 1/2
99 in adaptSeaLadder (curr -1) (Set.filter (> thr) similarities) (Set.insert thr ladder)
103 -- create a constante 'sea elevation' ladder
105 constSeaLadder :: Double -> Double -> Set Double -> [Double]
106 constSeaLadder curr step ladder =
108 then Set.toList ladder
109 else constSeaLadder (curr + step) step (Set.insert curr ladder)
114 -- create an evolving 'sea elevation' ladder based on estimated & local quality maxima
116 evolvSeaLadder :: Double -> Double -> Map Int Double -> Set Double -> [((PhyloGroup,PhyloGroup),Double)] -> [Double]
117 evolvSeaLadder nbFdt lambda freq similarities graph = map snd
119 $ zip maxima (map fst qua')
120 -- 3) find the corresponding measures of similarity and create the ladder
123 -- 2) find the local maxima in the quality distribution
125 maxima = [snd (List.head qua') > snd (List.head $ List.tail qua')] ++ (findMaxima qua') ++ [snd (List.head $ reverse qua') > snd (List.head $ List.tail $ reverse qua')]
128 qua' :: [(Double,Double)]
129 qua' = foldl (\acc (s,q) ->
132 else if (snd (List.last acc)) == q
135 ) [] $ zip (Set.toList similarities) qua
137 -- 1.1) for each measure of similarity, prune the flat phylo, compute the branches and estimate the quality
140 let edges = filter (\edge -> snd edge >= thr) graph
141 nodes = nub $ concat $ map (\((n,n'),_) -> [n,n']) edges
142 branches = toRelatedComponents nodes edges
143 in toPhyloQuality nbFdt lambda freq branches
144 ) $ (Set.toList similarities)
148 -- find a similarity ladder regarding the "sea elevation" strategy
150 findSeaLadder :: Phylo -> Phylo
151 findSeaLadder phylo = case getSeaElevation phylo of
152 Constante start gap -> phylo & phylo_seaLadder .~ (constSeaLadder start gap Set.empty)
153 Adaptative steps -> phylo & phylo_seaLadder .~ (squareLadder $ adaptSeaLadder steps similarities Set.empty)
154 Evolving _ -> let ladder = evolvSeaLadder
155 (fromIntegral $ Vector.length $ getRoots phylo)
158 similarities simGraph
159 in phylo & phylo_seaLadder .~ (if length ladder > 0
161 -- if we don't find any local maxima with the evolving strategy
162 else constSeaLadder 0.1 0.1 Set.empty)
165 -- 2) extract the values of the kinship links
166 similarities :: Set Double
167 similarities = Set.fromList $ sort $ map snd simGraph
169 -- 1) we process an initial calculation of the kinship links
170 -- this initial calculation is used to estimate the real sea ladder
171 simGraph :: [((PhyloGroup,PhyloGroup),Double)]
172 simGraph = foldl' (\acc period ->
173 -- 1.1) process period by period
174 let sources = getGroupsFromScalePeriods 1 [period] phylo
175 next = getNextPeriods ToParents 3 period (keys $ phylo ^. phylo_periods)
176 targets = getGroupsFromScalePeriods 1 next phylo
177 docs = filterDocs (getDocsByDate phylo) ([period] ++ next)
178 diagos = filterDiago (getCoocByDate phylo) ([period] ++ next)
179 -- 1.2) compute the kinship similarities between pairs of source & target in parallel
180 pairs = map (\source ->
181 let candidates = filter (\target -> (> 2) $ length
182 $ intersect (getGroupNgrams source) (getGroupNgrams target)) targets
184 let nbDocs = (sum . elems)
185 $ filterDocs docs ([idToPrd (getGroupId source), idToPrd (getGroupId target)])
187 $ filterDiago diagos ([idToPrd (getGroupId source), idToPrd (getGroupId target)])
188 in ((source,target),toSimilarity nbDocs diago (getSimilarity phylo) (getGroupNgrams source) (getGroupNgrams target) (getGroupNgrams target))
191 pairs' = pairs `using` parList rdeepseq
192 in acc ++ (concat pairs')
193 ) [] $ keys $ phylo ^. phylo_periods
195 appendGroups :: (a -> Period -> (Text,Text) -> Scale -> Int -> [Cooc] -> Map Int Double -> PhyloGroup) -> Scale -> Map (Date,Date) [a] -> Phylo -> Phylo
196 appendGroups f lvl m phylo = trace ("\n" <> "-- | Append " <> show (length $ concat $ elems m) <> " groups to scale " <> show (lvl) <> "\n")
197 $ over ( phylo_periods
201 (\phyloLvl -> if lvl == (phyloLvl ^. phylo_scaleScale)
203 let pId = phyloLvl ^. phylo_scalePeriod
204 pId' = phyloLvl ^. phylo_scalePeriodStr
207 & phylo_scaleGroups .~ (fromList $ foldl (\groups obj ->
208 groups ++ [ (((pId,lvl),length groups)
209 , f obj pId pId' lvl (length groups)
210 -- select the cooc of the periods
211 (elems $ restrictKeys (getCoocByDate phylo) $ periodsToYears [pId])
212 -- select and merge the roots count of the periods
213 (foldl (\acc count -> unionWith (+) acc count) empty
214 $ elems $ restrictKeys (getRootsCountByDate phylo) $ periodsToYears [pId]))
221 clusterToGroup :: Clustering -> Period -> (Text,Text) -> Scale -> Int -> [Cooc] -> Map Int Double -> PhyloGroup
222 clusterToGroup fis pId pId' lvl idx coocs rootsCount = PhyloGroup pId pId' lvl idx ""
223 (fis ^. clustering_support )
224 (fis ^. clustering_visWeighting)
225 (fis ^. clustering_visFiltering)
226 (fis ^. clustering_roots)
227 (ngramsToCooc (fis ^. clustering_roots) coocs)
228 (ngramsToDensity (fis ^. clustering_roots) coocs rootsCount)
229 (1,[0]) -- branchid (lvl,[path in the branching tree])
230 (fromList [("breaks",[0]),("seaLevels",[0])])
231 rootsCount [] [] [] [] [] [] []
234 -----------------------
235 -- | To Phylo Step | --
236 -----------------------
239 indexDates' :: Map (Date,Date) [Document] -> Map (Date,Date) (Text,Text)
240 indexDates' m = map (\docs ->
241 let ds = map (\d -> date' d) docs
251 -- create a map of roots and group ids
252 joinRoots :: Phylo -> Phylo
253 joinRoots phylo = set (phylo_foundations . foundations_rootsInGroups) rootsMap phylo
255 --------------------------------------
256 rootsMap :: Map Int [PhyloGroupId]
257 rootsMap = fromListWith (++)
260 map (\n -> (n,[getGroupId g])) $ _phylo_groupNgrams g)
261 $ getGroupsFromScale 1 phylo
264 maybeDefaultParams :: Phylo -> Phylo
265 maybeDefaultParams phylo = if (defaultMode (getConfig phylo))
266 then findDefaultLevel phylo
270 -- To build the first phylo step from docs and terms
271 -- QL: backend entre phyloBase et Clustering
272 -- tophylowithoutLink
273 toPhyloWithoutLink :: [Document] -> PhyloConfig -> Phylo
274 toPhyloWithoutLink docs conf = joinRoots
277 $ appendGroups clusterToGroup 1 seriesOfClustering (updatePeriods (indexDates' docs') phyloBase)
279 --------------------------------------
280 seriesOfClustering :: Map (Date,Date) [Clustering]
281 seriesOfClustering = toSeriesOfClustering phyloBase docs'
282 --------------------------------------
283 docs' :: Map (Date,Date) [Document]
284 -- QL: Time Consuming here
285 docs' = groupDocsByPeriodRec date (getPeriodIds phyloBase) (sortOn date docs) empty
286 --------------------------------------
288 phyloBase = initPhylo docs conf
289 --------------------------------------
291 ---------------------------
292 -- | Frequent Item Set | --
293 ---------------------------
296 -- To apply a filter with the possibility of keeping some periods non empty (keep : True|False)
297 filterClique :: Bool -> Int -> (Int -> [Clustering] -> [Clustering]) -> Map (Date, Date) [Clustering] -> Map (Date, Date) [Clustering]
298 filterClique keep thr f m = case keep of
299 False -> map (\l -> f thr l) m
300 True -> map (\l -> keepFilled (f) thr l) m
303 -- To filter Fis with small Support
304 filterCliqueBySupport :: Int -> [Clustering] -> [Clustering]
305 filterCliqueBySupport thr l = filter (\clq -> (clq ^. clustering_support ) >= thr) l
308 -- To filter Fis with small Clique size
309 filterCliqueBySize :: Int -> [Clustering] -> [Clustering]
310 filterCliqueBySize thr l = filter (\clq -> (length $ clq ^. clustering_roots) >= thr) l
313 -- To filter nested Fis
314 filterCliqueByNested :: Map (Date, Date) [Clustering] -> Map (Date, Date) [Clustering]
315 filterCliqueByNested m =
317 foldl (\mem f -> if (any (\f' -> isNested (f' ^. clustering_roots) (f ^. clustering_roots)) mem)
320 let fMax = filter (\f' -> not $ isNested (f ^. clustering_roots) (f' ^. clustering_roots)) mem
321 in fMax ++ [f] ) [] l)
323 clq' = clq `using` parList rdeepseq
324 in fromList $ zip (keys m) clq'
327 -- | To transform a time map of docs into a time map of Fis with some filters
328 toSeriesOfClustering :: Phylo -> Map (Date, Date) [Document] -> Map (Date,Date) [Clustering]
329 toSeriesOfClustering phylo phyloDocs = case (clique $ getConfig phylo) of
330 Fis s s' -> -- traceFis "Filtered Fis"
332 {- \$ traceFis "Filtered by clique size" -}
333 $ filterClique True s' (filterCliqueBySize)
334 {- \$ traceFis "Filtered by support" -}
335 $ filterClique True s (filterCliqueBySupport)
336 {- \$ traceFis "Unfiltered Fis" -}
338 MaxClique s _ _ -> filterClique True s (filterCliqueBySize)
341 --------------------------------------
342 seriesOfClustering :: Map (Date,Date) [Clustering]
343 seriesOfClustering = case (clique $ getConfig phylo) of
345 let fis = map (\(prd,docs) ->
346 case (corpusParser $ getConfig phylo) of
347 Csv' _ -> let lst = toList
348 $ fisWithSizePolyMap' (Segment 1 20) 1 (map (\d -> (ngramsToIdx (text d) (getRoots phylo), (weight d, (sourcesToIdx (sources d) (getSources phylo))))) docs)
349 in (prd, map (\f -> Clustering (Set.toList $ fst f) ((fst . snd) f) prd ((fst . snd . snd) f) (((snd . snd . snd) f))) lst)
350 _ -> let lst = toList
351 $ fisWithSizePolyMap (Segment 1 20) 1 (map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs)
352 in (prd, map (\f -> Clustering (Set.toList $ fst f) (snd f) prd (Just $ fromIntegral $ snd f) []) lst)
355 fis' = fis `using` parList rdeepseq
357 MaxClique _ thr filterType ->
358 let mcl = map (\(prd,docs) ->
360 $ foldl sumCooc empty
362 $ map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs
363 in (prd, map (\cl -> Clustering cl 0 prd Nothing []) $ getMaxCliques filterType Conditional thr cooc))
365 mcl' = mcl `using` parList rdeepseq
367 --------------------------------------
369 -- dev viz graph maxClique getMaxClique
377 -- To transform the docs into a time map of coocurency matrix
378 docsToTimeScaleCooc :: [Document] -> Vector Ngrams -> Map Date Cooc
379 docsToTimeScaleCooc docs fdt =
380 let mCooc = fromListWith sumCooc
381 $ map (\(_d,l) -> (_d, listToMatrix l))
382 $ map (\doc -> (date doc, sort $ ngramsToIdx (text doc) fdt)) docs
384 $ map (\t -> (t,empty))
385 $ toTimeScale (map date docs) 1
386 in trace ("\n" <> "-- | Build the coocurency matrix for " <> show (length $ keys mCooc') <> " unit of time" <> "\n")
387 $ unionWith sumCooc mCooc mCooc'
390 -----------------------
391 -- | to Phylo Base | --
392 -----------------------
395 groupDocsByPeriodRec :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc] -> Map (date, date) [doc]
396 groupDocsByPeriodRec f prds docs acc =
397 if ((null prds) || (null docs))
400 let prd = head' "groupBy" prds
401 docs' = partition (\d -> (f d >= fst prd) && (f d <= snd prd)) docs
402 in groupDocsByPeriodRec f (tail prds) (snd docs') (insert prd (fst docs') acc)
405 -- To group a list of Documents by fixed periods
406 groupDocsByPeriod' :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
407 groupDocsByPeriod' f pds docs =
408 let docs' = groupBy (\d d' -> f d == f d') $ sortOn f docs
409 periods = map (inPeriode f docs') pds
410 periods' = periods `using` parList rdeepseq
411 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length pds) <> " periods" <> "\n")
412 $ fromList $ zip pds periods'
414 --------------------------------------
415 inPeriode :: Ord b => (t -> b) -> [[t]] -> (b, b) -> [t]
416 inPeriode f' h (start,end) =
417 concat $ fst $ partition (\d -> f' (head' "inPeriode" d) >= start && f' (head' "inPeriode" d) <= end) h
421 -- To group a list of Documents by fixed periods
422 groupDocsByPeriod :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
423 groupDocsByPeriod _ _ [] = panic "[ERR][Viz.Phylo.PhyloMaker] Empty [Documents] can not have any periods"
424 groupDocsByPeriod f pds es =
425 let periods = map (inPeriode f es) pds
426 periods' = periods `using` parList rdeepseq
428 in trace ("\n" <> "-- | Group " <> show(length es) <> " docs by " <> show(length pds) <> " periods" <> "\n")
429 $ fromList $ zip pds periods'
431 --------------------------------------
432 inPeriode :: Ord b => (t -> b) -> [t] -> (b, b) -> [t]
433 inPeriode f' h (start,end) =
434 fst $ partition (\d -> f' d >= start && f' d <= end) h
435 --------------------------------------
438 docsToTermFreq :: [Document] -> Vector Ngrams -> Map Int Double
439 docsToTermFreq docs fdt =
440 let nbDocs = fromIntegral $ length docs
441 freqs = map (/(nbDocs))
443 $ map (\lst -> (head' "docsToTermFreq" lst, fromIntegral $ length lst))
444 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) docs
445 sumFreqs = sum $ elems freqs
446 in map (/sumFreqs) freqs
449 docsToTermCount :: [Document] -> Vector Ngrams -> Map Int Double
450 docsToTermCount docs roots = fromList
451 $ map (\lst -> (head' "docsToTermCount" lst, fromIntegral $ length lst))
452 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) roots) docs
456 docsToTimeTermCount :: [Document] -> Vector Ngrams -> (Map Date (Map Int Double))
457 docsToTimeTermCount docs roots =
458 let docs' = Map.map (\l -> fromList $ map (\lst -> (head' "docsToTimeTermCount" lst, fromIntegral $ length lst))
461 $ map (\d -> (date d, nub $ ngramsToIdx (text d) roots)) docs
462 time = fromList $ map (\t -> (t,Map.empty)) $ toTimeScale (keys docs') 1
463 in unionWith (Map.union) time docs'
466 docsToLastTermFreq :: Int -> [Document] -> Vector Ngrams -> Map Int Double
467 docsToLastTermFreq n docs fdt =
468 let last = take n $ reverse $ sort $ map date docs
469 nbDocs = fromIntegral $ length $ filter (\d -> elem (date d) last) docs
470 freqs = map (/(nbDocs))
472 $ map (\lst -> (head' "docsToLastTermFreq" lst, fromIntegral $ length lst))
473 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) $ filter (\d -> elem (date d) last) docs
474 sumFreqs = sum $ elems freqs
475 in map (/sumFreqs) freqs
478 -- To count the number of docs by unit of time
479 docsToTimeScaleNb :: [Document] -> Map Date Double
480 docsToTimeScaleNb docs =
481 let docs' = fromListWith (+) $ map (\d -> (date d,1)) docs
482 time = fromList $ map (\t -> (t,0)) $ toTimeScale (keys docs') 1
483 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length time) <> " unit of time" <> "\n")
484 $ unionWith (+) time docs'
487 initPhyloScales :: Int -> Period -> Map PhyloScaleId PhyloScale
488 initPhyloScales lvlMax pId =
489 fromList $ map (\lvl -> ((pId,lvl),PhyloScale pId ("","") lvl empty)) [1..lvlMax]
493 setDefault :: PhyloConfig -> TimeUnit -> Int -> PhyloConfig
494 setDefault conf timeScale nbDocs = defaultConfig
495 { corpusPath = (corpusPath conf)
496 , listPath = (listPath conf)
497 , outputPath = (outputPath conf)
498 , corpusParser = (corpusParser conf)
499 , listParser = (listParser conf)
500 , phyloName = (phyloName conf)
502 , timeUnit = timeScale
503 , clique = Fis (toSupport nbDocs) 3}
505 --------------------------------------
506 toSupport :: Int -> Support
514 --------------------------------------
517 -- Init the basic elements of a Phylo
519 initPhylo :: [Document] -> PhyloConfig -> Phylo
520 initPhylo docs conf =
521 let roots = Vector.fromList $ nub $ concat $ map text docs
522 timeScale = head' "initPhylo" $ map docTime docs
523 foundations = PhyloFoundations roots empty
524 docsSources = PhyloSources (Vector.fromList $ nub $ concat $ map sources docs)
525 docsCounts = PhyloCounts (docsToTimeScaleCooc docs (foundations ^. foundations_roots))
526 (docsToTimeScaleNb docs)
527 (docsToTimeTermCount docs (foundations ^. foundations_roots))
528 (docsToTermCount docs (foundations ^. foundations_roots))
529 (docsToTermFreq docs (foundations ^. foundations_roots))
530 (docsToLastTermFreq (getTimePeriod timeScale) docs (foundations ^. foundations_roots))
531 params = if (defaultMode conf)
532 then defaultPhyloParam { _phyloParam_config = setDefault conf timeScale (length docs) }
533 else defaultPhyloParam { _phyloParam_config = conf }
534 periods = toPeriods (sort $ nub $ map date docs) (getTimePeriod timeScale) (getTimeStep timeScale)
535 in trace ("\n" <> "-- | Init a phylo out of " <> show(length docs) <> " docs \n")
536 $ trace ("\n" <> "-- | lambda " <> show(_qua_granularity $ phyloQuality $ _phyloParam_config params))
542 (fromList $ map (\prd -> (prd, PhyloPeriod prd ("","") (initPhyloScales 1 prd))) periods)
544 (_qua_granularity $ phyloQuality $ _phyloParam_config params)