2 Module : Gargantext.Core.Viz.Phylo.PhyloMaker
3 Description : Maker engine for rebuilding a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 module Gargantext.Core.Viz.Phylo.PhyloMaker where
13 import Control.DeepSeq (NFData)
14 import Control.Lens hiding (Level)
15 import Control.Parallel.Strategies (parList, rdeepseq, using)
16 import Data.List (concat, nub, partition, sort, (++), group, intersect, null, sortOn, groupBy, tail)
17 import Data.Map (Map, fromListWith, keys, unionWith, fromList, empty, toList, elems, (!), restrictKeys, foldlWithKey, insert)
19 import Data.Text (Text)
20 import Data.Vector (Vector)
21 import Debug.Trace (trace)
22 import Prelude (floor)
24 import Gargantext.Core.Methods.Distances (Distance(Conditional))
25 import Gargantext.Core.Methods.Graph.MaxClique (getMaxCliques)
26 import Gargantext.Core.Text.Context (TermList)
27 import Gargantext.Core.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, fisWithSizePolyMap', Size(..))
28 import Gargantext.Core.Viz.Phylo
29 import Gargantext.Core.Viz.Phylo.PhyloExport (toHorizon)
30 import Gargantext.Core.Viz.Phylo.PhyloTools
31 import Gargantext.Core.Viz.Phylo.SynchronicClustering (synchronicClustering)
32 import Gargantext.Core.Viz.Phylo.TemporalMatching (temporalMatching, getNextPeriods, filterDocs, filterDiago, reduceDiagos, toProximity)
33 import Gargantext.Prelude
35 import qualified Data.Set as Set
36 import qualified Data.Vector as Vector
44 data Phylo' = PhyloBase { _phylo'_phyloBase :: Phylo}
45 | PhyloN { _phylo'_flatPhylo :: Phylo}
48 toPhylo' :: Phylo' -> [Document] -> TermList -> PhyloConfig -> Phylo
49 toPhylo' (PhyloN phylo) = toPhylo'
50 toPhylo' (PhyloBase phylo) = toPhylo
53 -- TODO an adaptative synchronic clustering with a slider
55 toPhylo :: Phylo -> Phylo
56 toPhylo phylowithoutLink = trace ("# flatPhylo groups " <> show(length $ getGroupsFromScale 1 flatPhylo))
57 $ traceToPhylo (phyloScale $ getConfig phylowithoutLink) $
58 if (phyloScale $ getConfig phylowithoutLink) > 1
59 then foldl' (\phylo' _ -> synchronicClustering phylo') phyloAncestors [2..(phyloScale $ getConfig phylowithoutLink)]
62 --------------------------------------
63 phyloAncestors :: Phylo
65 if (findAncestors $ getConfig phylowithoutLink)
66 then toHorizon flatPhylo
68 --------------------------------------
70 flatPhylo = addTemporalLinksToPhylo phylowithoutLink
71 --------------------------------------
74 -----------------------------
75 -- | Create a flat Phylo | --
76 -----------------------------
79 -- create an adaptative diachronic 'sea elevation' ladder
81 adaptDiachronicLadder :: Double -> Set Double -> Set Double -> [Double]
82 adaptDiachronicLadder curr similarities ladder =
83 if curr <= 0 || Set.null similarities
84 then Set.toList ladder
86 let idx = ((Set.size similarities) `div` (floor curr)) - 1
87 thr = Set.elemAt idx similarities
88 -- we use a sliding methods 1/10, then 1/9, then ... 1/2
89 in adaptDiachronicLadder (curr -1) (Set.filter (> thr) similarities) (Set.insert thr ladder)
93 -- create a constante diachronic 'sea elevation' ladder
95 constDiachronicLadder :: Double -> Double -> Set Double -> [Double]
96 constDiachronicLadder curr step ladder =
98 then Set.toList ladder
99 else constDiachronicLadder (curr + step) step (Set.insert curr ladder)
103 -- process an initial scanning of the kinship links
105 scanSimilarity :: Scale -> Phylo -> Phylo
106 scanSimilarity lvl phylo =
107 let proximity = phyloProximity $ getConfig phylo
108 scanning = foldlWithKey (\acc pId pds ->
109 -- 1) process period by period
110 let egos = map (\g -> (getGroupId g, g ^. phylo_groupNgrams))
112 $ view ( phylo_periodScales
113 . traverse . filtered (\phyloLvl -> phyloLvl ^. phylo_scaleScale == lvl)
114 . phylo_scaleGroups ) pds
115 next = getNextPeriods ToParents (getTimeFrame $ timeUnit $ getConfig phylo) pId (keys $ phylo ^. phylo_periods)
116 targets = map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ getGroupsFromScalePeriods lvl next phylo
117 docs = filterDocs (phylo ^. phylo_timeDocs) ([pId] ++ next)
118 diagos = filterDiago (phylo ^. phylo_timeCooc) ([pId] ++ next)
119 -- 2) compute the pairs in parallel
120 pairs = map (\(id,ngrams) ->
121 map (\(id',ngrams') ->
122 let nbDocs = (sum . elems) $ filterDocs docs ([idToPrd id, idToPrd id'])
123 diago = reduceDiagos $ filterDiago diagos ([idToPrd id, idToPrd id'])
124 in ((id,id'),toProximity nbDocs diago proximity ngrams ngrams' ngrams')
125 ) $ filter (\(_,ngrams') -> (not . null) $ intersect ngrams ngrams') targets
127 pairs' = pairs `using` parList rdeepseq
128 in acc ++ (concat pairs')
129 ) [] $ phylo ^. phylo_periods
130 in phylo & phylo_diaSimScan .~ Set.fromList (traceGroupsProxi $ map snd scanning)
134 appendGroups :: (a -> Period -> (Text,Text) -> Scale -> Int -> [Cooc] -> PhyloGroup) -> Scale -> Map (Date,Date) [a] -> Phylo -> Phylo
135 appendGroups f lvl m phylo = trace ("\n" <> "-- | Append " <> show (length $ concat $ elems m) <> " groups to Level " <> show (lvl) <> "\n")
136 $ over ( phylo_periods
140 (\phyloLvl -> if lvl == (phyloLvl ^. phylo_scaleScale)
142 let pId = phyloLvl ^. phylo_scalePeriod
143 pId' = phyloLvl ^. phylo_scalePeriodStr
146 & phylo_scaleGroups .~ (fromList $ foldl (\groups obj ->
147 groups ++ [ (((pId,lvl),length groups)
148 , f obj pId pId' lvl (length groups)
149 (elems $ restrictKeys (phylo ^. phylo_timeCooc) $ periodsToYears [pId]))
156 clusterToGroup :: Clustering -> Period -> (Text,Text) -> Scale -> Int -> [Cooc] -> PhyloGroup
157 clusterToGroup fis pId pId' lvl idx coocs = PhyloGroup pId pId' lvl idx ""
158 (fis ^. clustering_support )
159 (fis ^. clustering_visWeighting)
160 (fis ^. clustering_visFiltering)
161 (fis ^. clustering_roots)
162 (ngramsToCooc (fis ^. clustering_roots) coocs)
163 (1,[0]) -- branchid (lvl,[path in the branching tree])
164 (fromList [("breaks",[0]),("seaLevels",[0])])
168 -- enhance the phylo with temporal links
170 addTemporalLinksToPhylo :: Phylo -> Phylo
171 addTemporalLinksToPhylo phylowithoutLink = case strategy of
172 Constante start gap -> temporalMatching (constDiachronicLadder start gap Set.empty) phylowithoutLink
173 Adaptative steps -> temporalMatching (adaptDiachronicLadder steps (phylowithoutLink ^. phylo_diaSimScan) Set.empty) phylowithoutLink
175 strategy :: SeaElevation
176 strategy = getSeaElevation phylowithoutLink
178 -----------------------
179 -- | To Phylo Step | --
180 -----------------------
183 indexDates' :: Map (Date,Date) [Document] -> Map (Date,Date) (Text,Text)
184 indexDates' m = map (\docs ->
185 let ds = map (\d -> date' d) docs
195 -- To build the first phylo step from docs and terms
196 -- QL: backend entre phyloBase et Clustering
197 -- tophylowithoutLink
198 toPhyloWithoutLink :: [Document] -> TermList -> PhyloConfig -> Phylo
199 toPhyloWithoutLink docs lst conf = case (getSeaElevation phyloBase) of
200 Constante _ _ -> appendGroups clusterToGroup 1 seriesOfClustering (updatePeriods (indexDates' docs') phyloBase)
201 Adaptative _ -> scanSimilarity 1
202 $ appendGroups clusterToGroup 1 seriesOfClustering (updatePeriods (indexDates' docs') phyloBase)
204 --------------------------------------
205 seriesOfClustering :: Map (Date,Date) [Clustering]
206 seriesOfClustering = toSeriesOfClustering phyloBase docs'
207 --------------------------------------
208 docs' :: Map (Date,Date) [Document]
209 -- QL: Time Consuming here
210 docs' = groupDocsByPeriodRec date (getPeriodIds phyloBase) (sortOn date docs) empty
211 --------------------------------------
213 phyloBase = initPhylo docs lst conf
214 --------------------------------------
216 ---------------------------
217 -- | Frequent Item Set | --
218 ---------------------------
221 -- To apply a filter with the possibility of keeping some periods non empty (keep : True|False)
222 filterClique :: Bool -> Int -> (Int -> [Clustering] -> [Clustering]) -> Map (Date, Date) [Clustering] -> Map (Date, Date) [Clustering]
223 filterClique keep thr f m = case keep of
224 False -> map (\l -> f thr l) m
225 True -> map (\l -> keepFilled (f) thr l) m
228 -- To filter Fis with small Support
229 filterCliqueBySupport :: Int -> [Clustering] -> [Clustering]
230 filterCliqueBySupport thr l = filter (\clq -> (clq ^. clustering_support ) >= thr) l
233 -- To filter Fis with small Clique size
234 filterCliqueBySize :: Int -> [Clustering] -> [Clustering]
235 filterCliqueBySize thr l = filter (\clq -> (length $ clq ^. clustering_roots) >= thr) l
238 -- To filter nested Fis
239 filterCliqueByNested :: Map (Date, Date) [Clustering] -> Map (Date, Date) [Clustering]
240 filterCliqueByNested m =
242 foldl (\mem f -> if (any (\f' -> isNested (f' ^. clustering_roots) (f ^. clustering_roots)) mem)
245 let fMax = filter (\f' -> not $ isNested (f ^. clustering_roots) (f' ^. clustering_roots)) mem
246 in fMax ++ [f] ) [] l)
248 clq' = clq `using` parList rdeepseq
249 in fromList $ zip (keys m) clq'
252 -- | To transform a time map of docs into a time map of Fis with some filters
253 toSeriesOfClustering :: Phylo -> Map (Date, Date) [Document] -> Map (Date,Date) [Clustering]
254 toSeriesOfClustering phylo phyloDocs = case (clique $ getConfig phylo) of
255 Fis s s' -> -- traceFis "Filtered Fis"
257 {- \$ traceFis "Filtered by clique size" -}
258 $ filterClique True s' (filterCliqueBySize)
259 {- \$ traceFis "Filtered by support" -}
260 $ filterClique True s (filterCliqueBySupport)
261 {- \$ traceFis "Unfiltered Fis" -}
263 MaxClique s _ _ -> filterClique True s (filterCliqueBySize)
266 --------------------------------------
267 seriesOfClustering :: Map (Date,Date) [Clustering]
268 seriesOfClustering = case (clique $ getConfig phylo) of
270 let fis = map (\(prd,docs) ->
271 case (corpusParser $ getConfig phylo) of
272 Csv' _ -> let lst = toList
273 $ fisWithSizePolyMap' (Segment 1 20) 1 (map (\d -> (ngramsToIdx (text d) (getRoots phylo), (weight d, (sourcesToIdx (sources d) (getSources phylo))))) docs)
274 in (prd, map (\f -> Clustering (Set.toList $ fst f) ((fst . snd) f) prd ((fst . snd . snd) f) (((snd . snd . snd) f))) lst)
275 _ -> let lst = toList
276 $ fisWithSizePolyMap (Segment 1 20) 1 (map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs)
277 in (prd, map (\f -> Clustering (Set.toList $ fst f) (snd f) prd (Just $ fromIntegral $ snd f) []) lst)
280 fis' = fis `using` parList rdeepseq
282 MaxClique _ thr filterType ->
283 let mcl = map (\(prd,docs) ->
285 $ foldl sumCooc empty
287 $ map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs
288 in (prd, map (\cl -> Clustering cl 0 prd Nothing []) $ getMaxCliques filterType Conditional thr cooc))
290 mcl' = mcl `using` parList rdeepseq
292 --------------------------------------
294 -- dev viz graph maxClique getMaxClique
302 -- To transform the docs into a time map of coocurency matrix
303 docsToTimeScaleCooc :: [Document] -> Vector Ngrams -> Map Date Cooc
304 docsToTimeScaleCooc docs fdt =
305 let mCooc = fromListWith sumCooc
306 $ map (\(_d,l) -> (_d, listToMatrix l))
307 $ map (\doc -> (date doc, sort $ ngramsToIdx (text doc) fdt)) docs
309 $ map (\t -> (t,empty))
310 $ toTimeScale (map date docs) 1
311 in trace ("\n" <> "-- | Build the coocurency matrix for " <> show (length $ keys mCooc') <> " unit of time" <> "\n")
312 $ unionWith sumCooc mCooc mCooc'
315 -----------------------
316 -- | to Phylo Base | --
317 -----------------------
319 groupDocsByPeriodRec :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc] -> Map (date, date) [doc]
320 groupDocsByPeriodRec f prds docs acc =
321 if ((null prds) || (null docs))
324 let prd = head' "groupBy" prds
325 docs' = partition (\d -> (f d >= fst prd) && (f d <= snd prd)) docs
326 in groupDocsByPeriodRec f (tail prds) (snd docs') (insert prd (fst docs') acc)
329 -- To group a list of Documents by fixed periods
330 groupDocsByPeriod' :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
331 groupDocsByPeriod' f pds docs =
332 let docs' = groupBy (\d d' -> f d == f d') $ sortOn f docs
333 periods = map (inPeriode f docs') pds
334 periods' = periods `using` parList rdeepseq
335 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length pds) <> " periods" <> "\n")
336 $ fromList $ zip pds periods'
338 --------------------------------------
339 inPeriode :: Ord b => (t -> b) -> [[t]] -> (b, b) -> [t]
340 inPeriode f' h (start,end) =
341 concat $ fst $ partition (\d -> f' (head' "inPeriode" d) >= start && f' (head' "inPeriode" d) <= end) h
345 -- To group a list of Documents by fixed periods
346 groupDocsByPeriod :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
347 groupDocsByPeriod _ _ [] = panic "[ERR][Viz.Phylo.PhyloMaker] Empty [Documents] can not have any periods"
348 groupDocsByPeriod f pds es =
349 let periods = map (inPeriode f es) pds
350 periods' = periods `using` parList rdeepseq
352 in trace ("\n" <> "-- | Group " <> show(length es) <> " docs by " <> show(length pds) <> " periods" <> "\n")
353 $ fromList $ zip pds periods'
355 --------------------------------------
356 inPeriode :: Ord b => (t -> b) -> [t] -> (b, b) -> [t]
357 inPeriode f' h (start,end) =
358 fst $ partition (\d -> f' d >= start && f' d <= end) h
359 --------------------------------------
362 docsToTermFreq :: [Document] -> Vector Ngrams -> Map Int Double
363 docsToTermFreq docs fdt =
364 let nbDocs = fromIntegral $ length docs
365 freqs = map (/(nbDocs))
367 $ map (\lst -> (head' "docsToTermFreq" lst, fromIntegral $ length lst))
368 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) docs
369 sumFreqs = sum $ elems freqs
370 in map (/sumFreqs) freqs
372 docsToLastTermFreq :: Int -> [Document] -> Vector Ngrams -> Map Int Double
373 docsToLastTermFreq n docs fdt =
374 let last = take n $ reverse $ sort $ map date docs
375 nbDocs = fromIntegral $ length $ filter (\d -> elem (date d) last) docs
376 freqs = map (/(nbDocs))
378 $ map (\lst -> (head' "docsToLastTermFreq" lst, fromIntegral $ length lst))
379 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) $ filter (\d -> elem (date d) last) docs
380 sumFreqs = sum $ elems freqs
381 in map (/sumFreqs) freqs
384 -- To count the number of docs by unit of time
385 docsToTimeScaleNb :: [Document] -> Map Date Double
386 docsToTimeScaleNb docs =
387 let docs' = fromListWith (+) $ map (\d -> (date d,1)) docs
388 time = fromList $ map (\t -> (t,0)) $ toTimeScale (keys docs') 1
389 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length time) <> " unit of time" <> "\n")
390 $ unionWith (+) time docs'
393 initPhyloScales :: Int -> Period -> Map PhyloScaleId PhyloScale
394 initPhyloScales lvlMax pId =
395 fromList $ map (\lvl -> ((pId,lvl),PhyloScale pId ("","") lvl empty)) [1..lvlMax]
399 -- Init the basic elements of a Phylo
401 initPhylo :: [Document] -> TermList -> PhyloConfig -> Phylo
402 initPhylo docs lst conf =
403 let foundations = PhyloFoundations (Vector.fromList $ nub $ concat $ map text docs) lst
404 docsSources = PhyloSources (Vector.fromList $ nub $ concat $ map sources docs)
405 params = defaultPhyloParam { _phyloParam_config = conf }
406 periods = toPeriods (sort $ nub $ map date docs) (getTimePeriod $ timeUnit conf) (getTimeStep $ timeUnit conf)
407 in trace ("\n" <> "-- | Init a phylo out of " <> show(length docs) <> " docs \n")
410 (docsToTimeScaleCooc docs (foundations ^. foundations_roots))
411 (docsToTimeScaleNb docs)
412 (docsToTermFreq docs (foundations ^. foundations_roots))
413 (docsToLastTermFreq (getTimePeriod $ timeUnit conf) docs (foundations ^. foundations_roots))
416 (fromList $ map (\prd -> (prd, PhyloPeriod prd ("","") (initPhyloScales 1 prd))) periods)