2 Module : Gargantext.Core.Viz.Phylo.PhyloMaker
3 Description : Maker engine for rebuilding a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 module Gargantext.Core.Viz.Phylo.PhyloMaker where
15 import Control.DeepSeq (NFData)
16 import Control.Lens hiding (Level)
17 import Control.Parallel.Strategies (parList, rdeepseq, using)
18 import Data.List (concat, nub, partition, sort, (++), group, intersect, null, sortOn, groupBy, tail)
19 import Data.Map (Map, fromListWith, keys, unionWith, fromList, empty, toList, elems, (!), restrictKeys, foldlWithKey, insert)
21 import Data.Text (Text)
22 import Data.Vector (Vector)
23 import Debug.Trace (trace)
24 import Prelude (floor)
26 import Gargantext.Core.Methods.Distances (Distance(Conditional))
27 import Gargantext.Core.Methods.Graph.MaxClique (getMaxCliques)
28 import Gargantext.Core.Text.Context (TermList)
29 import Gargantext.Core.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, fisWithSizePolyMap', Size(..))
30 import Gargantext.Core.Viz.Phylo
31 import Gargantext.Core.Viz.Phylo.PhyloExport (toHorizon)
32 import Gargantext.Core.Viz.Phylo.PhyloTools
33 import Gargantext.Core.Viz.Phylo.SynchronicClustering (synchronicClustering)
34 import Gargantext.Core.Viz.Phylo.TemporalMatching (temporalMatching, getNextPeriods, filterDocs, filterDiago, reduceDiagos, toSimilarity)
35 import Gargantext.Prelude
37 import qualified Data.Set as Set
38 import qualified Data.List as List
39 import qualified Data.Vector as Vector
47 data Phylo' = PhyloBase { _phylo'_phyloBase :: Phylo}
48 | PhyloN { _phylo'_flatPhylo :: Phylo}
51 toPhylo' :: Phylo' -> [Document] -> TermList -> PhyloConfig -> Phylo
52 toPhylo' (PhyloN phylo) = toPhylo'
53 toPhylo' (PhyloBase phylo) = toPhylo
56 -- TODO an adaptative synchronic clustering with a slider
58 toPhylo :: Phylo -> Phylo
59 toPhylo phylowithoutLink = trace ("# flatPhylo groups " <> show(length $ getGroupsFromScale 1 flatPhylo))
60 $ traceToPhylo (phyloScale $ getConfig phylowithoutLink) $
61 if (phyloScale $ getConfig phylowithoutLink) > 1
62 then foldl' (\phylo' _ -> synchronicClustering phylo') phyloAncestors [2..(phyloScale $ getConfig phylowithoutLink)]
65 --------------------------------------
66 phyloAncestors :: Phylo
68 if (findAncestors $ getConfig phylowithoutLink)
69 then toHorizon flatPhylo
71 --------------------------------------
73 flatPhylo = addTemporalLinksToPhylo phylowithoutLink
74 --------------------------------------
77 -----------------------------
78 -- | Create a flat Phylo | --
79 -----------------------------
83 -- create a square ladder
85 squareLadder :: [Double] -> [Double]
86 squareLadder ladder = List.map (\x -> x * x) ladder
90 -- create an adaptative diachronic 'sea elevation' ladder
92 adaptDiachronicLadder :: Double -> Set Double -> Set Double -> [Double]
93 adaptDiachronicLadder curr similarities ladder =
94 if curr <= 0 || Set.null similarities
95 then Set.toList ladder
97 let idx = ((Set.size similarities) `div` (floor curr)) - 1
98 thr = Set.elemAt idx similarities
99 -- we use a sliding methods 1/10, then 1/9, then ... 1/2
100 in adaptDiachronicLadder (curr -1) (Set.filter (> thr) similarities) (Set.insert thr ladder)
104 -- create a constante diachronic 'sea elevation' ladder
106 constDiachronicLadder :: Double -> Double -> Set Double -> [Double]
107 constDiachronicLadder curr step ladder =
109 then Set.toList ladder
110 else constDiachronicLadder (curr + step) step (Set.insert curr ladder)
114 -- process an initial scanning of the kinship links
116 scanSimilarity :: Scale -> Phylo -> Phylo
117 scanSimilarity lvl phylo =
118 let proximity = similarity $ getConfig phylo
119 scanning = foldlWithKey (\acc pId pds ->
120 -- 1) process period by period
121 let egos = map (\g -> (getGroupId g, g ^. phylo_groupNgrams))
123 $ view ( phylo_periodScales
124 . traverse . filtered (\phyloLvl -> phyloLvl ^. phylo_scaleScale == lvl)
125 . phylo_scaleGroups ) pds
126 next = getNextPeriods ToParents (getTimeFrame $ timeUnit $ getConfig phylo) pId (keys $ phylo ^. phylo_periods)
127 targets = map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ getGroupsFromScalePeriods lvl next phylo
128 docs = filterDocs (phylo ^. phylo_timeDocs) ([pId] ++ next)
129 diagos = filterDiago (phylo ^. phylo_timeCooc) ([pId] ++ next)
130 -- 2) compute the pairs in parallel
131 pairs = map (\(id,ngrams) ->
132 map (\(id',ngrams') ->
133 let nbDocs = (sum . elems) $ filterDocs docs ([idToPrd id, idToPrd id'])
134 diago = reduceDiagos $ filterDiago diagos ([idToPrd id, idToPrd id'])
135 in ((id,id'),toSimilarity nbDocs diago proximity ngrams ngrams' ngrams')
136 ) $ filter (\(_,ngrams') -> (not . null) $ intersect ngrams ngrams') targets
138 pairs' = pairs `using` parList rdeepseq
139 in acc ++ (concat pairs')
140 ) [] $ phylo ^. phylo_periods
141 in phylo & phylo_diaSimScan .~ Set.fromList (traceGroupsProxi $ map snd scanning)
145 appendGroups :: (a -> Period -> (Text,Text) -> Scale -> Int -> [Cooc] -> PhyloGroup) -> Scale -> Map (Date,Date) [a] -> Phylo -> Phylo
146 appendGroups f lvl m phylo = trace ("\n" <> "-- | Append " <> show (length $ concat $ elems m) <> " groups to Level " <> show (lvl) <> "\n")
147 $ over ( phylo_periods
151 (\phyloLvl -> if lvl == (phyloLvl ^. phylo_scaleScale)
153 let pId = phyloLvl ^. phylo_scalePeriod
154 pId' = phyloLvl ^. phylo_scalePeriodStr
157 & phylo_scaleGroups .~ (fromList $ foldl (\groups obj ->
158 groups ++ [ (((pId,lvl),length groups)
159 , f obj pId pId' lvl (length groups)
160 (elems $ restrictKeys (phylo ^. phylo_timeCooc) $ periodsToYears [pId]))
167 clusterToGroup :: Clustering -> Period -> (Text,Text) -> Scale -> Int -> [Cooc] -> PhyloGroup
168 clusterToGroup fis pId pId' lvl idx coocs = PhyloGroup pId pId' lvl idx ""
169 (fis ^. clustering_support )
170 (fis ^. clustering_visWeighting)
171 (fis ^. clustering_visFiltering)
172 (fis ^. clustering_roots)
173 (ngramsToCooc (fis ^. clustering_roots) coocs)
174 (1,[0]) -- branchid (lvl,[path in the branching tree])
175 (fromList [("breaks",[0]),("seaLevels",[0])])
179 -- enhance the phylo with temporal links
181 addTemporalLinksToPhylo :: Phylo -> Phylo
182 addTemporalLinksToPhylo phylowithoutLink = case strategy of
183 Constante start gap -> temporalMatching (constDiachronicLadder start gap Set.empty) phylowithoutLink
184 Adaptative steps -> temporalMatching (squareLadder $ adaptDiachronicLadder steps (phylowithoutLink ^. phylo_diaSimScan) Set.empty) phylowithoutLink
186 strategy :: SeaElevation
187 strategy = getSeaElevation phylowithoutLink
189 -----------------------
190 -- | To Phylo Step | --
191 -----------------------
194 indexDates' :: Map (Date,Date) [Document] -> Map (Date,Date) (Text,Text)
195 indexDates' m = map (\docs ->
196 let ds = map (\d -> date' d) docs
206 -- To build the first phylo step from docs and terms
207 -- QL: backend entre phyloBase et Clustering
208 -- tophylowithoutLink
209 toPhyloWithoutLink :: [Document] -> TermList -> PhyloConfig -> Phylo
210 toPhyloWithoutLink docs lst conf = case (getSeaElevation phyloBase) of
211 Constante _ _ -> appendGroups clusterToGroup 1 seriesOfClustering (updatePeriods (indexDates' docs') phyloBase)
212 Adaptative _ -> scanSimilarity 1
213 $ appendGroups clusterToGroup 1 seriesOfClustering (updatePeriods (indexDates' docs') phyloBase)
215 --------------------------------------
216 seriesOfClustering :: Map (Date,Date) [Clustering]
217 seriesOfClustering = toSeriesOfClustering phyloBase docs'
218 --------------------------------------
219 docs' :: Map (Date,Date) [Document]
220 -- QL: Time Consuming here
221 docs' = groupDocsByPeriodRec date (getPeriodIds phyloBase) (sortOn date docs) empty
222 --------------------------------------
224 phyloBase = initPhylo docs lst conf
225 --------------------------------------
227 ---------------------------
228 -- | Frequent Item Set | --
229 ---------------------------
232 -- To apply a filter with the possibility of keeping some periods non empty (keep : True|False)
233 filterClique :: Bool -> Int -> (Int -> [Clustering] -> [Clustering]) -> Map (Date, Date) [Clustering] -> Map (Date, Date) [Clustering]
234 filterClique keep thr f m = case keep of
235 False -> map (\l -> f thr l) m
236 True -> map (\l -> keepFilled (f) thr l) m
239 -- To filter Fis with small Support
240 filterCliqueBySupport :: Int -> [Clustering] -> [Clustering]
241 filterCliqueBySupport thr l = filter (\clq -> (clq ^. clustering_support ) >= thr) l
244 -- To filter Fis with small Clique size
245 filterCliqueBySize :: Int -> [Clustering] -> [Clustering]
246 filterCliqueBySize thr l = filter (\clq -> (length $ clq ^. clustering_roots) >= thr) l
249 -- To filter nested Fis
250 filterCliqueByNested :: Map (Date, Date) [Clustering] -> Map (Date, Date) [Clustering]
251 filterCliqueByNested m =
253 foldl (\mem f -> if (any (\f' -> isNested (f' ^. clustering_roots) (f ^. clustering_roots)) mem)
256 let fMax = filter (\f' -> not $ isNested (f ^. clustering_roots) (f' ^. clustering_roots)) mem
257 in fMax ++ [f] ) [] l)
259 clq' = clq `using` parList rdeepseq
260 in fromList $ zip (keys m) clq'
263 -- | To transform a time map of docs into a time map of Fis with some filters
264 toSeriesOfClustering :: Phylo -> Map (Date, Date) [Document] -> Map (Date,Date) [Clustering]
265 toSeriesOfClustering phylo phyloDocs = case (clique $ getConfig phylo) of
266 Fis s s' -> -- traceFis "Filtered Fis"
268 {- \$ traceFis "Filtered by clique size" -}
269 $ filterClique True s' (filterCliqueBySize)
270 {- \$ traceFis "Filtered by support" -}
271 $ filterClique True s (filterCliqueBySupport)
272 {- \$ traceFis "Unfiltered Fis" -}
274 MaxClique s _ _ -> filterClique True s (filterCliqueBySize)
277 --------------------------------------
278 seriesOfClustering :: Map (Date,Date) [Clustering]
279 seriesOfClustering = case (clique $ getConfig phylo) of
281 let fis = map (\(prd,docs) ->
282 case (corpusParser $ getConfig phylo) of
283 Csv' _ -> let lst = toList
284 $ fisWithSizePolyMap' (Segment 1 20) 1 (map (\d -> (ngramsToIdx (text d) (getRoots phylo), (weight d, (sourcesToIdx (sources d) (getSources phylo))))) docs)
285 in (prd, map (\f -> Clustering (Set.toList $ fst f) ((fst . snd) f) prd ((fst . snd . snd) f) (((snd . snd . snd) f))) lst)
286 _ -> let lst = toList
287 $ fisWithSizePolyMap (Segment 1 20) 1 (map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs)
288 in (prd, map (\f -> Clustering (Set.toList $ fst f) (snd f) prd (Just $ fromIntegral $ snd f) []) lst)
291 fis' = fis `using` parList rdeepseq
293 MaxClique _ thr filterType ->
294 let mcl = map (\(prd,docs) ->
296 $ foldl sumCooc empty
298 $ map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs
299 in (prd, map (\cl -> Clustering cl 0 prd Nothing []) $ getMaxCliques filterType Conditional thr cooc))
301 mcl' = mcl `using` parList rdeepseq
303 --------------------------------------
305 -- dev viz graph maxClique getMaxClique
313 -- To transform the docs into a time map of coocurency matrix
314 docsToTimeScaleCooc :: [Document] -> Vector Ngrams -> Map Date Cooc
315 docsToTimeScaleCooc docs fdt =
316 let mCooc = fromListWith sumCooc
317 $ map (\(_d,l) -> (_d, listToMatrix l))
318 $ map (\doc -> (date doc, sort $ ngramsToIdx (text doc) fdt)) docs
320 $ map (\t -> (t,empty))
321 $ toTimeScale (map date docs) 1
322 in trace ("\n" <> "-- | Build the coocurency matrix for " <> show (length $ keys mCooc') <> " unit of time" <> "\n")
323 $ unionWith sumCooc mCooc mCooc'
326 -----------------------
327 -- | to Phylo Base | --
328 -----------------------
330 groupDocsByPeriodRec :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc] -> Map (date, date) [doc]
331 groupDocsByPeriodRec f prds docs acc =
332 if ((null prds) || (null docs))
335 let prd = head' "groupBy" prds
336 docs' = partition (\d -> (f d >= fst prd) && (f d <= snd prd)) docs
337 in groupDocsByPeriodRec f (tail prds) (snd docs') (insert prd (fst docs') acc)
340 -- To group a list of Documents by fixed periods
341 groupDocsByPeriod' :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
342 groupDocsByPeriod' f pds docs =
343 let docs' = groupBy (\d d' -> f d == f d') $ sortOn f docs
344 periods = map (inPeriode f docs') pds
345 periods' = periods `using` parList rdeepseq
346 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length pds) <> " periods" <> "\n")
347 $ fromList $ zip pds periods'
349 --------------------------------------
350 inPeriode :: Ord b => (t -> b) -> [[t]] -> (b, b) -> [t]
351 inPeriode f' h (start,end) =
352 concat $ fst $ partition (\d -> f' (head' "inPeriode" d) >= start && f' (head' "inPeriode" d) <= end) h
356 -- To group a list of Documents by fixed periods
357 groupDocsByPeriod :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
358 groupDocsByPeriod _ _ [] = panic "[ERR][Viz.Phylo.PhyloMaker] Empty [Documents] can not have any periods"
359 groupDocsByPeriod f pds es =
360 let periods = map (inPeriode f es) pds
361 periods' = periods `using` parList rdeepseq
363 in trace ("\n" <> "-- | Group " <> show(length es) <> " docs by " <> show(length pds) <> " periods" <> "\n")
364 $ fromList $ zip pds periods'
366 --------------------------------------
367 inPeriode :: Ord b => (t -> b) -> [t] -> (b, b) -> [t]
368 inPeriode f' h (start,end) =
369 fst $ partition (\d -> f' d >= start && f' d <= end) h
370 --------------------------------------
373 docsToTermFreq :: [Document] -> Vector Ngrams -> Map Int Double
374 docsToTermFreq docs fdt =
375 let nbDocs = fromIntegral $ length docs
376 freqs = map (/(nbDocs))
378 $ map (\lst -> (head' "docsToTermFreq" lst, fromIntegral $ length lst))
379 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) docs
380 sumFreqs = sum $ elems freqs
381 in map (/sumFreqs) freqs
383 docsToLastTermFreq :: Int -> [Document] -> Vector Ngrams -> Map Int Double
384 docsToLastTermFreq n docs fdt =
385 let last = take n $ reverse $ sort $ map date docs
386 nbDocs = fromIntegral $ length $ filter (\d -> elem (date d) last) docs
387 freqs = map (/(nbDocs))
389 $ map (\lst -> (head' "docsToLastTermFreq" lst, fromIntegral $ length lst))
390 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) $ filter (\d -> elem (date d) last) docs
391 sumFreqs = sum $ elems freqs
392 in map (/sumFreqs) freqs
395 -- To count the number of docs by unit of time
396 docsToTimeScaleNb :: [Document] -> Map Date Double
397 docsToTimeScaleNb docs =
398 let docs' = fromListWith (+) $ map (\d -> (date d,1)) docs
399 time = fromList $ map (\t -> (t,0)) $ toTimeScale (keys docs') 1
400 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length time) <> " unit of time" <> "\n")
401 $ unionWith (+) time docs'
404 initPhyloScales :: Int -> Period -> Map PhyloScaleId PhyloScale
405 initPhyloScales lvlMax pId =
406 fromList $ map (\lvl -> ((pId,lvl),PhyloScale pId ("","") lvl empty)) [1..lvlMax]
410 -- Init the basic elements of a Phylo
412 initPhylo :: [Document] -> TermList -> PhyloConfig -> Phylo
413 initPhylo docs lst conf =
414 let foundations = PhyloFoundations (Vector.fromList $ nub $ concat $ map text docs) lst
415 docsSources = PhyloSources (Vector.fromList $ nub $ concat $ map sources docs)
416 params = defaultPhyloParam { _phyloParam_config = conf }
417 periods = toPeriods (sort $ nub $ map date docs) (getTimePeriod $ timeUnit conf) (getTimeStep $ timeUnit conf)
418 in trace ("\n" <> "-- | Init a phylo out of " <> show(length docs) <> " docs \n")
421 (docsToTimeScaleCooc docs (foundations ^. foundations_roots))
422 (docsToTimeScaleNb docs)
423 (docsToTermFreq docs (foundations ^. foundations_roots))
424 (docsToLastTermFreq (getTimePeriod $ timeUnit conf) docs (foundations ^. foundations_roots))
427 (fromList $ map (\prd -> (prd, PhyloPeriod prd ("","") (initPhyloScales 1 prd))) periods)