2 Module : Gargantext.Core.Viz.Phylo.PhyloMaker
3 Description : Maker engine for rebuilding a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 module Gargantext.Core.Viz.Phylo.PhyloMaker where
14 import Data.List (concat, nub, partition, sort, (++), group, intersect, null, sortOn, groupBy, tail)
15 import Data.Map (Map, fromListWith, keys, unionWith, fromList, empty, toList, elems, (!), restrictKeys, foldlWithKey, insert)
16 import Data.Vector (Vector)
18 import Gargantext.Prelude
19 import Gargantext.Core.Viz.AdaptativePhylo
20 import Gargantext.Core.Viz.Phylo.PhyloTools
21 import Gargantext.Core.Viz.Phylo.TemporalMatching (adaptativeTemporalMatching, constanteTemporalMatching, getNextPeriods, filterDocs, filterDiago, reduceDiagos, toProximity)
22 import Gargantext.Core.Viz.Phylo.SynchronicClustering (synchronicClustering)
23 import Gargantext.Core.Text.Context (TermList)
24 import Gargantext.Core.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, Size(..))
25 import Gargantext.Core.Methods.Graph.MaxClique (getMaxCliques)
26 import Gargantext.Core.Methods.Distances (Distance(Conditional))
27 import Gargantext.Core.Viz.Phylo.PhyloExport (toHorizon)
30 import Control.DeepSeq (NFData)
31 import Control.Parallel.Strategies (parList, rdeepseq, using)
32 import Debug.Trace (trace)
33 import Control.Lens hiding (Level)
35 import qualified Data.Vector as Vector
36 import qualified Data.Set as Set
44 data Phylo' = PhyloBase { _phylo'_phyloBase :: Phylo}
45 | PhyloN { _phylo'_phylo1 :: Phylo}
48 toPhylo' :: Phylo' -> [Document] -> TermList -> Config -> Phylo
49 toPhylo' (PhyloN phylo) = toPhylo'
50 toPhylo' (PhyloBase phylo) = toPhylo
54 toPhylo :: Phylo -> Phylo
55 toPhylo phyloStep = trace ("# phylo1 groups " <> show(length $ getGroupsFromLevel 1 phylo1))
56 $ traceToPhylo (phyloLevel $ getConfig phyloStep) $
57 if (phyloLevel $ getConfig phyloStep) > 1
58 then foldl' (\phylo' _ -> synchronicClustering phylo') phyloAncestors [2..(phyloLevel $ getConfig phyloStep)]
61 --------------------------------------
62 phyloAncestors :: Phylo
64 if (findAncestors $ getConfig phyloStep)
67 --------------------------------------
69 phylo1 = toPhylo1 phyloStep
71 --------------------------------------
79 toGroupsProxi :: Level -> Phylo -> Phylo
80 toGroupsProxi lvl phylo =
81 let proximity = phyloProximity $ getConfig phylo
82 groupsProxi = foldlWithKey (\acc pId pds ->
83 -- 1) process period by period
84 let egos = map (\g -> (getGroupId g, g ^. phylo_groupNgrams))
86 $ view ( phylo_periodLevels
87 . traverse . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
88 . phylo_levelGroups ) pds
89 next = getNextPeriods ToParents (getTimeFrame $ timeUnit $ getConfig phylo) pId (keys $ phylo ^. phylo_periods)
90 targets = map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ getGroupsFromLevelPeriods lvl next phylo
91 docs = filterDocs (phylo ^. phylo_timeDocs) ([pId] ++ next)
92 diagos = filterDiago (phylo ^. phylo_timeCooc) ([pId] ++ next)
93 -- 2) compute the pairs in parallel
94 pairs = map (\(id,ngrams) ->
95 map (\(id',ngrams') ->
96 let nbDocs = (sum . elems) $ filterDocs docs ([idToPrd id, idToPrd id'])
97 diago = reduceDiagos $ filterDiago diagos ([idToPrd id, idToPrd id'])
98 in ((id,id'),toProximity nbDocs diago proximity ngrams ngrams' ngrams')
99 ) $ filter (\(_,ngrams') -> (not . null) $ intersect ngrams ngrams') targets
101 pairs' = pairs `using` parList rdeepseq
102 in acc ++ (concat pairs')
103 ) [] $ phylo ^. phylo_periods
104 in phylo & phylo_groupsProxi .~ ((traceGroupsProxi . fromList) groupsProxi)
107 appendGroups :: (a -> PhyloPeriodId -> Level -> Int -> [Cooc] -> PhyloGroup) -> Level -> Map (Date,Date) [a] -> Phylo -> Phylo
108 appendGroups f lvl m phylo = trace ("\n" <> "-- | Append " <> show (length $ concat $ elems m) <> " groups to Level " <> show (lvl) <> "\n")
109 $ over ( phylo_periods
113 (\phyloLvl -> if lvl == (phyloLvl ^. phylo_levelLevel)
115 let pId = phyloLvl ^. phylo_levelPeriod
118 & phylo_levelGroups .~ (fromList $ foldl (\groups obj ->
119 groups ++ [ (((pId,lvl),length groups)
120 , f obj pId lvl (length groups)
121 (elems $ restrictKeys (phylo ^. phylo_timeCooc) $ periodsToYears [pId]))
128 cliqueToGroup :: PhyloClique -> PhyloPeriodId -> Level -> Int -> [Cooc] -> PhyloGroup
129 cliqueToGroup fis pId lvl idx coocs = PhyloGroup pId lvl idx ""
130 (fis ^. phyloClique_support)
131 (fis ^. phyloClique_nodes)
132 (ngramsToCooc (fis ^. phyloClique_nodes) coocs)
133 (1,[0]) -- branchid (lvl,[path in the branching tree])
134 (fromList [("breaks",[0]),("seaLevels",[0])])
138 toPhylo1 :: Phylo -> Phylo
139 toPhylo1 phyloStep = case (getSeaElevation phyloStep) of
140 Constante start gap -> constanteTemporalMatching start gap phyloStep
141 Adaptative steps -> adaptativeTemporalMatching steps phyloStep
143 -----------------------
144 -- | To Phylo Step | --
145 -----------------------
148 -- To build the first phylo step from docs and terms
149 toPhyloStep :: [Document] -> TermList -> Config -> Phylo
150 toPhyloStep docs lst conf = case (getSeaElevation phyloBase) of
151 Constante _ _ -> appendGroups cliqueToGroup 1 phyloClique phyloBase
152 Adaptative _ -> toGroupsProxi 1 $ appendGroups cliqueToGroup 1 phyloClique phyloBase
154 --------------------------------------
155 phyloClique :: Map (Date,Date) [PhyloClique]
156 phyloClique = toPhyloClique phyloBase docs'
157 --------------------------------------
158 docs' :: Map (Date,Date) [Document]
159 docs' = groupDocsByPeriodRec date (getPeriodIds phyloBase) (sortOn date docs) empty
160 --------------------------------------
162 phyloBase = toPhyloBase docs lst conf
163 --------------------------------------
165 ---------------------------
166 -- | Frequent Item Set | --
167 ---------------------------
170 -- To apply a filter with the possibility of keeping some periods non empty (keep : True|False)
171 filterClique :: Bool -> Int -> (Int -> [PhyloClique] -> [PhyloClique]) -> Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
172 filterClique keep thr f m = case keep of
173 False -> map (\l -> f thr l) m
174 True -> map (\l -> keepFilled (f) thr l) m
177 -- To filter Fis with small Support
178 filterCliqueBySupport :: Int -> [PhyloClique] -> [PhyloClique]
179 filterCliqueBySupport thr l = filter (\clq -> (clq ^. phyloClique_support) >= thr) l
182 -- To filter Fis with small Clique size
183 filterCliqueBySize :: Int -> [PhyloClique] -> [PhyloClique]
184 filterCliqueBySize thr l = filter (\clq -> (length $ clq ^. phyloClique_nodes) >= thr) l
187 -- To filter nested Fis
188 filterCliqueByNested :: Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
189 filterCliqueByNested m =
191 foldl (\mem f -> if (any (\f' -> isNested (f' ^. phyloClique_nodes) (f ^. phyloClique_nodes)) mem)
194 let fMax = filter (\f' -> not $ isNested (f ^. phyloClique_nodes) (f' ^. phyloClique_nodes)) mem
195 in fMax ++ [f] ) [] l)
197 clq' = clq `using` parList rdeepseq
198 in fromList $ zip (keys m) clq'
201 -- | To transform a time map of docs into a time map of Fis with some filters
202 toPhyloClique :: Phylo -> Map (Date, Date) [Document] -> Map (Date,Date) [PhyloClique]
203 toPhyloClique phylo phyloDocs = case (clique $ getConfig phylo) of
204 Fis s s' -> -- traceFis "Filtered Fis"
206 {- \$ traceFis "Filtered by clique size" -}
207 $ filterClique True s' (filterCliqueBySize)
208 {- \$ traceFis "Filtered by support" -}
209 $ filterClique True s (filterCliqueBySupport)
210 {- \$ traceFis "Unfiltered Fis" -}
212 MaxClique s _ _ -> filterClique True s (filterCliqueBySize)
215 --------------------------------------
216 phyloClique :: Map (Date,Date) [PhyloClique]
217 phyloClique = case (clique $ getConfig phylo) of
219 let fis = map (\(prd,docs) ->
220 let lst = toList $ fisWithSizePolyMap (Segment 1 20) 1 (map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs)
221 in (prd, map (\f -> PhyloClique (Set.toList $ fst f) (snd f) prd) lst))
223 fis' = fis `using` parList rdeepseq
225 MaxClique _ thr filterType ->
226 let mcl = map (\(prd,docs) ->
228 $ foldl sumCooc empty
230 $ map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs
231 in (prd, map (\cl -> PhyloClique cl 0 prd) $ getMaxCliques filterType Conditional thr cooc))
233 mcl' = mcl `using` parList rdeepseq
235 --------------------------------------
237 -- dev viz graph maxClique getMaxClique
245 -- To transform the docs into a time map of coocurency matrix
246 docsToTimeScaleCooc :: [Document] -> Vector Ngrams -> Map Date Cooc
247 docsToTimeScaleCooc docs fdt =
248 let mCooc = fromListWith sumCooc
249 $ map (\(_d,l) -> (_d, listToMatrix l))
250 $ map (\doc -> (date doc, sort $ ngramsToIdx (text doc) fdt)) docs
252 $ map (\t -> (t,empty))
253 $ toTimeScale (map date docs) 1
254 in trace ("\n" <> "-- | Build the coocurency matrix for " <> show (length $ keys mCooc') <> " unit of time" <> "\n")
255 $ unionWith sumCooc mCooc mCooc'
258 -----------------------
259 -- | to Phylo Base | --
260 -----------------------
262 groupDocsByPeriodRec :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc] -> Map (date, date) [doc]
263 groupDocsByPeriodRec f prds docs acc =
264 if ((null prds) || (null docs))
267 let prd = head' "groupBy" prds
268 docs' = partition (\d -> (f d >= fst prd) && (f d <= snd prd)) docs
269 in groupDocsByPeriodRec f (tail prds) (snd docs') (insert prd (fst docs') acc)
272 -- To group a list of Documents by fixed periods
273 groupDocsByPeriod' :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
274 groupDocsByPeriod' f pds docs =
275 let docs' = groupBy (\d d' -> f d == f d') $ sortOn f docs
276 periods = map (inPeriode f docs') pds
277 periods' = periods `using` parList rdeepseq
278 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length pds) <> " periods" <> "\n")
279 $ fromList $ zip pds periods'
281 --------------------------------------
282 inPeriode :: Ord b => (t -> b) -> [[t]] -> (b, b) -> [t]
283 inPeriode f' h (start,end) =
284 concat $ fst $ partition (\d -> f' (head' "inPeriode" d) >= start && f' (head' "inPeriode" d) <= end) h
288 -- To group a list of Documents by fixed periods
289 groupDocsByPeriod :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
290 groupDocsByPeriod _ _ [] = panic "[ERR][Viz.Phylo.PhyloMaker] Empty [Documents] can not have any periods"
291 groupDocsByPeriod f pds es =
292 let periods = map (inPeriode f es) pds
293 periods' = periods `using` parList rdeepseq
295 in trace ("\n" <> "-- | Group " <> show(length es) <> " docs by " <> show(length pds) <> " periods" <> "\n")
296 $ fromList $ zip pds periods'
298 --------------------------------------
299 inPeriode :: Ord b => (t -> b) -> [t] -> (b, b) -> [t]
300 inPeriode f' h (start,end) =
301 fst $ partition (\d -> f' d >= start && f' d <= end) h
302 --------------------------------------
305 docsToTermFreq :: [Document] -> Vector Ngrams -> Map Int Double
306 docsToTermFreq docs fdt =
307 let nbDocs = fromIntegral $ length docs
308 freqs = map (/(nbDocs))
310 $ map (\lst -> (head' "docsToTermFreq" lst, fromIntegral $ length lst))
311 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) docs
312 sumFreqs = sum $ elems freqs
313 in map (/sumFreqs) freqs
315 docsToLastTermFreq :: Int -> [Document] -> Vector Ngrams -> Map Int Double
316 docsToLastTermFreq n docs fdt =
317 let last = take n $ reverse $ sort $ map date docs
318 nbDocs = fromIntegral $ length $ filter (\d -> elem (date d) last) docs
319 freqs = map (/(nbDocs))
321 $ map (\lst -> (head' "docsToLastTermFreq" lst, fromIntegral $ length lst))
322 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) $ filter (\d -> elem (date d) last) docs
323 sumFreqs = sum $ elems freqs
324 in map (/sumFreqs) freqs
327 -- To count the number of docs by unit of time
328 docsToTimeScaleNb :: [Document] -> Map Date Double
329 docsToTimeScaleNb docs =
330 let docs' = fromListWith (+) $ map (\d -> (date d,1)) docs
331 time = fromList $ map (\t -> (t,0)) $ toTimeScale (keys docs') 1
332 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length time) <> " unit of time" <> "\n")
333 $ unionWith (+) time docs'
336 initPhyloLevels :: Int -> PhyloPeriodId -> Map PhyloLevelId PhyloLevel
337 initPhyloLevels lvlMax pId =
338 fromList $ map (\lvl -> ((pId,lvl),PhyloLevel pId lvl empty)) [1..lvlMax]
341 -- To init the basic elements of a Phylo
342 toPhyloBase :: [Document] -> TermList -> Config -> Phylo
343 toPhyloBase docs lst conf =
344 let foundations = PhyloFoundations (Vector.fromList $ nub $ concat $ map text docs) lst
345 params = defaultPhyloParam { _phyloParam_config = conf }
346 periods = toPeriods (sort $ nub $ map date docs) (getTimePeriod $ timeUnit conf) (getTimeStep $ timeUnit conf)
347 in trace ("\n" <> "-- | Create PhyloBase out of " <> show(length docs) <> " docs \n")
349 (docsToTimeScaleCooc docs (foundations ^. foundations_roots))
350 (docsToTimeScaleNb docs)
351 (docsToTermFreq docs (foundations ^. foundations_roots))
352 (docsToLastTermFreq (getTimePeriod $ timeUnit conf) docs (foundations ^. foundations_roots))
356 (fromList $ map (\prd -> (prd, PhyloPeriod prd (initPhyloLevels 1 prd))) periods)