2 Module : Gargantext.Core.Viz.Phylo.PhyloMaker
3 Description : Maker engine for rebuilding a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 module Gargantext.Core.Viz.Phylo.PhyloMaker where
13 import Control.DeepSeq (NFData)
14 import Control.Lens hiding (Level)
15 import Control.Parallel.Strategies (parList, rdeepseq, using)
16 import Data.List (concat, nub, partition, sort, (++), group, intersect, null, sortOn, groupBy, tail)
17 import Data.Map (Map, fromListWith, keys, unionWith, fromList, empty, toList, elems, (!), restrictKeys, foldlWithKey, insert)
18 import Data.Text (Text)
19 import Data.Vector (Vector)
20 import Debug.Trace (trace)
22 import Gargantext.Core.Methods.Distances (Distance(Conditional))
23 import Gargantext.Core.Methods.Graph.MaxClique (getMaxCliques)
24 import Gargantext.Core.Text.Context (TermList)
25 import Gargantext.Core.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, fisWithSizePolyMap', Size(..))
26 import Gargantext.Core.Viz.Phylo
27 import Gargantext.Core.Viz.Phylo.PhyloExport (toHorizon)
28 import Gargantext.Core.Viz.Phylo.PhyloTools
29 import Gargantext.Core.Viz.Phylo.SynchronicClustering (synchronicClustering)
30 import Gargantext.Core.Viz.Phylo.TemporalMatching (adaptativeTemporalMatching, constanteTemporalMatching, getNextPeriods, filterDocs, filterDiago, reduceDiagos, toProximity)
31 import Gargantext.Prelude
33 import qualified Data.Set as Set
34 import qualified Data.Vector as Vector
42 data Phylo' = PhyloBase { _phylo'_phyloBase :: Phylo}
43 | PhyloN { _phylo'_phylo1 :: Phylo}
46 toPhylo' :: Phylo' -> [Document] -> TermList -> PhyloConfig -> Phylo
47 toPhylo' (PhyloN phylo) = toPhylo'
48 toPhylo' (PhyloBase phylo) = toPhylo
52 toPhylo :: Phylo -> Phylo
53 toPhylo phyloStep = trace ("# phylo1 groups " <> show(length $ getGroupsFromLevel 1 phylo1))
54 $ traceToPhylo (phyloLevel $ getConfig phyloStep) $
55 if (phyloLevel $ getConfig phyloStep) > 1
56 then foldl' (\phylo' _ -> synchronicClustering phylo') phyloAncestors [2..(phyloLevel $ getConfig phyloStep)]
59 --------------------------------------
60 phyloAncestors :: Phylo
62 if (findAncestors $ getConfig phyloStep)
65 --------------------------------------
67 phylo1 = toPhylo1 phyloStep
68 --------------------------------------
75 toGroupsProxi :: Level -> Phylo -> Phylo
76 toGroupsProxi lvl phylo =
77 let proximity = phyloProximity $ getConfig phylo
78 groupsProxi = foldlWithKey (\acc pId pds ->
79 -- 1) process period by period
80 let egos = map (\g -> (getGroupId g, g ^. phylo_groupNgrams))
82 $ view ( phylo_periodLevels
83 . traverse . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
84 . phylo_levelGroups ) pds
85 next = getNextPeriods ToParents (getTimeFrame $ timeUnit $ getConfig phylo) pId (keys $ phylo ^. phylo_periods)
86 targets = map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ getGroupsFromLevelPeriods lvl next phylo
87 docs = filterDocs (phylo ^. phylo_timeDocs) ([pId] ++ next)
88 diagos = filterDiago (phylo ^. phylo_timeCooc) ([pId] ++ next)
89 -- 2) compute the pairs in parallel
90 pairs = map (\(id,ngrams) ->
91 map (\(id',ngrams') ->
92 let nbDocs = (sum . elems) $ filterDocs docs ([idToPrd id, idToPrd id'])
93 diago = reduceDiagos $ filterDiago diagos ([idToPrd id, idToPrd id'])
94 in ((id,id'),toProximity nbDocs diago proximity ngrams ngrams' ngrams')
95 ) $ filter (\(_,ngrams') -> (not . null) $ intersect ngrams ngrams') targets
97 pairs' = pairs `using` parList rdeepseq
98 in acc ++ (concat pairs')
99 ) [] $ phylo ^. phylo_periods
100 in phylo & phylo_groupsProxi .~ ((traceGroupsProxi . fromList) groupsProxi)
103 appendGroups :: (a -> PhyloPeriodId -> (Text,Text) -> Level -> Int -> [Cooc] -> PhyloGroup) -> Level -> Map (Date,Date) [a] -> Phylo -> Phylo
104 appendGroups f lvl m phylo = trace ("\n" <> "-- | Append " <> show (length $ concat $ elems m) <> " groups to Level " <> show (lvl) <> "\n")
105 $ over ( phylo_periods
109 (\phyloLvl -> if lvl == (phyloLvl ^. phylo_levelLevel)
111 let pId = phyloLvl ^. phylo_levelPeriod
112 pId' = phyloLvl ^. phylo_levelPeriod'
115 & phylo_levelGroups .~ (fromList $ foldl (\groups obj ->
116 groups ++ [ (((pId,lvl),length groups)
117 , f obj pId pId' lvl (length groups)
118 (elems $ restrictKeys (phylo ^. phylo_timeCooc) $ periodsToYears [pId]))
125 cliqueToGroup :: PhyloClique -> PhyloPeriodId -> (Text,Text) -> Level -> Int -> [Cooc] -> PhyloGroup
126 cliqueToGroup fis pId pId' lvl idx coocs = PhyloGroup pId pId' lvl idx ""
127 (fis ^. phyloClique_support)
128 (fis ^. phyloClique_weight)
129 (fis ^. phyloClique_sources)
130 (fis ^. phyloClique_nodes)
131 (ngramsToCooc (fis ^. phyloClique_nodes) coocs)
132 (1,[0]) -- branchid (lvl,[path in the branching tree])
133 (fromList [("breaks",[0]),("seaLevels",[0])])
137 toPhylo1 :: Phylo -> Phylo
138 toPhylo1 phyloStep = case (getSeaElevation phyloStep) of
139 Constante start gap -> constanteTemporalMatching start gap phyloStep
140 Adaptative steps -> adaptativeTemporalMatching steps phyloStep
142 -----------------------
143 -- | To Phylo Step | --
144 -----------------------
147 indexDates' :: Map (Date,Date) [Document] -> Map (Date,Date) (Text,Text)
148 indexDates' m = map (\docs ->
149 let ds = map (\d -> date' d) docs
159 -- To build the first phylo step from docs and terms
160 -- QL: backend entre phyloBase et phyloClique
161 toPhyloStep :: [Document] -> TermList -> PhyloConfig -> Phylo
162 toPhyloStep docs lst conf = case (getSeaElevation phyloBase) of
163 Constante _ _ -> appendGroups cliqueToGroup 1 phyloClique (updatePeriods (indexDates' docs') phyloBase)
164 Adaptative _ -> toGroupsProxi 1
165 $ appendGroups cliqueToGroup 1 phyloClique (updatePeriods (indexDates' docs') phyloBase)
167 --------------------------------------
168 phyloClique :: Map (Date,Date) [PhyloClique]
169 phyloClique = toPhyloClique phyloBase docs'
170 --------------------------------------
171 docs' :: Map (Date,Date) [Document]
172 -- QL: Time Consuming here
173 docs' = groupDocsByPeriodRec date (getPeriodIds phyloBase) (sortOn date docs) empty
174 --------------------------------------
176 phyloBase = toPhyloBase docs lst conf
177 --------------------------------------
179 ---------------------------
180 -- | Frequent Item Set | --
181 ---------------------------
184 -- To apply a filter with the possibility of keeping some periods non empty (keep : True|False)
185 filterClique :: Bool -> Int -> (Int -> [PhyloClique] -> [PhyloClique]) -> Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
186 filterClique keep thr f m = case keep of
187 False -> map (\l -> f thr l) m
188 True -> map (\l -> keepFilled (f) thr l) m
191 -- To filter Fis with small Support
192 filterCliqueBySupport :: Int -> [PhyloClique] -> [PhyloClique]
193 filterCliqueBySupport thr l = filter (\clq -> (clq ^. phyloClique_support) >= thr) l
196 -- To filter Fis with small Clique size
197 filterCliqueBySize :: Int -> [PhyloClique] -> [PhyloClique]
198 filterCliqueBySize thr l = filter (\clq -> (length $ clq ^. phyloClique_nodes) >= thr) l
201 -- To filter nested Fis
202 filterCliqueByNested :: Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
203 filterCliqueByNested m =
205 foldl (\mem f -> if (any (\f' -> isNested (f' ^. phyloClique_nodes) (f ^. phyloClique_nodes)) mem)
208 let fMax = filter (\f' -> not $ isNested (f ^. phyloClique_nodes) (f' ^. phyloClique_nodes)) mem
209 in fMax ++ [f] ) [] l)
211 clq' = clq `using` parList rdeepseq
212 in fromList $ zip (keys m) clq'
215 -- | To transform a time map of docs into a time map of Fis with some filters
216 toPhyloClique :: Phylo -> Map (Date, Date) [Document] -> Map (Date,Date) [PhyloClique]
217 toPhyloClique phylo phyloDocs = case (clique $ getConfig phylo) of
218 Fis s s' -> -- traceFis "Filtered Fis"
220 {- \$ traceFis "Filtered by clique size" -}
221 $ filterClique True s' (filterCliqueBySize)
222 {- \$ traceFis "Filtered by support" -}
223 $ filterClique True s (filterCliqueBySupport)
224 {- \$ traceFis "Unfiltered Fis" -}
226 MaxClique s _ _ -> filterClique True s (filterCliqueBySize)
229 --------------------------------------
230 phyloClique :: Map (Date,Date) [PhyloClique]
231 phyloClique = case (clique $ getConfig phylo) of
233 let fis = map (\(prd,docs) ->
234 case (corpusParser $ getConfig phylo) of
235 Csv' _ -> let lst = toList
236 $ fisWithSizePolyMap' (Segment 1 20) 1 (map (\d -> (ngramsToIdx (text d) (getRoots phylo), (weight d, (sourcesToIdx (sources d) (getSources phylo))))) docs)
237 in (prd, map (\f -> PhyloClique (Set.toList $ fst f) ((fst . snd) f) prd ((fst . snd . snd) f) (((snd . snd . snd) f))) lst)
238 _ -> let lst = toList
239 $ fisWithSizePolyMap (Segment 1 20) 1 (map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs)
240 in (prd, map (\f -> PhyloClique (Set.toList $ fst f) (snd f) prd (Just $ fromIntegral $ snd f) []) lst)
243 fis' = fis `using` parList rdeepseq
245 MaxClique _ thr filterType ->
246 let mcl = map (\(prd,docs) ->
248 $ foldl sumCooc empty
250 $ map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs
251 in (prd, map (\cl -> PhyloClique cl 0 prd Nothing []) $ getMaxCliques filterType Conditional thr cooc))
253 mcl' = mcl `using` parList rdeepseq
255 --------------------------------------
257 -- dev viz graph maxClique getMaxClique
265 -- To transform the docs into a time map of coocurency matrix
266 docsToTimeScaleCooc :: [Document] -> Vector Ngrams -> Map Date Cooc
267 docsToTimeScaleCooc docs fdt =
268 let mCooc = fromListWith sumCooc
269 $ map (\(_d,l) -> (_d, listToMatrix l))
270 $ map (\doc -> (date doc, sort $ ngramsToIdx (text doc) fdt)) docs
272 $ map (\t -> (t,empty))
273 $ toTimeScale (map date docs) 1
274 in trace ("\n" <> "-- | Build the coocurency matrix for " <> show (length $ keys mCooc') <> " unit of time" <> "\n")
275 $ unionWith sumCooc mCooc mCooc'
278 -----------------------
279 -- | to Phylo Base | --
280 -----------------------
282 groupDocsByPeriodRec :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc] -> Map (date, date) [doc]
283 groupDocsByPeriodRec f prds docs acc =
284 if ((null prds) || (null docs))
287 let prd = head' "groupBy" prds
288 docs' = partition (\d -> (f d >= fst prd) && (f d <= snd prd)) docs
289 in groupDocsByPeriodRec f (tail prds) (snd docs') (insert prd (fst docs') acc)
292 -- To group a list of Documents by fixed periods
293 groupDocsByPeriod' :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
294 groupDocsByPeriod' f pds docs =
295 let docs' = groupBy (\d d' -> f d == f d') $ sortOn f docs
296 periods = map (inPeriode f docs') pds
297 periods' = periods `using` parList rdeepseq
298 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length pds) <> " periods" <> "\n")
299 $ fromList $ zip pds periods'
301 --------------------------------------
302 inPeriode :: Ord b => (t -> b) -> [[t]] -> (b, b) -> [t]
303 inPeriode f' h (start,end) =
304 concat $ fst $ partition (\d -> f' (head' "inPeriode" d) >= start && f' (head' "inPeriode" d) <= end) h
308 -- To group a list of Documents by fixed periods
309 groupDocsByPeriod :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
310 groupDocsByPeriod _ _ [] = panic "[ERR][Viz.Phylo.PhyloMaker] Empty [Documents] can not have any periods"
311 groupDocsByPeriod f pds es =
312 let periods = map (inPeriode f es) pds
313 periods' = periods `using` parList rdeepseq
315 in trace ("\n" <> "-- | Group " <> show(length es) <> " docs by " <> show(length pds) <> " periods" <> "\n")
316 $ fromList $ zip pds periods'
318 --------------------------------------
319 inPeriode :: Ord b => (t -> b) -> [t] -> (b, b) -> [t]
320 inPeriode f' h (start,end) =
321 fst $ partition (\d -> f' d >= start && f' d <= end) h
322 --------------------------------------
325 docsToTermFreq :: [Document] -> Vector Ngrams -> Map Int Double
326 docsToTermFreq docs fdt =
327 let nbDocs = fromIntegral $ length docs
328 freqs = map (/(nbDocs))
330 $ map (\lst -> (head' "docsToTermFreq" lst, fromIntegral $ length lst))
331 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) docs
332 sumFreqs = sum $ elems freqs
333 in map (/sumFreqs) freqs
335 docsToLastTermFreq :: Int -> [Document] -> Vector Ngrams -> Map Int Double
336 docsToLastTermFreq n docs fdt =
337 let last = take n $ reverse $ sort $ map date docs
338 nbDocs = fromIntegral $ length $ filter (\d -> elem (date d) last) docs
339 freqs = map (/(nbDocs))
341 $ map (\lst -> (head' "docsToLastTermFreq" lst, fromIntegral $ length lst))
342 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) $ filter (\d -> elem (date d) last) docs
343 sumFreqs = sum $ elems freqs
344 in map (/sumFreqs) freqs
347 -- To count the number of docs by unit of time
348 docsToTimeScaleNb :: [Document] -> Map Date Double
349 docsToTimeScaleNb docs =
350 let docs' = fromListWith (+) $ map (\d -> (date d,1)) docs
351 time = fromList $ map (\t -> (t,0)) $ toTimeScale (keys docs') 1
352 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length time) <> " unit of time" <> "\n")
353 $ unionWith (+) time docs'
356 initPhyloLevels :: Int -> PhyloPeriodId -> Map PhyloLevelId PhyloLevel
357 initPhyloLevels lvlMax pId =
358 fromList $ map (\lvl -> ((pId,lvl),PhyloLevel pId ("","") lvl empty)) [1..lvlMax]
362 -- To init the basic elements of a Phylo
363 toPhyloBase :: [Document] -> TermList -> PhyloConfig -> Phylo
364 toPhyloBase docs lst conf =
365 let foundations = PhyloFoundations (Vector.fromList $ nub $ concat $ map text docs) lst
366 docsSources = PhyloSources (Vector.fromList $ nub $ concat $ map sources docs)
367 params = defaultPhyloParam { _phyloParam_config = conf }
368 periods = toPeriods (sort $ nub $ map date docs) (getTimePeriod $ timeUnit conf) (getTimeStep $ timeUnit conf)
369 in trace ("\n" <> "-- | Create PhyloBase out of " <> show(length docs) <> " docs \n")
372 (docsToTimeScaleCooc docs (foundations ^. foundations_roots))
373 (docsToTimeScaleNb docs)
374 (docsToTermFreq docs (foundations ^. foundations_roots))
375 (docsToLastTermFreq (getTimePeriod $ timeUnit conf) docs (foundations ^. foundations_roots))
379 (fromList $ map (\prd -> (prd, PhyloPeriod prd ("","") (initPhyloLevels 1 prd))) periods)