2 Module : Gargantext.Core.Viz.Phylo.PhyloMaker
3 Description : Maker engine for rebuilding a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 module Gargantext.Core.Viz.Phylo.PhyloMaker where
15 import Control.DeepSeq (NFData)
16 import Control.Lens hiding (Level)
17 import Control.Parallel.Strategies (parList, rdeepseq, using)
18 import Data.List (concat, nub, partition, sort, (++), group, intersect, null, sortOn, groupBy, tail)
19 import Data.Map (Map, fromListWith, keys, unionWith, fromList, empty, toList, elems, (!), restrictKeys, foldlWithKey, insert)
20 import Data.Text (Text)
21 import Data.Vector (Vector)
22 import Debug.Trace (trace)
24 import Gargantext.Core.Methods.Distances (Distance(Conditional))
25 import Gargantext.Core.Methods.Graph.MaxClique (getMaxCliques)
26 import Gargantext.Core.Text.Context (TermList)
27 import Gargantext.Core.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, fisWithSizePolyMap', Size(..))
28 import Gargantext.Core.Viz.Phylo
29 import Gargantext.Core.Viz.Phylo.PhyloExport (toHorizon)
30 import Gargantext.Core.Viz.Phylo.PhyloTools
31 import Gargantext.Core.Viz.Phylo.SynchronicClustering (synchronicClustering)
32 import Gargantext.Core.Viz.Phylo.TemporalMatching (adaptativeTemporalMatching, constanteTemporalMatching, getNextPeriods, filterDocs, filterDiago, reduceDiagos, toProximity)
33 import Gargantext.Prelude
35 import qualified Data.Set as Set
36 import qualified Data.Vector as Vector
44 data Phylo' = PhyloBase { _phylo'_phyloBase :: Phylo}
45 | PhyloN { _phylo'_phylo1 :: Phylo}
48 toPhylo' :: Phylo' -> [Document] -> TermList -> Config -> Phylo
49 toPhylo' (PhyloN phylo) = toPhylo'
50 toPhylo' (PhyloBase phylo) = toPhylo
54 toPhylo :: Phylo -> Phylo
55 toPhylo phyloStep = trace ("# phylo1 groups " <> show(length $ getGroupsFromLevel 1 phylo1))
56 $ traceToPhylo (phyloLevel $ getConfig phyloStep) $
57 if (phyloLevel $ getConfig phyloStep) > 1
58 then foldl' (\phylo' _ -> synchronicClustering phylo') phyloAncestors [2..(phyloLevel $ getConfig phyloStep)]
61 --------------------------------------
62 phyloAncestors :: Phylo
64 if (findAncestors $ getConfig phyloStep)
67 --------------------------------------
69 phylo1 = toPhylo1 phyloStep
71 --------------------------------------
79 toGroupsProxi :: Level -> Phylo -> Phylo
80 toGroupsProxi lvl phylo =
81 let proximity = phyloProximity $ getConfig phylo
82 groupsProxi = foldlWithKey (\acc pId pds ->
83 -- 1) process period by period
84 let egos = map (\g -> (getGroupId g, g ^. phylo_groupNgrams))
86 $ view ( phylo_periodLevels
87 . traverse . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
88 . phylo_levelGroups ) pds
89 next = getNextPeriods ToParents (getTimeFrame $ timeUnit $ getConfig phylo) pId (keys $ phylo ^. phylo_periods)
90 targets = map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ getGroupsFromLevelPeriods lvl next phylo
91 docs = filterDocs (phylo ^. phylo_timeDocs) ([pId] ++ next)
92 diagos = filterDiago (phylo ^. phylo_timeCooc) ([pId] ++ next)
93 -- 2) compute the pairs in parallel
94 pairs = map (\(id,ngrams) ->
95 map (\(id',ngrams') ->
96 let nbDocs = (sum . elems) $ filterDocs docs ([idToPrd id, idToPrd id'])
97 diago = reduceDiagos $ filterDiago diagos ([idToPrd id, idToPrd id'])
98 in ((id,id'),toProximity nbDocs diago proximity ngrams ngrams' ngrams')
99 ) $ filter (\(_,ngrams') -> (not . null) $ intersect ngrams ngrams') targets
101 pairs' = pairs `using` parList rdeepseq
102 in acc ++ (concat pairs')
103 ) [] $ phylo ^. phylo_periods
104 in phylo & phylo_groupsProxi .~ ((traceGroupsProxi . fromList) groupsProxi)
107 appendGroups :: (a -> PhyloPeriodId -> (Text,Text) -> Level -> Int -> [Cooc] -> PhyloGroup) -> Level -> Map (Date,Date) [a] -> Phylo -> Phylo
108 appendGroups f lvl m phylo = trace ("\n" <> "-- | Append " <> show (length $ concat $ elems m) <> " groups to Level " <> show (lvl) <> "\n")
109 $ over ( phylo_periods
113 (\phyloLvl -> if lvl == (phyloLvl ^. phylo_levelLevel)
115 let pId = phyloLvl ^. phylo_levelPeriod
116 pId' = phyloLvl ^. phylo_levelPeriod'
119 & phylo_levelGroups .~ (fromList $ foldl (\groups obj ->
120 groups ++ [ (((pId,lvl),length groups)
121 , f obj pId pId' lvl (length groups)
122 (elems $ restrictKeys (phylo ^. phylo_timeCooc) $ periodsToYears [pId]))
129 cliqueToGroup :: PhyloClique -> PhyloPeriodId -> (Text,Text) -> Level -> Int -> [Cooc] -> PhyloGroup
130 cliqueToGroup fis pId pId' lvl idx coocs = PhyloGroup pId pId' lvl idx ""
131 (fis ^. phyloClique_support)
132 (fis ^. phyloClique_weight)
133 (fis ^. phyloClique_sources)
134 (fis ^. phyloClique_nodes)
135 (ngramsToCooc (fis ^. phyloClique_nodes) coocs)
136 (1,[0]) -- branchid (lvl,[path in the branching tree])
137 (fromList [("breaks",[0]),("seaLevels",[0])])
141 toPhylo1 :: Phylo -> Phylo
142 toPhylo1 phyloStep = case (getSeaElevation phyloStep) of
143 Constante start gap -> constanteTemporalMatching start gap phyloStep
144 Adaptative steps -> adaptativeTemporalMatching steps phyloStep
146 -----------------------
147 -- | To Phylo Step | --
148 -----------------------
151 indexDates' :: Map (Date,Date) [Document] -> Map (Date,Date) (Text,Text)
152 indexDates' m = map (\docs ->
153 let ds = map (\d -> date' d) docs
163 -- To build the first phylo step from docs and terms
164 -- QL: backend entre phyloBase et phyloClique
165 toPhyloStep :: [Document] -> TermList -> Config -> Phylo
166 toPhyloStep docs lst conf = case (getSeaElevation phyloBase) of
167 Constante _ _ -> appendGroups cliqueToGroup 1 phyloClique (updatePeriods (indexDates' docs') phyloBase)
168 Adaptative _ -> toGroupsProxi 1
169 $ appendGroups cliqueToGroup 1 phyloClique (updatePeriods (indexDates' docs') phyloBase)
171 --------------------------------------
172 phyloClique :: Map (Date,Date) [PhyloClique]
173 phyloClique = toPhyloClique phyloBase docs'
174 --------------------------------------
175 docs' :: Map (Date,Date) [Document]
176 -- QL: Time Consuming here
177 docs' = groupDocsByPeriodRec date (getPeriodIds phyloBase) (sortOn date docs) empty
178 --------------------------------------
180 phyloBase = toPhyloBase docs lst conf
181 --------------------------------------
183 ---------------------------
184 -- | Frequent Item Set | --
185 ---------------------------
188 -- To apply a filter with the possibility of keeping some periods non empty (keep : True|False)
189 filterClique :: Bool -> Int -> (Int -> [PhyloClique] -> [PhyloClique]) -> Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
190 filterClique keep thr f m = case keep of
191 False -> map (\l -> f thr l) m
192 True -> map (\l -> keepFilled (f) thr l) m
195 -- To filter Fis with small Support
196 filterCliqueBySupport :: Int -> [PhyloClique] -> [PhyloClique]
197 filterCliqueBySupport thr l = filter (\clq -> (clq ^. phyloClique_support) >= thr) l
200 -- To filter Fis with small Clique size
201 filterCliqueBySize :: Int -> [PhyloClique] -> [PhyloClique]
202 filterCliqueBySize thr l = filter (\clq -> (length $ clq ^. phyloClique_nodes) >= thr) l
205 -- To filter nested Fis
206 filterCliqueByNested :: Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
207 filterCliqueByNested m =
209 foldl (\mem f -> if (any (\f' -> isNested (f' ^. phyloClique_nodes) (f ^. phyloClique_nodes)) mem)
212 let fMax = filter (\f' -> not $ isNested (f ^. phyloClique_nodes) (f' ^. phyloClique_nodes)) mem
213 in fMax ++ [f] ) [] l)
215 clq' = clq `using` parList rdeepseq
216 in fromList $ zip (keys m) clq'
219 -- | To transform a time map of docs into a time map of Fis with some filters
220 toPhyloClique :: Phylo -> Map (Date, Date) [Document] -> Map (Date,Date) [PhyloClique]
221 toPhyloClique phylo phyloDocs = case (clique $ getConfig phylo) of
222 Fis s s' -> -- traceFis "Filtered Fis"
224 {- \$ traceFis "Filtered by clique size" -}
225 $ filterClique True s' (filterCliqueBySize)
226 {- \$ traceFis "Filtered by support" -}
227 $ filterClique True s (filterCliqueBySupport)
228 {- \$ traceFis "Unfiltered Fis" -}
230 MaxClique s _ _ -> filterClique True s (filterCliqueBySize)
233 --------------------------------------
234 phyloClique :: Map (Date,Date) [PhyloClique]
235 phyloClique = case (clique $ getConfig phylo) of
237 let fis = map (\(prd,docs) ->
238 case (corpusParser $ getConfig phylo) of
239 Csv' _ -> let lst = toList
240 $ fisWithSizePolyMap' (Segment 1 20) 1 (map (\d -> (ngramsToIdx (text d) (getRoots phylo), (weight d, (sourcesToIdx (sources d) (getSources phylo))))) docs)
241 in (prd, map (\f -> PhyloClique (Set.toList $ fst f) ((fst . snd) f) prd ((fst . snd . snd) f) (((snd . snd . snd) f))) lst)
242 _ -> let lst = toList
243 $ fisWithSizePolyMap (Segment 1 20) 1 (map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs)
244 in (prd, map (\f -> PhyloClique (Set.toList $ fst f) (snd f) prd (Just $ fromIntegral $ snd f) []) lst)
247 fis' = fis `using` parList rdeepseq
249 MaxClique _ thr filterType ->
250 let mcl = map (\(prd,docs) ->
252 $ foldl sumCooc empty
254 $ map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs
255 in (prd, map (\cl -> PhyloClique cl 0 prd Nothing []) $ getMaxCliques filterType Conditional thr cooc))
257 mcl' = mcl `using` parList rdeepseq
259 --------------------------------------
261 -- dev viz graph maxClique getMaxClique
269 -- To transform the docs into a time map of coocurency matrix
270 docsToTimeScaleCooc :: [Document] -> Vector Ngrams -> Map Date Cooc
271 docsToTimeScaleCooc docs fdt =
272 let mCooc = fromListWith sumCooc
273 $ map (\(_d,l) -> (_d, listToMatrix l))
274 $ map (\doc -> (date doc, sort $ ngramsToIdx (text doc) fdt)) docs
276 $ map (\t -> (t,empty))
277 $ toTimeScale (map date docs) 1
278 in trace ("\n" <> "-- | Build the coocurency matrix for " <> show (length $ keys mCooc') <> " unit of time" <> "\n")
279 $ unionWith sumCooc mCooc mCooc'
282 -----------------------
283 -- | to Phylo Base | --
284 -----------------------
286 groupDocsByPeriodRec :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc] -> Map (date, date) [doc]
287 groupDocsByPeriodRec f prds docs acc =
288 if ((null prds) || (null docs))
291 let prd = head' "groupBy" prds
292 docs' = partition (\d -> (f d >= fst prd) && (f d <= snd prd)) docs
293 in groupDocsByPeriodRec f (tail prds) (snd docs') (insert prd (fst docs') acc)
296 -- To group a list of Documents by fixed periods
297 groupDocsByPeriod' :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
298 groupDocsByPeriod' f pds docs =
299 let docs' = groupBy (\d d' -> f d == f d') $ sortOn f docs
300 periods = map (inPeriode f docs') pds
301 periods' = periods `using` parList rdeepseq
302 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length pds) <> " periods" <> "\n")
303 $ fromList $ zip pds periods'
305 --------------------------------------
306 inPeriode :: Ord b => (t -> b) -> [[t]] -> (b, b) -> [t]
307 inPeriode f' h (start,end) =
308 concat $ fst $ partition (\d -> f' (head' "inPeriode" d) >= start && f' (head' "inPeriode" d) <= end) h
312 -- To group a list of Documents by fixed periods
313 groupDocsByPeriod :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
314 groupDocsByPeriod _ _ [] = panic "[ERR][Viz.Phylo.PhyloMaker] Empty [Documents] can not have any periods"
315 groupDocsByPeriod f pds es =
316 let periods = map (inPeriode f es) pds
317 periods' = periods `using` parList rdeepseq
319 in trace ("\n" <> "-- | Group " <> show(length es) <> " docs by " <> show(length pds) <> " periods" <> "\n")
320 $ fromList $ zip pds periods'
322 --------------------------------------
323 inPeriode :: Ord b => (t -> b) -> [t] -> (b, b) -> [t]
324 inPeriode f' h (start,end) =
325 fst $ partition (\d -> f' d >= start && f' d <= end) h
326 --------------------------------------
329 docsToTermFreq :: [Document] -> Vector Ngrams -> Map Int Double
330 docsToTermFreq docs fdt =
331 let nbDocs = fromIntegral $ length docs
332 freqs = map (/(nbDocs))
334 $ map (\lst -> (head' "docsToTermFreq" lst, fromIntegral $ length lst))
335 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) docs
336 sumFreqs = sum $ elems freqs
337 in map (/sumFreqs) freqs
339 docsToLastTermFreq :: Int -> [Document] -> Vector Ngrams -> Map Int Double
340 docsToLastTermFreq n docs fdt =
341 let last = take n $ reverse $ sort $ map date docs
342 nbDocs = fromIntegral $ length $ filter (\d -> elem (date d) last) docs
343 freqs = map (/(nbDocs))
345 $ map (\lst -> (head' "docsToLastTermFreq" lst, fromIntegral $ length lst))
346 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) $ filter (\d -> elem (date d) last) docs
347 sumFreqs = sum $ elems freqs
348 in map (/sumFreqs) freqs
351 -- To count the number of docs by unit of time
352 docsToTimeScaleNb :: [Document] -> Map Date Double
353 docsToTimeScaleNb docs =
354 let docs' = fromListWith (+) $ map (\d -> (date d,1)) docs
355 time = fromList $ map (\t -> (t,0)) $ toTimeScale (keys docs') 1
356 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length time) <> " unit of time" <> "\n")
357 $ unionWith (+) time docs'
360 initPhyloLevels :: Int -> PhyloPeriodId -> Map PhyloLevelId PhyloLevel
361 initPhyloLevels lvlMax pId =
362 fromList $ map (\lvl -> ((pId,lvl),PhyloLevel pId ("","") lvl empty)) [1..lvlMax]
366 -- To init the basic elements of a Phylo
367 toPhyloBase :: [Document] -> TermList -> Config -> Phylo
368 toPhyloBase docs lst conf =
369 let foundations = PhyloFoundations (Vector.fromList $ nub $ concat $ map text docs) lst
370 docsSources = PhyloSources (Vector.fromList $ nub $ concat $ map sources docs)
371 params = defaultPhyloParam { _phyloParam_config = conf }
372 periods = toPeriods (sort $ nub $ map date docs) (getTimePeriod $ timeUnit conf) (getTimeStep $ timeUnit conf)
373 in trace ("\n" <> "-- | Create PhyloBase out of " <> show(length docs) <> " docs \n")
376 (docsToTimeScaleCooc docs (foundations ^. foundations_roots))
377 (docsToTimeScaleNb docs)
378 (docsToTermFreq docs (foundations ^. foundations_roots))
379 (docsToLastTermFreq (getTimePeriod $ timeUnit conf) docs (foundations ^. foundations_roots))
383 (fromList $ map (\prd -> (prd, PhyloPeriod prd ("","") (initPhyloLevels 1 prd))) periods)