2 Module : Gargantext.Core.Viz.Phylo.PhyloMaker
3 Description : Maker engine for rebuilding a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 module Gargantext.Core.Viz.Phylo.PhyloMaker where
15 import Control.DeepSeq (NFData)
16 import Control.Lens hiding (Level)
17 import Control.Parallel.Strategies (parList, rdeepseq, using)
18 import Data.List (concat, nub, partition, sort, (++), group, intersect, null, sortOn, groupBy, tail)
19 import Data.Map (Map, fromListWith, keys, unionWith, fromList, empty, toList, elems, (!), restrictKeys, foldlWithKey, insert)
20 import Data.Text (Text)
21 import Data.Vector (Vector)
22 import Debug.Trace (trace)
24 import Gargantext.Core.Methods.Distances (Distance(Conditional))
25 import Gargantext.Core.Methods.Graph.MaxClique (getMaxCliques)
26 import Gargantext.Core.Text.Context (TermList)
27 import Gargantext.Core.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, fisWithSizePolyMap', Size(..))
28 import Gargantext.Core.Viz.Phylo
29 import Gargantext.Core.Viz.Phylo.PhyloExport (toHorizon)
30 import Gargantext.Core.Viz.Phylo.PhyloTools
31 import Gargantext.Core.Viz.Phylo.SynchronicClustering (synchronicClustering)
32 import Gargantext.Core.Viz.Phylo.TemporalMatching (adaptativeTemporalMatching, constanteTemporalMatching, getNextPeriods, filterDocs, filterDiago, reduceDiagos, toProximity)
33 import Gargantext.Prelude
35 import qualified Data.Set as Set
36 import qualified Data.Vector as Vector
44 data Phylo' = PhyloBase { _phylo'_phyloBase :: Phylo}
45 | PhyloN { _phylo'_phylo1 :: Phylo}
48 toPhylo' :: Phylo' -> [Document] -> TermList -> Config -> Phylo
49 toPhylo' (PhyloN phylo) = toPhylo'
50 toPhylo' (PhyloBase phylo) = toPhylo
54 toPhylo :: Phylo -> Phylo
55 toPhylo phyloStep = trace ("# phylo1 groups " <> show(length $ getGroupsFromLevel 1 phylo1))
56 $ traceToPhylo (phyloLevel $ getConfig phyloStep) $
57 if (phyloLevel $ getConfig phyloStep) > 1
58 then foldl' (\phylo' _ -> synchronicClustering phylo') phyloAncestors [2..(phyloLevel $ getConfig phyloStep)]
61 --------------------------------------
62 phyloAncestors :: Phylo
64 if (findAncestors $ getConfig phyloStep)
67 --------------------------------------
69 phylo1 = toPhylo1 phyloStep
70 --------------------------------------
77 toGroupsProxi :: Level -> Phylo -> Phylo
78 toGroupsProxi lvl phylo =
79 let proximity = phyloProximity $ getConfig phylo
80 groupsProxi = foldlWithKey (\acc pId pds ->
81 -- 1) process period by period
82 let egos = map (\g -> (getGroupId g, g ^. phylo_groupNgrams))
84 $ view ( phylo_periodLevels
85 . traverse . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
86 . phylo_levelGroups ) pds
87 next = getNextPeriods ToParents (getTimeFrame $ timeUnit $ getConfig phylo) pId (keys $ phylo ^. phylo_periods)
88 targets = map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ getGroupsFromLevelPeriods lvl next phylo
89 docs = filterDocs (phylo ^. phylo_timeDocs) ([pId] ++ next)
90 diagos = filterDiago (phylo ^. phylo_timeCooc) ([pId] ++ next)
91 -- 2) compute the pairs in parallel
92 pairs = map (\(id,ngrams) ->
93 map (\(id',ngrams') ->
94 let nbDocs = (sum . elems) $ filterDocs docs ([idToPrd id, idToPrd id'])
95 diago = reduceDiagos $ filterDiago diagos ([idToPrd id, idToPrd id'])
96 in ((id,id'),toProximity nbDocs diago proximity ngrams ngrams' ngrams')
97 ) $ filter (\(_,ngrams') -> (not . null) $ intersect ngrams ngrams') targets
99 pairs' = pairs `using` parList rdeepseq
100 in acc ++ (concat pairs')
101 ) [] $ phylo ^. phylo_periods
102 in phylo & phylo_groupsProxi .~ ((traceGroupsProxi . fromList) groupsProxi)
105 appendGroups :: (a -> PhyloPeriodId -> (Text,Text) -> Level -> Int -> [Cooc] -> PhyloGroup) -> Level -> Map (Date,Date) [a] -> Phylo -> Phylo
106 appendGroups f lvl m phylo = trace ("\n" <> "-- | Append " <> show (length $ concat $ elems m) <> " groups to Level " <> show (lvl) <> "\n")
107 $ over ( phylo_periods
111 (\phyloLvl -> if lvl == (phyloLvl ^. phylo_levelLevel)
113 let pId = phyloLvl ^. phylo_levelPeriod
114 pId' = phyloLvl ^. phylo_levelPeriod'
117 & phylo_levelGroups .~ (fromList $ foldl (\groups obj ->
118 groups ++ [ (((pId,lvl),length groups)
119 , f obj pId pId' lvl (length groups)
120 (elems $ restrictKeys (phylo ^. phylo_timeCooc) $ periodsToYears [pId]))
127 cliqueToGroup :: PhyloClique -> PhyloPeriodId -> (Text,Text) -> Level -> Int -> [Cooc] -> PhyloGroup
128 cliqueToGroup fis pId pId' lvl idx coocs = PhyloGroup pId pId' lvl idx ""
129 (fis ^. phyloClique_support)
130 (fis ^. phyloClique_weight)
131 (fis ^. phyloClique_sources)
132 (fis ^. phyloClique_nodes)
133 (ngramsToCooc (fis ^. phyloClique_nodes) coocs)
134 (1,[0]) -- branchid (lvl,[path in the branching tree])
135 (fromList [("breaks",[0]),("seaLevels",[0])])
139 toPhylo1 :: Phylo -> Phylo
140 toPhylo1 phyloStep = case (getSeaElevation phyloStep) of
141 Constante start gap -> constanteTemporalMatching start gap phyloStep
142 Adaptative steps -> adaptativeTemporalMatching steps phyloStep
144 -----------------------
145 -- | To Phylo Step | --
146 -----------------------
149 indexDates' :: Map (Date,Date) [Document] -> Map (Date,Date) (Text,Text)
150 indexDates' m = map (\docs ->
151 let ds = map (\d -> date' d) docs
161 -- To build the first phylo step from docs and terms
162 -- QL: backend entre phyloBase et phyloClique
163 toPhyloStep :: [Document] -> TermList -> Config -> Phylo
164 toPhyloStep docs lst conf = case (getSeaElevation phyloBase) of
165 Constante _ _ -> appendGroups cliqueToGroup 1 phyloClique (updatePeriods (indexDates' docs') phyloBase)
166 Adaptative _ -> toGroupsProxi 1
167 $ appendGroups cliqueToGroup 1 phyloClique (updatePeriods (indexDates' docs') phyloBase)
169 --------------------------------------
170 phyloClique :: Map (Date,Date) [PhyloClique]
171 phyloClique = toPhyloClique phyloBase docs'
172 --------------------------------------
173 docs' :: Map (Date,Date) [Document]
174 -- QL: Time Consuming here
175 docs' = groupDocsByPeriodRec date (getPeriodIds phyloBase) (sortOn date docs) empty
176 --------------------------------------
178 phyloBase = toPhyloBase docs lst conf
179 --------------------------------------
181 ---------------------------
182 -- | Frequent Item Set | --
183 ---------------------------
186 -- To apply a filter with the possibility of keeping some periods non empty (keep : True|False)
187 filterClique :: Bool -> Int -> (Int -> [PhyloClique] -> [PhyloClique]) -> Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
188 filterClique keep thr f m = case keep of
189 False -> map (\l -> f thr l) m
190 True -> map (\l -> keepFilled (f) thr l) m
193 -- To filter Fis with small Support
194 filterCliqueBySupport :: Int -> [PhyloClique] -> [PhyloClique]
195 filterCliqueBySupport thr l = filter (\clq -> (clq ^. phyloClique_support) >= thr) l
198 -- To filter Fis with small Clique size
199 filterCliqueBySize :: Int -> [PhyloClique] -> [PhyloClique]
200 filterCliqueBySize thr l = filter (\clq -> (length $ clq ^. phyloClique_nodes) >= thr) l
203 -- To filter nested Fis
204 filterCliqueByNested :: Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
205 filterCliqueByNested m =
207 foldl (\mem f -> if (any (\f' -> isNested (f' ^. phyloClique_nodes) (f ^. phyloClique_nodes)) mem)
210 let fMax = filter (\f' -> not $ isNested (f ^. phyloClique_nodes) (f' ^. phyloClique_nodes)) mem
211 in fMax ++ [f] ) [] l)
213 clq' = clq `using` parList rdeepseq
214 in fromList $ zip (keys m) clq'
217 -- | To transform a time map of docs into a time map of Fis with some filters
218 toPhyloClique :: Phylo -> Map (Date, Date) [Document] -> Map (Date,Date) [PhyloClique]
219 toPhyloClique phylo phyloDocs = case (clique $ getConfig phylo) of
220 Fis s s' -> -- traceFis "Filtered Fis"
222 {- \$ traceFis "Filtered by clique size" -}
223 $ filterClique True s' (filterCliqueBySize)
224 {- \$ traceFis "Filtered by support" -}
225 $ filterClique True s (filterCliqueBySupport)
226 {- \$ traceFis "Unfiltered Fis" -}
228 MaxClique s _ _ -> filterClique True s (filterCliqueBySize)
231 --------------------------------------
232 phyloClique :: Map (Date,Date) [PhyloClique]
233 phyloClique = case (clique $ getConfig phylo) of
235 let fis = map (\(prd,docs) ->
236 case (corpusParser $ getConfig phylo) of
237 Csv' _ -> let lst = toList
238 $ fisWithSizePolyMap' (Segment 1 20) 1 (map (\d -> (ngramsToIdx (text d) (getRoots phylo), (weight d, (sourcesToIdx (sources d) (getSources phylo))))) docs)
239 in (prd, map (\f -> PhyloClique (Set.toList $ fst f) ((fst . snd) f) prd ((fst . snd . snd) f) (((snd . snd . snd) f))) lst)
240 _ -> let lst = toList
241 $ fisWithSizePolyMap (Segment 1 20) 1 (map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs)
242 in (prd, map (\f -> PhyloClique (Set.toList $ fst f) (snd f) prd (Just $ fromIntegral $ snd f) []) lst)
245 fis' = fis `using` parList rdeepseq
247 MaxClique _ thr filterType ->
248 let mcl = map (\(prd,docs) ->
250 $ foldl sumCooc empty
252 $ map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs
253 in (prd, map (\cl -> PhyloClique cl 0 prd Nothing []) $ getMaxCliques filterType Conditional thr cooc))
255 mcl' = mcl `using` parList rdeepseq
257 --------------------------------------
259 -- dev viz graph maxClique getMaxClique
267 -- To transform the docs into a time map of coocurency matrix
268 docsToTimeScaleCooc :: [Document] -> Vector Ngrams -> Map Date Cooc
269 docsToTimeScaleCooc docs fdt =
270 let mCooc = fromListWith sumCooc
271 $ map (\(_d,l) -> (_d, listToMatrix l))
272 $ map (\doc -> (date doc, sort $ ngramsToIdx (text doc) fdt)) docs
274 $ map (\t -> (t,empty))
275 $ toTimeScale (map date docs) 1
276 in trace ("\n" <> "-- | Build the coocurency matrix for " <> show (length $ keys mCooc') <> " unit of time" <> "\n")
277 $ unionWith sumCooc mCooc mCooc'
280 -----------------------
281 -- | to Phylo Base | --
282 -----------------------
284 groupDocsByPeriodRec :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc] -> Map (date, date) [doc]
285 groupDocsByPeriodRec f prds docs acc =
286 if ((null prds) || (null docs))
289 let prd = head' "groupBy" prds
290 docs' = partition (\d -> (f d >= fst prd) && (f d <= snd prd)) docs
291 in groupDocsByPeriodRec f (tail prds) (snd docs') (insert prd (fst docs') acc)
294 -- To group a list of Documents by fixed periods
295 groupDocsByPeriod' :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
296 groupDocsByPeriod' f pds docs =
297 let docs' = groupBy (\d d' -> f d == f d') $ sortOn f docs
298 periods = map (inPeriode f docs') pds
299 periods' = periods `using` parList rdeepseq
300 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length pds) <> " periods" <> "\n")
301 $ fromList $ zip pds periods'
303 --------------------------------------
304 inPeriode :: Ord b => (t -> b) -> [[t]] -> (b, b) -> [t]
305 inPeriode f' h (start,end) =
306 concat $ fst $ partition (\d -> f' (head' "inPeriode" d) >= start && f' (head' "inPeriode" d) <= end) h
310 -- To group a list of Documents by fixed periods
311 groupDocsByPeriod :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
312 groupDocsByPeriod _ _ [] = panic "[ERR][Viz.Phylo.PhyloMaker] Empty [Documents] can not have any periods"
313 groupDocsByPeriod f pds es =
314 let periods = map (inPeriode f es) pds
315 periods' = periods `using` parList rdeepseq
317 in trace ("\n" <> "-- | Group " <> show(length es) <> " docs by " <> show(length pds) <> " periods" <> "\n")
318 $ fromList $ zip pds periods'
320 --------------------------------------
321 inPeriode :: Ord b => (t -> b) -> [t] -> (b, b) -> [t]
322 inPeriode f' h (start,end) =
323 fst $ partition (\d -> f' d >= start && f' d <= end) h
324 --------------------------------------
327 docsToTermFreq :: [Document] -> Vector Ngrams -> Map Int Double
328 docsToTermFreq docs fdt =
329 let nbDocs = fromIntegral $ length docs
330 freqs = map (/(nbDocs))
332 $ map (\lst -> (head' "docsToTermFreq" lst, fromIntegral $ length lst))
333 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) docs
334 sumFreqs = sum $ elems freqs
335 in map (/sumFreqs) freqs
337 docsToLastTermFreq :: Int -> [Document] -> Vector Ngrams -> Map Int Double
338 docsToLastTermFreq n docs fdt =
339 let last = take n $ reverse $ sort $ map date docs
340 nbDocs = fromIntegral $ length $ filter (\d -> elem (date d) last) docs
341 freqs = map (/(nbDocs))
343 $ map (\lst -> (head' "docsToLastTermFreq" lst, fromIntegral $ length lst))
344 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) $ filter (\d -> elem (date d) last) docs
345 sumFreqs = sum $ elems freqs
346 in map (/sumFreqs) freqs
349 -- To count the number of docs by unit of time
350 docsToTimeScaleNb :: [Document] -> Map Date Double
351 docsToTimeScaleNb docs =
352 let docs' = fromListWith (+) $ map (\d -> (date d,1)) docs
353 time = fromList $ map (\t -> (t,0)) $ toTimeScale (keys docs') 1
354 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length time) <> " unit of time" <> "\n")
355 $ unionWith (+) time docs'
358 initPhyloLevels :: Int -> PhyloPeriodId -> Map PhyloLevelId PhyloLevel
359 initPhyloLevels lvlMax pId =
360 fromList $ map (\lvl -> ((pId,lvl),PhyloLevel pId ("","") lvl empty)) [1..lvlMax]
364 -- To init the basic elements of a Phylo
365 toPhyloBase :: [Document] -> TermList -> Config -> Phylo
366 toPhyloBase docs lst conf =
367 let foundations = PhyloFoundations (Vector.fromList $ nub $ concat $ map text docs) lst
368 docsSources = PhyloSources (Vector.fromList $ nub $ concat $ map sources docs)
369 params = defaultPhyloParam { _phyloParam_config = conf }
370 periods = toPeriods (sort $ nub $ map date docs) (getTimePeriod $ timeUnit conf) (getTimeStep $ timeUnit conf)
371 in trace ("\n" <> "-- | Create PhyloBase out of " <> show(length docs) <> " docs \n")
374 (docsToTimeScaleCooc docs (foundations ^. foundations_roots))
375 (docsToTimeScaleNb docs)
376 (docsToTermFreq docs (foundations ^. foundations_roots))
377 (docsToLastTermFreq (getTimePeriod $ timeUnit conf) docs (foundations ^. foundations_roots))
381 (fromList $ map (\prd -> (prd, PhyloPeriod prd ("","") (initPhyloLevels 1 prd))) periods)