2 Module : Gargantext.Core.Viz.Phylo.PhyloMaker
3 Description : Maker engine for rebuilding a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 module Gargantext.Core.Viz.Phylo.PhyloMaker where
14 import Data.List (concat, nub, partition, sort, (++), group, intersect, null, sortOn, groupBy, tail)
15 import Data.Map (Map, fromListWith, keys, unionWith, fromList, empty, toList, elems, (!), restrictKeys, foldlWithKey, insert)
16 import Data.Vector (Vector)
18 import Gargantext.Prelude
19 import Gargantext.Core.Viz.AdaptativePhylo
20 import Gargantext.Core.Viz.Phylo.PhyloTools
21 import Gargantext.Core.Viz.Phylo.TemporalMatching (adaptativeTemporalMatching, constanteTemporalMatching, getNextPeriods, filterDocs, filterDiago, reduceDiagos, toProximity)
22 import Gargantext.Core.Viz.Phylo.SynchronicClustering (synchronicClustering)
23 import Gargantext.Core.Text.Context (TermList)
24 import Gargantext.Core.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, Size(..))
25 import Gargantext.Core.Viz.Graph.MaxClique (getMaxCliques)
26 import Gargantext.Core.Viz.Graph.Distances (Distance(Conditional))
27 import Gargantext.Core.Viz.Phylo.PhyloExport (toHorizon)
29 import Control.DeepSeq (NFData)
30 import Control.Parallel.Strategies (parList, rdeepseq, using)
31 import Debug.Trace (trace)
32 import Control.Lens hiding (Level)
34 import qualified Data.Vector as Vector
35 import qualified Data.Set as Set
43 data Phylo' = PhyloBase { _phylo'_phyloBase :: Phylo}
44 | PhyloN { _phylo'_phylo1 :: Phylo}
47 toPhylo' :: Phylo' -> [Document] -> TermList -> Config -> Phylo
48 toPhylo' (PhyloN phylo) = toPhylo'
49 toPhylo' (PhyloBase phylo) = toPhylo
53 toPhylo :: [Document] -> TermList -> Config -> Phylo
54 toPhylo docs lst conf = trace ("# phylo1 groups " <> show(length $ getGroupsFromLevel 1 phylo1))
55 $ traceToPhylo (phyloLevel conf) $
56 if (phyloLevel conf) > 1
57 then foldl' (\phylo' _ -> synchronicClustering phylo') phyloAncestors [2..(phyloLevel conf)]
60 --------------------------------------
61 phyloAncestors :: Phylo
63 if (findAncestors conf)
66 --------------------------------------
68 phylo1 = toPhylo1 docs phyloBase
70 --------------------------------------
72 phyloBase = toPhyloBase docs lst conf
74 --------------------------------------
82 toGroupsProxi :: Level -> Phylo -> Phylo
83 toGroupsProxi lvl phylo =
84 let proximity = phyloProximity $ getConfig phylo
85 groupsProxi = foldlWithKey (\acc pId pds ->
86 -- 1) process period by period
87 let egos = map (\g -> (getGroupId g, g ^. phylo_groupNgrams))
89 $ view ( phylo_periodLevels
90 . traverse . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
91 . phylo_levelGroups ) pds
92 next = getNextPeriods ToParents (getTimeFrame $ timeUnit $ getConfig phylo) pId (keys $ phylo ^. phylo_periods)
93 targets = map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ getGroupsFromLevelPeriods lvl next phylo
94 docs = filterDocs (phylo ^. phylo_timeDocs) ([pId] ++ next)
95 diagos = filterDiago (phylo ^. phylo_timeCooc) ([pId] ++ next)
96 -- 2) compute the pairs in parallel
97 pairs = map (\(id,ngrams) ->
98 map (\(id',ngrams') ->
99 let nbDocs = (sum . elems) $ filterDocs docs ([idToPrd id, idToPrd id'])
100 diago = reduceDiagos $ filterDiago diagos ([idToPrd id, idToPrd id'])
101 in ((id,id'),toProximity nbDocs diago proximity ngrams ngrams' ngrams')
102 ) $ filter (\(_,ngrams') -> (not . null) $ intersect ngrams ngrams') targets
104 pairs' = pairs `using` parList rdeepseq
105 in acc ++ (concat pairs')
106 ) [] $ phylo ^. phylo_periods
107 in phylo & phylo_groupsProxi .~ ((traceGroupsProxi . fromList) groupsProxi)
110 appendGroups :: (a -> PhyloPeriodId -> Level -> Int -> [Cooc] -> PhyloGroup) -> Level -> Map (Date,Date) [a] -> Phylo -> Phylo
111 appendGroups f lvl m phylo = trace ("\n" <> "-- | Append " <> show (length $ concat $ elems m) <> " groups to Level " <> show (lvl) <> "\n")
112 $ over ( phylo_periods
116 (\phyloLvl -> if lvl == (phyloLvl ^. phylo_levelLevel)
118 let pId = phyloLvl ^. phylo_levelPeriod
121 & phylo_levelGroups .~ (fromList $ foldl (\groups obj ->
122 groups ++ [ (((pId,lvl),length groups)
123 , f obj pId lvl (length groups)
124 (elems $ restrictKeys (phylo ^. phylo_timeCooc) $ periodsToYears [pId]))
131 cliqueToGroup :: PhyloClique -> PhyloPeriodId -> Level -> Int -> [Cooc] -> PhyloGroup
132 cliqueToGroup fis pId lvl idx coocs = PhyloGroup pId lvl idx ""
133 (fis ^. phyloClique_support)
134 (fis ^. phyloClique_nodes)
135 (ngramsToCooc (fis ^. phyloClique_nodes) coocs)
136 (1,[0]) -- branchid (lvl,[path in the branching tree])
137 (fromList [("breaks",[0]),("seaLevels",[0])])
141 toPhylo1 :: [Document] -> Phylo -> Phylo
142 toPhylo1 docs phyloBase = case (getSeaElevation phyloBase) of
143 Constante start gap -> constanteTemporalMatching start gap
144 $ appendGroups cliqueToGroup 1 phyloClique phyloBase
145 Adaptative steps -> adaptativeTemporalMatching steps
147 $ appendGroups cliqueToGroup 1 phyloClique phyloBase
149 --------------------------------------
150 phyloClique :: Map (Date,Date) [PhyloClique]
151 phyloClique = toPhyloClique phyloBase docs'
152 --------------------------------------
153 docs' :: Map (Date,Date) [Document]
154 docs' = groupDocsByPeriodRec date (getPeriodIds phyloBase) (sortOn date docs) empty
155 -- docs' = groupDocsByPeriod' date (getPeriodIds phyloBase) docs
156 --------------------------------------
159 ---------------------------
160 -- | Frequent Item Set | --
161 ---------------------------
164 -- To apply a filter with the possibility of keeping some periods non empty (keep : True|False)
165 filterClique :: Bool -> Int -> (Int -> [PhyloClique] -> [PhyloClique]) -> Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
166 filterClique keep thr f m = case keep of
167 False -> map (\l -> f thr l) m
168 True -> map (\l -> keepFilled (f) thr l) m
171 -- To filter Fis with small Support
172 filterCliqueBySupport :: Int -> [PhyloClique] -> [PhyloClique]
173 filterCliqueBySupport thr l = filter (\clq -> (clq ^. phyloClique_support) >= thr) l
176 -- To filter Fis with small Clique size
177 filterCliqueBySize :: Int -> [PhyloClique] -> [PhyloClique]
178 filterCliqueBySize thr l = filter (\clq -> (length $ clq ^. phyloClique_nodes) >= thr) l
181 -- To filter nested Fis
182 filterCliqueByNested :: Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
183 filterCliqueByNested m =
185 foldl (\mem f -> if (any (\f' -> isNested (f' ^. phyloClique_nodes) (f ^. phyloClique_nodes)) mem)
188 let fMax = filter (\f' -> not $ isNested (f ^. phyloClique_nodes) (f' ^. phyloClique_nodes)) mem
189 in fMax ++ [f] ) [] l)
191 clq' = clq `using` parList rdeepseq
192 in fromList $ zip (keys m) clq'
195 -- | To transform a time map of docs into a time map of Fis with some filters
196 toPhyloClique :: Phylo -> Map (Date, Date) [Document] -> Map (Date,Date) [PhyloClique]
197 toPhyloClique phylo phyloDocs = case (clique $ getConfig phylo) of
198 Fis s s' -> -- traceFis "Filtered Fis"
200 {- \$ traceFis "Filtered by clique size" -}
201 $ filterClique True s' (filterCliqueBySize)
202 {- \$ traceFis "Filtered by support" -}
203 $ filterClique True s (filterCliqueBySupport)
204 {- \$ traceFis "Unfiltered Fis" -}
206 MaxClique s -> filterClique True s (filterCliqueBySize)
209 --------------------------------------
210 phyloClique :: Map (Date,Date) [PhyloClique]
211 phyloClique = case (clique $ getConfig phylo) of
213 let fis = map (\(prd,docs) ->
214 let lst = toList $ fisWithSizePolyMap (Segment 1 20) 1 (map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs)
215 in (prd, map (\f -> PhyloClique (Set.toList $ fst f) (snd f) prd) lst))
217 fis' = fis `using` parList rdeepseq
220 let mcl = map (\(prd,docs) ->
222 $ foldl sumCooc empty
224 $ map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs
225 in (prd, map (\cl -> PhyloClique cl 0 prd) $ getMaxCliques Conditional 0.001 cooc))
227 mcl' = mcl `using` parList rdeepseq
229 --------------------------------------
231 -- dev viz graph maxClique getMaxClique
239 -- To transform the docs into a time map of coocurency matrix
240 docsToTimeScaleCooc :: [Document] -> Vector Ngrams -> Map Date Cooc
241 docsToTimeScaleCooc docs fdt =
242 let mCooc = fromListWith sumCooc
243 $ map (\(_d,l) -> (_d, listToMatrix l))
244 $ map (\doc -> (date doc, sort $ ngramsToIdx (text doc) fdt)) docs
246 $ map (\t -> (t,empty))
247 $ toTimeScale (map date docs) 1
248 in trace ("\n" <> "-- | Build the coocurency matrix for " <> show (length $ keys mCooc') <> " unit of time" <> "\n")
249 $ unionWith sumCooc mCooc mCooc'
252 -----------------------
253 -- | to Phylo Base | --
254 -----------------------
256 groupDocsByPeriodRec :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc] -> Map (date, date) [doc]
257 groupDocsByPeriodRec f prds docs acc =
258 if ((null prds) || (null docs))
261 let prd = head' "groupBy" prds
262 docs' = partition (\d -> (f d >= fst prd) && (f d <= snd prd)) docs
263 in groupDocsByPeriodRec f (tail prds) (snd docs') (insert prd (fst docs') acc)
266 -- To group a list of Documents by fixed periods
267 groupDocsByPeriod' :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
268 groupDocsByPeriod' f pds docs =
269 let docs' = groupBy (\d d' -> f d == f d') $ sortOn f docs
270 periods = map (inPeriode f docs') pds
271 periods' = periods `using` parList rdeepseq
272 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length pds) <> " periods" <> "\n")
273 $ fromList $ zip pds periods'
275 --------------------------------------
276 inPeriode :: Ord b => (t -> b) -> [[t]] -> (b, b) -> [t]
277 inPeriode f' h (start,end) =
278 concat $ fst $ partition (\d -> f' (head' "inPeriode" d) >= start && f' (head' "inPeriode" d) <= end) h
282 -- To group a list of Documents by fixed periods
283 groupDocsByPeriod :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
284 groupDocsByPeriod _ _ [] = panic "[ERR][Viz.Phylo.PhyloMaker] Empty [Documents] can not have any periods"
285 groupDocsByPeriod f pds es =
286 let periods = map (inPeriode f es) pds
287 periods' = periods `using` parList rdeepseq
289 in trace ("\n" <> "-- | Group " <> show(length es) <> " docs by " <> show(length pds) <> " periods" <> "\n")
290 $ fromList $ zip pds periods'
292 --------------------------------------
293 inPeriode :: Ord b => (t -> b) -> [t] -> (b, b) -> [t]
294 inPeriode f' h (start,end) =
295 fst $ partition (\d -> f' d >= start && f' d <= end) h
296 --------------------------------------
299 docsToTermFreq :: [Document] -> Vector Ngrams -> Map Int Double
300 docsToTermFreq docs fdt =
301 let nbDocs = fromIntegral $ length docs
302 freqs = map (/(nbDocs))
304 $ map (\lst -> (head' "docsToTermFreq" lst, fromIntegral $ length lst))
305 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) docs
306 sumFreqs = sum $ elems freqs
307 in map (/sumFreqs) freqs
309 docsToLastTermFreq :: Int -> [Document] -> Vector Ngrams -> Map Int Double
310 docsToLastTermFreq n docs fdt =
311 let last = take n $ reverse $ sort $ map date docs
312 nbDocs = fromIntegral $ length $ filter (\d -> elem (date d) last) docs
313 freqs = map (/(nbDocs))
315 $ map (\lst -> (head' "docsToLastTermFreq" lst, fromIntegral $ length lst))
316 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) $ filter (\d -> elem (date d) last) docs
317 sumFreqs = sum $ elems freqs
318 in map (/sumFreqs) freqs
321 -- To count the number of docs by unit of time
322 docsToTimeScaleNb :: [Document] -> Map Date Double
323 docsToTimeScaleNb docs =
324 let docs' = fromListWith (+) $ map (\d -> (date d,1)) docs
325 time = fromList $ map (\t -> (t,0)) $ toTimeScale (keys docs') 1
326 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length time) <> " unit of time" <> "\n")
327 $ unionWith (+) time docs'
330 initPhyloLevels :: Int -> PhyloPeriodId -> Map PhyloLevelId PhyloLevel
331 initPhyloLevels lvlMax pId =
332 fromList $ map (\lvl -> ((pId,lvl),PhyloLevel pId lvl empty)) [1..lvlMax]
335 -- To init the basic elements of a Phylo
336 toPhyloBase :: [Document] -> TermList -> Config -> Phylo
337 toPhyloBase docs lst conf =
338 let foundations = PhyloFoundations (Vector.fromList $ nub $ concat $ map text docs) lst
339 params = defaultPhyloParam { _phyloParam_config = conf }
340 periods = toPeriods (sort $ nub $ map date docs) (getTimePeriod $ timeUnit conf) (getTimeStep $ timeUnit conf)
341 in trace ("\n" <> "-- | Create PhyloBase out of " <> show(length docs) <> " docs \n")
343 (docsToTimeScaleCooc docs (foundations ^. foundations_roots))
344 (docsToTimeScaleNb docs)
345 (docsToTermFreq docs (foundations ^. foundations_roots))
346 (docsToLastTermFreq (getTimePeriod $ timeUnit conf) docs (foundations ^. foundations_roots))
350 (fromList $ map (\prd -> (prd, PhyloPeriod prd (initPhyloLevels 1 prd))) periods)