2 Module : Gargantext.Viz.Phylo.PhyloMaker
3 Description : Maker engine for rebuilding a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 {-# LANGUAGE NoImplicitPrelude #-}
12 {-# LANGUAGE FlexibleContexts #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE MultiParamTypeClasses #-}
16 module Gargantext.Viz.Phylo.PhyloMaker where
18 import Data.List (concat, nub, partition, sort, (++), group, intersect, null, sortOn, groupBy, tail)
19 import Data.Map (Map, fromListWith, keys, unionWith, fromList, empty, toList, elems, (!), restrictKeys, foldlWithKey, insert)
20 import Data.Vector (Vector)
22 import Gargantext.Prelude
23 import Gargantext.Viz.AdaptativePhylo
24 import Gargantext.Viz.Phylo.PhyloTools
25 import Gargantext.Viz.Phylo.TemporalMatching (adaptativeTemporalMatching, constanteTemporalMatching, getNextPeriods, filterDocs, filterDiago, reduceDiagos, toProximity)
26 import Gargantext.Viz.Phylo.SynchronicClustering (synchronicClustering)
27 import Gargantext.Text.Context (TermList)
28 import Gargantext.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, Size(..))
29 import Gargantext.Viz.Graph.MaxClique (getMaxCliques)
31 import Control.DeepSeq (NFData)
32 import Control.Parallel.Strategies (parList, rdeepseq, using)
33 import Debug.Trace (trace)
34 import Control.Lens hiding (Level)
36 import qualified Data.Vector as Vector
37 import qualified Data.Set as Set
45 toPhylo :: [Document] -> TermList -> Config -> Phylo
46 toPhylo docs lst conf = trace ("# phylo1 groups " <> show(length $ getGroupsFromLevel 1 phylo1))
47 $ traceToPhylo (phyloLevel conf) $
48 if (phyloLevel conf) > 1
49 then foldl' (\phylo' _ -> synchronicClustering phylo') phylo1 [2..(phyloLevel conf)]
52 --------------------------------------
54 phylo1 = toPhylo1 docs phyloBase
55 --------------------------------------
57 phyloBase = toPhyloBase docs lst conf
58 --------------------------------------
66 toGroupsProxi :: Level -> Phylo -> Phylo
67 toGroupsProxi lvl phylo =
68 let proximity = phyloProximity $ getConfig phylo
69 groupsProxi = foldlWithKey (\acc pId pds ->
70 -- 1) process period by period
71 let egos = map (\g -> (getGroupId g, g ^. phylo_groupNgrams))
73 $ view ( phylo_periodLevels
74 . traverse . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
75 . phylo_levelGroups ) pds
76 next = getNextPeriods ToParents (getTimeFrame $ timeUnit $ getConfig phylo) pId (keys $ phylo ^. phylo_periods)
77 targets = map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ getGroupsFromLevelPeriods lvl next phylo
78 docs = filterDocs (phylo ^. phylo_timeDocs) ([pId] ++ next)
79 diagos = filterDiago (phylo ^. phylo_timeCooc) ([pId] ++ next)
80 -- 2) compute the pairs in parallel
81 pairs = map (\(id,ngrams) ->
82 map (\(id',ngrams') ->
83 let nbDocs = (sum . elems) $ filterDocs docs ([idToPrd id, idToPrd id'])
84 diago = reduceDiagos $ filterDiago diagos ([idToPrd id, idToPrd id'])
85 in ((id,id'),toProximity nbDocs diago proximity ngrams ngrams' ngrams')
86 ) $ filter (\(_,ngrams') -> (not . null) $ intersect ngrams ngrams') targets
88 pairs' = pairs `using` parList rdeepseq
89 in acc ++ (concat pairs')
90 ) [] $ phylo ^. phylo_periods
91 in phylo & phylo_groupsProxi .~ ((traceGroupsProxi . fromList) groupsProxi)
94 appendGroups :: (a -> PhyloPeriodId -> Level -> Int -> [Cooc] -> PhyloGroup) -> Level -> Map (Date,Date) [a] -> Phylo -> Phylo
95 appendGroups f lvl m phylo = trace ("\n" <> "-- | Append " <> show (length $ concat $ elems m) <> " groups to Level " <> show (lvl) <> "\n")
96 $ over ( phylo_periods
100 (\phyloLvl -> if lvl == (phyloLvl ^. phylo_levelLevel)
102 let pId = phyloLvl ^. phylo_levelPeriod
105 & phylo_levelGroups .~ (fromList $ foldl (\groups obj ->
106 groups ++ [ (((pId,lvl),length groups)
107 , f obj pId lvl (length groups)
108 (elems $ restrictKeys (phylo ^. phylo_timeCooc) $ periodsToYears [pId]))
115 cliqueToGroup :: PhyloClique -> PhyloPeriodId -> Level -> Int -> [Cooc] -> PhyloGroup
116 cliqueToGroup fis pId lvl idx coocs = PhyloGroup pId lvl idx ""
117 (fis ^. phyloClique_support)
118 (fis ^. phyloClique_nodes)
119 (ngramsToCooc (fis ^. phyloClique_nodes) coocs)
120 (1,[0]) -- | branchid (lvl,[path in the branching tree])
121 (fromList [("breaks",[0]),("seaLevels",[0])])
125 toPhylo1 :: [Document] -> Phylo -> Phylo
126 toPhylo1 docs phyloBase = case (getSeaElevation phyloBase) of
127 Constante start gap -> constanteTemporalMatching start gap
128 $ appendGroups cliqueToGroup 1 phyloClique phyloBase
129 Adaptative steps -> adaptativeTemporalMatching steps
131 $ appendGroups cliqueToGroup 1 phyloClique phyloBase
133 --------------------------------------
134 phyloClique :: Map (Date,Date) [PhyloClique]
135 phyloClique = toPhyloClique phyloBase docs'
136 --------------------------------------
137 docs' :: Map (Date,Date) [Document]
138 docs' = groupDocsByPeriodRec date (getPeriodIds phyloBase) (sortOn date docs) empty
139 -- docs' = groupDocsByPeriod' date (getPeriodIds phyloBase) docs
140 --------------------------------------
143 ---------------------------
144 -- | Frequent Item Set | --
145 ---------------------------
148 -- | To apply a filter with the possibility of keeping some periods non empty (keep : True|False)
149 filterClique :: Bool -> Int -> (Int -> [PhyloClique] -> [PhyloClique]) -> Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
150 filterClique keep thr f m = case keep of
151 False -> map (\l -> f thr l) m
152 True -> map (\l -> keepFilled (f) thr l) m
155 -- | To filter Fis with small Support
156 filterCliqueBySupport :: Int -> [PhyloClique] -> [PhyloClique]
157 filterCliqueBySupport thr l = filter (\clq -> (clq ^. phyloClique_support) >= thr) l
160 -- | To filter Fis with small Clique size
161 filterCliqueBySize :: Int -> [PhyloClique] -> [PhyloClique]
162 filterCliqueBySize thr l = filter (\clq -> (length $ clq ^. phyloClique_nodes) >= thr) l
165 -- | To filter nested Fis
166 filterCliqueByNested :: Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
167 filterCliqueByNested m =
169 foldl (\mem f -> if (any (\f' -> isNested (f' ^. phyloClique_nodes) (f ^. phyloClique_nodes)) mem)
172 let fMax = filter (\f' -> not $ isNested (f ^. phyloClique_nodes) (f' ^. phyloClique_nodes)) mem
173 in fMax ++ [f] ) [] l)
175 clq' = clq `using` parList rdeepseq
176 in fromList $ zip (keys m) clq'
179 -- | To transform a time map of docs into a time map of Fis with some filters
180 toPhyloClique :: Phylo -> Map (Date, Date) [Document] -> Map (Date,Date) [PhyloClique]
181 toPhyloClique phylo phyloDocs = case (clique $ getConfig phylo) of
182 Fis s s' -> -- traceFis "Filtered Fis"
184 -- $ traceFis "Filtered by clique size"
185 $ filterClique True s' (filterCliqueBySize)
186 -- $ traceFis "Filtered by support"
187 $ filterClique True s (filterCliqueBySupport)
188 -- $ traceFis "Unfiltered Fis"
190 MaxClique s -> filterClique True s (filterCliqueBySize)
193 --------------------------------------
194 phyloClique :: Map (Date,Date) [PhyloClique]
195 phyloClique = case (clique $ getConfig phylo) of
197 let fis = map (\(prd,docs) ->
198 let lst = toList $ fisWithSizePolyMap (Segment 1 20) 1 (map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs)
199 in (prd, map (\f -> PhyloClique (Set.toList $ fst f) (snd f) prd) lst))
201 fis' = fis `using` parList rdeepseq
204 let mcl = map (\(prd,docs) ->
206 $ foldl sumCooc empty
208 $ map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs
209 in (prd, map (\cl -> PhyloClique cl 0 prd) $ getMaxCliques 0.001 cooc))
211 mcl' = mcl `using` parList rdeepseq
213 --------------------------------------
215 -- dev viz graph maxClique getMaxClique
223 -- | To transform the docs into a time map of coocurency matrix
224 docsToTimeScaleCooc :: [Document] -> Vector Ngrams -> Map Date Cooc
225 docsToTimeScaleCooc docs fdt =
226 let mCooc = fromListWith sumCooc
227 $ map (\(_d,l) -> (_d, listToMatrix l))
228 $ map (\doc -> (date doc, sort $ ngramsToIdx (text doc) fdt)) docs
230 $ map (\t -> (t,empty))
231 $ toTimeScale (map date docs) 1
232 in trace ("\n" <> "-- | Build the coocurency matrix for " <> show (length $ keys mCooc') <> " unit of time" <> "\n")
233 $ unionWith sumCooc mCooc mCooc'
236 -----------------------
237 -- | to Phylo Base | --
238 -----------------------
240 groupDocsByPeriodRec :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc] -> Map (date, date) [doc]
241 groupDocsByPeriodRec f prds docs acc =
242 if ((null prds) || (null docs))
245 let prd = head' "groupBy" prds
246 docs' = partition (\d -> (f d >= fst prd) && (f d <= snd prd)) docs
247 in groupDocsByPeriodRec f (tail prds) (snd docs') (insert prd (fst docs') acc)
250 -- | To group a list of Documents by fixed periods
251 groupDocsByPeriod' :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
252 groupDocsByPeriod' f pds docs =
253 let docs' = groupBy (\d d' -> f d == f d') $ sortOn f docs
254 periods = map (inPeriode f docs') pds
255 periods' = periods `using` parList rdeepseq
256 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length pds) <> " periods" <> "\n")
257 $ fromList $ zip pds periods'
259 --------------------------------------
260 inPeriode :: Ord b => (t -> b) -> [[t]] -> (b, b) -> [t]
261 inPeriode f' h (start,end) =
262 concat $ fst $ partition (\d -> f' (head' "inPeriode" d) >= start && f' (head' "inPeriode" d) <= end) h
266 -- | To group a list of Documents by fixed periods
267 groupDocsByPeriod :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
268 groupDocsByPeriod _ _ [] = panic "[ERR][Viz.Phylo.PhyloMaker] Empty [Documents] can not have any periods"
269 groupDocsByPeriod f pds es =
270 let periods = map (inPeriode f es) pds
271 periods' = periods `using` parList rdeepseq
273 in trace ("\n" <> "-- | Group " <> show(length es) <> " docs by " <> show(length pds) <> " periods" <> "\n")
274 $ fromList $ zip pds periods'
276 --------------------------------------
277 inPeriode :: Ord b => (t -> b) -> [t] -> (b, b) -> [t]
278 inPeriode f' h (start,end) =
279 fst $ partition (\d -> f' d >= start && f' d <= end) h
280 --------------------------------------
283 docsToTermFreq :: [Document] -> Vector Ngrams -> Map Int Double
284 docsToTermFreq docs fdt =
285 let nbDocs = fromIntegral $ length docs
286 freqs = map (/(nbDocs))
288 $ map (\lst -> (head' "docsToTermFreq" lst, fromIntegral $ length lst))
289 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) docs
290 sumFreqs = sum $ elems freqs
291 in map (/sumFreqs) freqs
294 -- | To count the number of docs by unit of time
295 docsToTimeScaleNb :: [Document] -> Map Date Double
296 docsToTimeScaleNb docs =
297 let docs' = fromListWith (+) $ map (\d -> (date d,1)) docs
298 time = fromList $ map (\t -> (t,0)) $ toTimeScale (keys docs') 1
299 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length time) <> " unit of time" <> "\n")
300 $ unionWith (+) time docs'
303 initPhyloLevels :: Int -> PhyloPeriodId -> Map PhyloLevelId PhyloLevel
304 initPhyloLevels lvlMax pId =
305 fromList $ map (\lvl -> ((pId,lvl),PhyloLevel pId lvl empty)) [1..lvlMax]
308 -- | To init the basic elements of a Phylo
309 toPhyloBase :: [Document] -> TermList -> Config -> Phylo
310 toPhyloBase docs lst conf =
311 let foundations = PhyloFoundations (Vector.fromList $ nub $ concat $ map text docs) lst
312 params = defaultPhyloParam { _phyloParam_config = conf }
313 periods = toPeriods (sort $ nub $ map date docs) (getTimePeriod $ timeUnit conf) (getTimeStep $ timeUnit conf)
314 in trace ("\n" <> "-- | Create PhyloBase out of " <> show(length docs) <> " docs \n")
316 (docsToTimeScaleCooc docs (foundations ^. foundations_roots))
317 (docsToTimeScaleNb docs)
318 (docsToTermFreq docs (foundations ^. foundations_roots))
322 (fromList $ map (\prd -> (prd, PhyloPeriod prd (initPhyloLevels 1 prd))) periods)