2 Module : Gargantext.Core.Viz.Phylo.PhyloMaker
3 Description : Maker engine for rebuilding a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 module Gargantext.Core.Viz.Phylo.PhyloMaker where
14 import Data.List (concat, nub, partition, sort, (++), group, intersect, null, sortOn, groupBy, tail)
15 import Data.Map (Map, fromListWith, keys, unionWith, fromList, empty, toList, elems, (!), restrictKeys, foldlWithKey, insert)
16 import Data.Vector (Vector)
18 import Gargantext.Prelude
19 import Gargantext.Core.Viz.AdaptativePhylo
20 import Gargantext.Core.Viz.Phylo.PhyloTools
21 import Gargantext.Core.Viz.Phylo.TemporalMatching (adaptativeTemporalMatching, constanteTemporalMatching, getNextPeriods, filterDocs, filterDiago, reduceDiagos, toProximity)
22 import Gargantext.Core.Viz.Phylo.SynchronicClustering (synchronicClustering)
23 import Gargantext.Core.Text.Context (TermList)
24 import Gargantext.Core.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, Size(..))
25 import Gargantext.Core.Methods.Graph.MaxClique (getMaxCliques)
26 import Gargantext.Core.Methods.Distances (Distance(Conditional))
28 import Control.DeepSeq (NFData)
29 import Control.Parallel.Strategies (parList, rdeepseq, using)
30 import Debug.Trace (trace)
31 import Control.Lens hiding (Level)
33 import qualified Data.Vector as Vector
34 import qualified Data.Set as Set
42 data Phylo' = PhyloBase { _phylo'_phyloBase :: Phylo}
43 | PhyloN { _phylo'_phylo1 :: Phylo}
46 toPhylo' :: Phylo' -> [Document] -> TermList -> Config -> Phylo
47 toPhylo' (PhyloN phylo) = toPhylo'
48 toPhylo' (PhyloBase phylo) = toPhylo
52 toPhylo :: [Document] -> TermList -> Config -> Phylo
53 toPhylo docs lst conf = trace ("# phylo1 groups " <> show(length $ getGroupsFromLevel 1 phylo1))
54 $ traceToPhylo (phyloLevel conf) $
55 if (phyloLevel conf) > 1
56 then foldl' (\phylo' _ -> synchronicClustering phylo') phylo1 [2..(phyloLevel conf)]
59 --------------------------------------
61 phylo1 = toPhylo1 docs phyloBase
63 --------------------------------------
65 phyloBase = toPhyloBase docs lst conf
67 --------------------------------------
75 toGroupsProxi :: Level -> Phylo -> Phylo
76 toGroupsProxi lvl phylo =
77 let proximity = phyloProximity $ getConfig phylo
78 groupsProxi = foldlWithKey (\acc pId pds ->
79 -- 1) process period by period
80 let egos = map (\g -> (getGroupId g, g ^. phylo_groupNgrams))
82 $ view ( phylo_periodLevels
83 . traverse . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
84 . phylo_levelGroups ) pds
85 next = getNextPeriods ToParents (getTimeFrame $ timeUnit $ getConfig phylo) pId (keys $ phylo ^. phylo_periods)
86 targets = map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ getGroupsFromLevelPeriods lvl next phylo
87 docs = filterDocs (phylo ^. phylo_timeDocs) ([pId] ++ next)
88 diagos = filterDiago (phylo ^. phylo_timeCooc) ([pId] ++ next)
89 -- 2) compute the pairs in parallel
90 pairs = map (\(id,ngrams) ->
91 map (\(id',ngrams') ->
92 let nbDocs = (sum . elems) $ filterDocs docs ([idToPrd id, idToPrd id'])
93 diago = reduceDiagos $ filterDiago diagos ([idToPrd id, idToPrd id'])
94 in ((id,id'),toProximity nbDocs diago proximity ngrams ngrams' ngrams')
95 ) $ filter (\(_,ngrams') -> (not . null) $ intersect ngrams ngrams') targets
97 pairs' = pairs `using` parList rdeepseq
98 in acc ++ (concat pairs')
99 ) [] $ phylo ^. phylo_periods
100 in phylo & phylo_groupsProxi .~ ((traceGroupsProxi . fromList) groupsProxi)
103 appendGroups :: (a -> PhyloPeriodId -> Level -> Int -> [Cooc] -> PhyloGroup) -> Level -> Map (Date,Date) [a] -> Phylo -> Phylo
104 appendGroups f lvl m phylo = trace ("\n" <> "-- | Append " <> show (length $ concat $ elems m) <> " groups to Level " <> show (lvl) <> "\n")
105 $ over ( phylo_periods
109 (\phyloLvl -> if lvl == (phyloLvl ^. phylo_levelLevel)
111 let pId = phyloLvl ^. phylo_levelPeriod
114 & phylo_levelGroups .~ (fromList $ foldl (\groups obj ->
115 groups ++ [ (((pId,lvl),length groups)
116 , f obj pId lvl (length groups)
117 (elems $ restrictKeys (phylo ^. phylo_timeCooc) $ periodsToYears [pId]))
124 cliqueToGroup :: PhyloClique -> PhyloPeriodId -> Level -> Int -> [Cooc] -> PhyloGroup
125 cliqueToGroup fis pId lvl idx coocs = PhyloGroup pId lvl idx ""
126 (fis ^. phyloClique_support)
127 (fis ^. phyloClique_nodes)
128 (ngramsToCooc (fis ^. phyloClique_nodes) coocs)
129 (1,[0]) -- branchid (lvl,[path in the branching tree])
130 (fromList [("breaks",[0]),("seaLevels",[0])])
134 toPhylo1 :: [Document] -> Phylo -> Phylo
135 toPhylo1 docs phyloBase = case (getSeaElevation phyloBase) of
136 Constante start gap -> constanteTemporalMatching start gap
137 $ appendGroups cliqueToGroup 1 phyloClique phyloBase
138 Adaptative steps -> adaptativeTemporalMatching steps
140 $ appendGroups cliqueToGroup 1 phyloClique phyloBase
142 --------------------------------------
143 phyloClique :: Map (Date,Date) [PhyloClique]
144 phyloClique = toPhyloClique phyloBase docs'
145 --------------------------------------
146 docs' :: Map (Date,Date) [Document]
147 docs' = groupDocsByPeriodRec date (getPeriodIds phyloBase) (sortOn date docs) empty
148 -- docs' = groupDocsByPeriod' date (getPeriodIds phyloBase) docs
149 --------------------------------------
152 ---------------------------
153 -- | Frequent Item Set | --
154 ---------------------------
157 -- To apply a filter with the possibility of keeping some periods non empty (keep : True|False)
158 filterClique :: Bool -> Int -> (Int -> [PhyloClique] -> [PhyloClique]) -> Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
159 filterClique keep thr f m = case keep of
160 False -> map (\l -> f thr l) m
161 True -> map (\l -> keepFilled (f) thr l) m
164 -- To filter Fis with small Support
165 filterCliqueBySupport :: Int -> [PhyloClique] -> [PhyloClique]
166 filterCliqueBySupport thr l = filter (\clq -> (clq ^. phyloClique_support) >= thr) l
169 -- To filter Fis with small Clique size
170 filterCliqueBySize :: Int -> [PhyloClique] -> [PhyloClique]
171 filterCliqueBySize thr l = filter (\clq -> (length $ clq ^. phyloClique_nodes) >= thr) l
174 -- To filter nested Fis
175 filterCliqueByNested :: Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
176 filterCliqueByNested m =
178 foldl (\mem f -> if (any (\f' -> isNested (f' ^. phyloClique_nodes) (f ^. phyloClique_nodes)) mem)
181 let fMax = filter (\f' -> not $ isNested (f ^. phyloClique_nodes) (f' ^. phyloClique_nodes)) mem
182 in fMax ++ [f] ) [] l)
184 clq' = clq `using` parList rdeepseq
185 in fromList $ zip (keys m) clq'
188 -- | To transform a time map of docs into a time map of Fis with some filters
189 toPhyloClique :: Phylo -> Map (Date, Date) [Document] -> Map (Date,Date) [PhyloClique]
190 toPhyloClique phylo phyloDocs = case (clique $ getConfig phylo) of
191 Fis s s' -> -- traceFis "Filtered Fis"
193 {- \$ traceFis "Filtered by clique size" -}
194 $ filterClique True s' (filterCliqueBySize)
195 {- \$ traceFis "Filtered by support" -}
196 $ filterClique True s (filterCliqueBySupport)
197 {- \$ traceFis "Unfiltered Fis" -}
199 MaxClique s -> filterClique True s (filterCliqueBySize)
202 --------------------------------------
203 phyloClique :: Map (Date,Date) [PhyloClique]
204 phyloClique = case (clique $ getConfig phylo) of
206 let fis = map (\(prd,docs) ->
207 let lst = toList $ fisWithSizePolyMap (Segment 1 20) 1 (map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs)
208 in (prd, map (\f -> PhyloClique (Set.toList $ fst f) (snd f) prd) lst))
210 fis' = fis `using` parList rdeepseq
213 let mcl = map (\(prd,docs) ->
215 $ foldl sumCooc empty
217 $ map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs
218 in (prd, map (\cl -> PhyloClique cl 0 prd) $ getMaxCliques Conditional 0.001 cooc))
220 mcl' = mcl `using` parList rdeepseq
222 --------------------------------------
224 -- dev viz graph maxClique getMaxClique
232 -- To transform the docs into a time map of coocurency matrix
233 docsToTimeScaleCooc :: [Document] -> Vector Ngrams -> Map Date Cooc
234 docsToTimeScaleCooc docs fdt =
235 let mCooc = fromListWith sumCooc
236 $ map (\(_d,l) -> (_d, listToMatrix l))
237 $ map (\doc -> (date doc, sort $ ngramsToIdx (text doc) fdt)) docs
239 $ map (\t -> (t,empty))
240 $ toTimeScale (map date docs) 1
241 in trace ("\n" <> "-- | Build the coocurency matrix for " <> show (length $ keys mCooc') <> " unit of time" <> "\n")
242 $ unionWith sumCooc mCooc mCooc'
245 -----------------------
246 -- | to Phylo Base | --
247 -----------------------
249 groupDocsByPeriodRec :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc] -> Map (date, date) [doc]
250 groupDocsByPeriodRec f prds docs acc =
251 if ((null prds) || (null docs))
254 let prd = head' "groupBy" prds
255 docs' = partition (\d -> (f d >= fst prd) && (f d <= snd prd)) docs
256 in groupDocsByPeriodRec f (tail prds) (snd docs') (insert prd (fst docs') acc)
259 -- To group a list of Documents by fixed periods
260 groupDocsByPeriod' :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
261 groupDocsByPeriod' f pds docs =
262 let docs' = groupBy (\d d' -> f d == f d') $ sortOn f docs
263 periods = map (inPeriode f docs') pds
264 periods' = periods `using` parList rdeepseq
265 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length pds) <> " periods" <> "\n")
266 $ fromList $ zip pds periods'
268 --------------------------------------
269 inPeriode :: Ord b => (t -> b) -> [[t]] -> (b, b) -> [t]
270 inPeriode f' h (start,end) =
271 concat $ fst $ partition (\d -> f' (head' "inPeriode" d) >= start && f' (head' "inPeriode" d) <= end) h
275 -- To group a list of Documents by fixed periods
276 groupDocsByPeriod :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
277 groupDocsByPeriod _ _ [] = panic "[ERR][Viz.Phylo.PhyloMaker] Empty [Documents] can not have any periods"
278 groupDocsByPeriod f pds es =
279 let periods = map (inPeriode f es) pds
280 periods' = periods `using` parList rdeepseq
282 in trace ("\n" <> "-- | Group " <> show(length es) <> " docs by " <> show(length pds) <> " periods" <> "\n")
283 $ fromList $ zip pds periods'
285 --------------------------------------
286 inPeriode :: Ord b => (t -> b) -> [t] -> (b, b) -> [t]
287 inPeriode f' h (start,end) =
288 fst $ partition (\d -> f' d >= start && f' d <= end) h
289 --------------------------------------
292 docsToTermFreq :: [Document] -> Vector Ngrams -> Map Int Double
293 docsToTermFreq docs fdt =
294 let nbDocs = fromIntegral $ length docs
295 freqs = map (/(nbDocs))
297 $ map (\lst -> (head' "docsToTermFreq" lst, fromIntegral $ length lst))
298 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) docs
299 sumFreqs = sum $ elems freqs
300 in map (/sumFreqs) freqs
302 docsToLastTermFreq :: Int -> [Document] -> Vector Ngrams -> Map Int Double
303 docsToLastTermFreq n docs fdt =
304 let last = take n $ reverse $ sort $ map date docs
305 nbDocs = fromIntegral $ length $ filter (\d -> elem (date d) last) docs
306 freqs = map (/(nbDocs))
308 $ map (\lst -> (head' "docsToLastTermFreq" lst, fromIntegral $ length lst))
309 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) $ filter (\d -> elem (date d) last) docs
310 sumFreqs = sum $ elems freqs
311 in map (/sumFreqs) freqs
314 -- To count the number of docs by unit of time
315 docsToTimeScaleNb :: [Document] -> Map Date Double
316 docsToTimeScaleNb docs =
317 let docs' = fromListWith (+) $ map (\d -> (date d,1)) docs
318 time = fromList $ map (\t -> (t,0)) $ toTimeScale (keys docs') 1
319 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length time) <> " unit of time" <> "\n")
320 $ unionWith (+) time docs'
323 initPhyloLevels :: Int -> PhyloPeriodId -> Map PhyloLevelId PhyloLevel
324 initPhyloLevels lvlMax pId =
325 fromList $ map (\lvl -> ((pId,lvl),PhyloLevel pId lvl empty)) [1..lvlMax]
328 -- To init the basic elements of a Phylo
329 toPhyloBase :: [Document] -> TermList -> Config -> Phylo
330 toPhyloBase docs lst conf =
331 let foundations = PhyloFoundations (Vector.fromList $ nub $ concat $ map text docs) lst
332 params = defaultPhyloParam { _phyloParam_config = conf }
333 periods = toPeriods (sort $ nub $ map date docs) (getTimePeriod $ timeUnit conf) (getTimeStep $ timeUnit conf)
334 in trace ("\n" <> "-- | Create PhyloBase out of " <> show(length docs) <> " docs \n")
336 (docsToTimeScaleCooc docs (foundations ^. foundations_roots))
337 (docsToTimeScaleNb docs)
338 (docsToTermFreq docs (foundations ^. foundations_roots))
339 (docsToLastTermFreq (getTimePeriod $ timeUnit conf) docs (foundations ^. foundations_roots))
343 (fromList $ map (\prd -> (prd, PhyloPeriod prd (initPhyloLevels 1 prd))) periods)