2 Module : Gargantext.Viz.Phylo.PhyloMaker
3 Description : Maker engine for rebuilding a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 {-# LANGUAGE NoImplicitPrelude #-}
12 {-# LANGUAGE FlexibleContexts #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE MultiParamTypeClasses #-}
16 module Gargantext.Viz.Phylo.PhyloMaker where
18 import Data.List (concat, nub, partition, sort, (++), group, intersect, null, sortOn, groupBy)
19 import Data.Map (Map, fromListWith, keys, unionWith, fromList, empty, toList, elems, (!), restrictKeys, foldlWithKey)
20 import Data.Set (size)
21 import Data.Vector (Vector)
23 import Gargantext.Prelude
24 import Gargantext.Viz.AdaptativePhylo
25 import Gargantext.Viz.Phylo.PhyloTools
26 import Gargantext.Viz.Phylo.TemporalMatching (adaptativeTemporalMatching, constanteTemporalMatching, getNextPeriods, filterDocs, filterDiago, reduceDiagos, toProximity)
27 import Gargantext.Viz.Phylo.SynchronicClustering (synchronicClustering)
28 import Gargantext.Text.Context (TermList)
29 import Gargantext.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, Size(..))
31 import Control.DeepSeq (NFData)
32 import Control.Parallel.Strategies (parList, rdeepseq, using)
33 import Debug.Trace (trace)
34 import Control.Lens hiding (Level)
36 import qualified Data.Vector as Vector
37 import qualified Data.Set as Set
45 toPhylo :: [Document] -> TermList -> Config -> Phylo
46 toPhylo docs lst conf = trace ("# phylo1 groups " <> show(length $ getGroupsFromLevel 1 phylo1))
47 $ traceToPhylo (phyloLevel conf) $
48 if (phyloLevel conf) > 1
49 then foldl' (\phylo' _ -> synchronicClustering phylo') phylo1 [2..(phyloLevel conf)]
52 --------------------------------------
54 phylo1 = toPhylo1 docs phyloBase
55 --------------------------------------
57 phyloBase = toPhyloBase docs lst conf
58 --------------------------------------
66 toGroupsProxi :: Level -> Phylo -> Phylo
67 toGroupsProxi lvl phylo =
68 let proximity = phyloProximity $ getConfig phylo
69 groupsProxi = foldlWithKey (\acc pId pds ->
70 -- 1) process period by period
71 let egos = map (\g -> (getGroupId g, g ^. phylo_groupNgrams))
73 $ view ( phylo_periodLevels
74 . traverse . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
75 . phylo_levelGroups ) pds
76 next = getNextPeriods ToParents (getTimeFrame $ timeUnit $ getConfig phylo) pId (keys $ phylo ^. phylo_periods)
77 targets = map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ getGroupsFromLevelPeriods lvl next phylo
78 docs = filterDocs (phylo ^. phylo_timeDocs) ([pId] ++ next)
79 diagos = filterDiago (phylo ^. phylo_timeCooc) ([pId] ++ next)
80 -- 2) compute the pairs in parallel
81 pairs = map (\(id,ngrams) ->
82 map (\(id',ngrams') ->
83 let nbDocs = (sum . elems) $ filterDocs docs ([idToPrd id, idToPrd id'])
84 diago = reduceDiagos $ filterDiago diagos ([idToPrd id, idToPrd id'])
85 in ((id,id'),toProximity nbDocs diago proximity ngrams ngrams' ngrams')
86 ) $ filter (\(_,ngrams') -> (not . null) $ intersect ngrams ngrams') targets
88 pairs' = pairs `using` parList rdeepseq
89 in acc ++ (concat pairs')
90 ) [] $ phylo ^. phylo_periods
91 in phylo & phylo_groupsProxi .~ ((traceGroupsProxi . fromList) groupsProxi)
94 appendGroups :: (a -> PhyloPeriodId -> Level -> Int -> Vector Ngrams -> [Cooc] -> PhyloGroup) -> Level -> Map (Date,Date) [a] -> Phylo -> Phylo
95 appendGroups f lvl m phylo = trace ("\n" <> "-- | Append " <> show (length $ concat $ elems m) <> " groups to Level " <> show (lvl) <> "\n")
96 $ over ( phylo_periods
100 (\phyloLvl -> if lvl == (phyloLvl ^. phylo_levelLevel)
102 let pId = phyloLvl ^. phylo_levelPeriod
105 & phylo_levelGroups .~ (fromList $ foldl (\groups obj ->
106 groups ++ [ (((pId,lvl),length groups)
107 , f obj pId lvl (length groups) (getRoots phylo)
108 (elems $ restrictKeys (phylo ^. phylo_timeCooc) $ periodsToYears [pId]))
115 cliqueToGroup :: PhyloClique -> PhyloPeriodId -> Level -> Int -> Vector Ngrams -> [Cooc] -> PhyloGroup
116 cliqueToGroup fis pId lvl idx fdt coocs =
117 let ngrams = ngramsToIdx (Set.toList $ fis ^. phyloClique_nodes) fdt
118 in PhyloGroup pId lvl idx ""
119 (fis ^. phyloClique_support)
121 (ngramsToCooc ngrams coocs)
122 (1,[0]) -- | branchid (lvl,[path in the branching tree])
123 (fromList [("breaks",[0]),("seaLevels",[0])])
127 toPhylo1 :: [Document] -> Phylo -> Phylo
128 toPhylo1 docs phyloBase = case (getSeaElevation phyloBase) of
129 Constante start gap -> constanteTemporalMatching start gap
130 $ appendGroups cliqueToGroup 1 phyloClique phyloBase
131 Adaptative steps -> adaptativeTemporalMatching steps
133 $ appendGroups cliqueToGroup 1 phyloClique phyloBase
135 --------------------------------------
136 phyloClique :: Map (Date,Date) [PhyloClique]
137 phyloClique = toPhyloClique phyloBase docs'
138 --------------------------------------
139 docs' :: Map (Date,Date) [Document]
140 docs' = groupDocsByPeriod' date (getPeriodIds phyloBase) docs
141 --------------------------------------
144 ---------------------------
145 -- | Frequent Item Set | --
146 ---------------------------
149 -- | To apply a filter with the possibility of keeping some periods non empty (keep : True|False)
150 filterClique :: Bool -> Int -> (Int -> [PhyloClique] -> [PhyloClique]) -> Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
151 filterClique keep thr f m = case keep of
152 False -> map (\l -> f thr l) m
153 True -> map (\l -> keepFilled (f) thr l) m
156 -- | To filter Fis with small Support
157 filterCliqueBySupport :: Int -> [PhyloClique] -> [PhyloClique]
158 filterCliqueBySupport thr l = filter (\clq -> (clq ^. phyloClique_support) >= thr) l
161 -- | To filter Fis with small Clique size
162 filterCliqueBySize :: Int -> [PhyloClique] -> [PhyloClique]
163 filterCliqueBySize thr l = filter (\clq -> (size $ clq ^. phyloClique_nodes) >= thr) l
166 -- | To filter nested Fis
167 filterCliqueByNested :: Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
168 filterCliqueByNested m =
170 foldl (\mem f -> if (any (\f' -> isNested (Set.toList $ f' ^. phyloClique_nodes) (Set.toList $ f ^. phyloClique_nodes)) mem)
173 let fMax = filter (\f' -> not $ isNested (Set.toList $ f ^. phyloClique_nodes) (Set.toList $ f' ^. phyloClique_nodes)) mem
174 in fMax ++ [f] ) [] l)
176 clq' = clq `using` parList rdeepseq
177 in fromList $ zip (keys m) clq'
180 -- | To transform a time map of docs innto a time map of Fis with some filters
181 toPhyloClique :: Phylo -> Map (Date, Date) [Document] -> Map (Date,Date) [PhyloClique]
182 toPhyloClique phylo phyloDocs = case (clique $ getConfig phylo) of
183 Fis s s' -> -- traceFis "Filtered Fis"
185 -- $ traceFis "Filtered by clique size"
186 $ filterClique True s' (filterCliqueBySize)
187 -- $ traceFis "Filtered by support"
188 $ filterClique True s (filterCliqueBySupport)
189 -- $ traceFis "Unfiltered Fis"
191 MaxClique _ -> undefined
193 --------------------------------------
194 phyloClique :: Map (Date,Date) [PhyloClique]
195 phyloClique = case (clique $ getConfig phylo) of
196 Fis _ _ -> let fis = map (\(prd,docs) ->
197 let lst = toList $ fisWithSizePolyMap (Segment 1 20) 1 (map text docs)
198 in (prd, map (\f -> PhyloClique (fst f) (snd f) prd) lst))
200 fis' = fis `using` parList rdeepseq
202 MaxClique _ -> undefined
203 --------------------------------------
211 -- | To transform the docs into a time map of coocurency matrix
212 docsToTimeScaleCooc :: [Document] -> Vector Ngrams -> Map Date Cooc
213 docsToTimeScaleCooc docs fdt =
214 let mCooc = fromListWith sumCooc
215 $ map (\(_d,l) -> (_d, listToMatrix l))
216 $ map (\doc -> (date doc, sort $ ngramsToIdx (text doc) fdt)) docs
218 $ map (\t -> (t,empty))
219 $ toTimeScale (map date docs) 1
220 in trace ("\n" <> "-- | Build the coocurency matrix for " <> show (length $ keys mCooc') <> " unit of time" <> "\n")
221 $ unionWith sumCooc mCooc mCooc'
224 -----------------------
225 -- | to Phylo Base | --
226 -----------------------
228 -- | To group a list of Documents by fixed periods
229 groupDocsByPeriod' :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
230 groupDocsByPeriod' f pds docs =
231 let docs' = groupBy (\d d' -> f d == f d') $ sortOn f docs
232 periods = map (inPeriode f docs') pds
233 periods' = periods `using` parList rdeepseq
234 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length pds) <> " periods" <> "\n")
235 $ fromList $ zip pds periods'
237 --------------------------------------
238 inPeriode :: Ord b => (t -> b) -> [[t]] -> (b, b) -> [t]
239 inPeriode f' h (start,end) =
240 concat $ fst $ partition (\d -> f' (head' "inPeriode" d) >= start && f' (head' "inPeriode" d) <= end) h
244 -- | To group a list of Documents by fixed periods
245 groupDocsByPeriod :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
246 groupDocsByPeriod _ _ [] = panic "[ERR][Viz.Phylo.PhyloMaker] Empty [Documents] can not have any periods"
247 groupDocsByPeriod f pds es =
248 let periods = map (inPeriode f es) pds
249 periods' = periods `using` parList rdeepseq
251 in trace ("\n" <> "-- | Group " <> show(length es) <> " docs by " <> show(length pds) <> " periods" <> "\n")
252 $ fromList $ zip pds periods'
254 --------------------------------------
255 inPeriode :: Ord b => (t -> b) -> [t] -> (b, b) -> [t]
256 inPeriode f' h (start,end) =
257 fst $ partition (\d -> f' d >= start && f' d <= end) h
258 --------------------------------------
261 docsToTermFreq :: [Document] -> Vector Ngrams -> Map Int Double
262 docsToTermFreq docs fdt =
263 let nbDocs = fromIntegral $ length docs
264 freqs = map (/(nbDocs))
266 $ map (\lst -> (head' "docsToTermFreq" lst, fromIntegral $ length lst))
267 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) docs
268 sumFreqs = sum $ elems freqs
269 in map (/sumFreqs) freqs
272 -- | To count the number of docs by unit of time
273 docsToTimeScaleNb :: [Document] -> Map Date Double
274 docsToTimeScaleNb docs =
275 let docs' = fromListWith (+) $ map (\d -> (date d,1)) docs
276 time = fromList $ map (\t -> (t,0)) $ toTimeScale (keys docs') 1
277 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length time) <> " unit of time" <> "\n")
278 $ unionWith (+) time docs'
281 initPhyloLevels :: Int -> PhyloPeriodId -> Map PhyloLevelId PhyloLevel
282 initPhyloLevels lvlMax pId =
283 fromList $ map (\lvl -> ((pId,lvl),PhyloLevel pId lvl empty)) [1..lvlMax]
286 -- | To init the basic elements of a Phylo
287 toPhyloBase :: [Document] -> TermList -> Config -> Phylo
288 toPhyloBase docs lst conf =
289 let foundations = PhyloFoundations (Vector.fromList $ nub $ concat $ map text docs) lst
290 params = defaultPhyloParam { _phyloParam_config = conf }
291 periods = toPeriods (sort $ nub $ map date docs) (getTimePeriod $ timeUnit conf) (getTimeStep $ timeUnit conf)
292 in trace ("\n" <> "-- | Create PhyloBase out of " <> show(length docs) <> " docs \n")
294 (docsToTimeScaleCooc docs (foundations ^. foundations_roots))
295 (docsToTimeScaleNb docs)
296 (docsToTermFreq docs (foundations ^. foundations_roots))
299 (fromList $ map (\prd -> (prd, PhyloPeriod prd (initPhyloLevels 1 prd))) periods)