2 Module : Gargantext.Viz.Phylo.PhyloMaker
3 Description : Maker engine for rebuilding a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 module Gargantext.Viz.Phylo.PhyloMaker where
14 import Data.List (concat, nub, partition, sort, (++), group, intersect, null, sortOn, groupBy)
15 import Data.Map (Map, fromListWith, keys, unionWith, fromList, empty, toList, elems, (!), restrictKeys, foldlWithKey)
16 import Data.Set (size)
17 import Data.Vector (Vector)
19 import Gargantext.Prelude
20 import Gargantext.Viz.AdaptativePhylo
21 import Gargantext.Viz.Phylo.PhyloTools
22 import Gargantext.Viz.Phylo.TemporalMatching (adaptativeTemporalMatching, constanteTemporalMatching, getNextPeriods, filterDocs, filterDiago, reduceDiagos, toProximity)
23 import Gargantext.Viz.Phylo.SynchronicClustering (synchronicClustering)
24 import Gargantext.Text.Context (TermList)
25 import Gargantext.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, Size(..))
27 import Control.DeepSeq (NFData)
28 import Control.Parallel.Strategies (parList, rdeepseq, using)
29 import Debug.Trace (trace)
30 import Control.Lens hiding (Level)
32 import qualified Data.Vector as Vector
33 import qualified Data.Set as Set
41 toPhylo :: [Document] -> TermList -> Config -> Phylo
42 toPhylo docs lst conf = trace ("# phylo1 groups " <> show(length $ getGroupsFromLevel 1 phylo1))
43 $ traceToPhylo (phyloLevel conf) $
44 if (phyloLevel conf) > 1
45 then foldl' (\phylo' _ -> synchronicClustering phylo') phylo1 [2..(phyloLevel conf)]
48 --------------------------------------
50 phylo1 = toPhylo1 docs phyloBase
51 --------------------------------------
53 phyloBase = toPhyloBase docs lst conf
54 --------------------------------------
62 toGroupsProxi :: Level -> Phylo -> Phylo
63 toGroupsProxi lvl phylo =
64 let proximity = phyloProximity $ getConfig phylo
65 groupsProxi = foldlWithKey (\acc pId pds ->
66 -- 1) process period by period
67 let egos = map (\g -> (getGroupId g, g ^. phylo_groupNgrams))
69 $ view ( phylo_periodLevels
70 . traverse . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
71 . phylo_levelGroups ) pds
72 next = getNextPeriods ToParents (getTimeFrame $ timeUnit $ getConfig phylo) pId (keys $ phylo ^. phylo_periods)
73 targets = map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ getGroupsFromLevelPeriods lvl next phylo
74 docs = filterDocs (phylo ^. phylo_timeDocs) ([pId] ++ next)
75 diagos = filterDiago (phylo ^. phylo_timeCooc) ([pId] ++ next)
76 -- 2) compute the pairs in parallel
77 pairs = map (\(id,ngrams) ->
78 map (\(id',ngrams') ->
79 let nbDocs = (sum . elems) $ filterDocs docs ([idToPrd id, idToPrd id'])
80 diago = reduceDiagos $ filterDiago diagos ([idToPrd id, idToPrd id'])
81 in ((id,id'),toProximity nbDocs diago proximity ngrams ngrams' ngrams')
82 ) $ filter (\(_,ngrams') -> (not . null) $ intersect ngrams ngrams') targets
84 pairs' = pairs `using` parList rdeepseq
85 in acc ++ (concat pairs')
86 ) [] $ phylo ^. phylo_periods
87 in phylo & phylo_groupsProxi .~ ((traceGroupsProxi . fromList) groupsProxi)
90 appendGroups :: (a -> PhyloPeriodId -> Level -> Int -> Vector Ngrams -> [Cooc] -> PhyloGroup) -> Level -> Map (Date,Date) [a] -> Phylo -> Phylo
91 appendGroups f lvl m phylo = trace ("\n" <> "-- | Append " <> show (length $ concat $ elems m) <> " groups to Level " <> show (lvl) <> "\n")
92 $ over ( phylo_periods
96 (\phyloLvl -> if lvl == (phyloLvl ^. phylo_levelLevel)
98 let pId = phyloLvl ^. phylo_levelPeriod
101 & phylo_levelGroups .~ (fromList $ foldl (\groups obj ->
102 groups ++ [ (((pId,lvl),length groups)
103 , f obj pId lvl (length groups) (getRoots phylo)
104 (elems $ restrictKeys (phylo ^. phylo_timeCooc) $ periodsToYears [pId]))
111 cliqueToGroup :: PhyloClique -> PhyloPeriodId -> Level -> Int -> Vector Ngrams -> [Cooc] -> PhyloGroup
112 cliqueToGroup fis pId lvl idx fdt coocs =
113 let ngrams = ngramsToIdx (Set.toList $ fis ^. phyloClique_nodes) fdt
114 in PhyloGroup pId lvl idx ""
115 (fis ^. phyloClique_support)
117 (ngramsToCooc ngrams coocs)
118 (1,[0]) -- branchid (lvl,[path in the branching tree])
119 (fromList [("breaks",[0]),("seaLevels",[0])])
123 toPhylo1 :: [Document] -> Phylo -> Phylo
124 toPhylo1 docs phyloBase = case (getSeaElevation phyloBase) of
125 Constante start gap -> constanteTemporalMatching start gap
126 $ appendGroups cliqueToGroup 1 phyloClique phyloBase
127 Adaptative steps -> adaptativeTemporalMatching steps
129 $ appendGroups cliqueToGroup 1 phyloClique phyloBase
131 --------------------------------------
132 phyloClique :: Map (Date,Date) [PhyloClique]
133 phyloClique = toPhyloClique phyloBase docs'
134 --------------------------------------
135 docs' :: Map (Date,Date) [Document]
136 docs' = groupDocsByPeriod' date (getPeriodIds phyloBase) docs
137 --------------------------------------
140 ---------------------------
141 -- | Frequent Item Set | --
142 ---------------------------
145 -- To apply a filter with the possibility of keeping some periods non empty (keep : True|False)
146 filterClique :: Bool -> Int -> (Int -> [PhyloClique] -> [PhyloClique]) -> Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
147 filterClique keep thr f m = case keep of
148 False -> map (\l -> f thr l) m
149 True -> map (\l -> keepFilled (f) thr l) m
152 -- To filter Fis with small Support
153 filterCliqueBySupport :: Int -> [PhyloClique] -> [PhyloClique]
154 filterCliqueBySupport thr l = filter (\clq -> (clq ^. phyloClique_support) >= thr) l
157 -- To filter Fis with small Clique size
158 filterCliqueBySize :: Int -> [PhyloClique] -> [PhyloClique]
159 filterCliqueBySize thr l = filter (\clq -> (size $ clq ^. phyloClique_nodes) >= thr) l
162 -- To filter nested Fis
163 filterCliqueByNested :: Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
164 filterCliqueByNested m =
166 foldl (\mem f -> if (any (\f' -> isNested (Set.toList $ f' ^. phyloClique_nodes) (Set.toList $ f ^. phyloClique_nodes)) mem)
169 let fMax = filter (\f' -> not $ isNested (Set.toList $ f ^. phyloClique_nodes) (Set.toList $ f' ^. phyloClique_nodes)) mem
170 in fMax ++ [f] ) [] l)
172 clq' = clq `using` parList rdeepseq
173 in fromList $ zip (keys m) clq'
176 -- To transform a time map of docs innto a time map of Fis with some filters
177 toPhyloClique :: Phylo -> Map (Date, Date) [Document] -> Map (Date,Date) [PhyloClique]
178 toPhyloClique phylo phyloDocs = case (clique $ getConfig phylo) of
179 Fis s s' -> -- traceFis "Filtered Fis"
181 {- \$ traceFis "Filtered by clique size" -}
182 $ filterClique True s' (filterCliqueBySize)
183 {- \$ traceFis "Filtered by support" -}
184 $ filterClique True s (filterCliqueBySupport)
185 {- \$ traceFis "Unfiltered Fis" -}
187 MaxClique _ -> undefined
189 --------------------------------------
190 phyloClique :: Map (Date,Date) [PhyloClique]
191 phyloClique = case (clique $ getConfig phylo) of
192 Fis _ _ -> let fis = map (\(prd,docs) ->
193 let lst = toList $ fisWithSizePolyMap (Segment 1 20) 1 (map text docs)
194 in (prd, map (\f -> PhyloClique (fst f) (snd f) prd) lst))
196 fis' = fis `using` parList rdeepseq
198 MaxClique _ -> undefined
199 --------------------------------------
207 -- To transform the docs into a time map of coocurency matrix
208 docsToTimeScaleCooc :: [Document] -> Vector Ngrams -> Map Date Cooc
209 docsToTimeScaleCooc docs fdt =
210 let mCooc = fromListWith sumCooc
211 $ map (\(_d,l) -> (_d, listToMatrix l))
212 $ map (\doc -> (date doc, sort $ ngramsToIdx (text doc) fdt)) docs
214 $ map (\t -> (t,empty))
215 $ toTimeScale (map date docs) 1
216 in trace ("\n" <> "-- | Build the coocurency matrix for " <> show (length $ keys mCooc') <> " unit of time" <> "\n")
217 $ unionWith sumCooc mCooc mCooc'
220 -----------------------
221 -- | to Phylo Base | --
222 -----------------------
224 -- To group a list of Documents by fixed periods
225 groupDocsByPeriod' :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
226 groupDocsByPeriod' f pds docs =
227 let docs' = groupBy (\d d' -> f d == f d') $ sortOn f docs
228 periods = map (inPeriode f docs') pds
229 periods' = periods `using` parList rdeepseq
230 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length pds) <> " periods" <> "\n")
231 $ fromList $ zip pds periods'
233 --------------------------------------
234 inPeriode :: Ord b => (t -> b) -> [[t]] -> (b, b) -> [t]
235 inPeriode f' h (start,end) =
236 concat $ fst $ partition (\d -> f' (head' "inPeriode" d) >= start && f' (head' "inPeriode" d) <= end) h
240 -- To group a list of Documents by fixed periods
241 groupDocsByPeriod :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
242 groupDocsByPeriod _ _ [] = panic "[ERR][Viz.Phylo.PhyloMaker] Empty [Documents] can not have any periods"
243 groupDocsByPeriod f pds es =
244 let periods = map (inPeriode f es) pds
245 periods' = periods `using` parList rdeepseq
247 in trace ("\n" <> "-- | Group " <> show(length es) <> " docs by " <> show(length pds) <> " periods" <> "\n")
248 $ fromList $ zip pds periods'
250 --------------------------------------
251 inPeriode :: Ord b => (t -> b) -> [t] -> (b, b) -> [t]
252 inPeriode f' h (start,end) =
253 fst $ partition (\d -> f' d >= start && f' d <= end) h
254 --------------------------------------
257 docsToTermFreq :: [Document] -> Vector Ngrams -> Map Int Double
258 docsToTermFreq docs fdt =
259 let nbDocs = fromIntegral $ length docs
260 freqs = map (/(nbDocs))
262 $ map (\lst -> (head' "docsToTermFreq" lst, fromIntegral $ length lst))
263 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) docs
264 sumFreqs = sum $ elems freqs
265 in map (/sumFreqs) freqs
268 -- To count the number of docs by unit of time
269 docsToTimeScaleNb :: [Document] -> Map Date Double
270 docsToTimeScaleNb docs =
271 let docs' = fromListWith (+) $ map (\d -> (date d,1)) docs
272 time = fromList $ map (\t -> (t,0)) $ toTimeScale (keys docs') 1
273 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length time) <> " unit of time" <> "\n")
274 $ unionWith (+) time docs'
277 initPhyloLevels :: Int -> PhyloPeriodId -> Map PhyloLevelId PhyloLevel
278 initPhyloLevels lvlMax pId =
279 fromList $ map (\lvl -> ((pId,lvl),PhyloLevel pId lvl empty)) [1..lvlMax]
282 -- To init the basic elements of a Phylo
283 toPhyloBase :: [Document] -> TermList -> Config -> Phylo
284 toPhyloBase docs lst conf =
285 let foundations = PhyloFoundations (Vector.fromList $ nub $ concat $ map text docs) lst
286 params = defaultPhyloParam { _phyloParam_config = conf }
287 periods = toPeriods (sort $ nub $ map date docs) (getTimePeriod $ timeUnit conf) (getTimeStep $ timeUnit conf)
288 in trace ("\n" <> "-- | Create PhyloBase out of " <> show(length docs) <> " docs \n")
290 (docsToTimeScaleCooc docs (foundations ^. foundations_roots))
291 (docsToTimeScaleNb docs)
292 (docsToTermFreq docs (foundations ^. foundations_roots))
295 (fromList $ map (\prd -> (prd, PhyloPeriod prd (initPhyloLevels 1 prd))) periods)