2 Module : Gargantext.Core.Viz.Phylo.PhyloMaker
3 Description : Maker engine for rebuilding a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 module Gargantext.Core.Viz.Phylo.PhyloMaker where
14 import Data.List (concat, nub, partition, sort, (++), group, intersect, null, sortOn, groupBy, tail)
15 import Data.Map (Map, fromListWith, keys, unionWith, fromList, empty, toList, elems, (!), restrictKeys, foldlWithKey, insert)
16 import Data.Vector (Vector)
18 import Gargantext.Prelude
19 import Gargantext.Core.Viz.AdaptativePhylo
20 import Gargantext.Core.Viz.Phylo.PhyloTools
21 import Gargantext.Core.Viz.Phylo.TemporalMatching (adaptativeTemporalMatching, constanteTemporalMatching, getNextPeriods, filterDocs, filterDiago, reduceDiagos, toProximity)
22 import Gargantext.Core.Viz.Phylo.SynchronicClustering (synchronicClustering)
23 import Gargantext.Core.Text.Context (TermList)
24 import Gargantext.Core.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, Size(..))
25 import Gargantext.Core.Viz.Graph.MaxClique (getMaxCliques)
26 import Gargantext.Core.Viz.Graph.Distances (Distance(Conditional))
28 import Control.DeepSeq (NFData)
29 import Control.Parallel.Strategies (parList, rdeepseq, using)
30 import Debug.Trace (trace)
31 import Control.Lens hiding (Level)
33 import qualified Data.Vector as Vector
34 import qualified Data.Set as Set
41 toPhylo :: [Document] -> TermList -> Config -> Phylo
42 toPhylo docs lst conf = trace ("# phylo1 groups " <> show(length $ getGroupsFromLevel 1 phylo1))
43 $ traceToPhylo (phyloLevel conf) $
44 if (phyloLevel conf) > 1
45 then foldl' (\phylo' _ -> synchronicClustering phylo') phylo1 [2..(phyloLevel conf)]
48 --------------------------------------
50 phylo1 = toPhylo1 docs phyloBase
51 --------------------------------------
53 phyloBase = toPhyloBase docs lst conf
54 --------------------------------------
62 toGroupsProxi :: Level -> Phylo -> Phylo
63 toGroupsProxi lvl phylo =
64 let proximity = phyloProximity $ getConfig phylo
65 groupsProxi = foldlWithKey (\acc pId pds ->
66 -- 1) process period by period
67 let egos = map (\g -> (getGroupId g, g ^. phylo_groupNgrams))
69 $ view ( phylo_periodLevels
70 . traverse . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
71 . phylo_levelGroups ) pds
72 next = getNextPeriods ToParents (getTimeFrame $ timeUnit $ getConfig phylo) pId (keys $ phylo ^. phylo_periods)
73 targets = map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ getGroupsFromLevelPeriods lvl next phylo
74 docs = filterDocs (phylo ^. phylo_timeDocs) ([pId] ++ next)
75 diagos = filterDiago (phylo ^. phylo_timeCooc) ([pId] ++ next)
76 -- 2) compute the pairs in parallel
77 pairs = map (\(id,ngrams) ->
78 map (\(id',ngrams') ->
79 let nbDocs = (sum . elems) $ filterDocs docs ([idToPrd id, idToPrd id'])
80 diago = reduceDiagos $ filterDiago diagos ([idToPrd id, idToPrd id'])
81 in ((id,id'),toProximity nbDocs diago proximity ngrams ngrams' ngrams')
82 ) $ filter (\(_,ngrams') -> (not . null) $ intersect ngrams ngrams') targets
84 pairs' = pairs `using` parList rdeepseq
85 in acc ++ (concat pairs')
86 ) [] $ phylo ^. phylo_periods
87 in phylo & phylo_groupsProxi .~ ((traceGroupsProxi . fromList) groupsProxi)
90 appendGroups :: (a -> PhyloPeriodId -> Level -> Int -> [Cooc] -> PhyloGroup) -> Level -> Map (Date,Date) [a] -> Phylo -> Phylo
91 appendGroups f lvl m phylo = trace ("\n" <> "-- | Append " <> show (length $ concat $ elems m) <> " groups to Level " <> show (lvl) <> "\n")
92 $ over ( phylo_periods
96 (\phyloLvl -> if lvl == (phyloLvl ^. phylo_levelLevel)
98 let pId = phyloLvl ^. phylo_levelPeriod
101 & phylo_levelGroups .~ (fromList $ foldl (\groups obj ->
102 groups ++ [ (((pId,lvl),length groups)
103 , f obj pId lvl (length groups)
104 (elems $ restrictKeys (phylo ^. phylo_timeCooc) $ periodsToYears [pId]))
111 cliqueToGroup :: PhyloClique -> PhyloPeriodId -> Level -> Int -> [Cooc] -> PhyloGroup
112 cliqueToGroup fis pId lvl idx coocs = PhyloGroup pId lvl idx ""
113 (fis ^. phyloClique_support)
114 (fis ^. phyloClique_nodes)
115 (ngramsToCooc (fis ^. phyloClique_nodes) coocs)
116 (1,[0]) -- branchid (lvl,[path in the branching tree])
117 (fromList [("breaks",[0]),("seaLevels",[0])])
121 toPhylo1 :: [Document] -> Phylo -> Phylo
122 toPhylo1 docs phyloBase = case (getSeaElevation phyloBase) of
123 Constante start gap -> constanteTemporalMatching start gap
124 $ appendGroups cliqueToGroup 1 phyloClique phyloBase
125 Adaptative steps -> adaptativeTemporalMatching steps
127 $ appendGroups cliqueToGroup 1 phyloClique phyloBase
129 --------------------------------------
130 phyloClique :: Map (Date,Date) [PhyloClique]
131 phyloClique = toPhyloClique phyloBase docs'
132 --------------------------------------
133 docs' :: Map (Date,Date) [Document]
134 docs' = groupDocsByPeriodRec date (getPeriodIds phyloBase) (sortOn date docs) empty
135 -- docs' = groupDocsByPeriod' date (getPeriodIds phyloBase) docs
136 --------------------------------------
139 ---------------------------
140 -- | Frequent Item Set | --
141 ---------------------------
144 -- To apply a filter with the possibility of keeping some periods non empty (keep : True|False)
145 filterClique :: Bool -> Int -> (Int -> [PhyloClique] -> [PhyloClique]) -> Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
146 filterClique keep thr f m = case keep of
147 False -> map (\l -> f thr l) m
148 True -> map (\l -> keepFilled (f) thr l) m
151 -- To filter Fis with small Support
152 filterCliqueBySupport :: Int -> [PhyloClique] -> [PhyloClique]
153 filterCliqueBySupport thr l = filter (\clq -> (clq ^. phyloClique_support) >= thr) l
156 -- To filter Fis with small Clique size
157 filterCliqueBySize :: Int -> [PhyloClique] -> [PhyloClique]
158 filterCliqueBySize thr l = filter (\clq -> (length $ clq ^. phyloClique_nodes) >= thr) l
161 -- To filter nested Fis
162 filterCliqueByNested :: Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
163 filterCliqueByNested m =
165 foldl (\mem f -> if (any (\f' -> isNested (f' ^. phyloClique_nodes) (f ^. phyloClique_nodes)) mem)
168 let fMax = filter (\f' -> not $ isNested (f ^. phyloClique_nodes) (f' ^. phyloClique_nodes)) mem
169 in fMax ++ [f] ) [] l)
171 clq' = clq `using` parList rdeepseq
172 in fromList $ zip (keys m) clq'
175 -- | To transform a time map of docs into a time map of Fis with some filters
176 toPhyloClique :: Phylo -> Map (Date, Date) [Document] -> Map (Date,Date) [PhyloClique]
177 toPhyloClique phylo phyloDocs = case (clique $ getConfig phylo) of
178 Fis s s' -> -- traceFis "Filtered Fis"
180 {- \$ traceFis "Filtered by clique size" -}
181 $ filterClique True s' (filterCliqueBySize)
182 {- \$ traceFis "Filtered by support" -}
183 $ filterClique True s (filterCliqueBySupport)
184 {- \$ traceFis "Unfiltered Fis" -}
186 MaxClique s -> filterClique True s (filterCliqueBySize)
189 --------------------------------------
190 phyloClique :: Map (Date,Date) [PhyloClique]
191 phyloClique = case (clique $ getConfig phylo) of
193 let fis = map (\(prd,docs) ->
194 let lst = toList $ fisWithSizePolyMap (Segment 1 20) 1 (map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs)
195 in (prd, map (\f -> PhyloClique (Set.toList $ fst f) (snd f) prd) lst))
197 fis' = fis `using` parList rdeepseq
200 let mcl = map (\(prd,docs) ->
202 $ foldl sumCooc empty
204 $ map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs
205 in (prd, map (\cl -> PhyloClique cl 0 prd) $ getMaxCliques Conditional 0.01 cooc))
207 mcl' = mcl `using` parList rdeepseq
209 --------------------------------------
211 -- dev viz graph maxClique getMaxClique
219 -- To transform the docs into a time map of coocurency matrix
220 docsToTimeScaleCooc :: [Document] -> Vector Ngrams -> Map Date Cooc
221 docsToTimeScaleCooc docs fdt =
222 let mCooc = fromListWith sumCooc
223 $ map (\(_d,l) -> (_d, listToMatrix l))
224 $ map (\doc -> (date doc, sort $ ngramsToIdx (text doc) fdt)) docs
226 $ map (\t -> (t,empty))
227 $ toTimeScale (map date docs) 1
228 in trace ("\n" <> "-- | Build the coocurency matrix for " <> show (length $ keys mCooc') <> " unit of time" <> "\n")
229 $ unionWith sumCooc mCooc mCooc'
232 -----------------------
233 -- | to Phylo Base | --
234 -----------------------
236 groupDocsByPeriodRec :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc] -> Map (date, date) [doc]
237 groupDocsByPeriodRec f prds docs acc =
238 if ((null prds) || (null docs))
241 let prd = head' "groupBy" prds
242 docs' = partition (\d -> (f d >= fst prd) && (f d <= snd prd)) docs
243 in groupDocsByPeriodRec f (tail prds) (snd docs') (insert prd (fst docs') acc)
246 -- To group a list of Documents by fixed periods
247 groupDocsByPeriod' :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
248 groupDocsByPeriod' f pds docs =
249 let docs' = groupBy (\d d' -> f d == f d') $ sortOn f docs
250 periods = map (inPeriode f docs') pds
251 periods' = periods `using` parList rdeepseq
252 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length pds) <> " periods" <> "\n")
253 $ fromList $ zip pds periods'
255 --------------------------------------
256 inPeriode :: Ord b => (t -> b) -> [[t]] -> (b, b) -> [t]
257 inPeriode f' h (start,end) =
258 concat $ fst $ partition (\d -> f' (head' "inPeriode" d) >= start && f' (head' "inPeriode" d) <= end) h
262 -- To group a list of Documents by fixed periods
263 groupDocsByPeriod :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
264 groupDocsByPeriod _ _ [] = panic "[ERR][Viz.Phylo.PhyloMaker] Empty [Documents] can not have any periods"
265 groupDocsByPeriod f pds es =
266 let periods = map (inPeriode f es) pds
267 periods' = periods `using` parList rdeepseq
269 in trace ("\n" <> "-- | Group " <> show(length es) <> " docs by " <> show(length pds) <> " periods" <> "\n")
270 $ fromList $ zip pds periods'
272 --------------------------------------
273 inPeriode :: Ord b => (t -> b) -> [t] -> (b, b) -> [t]
274 inPeriode f' h (start,end) =
275 fst $ partition (\d -> f' d >= start && f' d <= end) h
276 --------------------------------------
279 docsToTermFreq :: [Document] -> Vector Ngrams -> Map Int Double
280 docsToTermFreq docs fdt =
281 let nbDocs = fromIntegral $ length docs
282 freqs = map (/(nbDocs))
284 $ map (\lst -> (head' "docsToTermFreq" lst, fromIntegral $ length lst))
285 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) docs
286 sumFreqs = sum $ elems freqs
287 in map (/sumFreqs) freqs
289 docsToLastTermFreq :: Int -> [Document] -> Vector Ngrams -> Map Int Double
290 docsToLastTermFreq n docs fdt =
291 let last = take n $ reverse $ sort $ map date docs
292 nbDocs = fromIntegral $ length $ filter (\d -> elem (date d) last) docs
293 freqs = map (/(nbDocs))
295 $ map (\lst -> (head' "docsToLastTermFreq" lst, fromIntegral $ length lst))
296 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) $ filter (\d -> elem (date d) last) docs
297 sumFreqs = sum $ elems freqs
298 in map (/sumFreqs) freqs
301 -- To count the number of docs by unit of time
302 docsToTimeScaleNb :: [Document] -> Map Date Double
303 docsToTimeScaleNb docs =
304 let docs' = fromListWith (+) $ map (\d -> (date d,1)) docs
305 time = fromList $ map (\t -> (t,0)) $ toTimeScale (keys docs') 1
306 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length time) <> " unit of time" <> "\n")
307 $ unionWith (+) time docs'
310 initPhyloLevels :: Int -> PhyloPeriodId -> Map PhyloLevelId PhyloLevel
311 initPhyloLevels lvlMax pId =
312 fromList $ map (\lvl -> ((pId,lvl),PhyloLevel pId lvl empty)) [1..lvlMax]
315 -- To init the basic elements of a Phylo
316 toPhyloBase :: [Document] -> TermList -> Config -> Phylo
317 toPhyloBase docs lst conf =
318 let foundations = PhyloFoundations (Vector.fromList $ nub $ concat $ map text docs) lst
319 params = defaultPhyloParam { _phyloParam_config = conf }
320 periods = toPeriods (sort $ nub $ map date docs) (getTimePeriod $ timeUnit conf) (getTimeStep $ timeUnit conf)
321 in trace ("\n" <> "-- | Create PhyloBase out of " <> show(length docs) <> " docs \n")
323 (docsToTimeScaleCooc docs (foundations ^. foundations_roots))
324 (docsToTimeScaleNb docs)
325 (docsToTermFreq docs (foundations ^. foundations_roots))
326 (docsToLastTermFreq (getTimePeriod $ timeUnit conf) docs (foundations ^. foundations_roots))
330 (fromList $ map (\prd -> (prd, PhyloPeriod prd (initPhyloLevels 1 prd))) periods)