2 Module : Gargantext.Viz.Phylo.PhyloMaker
3 Description : Maker engine for rebuilding a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
12 module Gargantext.Viz.Phylo.PhyloMaker where
14 import Data.List (concat, nub, partition, sort, (++), group, intersect, null, sortOn, groupBy, tail)
15 import Data.Map (Map, fromListWith, keys, unionWith, fromList, empty, toList, elems, (!), restrictKeys, foldlWithKey, insert)
16 import Data.Vector (Vector)
18 import Gargantext.Prelude
19 import Gargantext.Viz.AdaptativePhylo
20 import Gargantext.Viz.Phylo.PhyloTools
21 import Gargantext.Viz.Phylo.TemporalMatching (adaptativeTemporalMatching, constanteTemporalMatching, getNextPeriods, filterDocs, filterDiago, reduceDiagos, toProximity)
22 import Gargantext.Viz.Phylo.SynchronicClustering (synchronicClustering)
23 import Gargantext.Text.Context (TermList)
24 import Gargantext.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, Size(..))
25 import Gargantext.Viz.Graph.MaxClique (getMaxCliques)
26 import Gargantext.Viz.Graph.Distances (Distance(Conditional))
28 import Control.DeepSeq (NFData)
29 import Control.Parallel.Strategies (parList, rdeepseq, using)
30 import Debug.Trace (trace)
31 import Control.Lens hiding (Level)
33 import qualified Data.Vector as Vector
34 import qualified Data.Set as Set
42 toPhylo :: [Document] -> TermList -> Config -> Phylo
43 toPhylo docs lst conf = trace ("# phylo1 groups " <> show(length $ getGroupsFromLevel 1 phylo1))
44 $ traceToPhylo (phyloLevel conf) $
45 if (phyloLevel conf) > 1
46 then foldl' (\phylo' _ -> synchronicClustering phylo') phylo1 [2..(phyloLevel conf)]
49 --------------------------------------
51 phylo1 = toPhylo1 docs phyloBase
52 --------------------------------------
54 phyloBase = toPhyloBase docs lst conf
55 --------------------------------------
63 toGroupsProxi :: Level -> Phylo -> Phylo
64 toGroupsProxi lvl phylo =
65 let proximity = phyloProximity $ getConfig phylo
66 groupsProxi = foldlWithKey (\acc pId pds ->
67 -- 1) process period by period
68 let egos = map (\g -> (getGroupId g, g ^. phylo_groupNgrams))
70 $ view ( phylo_periodLevels
71 . traverse . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
72 . phylo_levelGroups ) pds
73 next = getNextPeriods ToParents (getTimeFrame $ timeUnit $ getConfig phylo) pId (keys $ phylo ^. phylo_periods)
74 targets = map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ getGroupsFromLevelPeriods lvl next phylo
75 docs = filterDocs (phylo ^. phylo_timeDocs) ([pId] ++ next)
76 diagos = filterDiago (phylo ^. phylo_timeCooc) ([pId] ++ next)
77 -- 2) compute the pairs in parallel
78 pairs = map (\(id,ngrams) ->
79 map (\(id',ngrams') ->
80 let nbDocs = (sum . elems) $ filterDocs docs ([idToPrd id, idToPrd id'])
81 diago = reduceDiagos $ filterDiago diagos ([idToPrd id, idToPrd id'])
82 in ((id,id'),toProximity nbDocs diago proximity ngrams ngrams' ngrams')
83 ) $ filter (\(_,ngrams') -> (not . null) $ intersect ngrams ngrams') targets
85 pairs' = pairs `using` parList rdeepseq
86 in acc ++ (concat pairs')
87 ) [] $ phylo ^. phylo_periods
88 in phylo & phylo_groupsProxi .~ ((traceGroupsProxi . fromList) groupsProxi)
91 appendGroups :: (a -> PhyloPeriodId -> Level -> Int -> [Cooc] -> PhyloGroup) -> Level -> Map (Date,Date) [a] -> Phylo -> Phylo
92 appendGroups f lvl m phylo = trace ("\n" <> "-- | Append " <> show (length $ concat $ elems m) <> " groups to Level " <> show (lvl) <> "\n")
93 $ over ( phylo_periods
97 (\phyloLvl -> if lvl == (phyloLvl ^. phylo_levelLevel)
99 let pId = phyloLvl ^. phylo_levelPeriod
102 & phylo_levelGroups .~ (fromList $ foldl (\groups obj ->
103 groups ++ [ (((pId,lvl),length groups)
104 , f obj pId lvl (length groups)
105 (elems $ restrictKeys (phylo ^. phylo_timeCooc) $ periodsToYears [pId]))
112 cliqueToGroup :: PhyloClique -> PhyloPeriodId -> Level -> Int -> [Cooc] -> PhyloGroup
113 cliqueToGroup fis pId lvl idx coocs = PhyloGroup pId lvl idx ""
114 (fis ^. phyloClique_support)
115 (fis ^. phyloClique_nodes)
116 (ngramsToCooc (fis ^. phyloClique_nodes) coocs)
117 (1,[0]) -- branchid (lvl,[path in the branching tree])
118 (fromList [("breaks",[0]),("seaLevels",[0])])
122 toPhylo1 :: [Document] -> Phylo -> Phylo
123 toPhylo1 docs phyloBase = case (getSeaElevation phyloBase) of
124 Constante start gap -> constanteTemporalMatching start gap
125 $ appendGroups cliqueToGroup 1 phyloClique phyloBase
126 Adaptative steps -> adaptativeTemporalMatching steps
128 $ appendGroups cliqueToGroup 1 phyloClique phyloBase
130 --------------------------------------
131 phyloClique :: Map (Date,Date) [PhyloClique]
132 phyloClique = toPhyloClique phyloBase docs'
133 --------------------------------------
134 docs' :: Map (Date,Date) [Document]
135 docs' = groupDocsByPeriodRec date (getPeriodIds phyloBase) (sortOn date docs) empty
136 -- docs' = groupDocsByPeriod' date (getPeriodIds phyloBase) docs
137 --------------------------------------
140 ---------------------------
141 -- | Frequent Item Set | --
142 ---------------------------
145 -- To apply a filter with the possibility of keeping some periods non empty (keep : True|False)
146 filterClique :: Bool -> Int -> (Int -> [PhyloClique] -> [PhyloClique]) -> Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
147 filterClique keep thr f m = case keep of
148 False -> map (\l -> f thr l) m
149 True -> map (\l -> keepFilled (f) thr l) m
152 -- To filter Fis with small Support
153 filterCliqueBySupport :: Int -> [PhyloClique] -> [PhyloClique]
154 filterCliqueBySupport thr l = filter (\clq -> (clq ^. phyloClique_support) >= thr) l
157 -- To filter Fis with small Clique size
158 filterCliqueBySize :: Int -> [PhyloClique] -> [PhyloClique]
159 filterCliqueBySize thr l = filter (\clq -> (length $ clq ^. phyloClique_nodes) >= thr) l
162 -- To filter nested Fis
163 filterCliqueByNested :: Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
164 filterCliqueByNested m =
166 foldl (\mem f -> if (any (\f' -> isNested (f' ^. phyloClique_nodes) (f ^. phyloClique_nodes)) mem)
169 let fMax = filter (\f' -> not $ isNested (f ^. phyloClique_nodes) (f' ^. phyloClique_nodes)) mem
170 in fMax ++ [f] ) [] l)
172 clq' = clq `using` parList rdeepseq
173 in fromList $ zip (keys m) clq'
176 -- | To transform a time map of docs into a time map of Fis with some filters
177 toPhyloClique :: Phylo -> Map (Date, Date) [Document] -> Map (Date,Date) [PhyloClique]
178 toPhyloClique phylo phyloDocs = case (clique $ getConfig phylo) of
179 Fis s s' -> -- traceFis "Filtered Fis"
181 {- \$ traceFis "Filtered by clique size" -}
182 $ filterClique True s' (filterCliqueBySize)
183 {- \$ traceFis "Filtered by support" -}
184 $ filterClique True s (filterCliqueBySupport)
185 {- \$ traceFis "Unfiltered Fis" -}
187 MaxClique s -> filterClique True s (filterCliqueBySize)
190 --------------------------------------
191 phyloClique :: Map (Date,Date) [PhyloClique]
192 phyloClique = case (clique $ getConfig phylo) of
194 let fis = map (\(prd,docs) ->
195 let lst = toList $ fisWithSizePolyMap (Segment 1 20) 1 (map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs)
196 in (prd, map (\f -> PhyloClique (Set.toList $ fst f) (snd f) prd) lst))
198 fis' = fis `using` parList rdeepseq
201 let mcl = map (\(prd,docs) ->
203 $ foldl sumCooc empty
205 $ map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs
206 in (prd, map (\cl -> PhyloClique cl 0 prd) $ getMaxCliques Conditional 0.001 cooc))
208 mcl' = mcl `using` parList rdeepseq
210 --------------------------------------
212 -- dev viz graph maxClique getMaxClique
220 -- To transform the docs into a time map of coocurency matrix
221 docsToTimeScaleCooc :: [Document] -> Vector Ngrams -> Map Date Cooc
222 docsToTimeScaleCooc docs fdt =
223 let mCooc = fromListWith sumCooc
224 $ map (\(_d,l) -> (_d, listToMatrix l))
225 $ map (\doc -> (date doc, sort $ ngramsToIdx (text doc) fdt)) docs
227 $ map (\t -> (t,empty))
228 $ toTimeScale (map date docs) 1
229 in trace ("\n" <> "-- | Build the coocurency matrix for " <> show (length $ keys mCooc') <> " unit of time" <> "\n")
230 $ unionWith sumCooc mCooc mCooc'
233 -----------------------
234 -- | to Phylo Base | --
235 -----------------------
237 groupDocsByPeriodRec :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc] -> Map (date, date) [doc]
238 groupDocsByPeriodRec f prds docs acc =
239 if ((null prds) || (null docs))
242 let prd = head' "groupBy" prds
243 docs' = partition (\d -> (f d >= fst prd) && (f d <= snd prd)) docs
244 in groupDocsByPeriodRec f (tail prds) (snd docs') (insert prd (fst docs') acc)
247 -- To group a list of Documents by fixed periods
248 groupDocsByPeriod' :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
249 groupDocsByPeriod' f pds docs =
250 let docs' = groupBy (\d d' -> f d == f d') $ sortOn f docs
251 periods = map (inPeriode f docs') pds
252 periods' = periods `using` parList rdeepseq
253 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length pds) <> " periods" <> "\n")
254 $ fromList $ zip pds periods'
256 --------------------------------------
257 inPeriode :: Ord b => (t -> b) -> [[t]] -> (b, b) -> [t]
258 inPeriode f' h (start,end) =
259 concat $ fst $ partition (\d -> f' (head' "inPeriode" d) >= start && f' (head' "inPeriode" d) <= end) h
263 -- To group a list of Documents by fixed periods
264 groupDocsByPeriod :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
265 groupDocsByPeriod _ _ [] = panic "[ERR][Viz.Phylo.PhyloMaker] Empty [Documents] can not have any periods"
266 groupDocsByPeriod f pds es =
267 let periods = map (inPeriode f es) pds
268 periods' = periods `using` parList rdeepseq
270 in trace ("\n" <> "-- | Group " <> show(length es) <> " docs by " <> show(length pds) <> " periods" <> "\n")
271 $ fromList $ zip pds periods'
273 --------------------------------------
274 inPeriode :: Ord b => (t -> b) -> [t] -> (b, b) -> [t]
275 inPeriode f' h (start,end) =
276 fst $ partition (\d -> f' d >= start && f' d <= end) h
277 --------------------------------------
280 docsToTermFreq :: [Document] -> Vector Ngrams -> Map Int Double
281 docsToTermFreq docs fdt =
282 let nbDocs = fromIntegral $ length docs
283 freqs = map (/(nbDocs))
285 $ map (\lst -> (head' "docsToTermFreq" lst, fromIntegral $ length lst))
286 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) docs
287 sumFreqs = sum $ elems freqs
288 in map (/sumFreqs) freqs
291 -- To count the number of docs by unit of time
292 docsToTimeScaleNb :: [Document] -> Map Date Double
293 docsToTimeScaleNb docs =
294 let docs' = fromListWith (+) $ map (\d -> (date d,1)) docs
295 time = fromList $ map (\t -> (t,0)) $ toTimeScale (keys docs') 1
296 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length time) <> " unit of time" <> "\n")
297 $ unionWith (+) time docs'
300 initPhyloLevels :: Int -> PhyloPeriodId -> Map PhyloLevelId PhyloLevel
301 initPhyloLevels lvlMax pId =
302 fromList $ map (\lvl -> ((pId,lvl),PhyloLevel pId lvl empty)) [1..lvlMax]
305 -- To init the basic elements of a Phylo
306 toPhyloBase :: [Document] -> TermList -> Config -> Phylo
307 toPhyloBase docs lst conf =
308 let foundations = PhyloFoundations (Vector.fromList $ nub $ concat $ map text docs) lst
309 params = defaultPhyloParam { _phyloParam_config = conf }
310 periods = toPeriods (sort $ nub $ map date docs) (getTimePeriod $ timeUnit conf) (getTimeStep $ timeUnit conf)
311 in trace ("\n" <> "-- | Create PhyloBase out of " <> show(length docs) <> " docs \n")
313 (docsToTimeScaleCooc docs (foundations ^. foundations_roots))
314 (docsToTimeScaleNb docs)
315 (docsToTermFreq docs (foundations ^. foundations_roots))
319 (fromList $ map (\prd -> (prd, PhyloPeriod prd (initPhyloLevels 1 prd))) periods)