2 Module : Gargantext.Viz.Phylo.PhyloMaker
3 Description : Maker engine for rebuilding a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 {-# LANGUAGE NoImplicitPrelude #-}
12 {-# LANGUAGE FlexibleContexts #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE MultiParamTypeClasses #-}
16 module Gargantext.Viz.Phylo.PhyloMaker where
18 import Data.List (concat, nub, partition, sort, (++), group)
19 import Data.Map (Map, fromListWith, keys, unionWith, fromList, empty, toList, elems, (!), restrictKeys, singleton)
20 import Data.Set (size)
21 import Data.Vector (Vector)
23 import Gargantext.Prelude
24 import Gargantext.Viz.AdaptativePhylo
25 import Gargantext.Viz.Phylo.PhyloTools
26 import Gargantext.Viz.Phylo.TemporalMatching (temporalMatching)
27 import Gargantext.Viz.Phylo.SynchronicClustering (synchronicClustering)
28 import Gargantext.Text.Context (TermList)
29 import Gargantext.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, Size(..))
31 import Control.DeepSeq (NFData)
32 import Control.Parallel.Strategies (parList, rdeepseq, using)
33 import Debug.Trace (trace)
34 import Control.Lens hiding (Level)
36 import qualified Data.Vector as Vector
37 import qualified Data.Set as Set
45 toPhylo :: [Document] -> TermList -> Config -> Phylo
46 toPhylo docs lst conf = traceToPhylo (phyloLevel conf) $
47 if (phyloLevel conf) > 1
48 then foldl' (\phylo' _ -> synchronicClustering phylo') phylo1 [2..(phyloLevel conf)]
51 --------------------------------------
53 phylo1 = toPhylo1 docs phyloBase
54 --------------------------------------
56 phyloBase = toPhyloBase docs lst conf
57 --------------------------------------
66 appendGroups :: (a -> Double -> PhyloPeriodId -> Level -> Int -> Vector Ngrams -> [Cooc] -> PhyloGroup) -> Level -> Map (Date,Date) [a] -> Phylo -> Phylo
67 appendGroups f lvl m phylo = trace ("\n" <> "-- | Append " <> show (length $ concat $ elems m) <> " groups to Level " <> show (lvl) <> "\n")
68 $ over ( phylo_periods
72 (\phyloLvl -> if lvl == (phyloLvl ^. phylo_levelLevel)
74 let pId = phyloLvl ^. phylo_levelPeriod
77 & phylo_levelGroups .~ (fromList $ foldl (\groups obj ->
78 groups ++ [ (((pId,lvl),length groups)
79 , f obj(getPhyloThresholdInit phylo) pId lvl (length groups) (getRoots phylo)
80 (elems $ restrictKeys (phylo ^. phylo_timeCooc) $ periodsToYears [pId]))
87 cliqueToGroup :: PhyloClique -> Double -> PhyloPeriodId -> Level -> Int -> Vector Ngrams -> [Cooc] -> PhyloGroup
88 cliqueToGroup fis thr pId lvl idx fdt coocs =
89 let ngrams = ngramsToIdx (Set.toList $ fis ^. phyloClique_nodes) fdt
90 in PhyloGroup pId lvl idx ""
91 (fis ^. phyloClique_support)
93 (ngramsToCooc ngrams coocs)
94 (1,[0]) -- | branchid (lvl,[path in the branching tree])
95 (singleton "thr" [thr])
99 toPhylo1 :: [Document] -> Phylo -> Phylo
100 toPhylo1 docs phyloBase = temporalMatching
101 $ appendGroups cliqueToGroup 1 phyloClique phyloBase
103 --------------------------------------
104 phyloClique :: Map (Date,Date) [PhyloClique]
105 phyloClique = toPhyloClique phyloBase docs'
106 --------------------------------------
107 docs' :: Map (Date,Date) [Document]
108 docs' = groupDocsByPeriod date (getPeriodIds phyloBase) docs
109 --------------------------------------
112 ---------------------------
113 -- | Frequent Item Set | --
114 ---------------------------
117 -- | To apply a filter with the possibility of keeping some periods non empty (keep : True|False)
118 filterClique :: Bool -> Int -> (Int -> [PhyloClique] -> [PhyloClique]) -> Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
119 filterClique keep thr f m = case keep of
120 False -> map (\l -> f thr l) m
121 True -> map (\l -> keepFilled (f) thr l) m
124 -- | To filter Fis with small Support
125 filterCliqueBySupport :: Int -> [PhyloClique] -> [PhyloClique]
126 filterCliqueBySupport thr l = filter (\clq -> (clq ^. phyloClique_support) >= thr) l
129 -- | To filter Fis with small Clique size
130 filterCliqueBySize :: Int -> [PhyloClique] -> [PhyloClique]
131 filterCliqueBySize thr l = filter (\clq -> (size $ clq ^. phyloClique_nodes) >= thr) l
134 -- | To filter nested Fis
135 filterCliqueByNested :: Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
136 filterCliqueByNested m =
138 foldl (\mem f -> if (any (\f' -> isNested (Set.toList $ f' ^. phyloClique_nodes) (Set.toList $ f ^. phyloClique_nodes)) mem)
141 let fMax = filter (\f' -> not $ isNested (Set.toList $ f ^. phyloClique_nodes) (Set.toList $ f' ^. phyloClique_nodes)) mem
142 in fMax ++ [f] ) [] l)
144 clq' = clq `using` parList rdeepseq
145 in fromList $ zip (keys m) clq'
148 -- | To transform a time map of docs innto a time map of Fis with some filters
149 toPhyloClique :: Phylo -> Map (Date, Date) [Document] -> Map (Date,Date) [PhyloClique]
150 toPhyloClique phylo phyloDocs = case (clique $ getConfig phylo) of
151 Fis s s' -> -- traceFis "Filtered Fis"
153 -- $ traceFis "Filtered by clique size"
154 $ filterClique True s' (filterCliqueBySize)
155 -- $ traceFis "Filtered by support"
156 $ filterClique True s (filterCliqueBySupport)
157 -- $ traceFis "Unfiltered Fis"
159 MaxClique _ -> undefined
161 --------------------------------------
162 phyloClique :: Map (Date,Date) [PhyloClique]
163 phyloClique = case (clique $ getConfig phylo) of
164 Fis _ _ -> let fis = map (\(prd,docs) ->
165 let lst = toList $ fisWithSizePolyMap (Segment 1 20) 1 (map text docs)
166 in (prd, map (\f -> PhyloClique (fst f) (snd f) prd) lst))
168 fis' = fis `using` parList rdeepseq
170 MaxClique _ -> undefined
171 --------------------------------------
179 -- | To transform the docs into a time map of coocurency matrix
180 docsToTimeScaleCooc :: [Document] -> Vector Ngrams -> Map Date Cooc
181 docsToTimeScaleCooc docs fdt =
182 let mCooc = fromListWith sumCooc
183 $ map (\(_d,l) -> (_d, listToMatrix l))
184 $ map (\doc -> (date doc, sort $ ngramsToIdx (text doc) fdt)) docs
186 $ map (\t -> (t,empty))
187 $ toTimeScale (map date docs) 1
188 in trace ("\n" <> "-- | Build the coocurency matrix for " <> show (length $ keys mCooc') <> " unit of time" <> "\n")
189 $ unionWith sumCooc mCooc mCooc'
192 -----------------------
193 -- | to Phylo Base | --
194 -----------------------
197 -- | To group a list of Documents by fixed periods
198 groupDocsByPeriod :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
199 groupDocsByPeriod _ _ [] = panic "[ERR][Viz.Phylo.PhyloMaker] Empty [Documents] can not have any periods"
200 groupDocsByPeriod f pds es =
201 let periods = map (inPeriode f es) pds
202 periods' = periods `using` parList rdeepseq
204 in trace ("\n" <> "-- | Group " <> show(length es) <> " docs by " <> show(length pds) <> " periods" <> "\n")
205 $ fromList $ zip pds periods'
207 --------------------------------------
208 inPeriode :: Ord b => (t -> b) -> [t] -> (b, b) -> [t]
209 inPeriode f' h (start,end) =
210 fst $ partition (\d -> f' d >= start && f' d <= end) h
211 --------------------------------------
214 docsToTermFreq :: [Document] -> Vector Ngrams -> Map Int Double
215 docsToTermFreq docs fdt =
216 let nbDocs = fromIntegral $ length docs
217 freqs = map (/(nbDocs))
219 $ map (\lst -> (head' "docsToTermFreq" lst, fromIntegral $ length lst))
220 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) docs
221 sumFreqs = sum $ elems freqs
222 in map (/sumFreqs) freqs
225 -- | To count the number of docs by unit of time
226 docsToTimeScaleNb :: [Document] -> Map Date Double
227 docsToTimeScaleNb docs =
228 let docs' = fromListWith (+) $ map (\d -> (date d,1)) docs
229 time = fromList $ map (\t -> (t,0)) $ toTimeScale (keys docs') 1
230 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length time) <> " unit of time" <> "\n")
231 $ unionWith (+) time docs'
234 initPhyloLevels :: Int -> PhyloPeriodId -> Map PhyloLevelId PhyloLevel
235 initPhyloLevels lvlMax pId =
236 fromList $ map (\lvl -> ((pId,lvl),PhyloLevel pId lvl empty)) [1..lvlMax]
239 -- | To init the basic elements of a Phylo
240 toPhyloBase :: [Document] -> TermList -> Config -> Phylo
241 toPhyloBase docs lst conf =
242 let foundations = PhyloFoundations (Vector.fromList $ nub $ concat $ map text docs) lst
243 params = defaultPhyloParam { _phyloParam_config = conf }
244 periods = toPeriods (sort $ nub $ map date docs) (getTimePeriod $ timeUnit conf) (getTimeStep $ timeUnit conf)
245 in trace ("\n" <> "-- | Create PhyloBase out of " <> show(length docs) <> " docs \n")
247 (docsToTimeScaleCooc docs (foundations ^. foundations_roots))
248 (docsToTimeScaleNb docs)
249 (docsToTermFreq docs (foundations ^. foundations_roots))
251 (fromList $ map (\prd -> (prd, PhyloPeriod prd (initPhyloLevels 1 prd))) periods)