2 Module : Gargantext.Viz.Phylo.PhyloMaker
3 Description : Maker engine for rebuilding a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 {-# LANGUAGE NoImplicitPrelude #-}
12 {-# LANGUAGE FlexibleContexts #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE MultiParamTypeClasses #-}
16 module Gargantext.Viz.Phylo.PhyloMaker where
18 import Data.List (concat, nub, partition, sort, (++))
19 import Data.Map (Map, fromListWith, keys, unionWith, fromList, empty, toList, elems, (!), filterWithKey, restrictKeys)
20 import Data.Set (size)
21 import Data.Vector (Vector)
23 import Gargantext.Prelude
24 import Gargantext.Viz.AdaptativePhylo
25 import Gargantext.Viz.Phylo.PhyloTools
26 import Gargantext.Text.Context (TermList)
27 import Gargantext.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, Size(..))
29 import Control.DeepSeq (NFData)
30 import Control.Parallel.Strategies (parList, rdeepseq, using)
31 import Debug.Trace (trace)
32 import Control.Lens hiding (Level)
34 import qualified Data.Vector as Vector
35 import qualified Data.Set as Set
43 toPhylo :: [Document] -> TermList -> Config -> Phylo
44 toPhylo docs lst conf = phylo1
46 --------------------------------------
48 phylo1 = toPhylo1 docs phyloBase
49 --------------------------------------
51 phyloBase = toPhyloBase docs lst conf
52 --------------------------------------
61 appendGroups :: (a -> PhyloPeriodId -> Level -> Int -> Vector Ngrams -> [Cooc] -> PhyloGroup) -> Level -> Map (Date,Date) [a] -> Phylo -> Phylo
62 appendGroups f lvl m phylo = trace ("\n" <> "-- | Append " <> show (length $ concat $ elems m) <> " groups to Level " <> show (lvl) <> "\n")
63 $ over ( phylo_periods
67 (\phyloLvl -> if lvl == (phyloLvl ^. phylo_levelLevel)
69 let pId = phyloLvl ^. phylo_levelPeriod
72 & phylo_levelGroups .~ (fromList $ foldl (\groups obj ->
73 groups ++ [ (((pId,lvl),length groups)
74 , f obj pId lvl (length groups) (getRoots phylo)
75 (elems $ restrictKeys (phylo ^. phylo_timeCooc) $ periodsToYears [pId]))
82 fisToGroup :: PhyloFis -> PhyloPeriodId -> Level -> Int -> Vector Ngrams -> [Cooc] -> PhyloGroup
83 fisToGroup fis pId lvl idx fdt coocs =
84 let ngrams = ngramsToIdx (Set.toList $ fis ^. phyloFis_clique) fdt
85 in PhyloGroup pId lvl idx
86 (fis ^. phyloFis_support)
88 (ngramsToCooc ngrams coocs)
94 toPhylo1 :: [Document] -> Phylo -> Phylo
95 toPhylo1 docs phyloBase = appendGroups fisToGroup 1 phyloFis phyloBase
97 --------------------------------------
98 phyloFis :: Map (Date,Date) [PhyloFis]
99 phyloFis = toPhyloFis docs' (fisSupport $ getConfig phyloBase) (fisSize $ getConfig phyloBase)
100 --------------------------------------
101 docs' :: Map (Date,Date) [Document]
102 docs' = groupDocsByPeriod date (getPeriodIds phyloBase) docs
103 --------------------------------------
106 ---------------------------
107 -- | Frequent Item Set | --
108 ---------------------------
111 -- | To apply a filter with the possibility of keeping some periods non empty (keep : True|False)
112 filterFis :: Bool -> Int -> (Int -> [PhyloFis] -> [PhyloFis]) -> Map (Date, Date) [PhyloFis] -> Map (Date, Date) [PhyloFis]
113 filterFis keep thr f m = case keep of
114 False -> map (\l -> f thr l) m
115 True -> map (\l -> keepFilled (f) thr l) m
118 -- | To filter Fis with small Support
119 filterFisBySupport :: Int -> [PhyloFis] -> [PhyloFis]
120 filterFisBySupport thr l = filter (\fis -> (fis ^. phyloFis_support) >= thr) l
123 -- | To filter Fis with small Clique size
124 filterFisByClique :: Int -> [PhyloFis] -> [PhyloFis]
125 filterFisByClique thr l = filter (\fis -> (size $ fis ^. phyloFis_clique) >= thr) l
128 -- | To filter nested Fis
129 filterFisByNested :: Map (Date, Date) [PhyloFis] -> Map (Date, Date) [PhyloFis]
130 filterFisByNested m =
132 foldl (\mem f -> if (any (\f' -> isNested (Set.toList $ f' ^. phyloFis_clique) (Set.toList $ f ^. phyloFis_clique)) mem)
135 let fMax = filter (\f' -> not $ isNested (Set.toList $ f ^. phyloFis_clique) (Set.toList $ f' ^. phyloFis_clique)) mem
136 in fMax ++ [f] ) [] l)
138 fis' = fis `using` parList rdeepseq
139 in fromList $ zip (keys m) fis'
142 -- | To transform a time map of docs innto a time map of Fis with some filters
143 toPhyloFis :: Map (Date, Date) [Document] -> Int -> Int -> Map (Date,Date) [PhyloFis]
144 toPhyloFis phyloDocs support clique = traceFis "Filtered Fis"
146 $ traceFis "Filtered by clique size"
147 $ filterFis True clique (filterFisByClique)
148 $ traceFis "Filtered by support"
149 $ filterFis True support (filterFisBySupport)
150 $ traceFis "Unfiltered Fis" phyloFis
152 --------------------------------------
153 phyloFis :: Map (Date,Date) [PhyloFis]
155 let fis = map (\(prd,docs) -> let lst = toList $ fisWithSizePolyMap (Segment 1 20) 1 (map text docs)
156 in (prd, map (\f -> PhyloFis (fst f) (snd f) prd) lst))
158 fis' = fis `using` parList rdeepseq
160 --------------------------------------
168 -- | To build the local cooc matrix of each phylogroup
169 ngramsToCooc :: [Int] -> [Cooc] -> Cooc
170 ngramsToCooc ngrams coocs =
171 let cooc = foldl (\acc cooc' -> sumCooc acc cooc') empty coocs
172 pairs = listToKeys ngrams
173 in filterWithKey (\k _ -> elem k pairs) cooc
176 -- | To transform the docs into a time map of coocurency matrix
177 docsToCoocByYear :: [Document] -> Vector Ngrams -> Config -> Map Date Cooc
178 docsToCoocByYear docs fdt conf =
179 let mCooc = fromListWith sumCooc
180 $ map (\(_d,l) -> (_d, listToMatrix l))
181 $ map (\doc -> (date doc, sort $ ngramsToIdx (text doc) fdt)) docs
183 $ map (\t -> (t,empty))
184 $ toTimeScale (map date docs) (timeUnit conf)
185 in trace ("\n" <> "-- | Build the coocurency matrix for " <> show (length $ keys mCooc') <> " unit of time" <> "\n")
186 $ unionWith sumCooc mCooc mCooc'
189 -----------------------
190 -- | to Phylo Base | --
191 -----------------------
194 -- | To group a list of Documents by fixed periods
195 groupDocsByPeriod :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
196 groupDocsByPeriod _ _ [] = panic "[ERR][Viz.Phylo.PhyloMaker] Empty [Documents] can not have any periods"
197 groupDocsByPeriod f pds es =
198 let periods = map (inPeriode f es) pds
199 periods' = periods `using` parList rdeepseq
201 in trace ("\n" <> "-- | Group " <> show(length es) <> " docs by " <> show(length pds) <> " periods" <> "\n")
202 $ fromList $ zip pds periods'
204 --------------------------------------
205 inPeriode :: Ord b => (t -> b) -> [t] -> (b, b) -> [t]
206 inPeriode f' h (start,end) =
207 fst $ partition (\d -> f' d >= start && f' d <= end) h
208 --------------------------------------
211 -- | To count the number of docs by unit of time (like a year)
212 nbDocsByTime :: [Document] -> Int -> Map Date Double
213 nbDocsByTime docs step =
214 let docs' = fromListWith (+) $ map (\d -> (date d,1)) docs
215 time = fromList $ map (\t -> (t,0)) $ toTimeScale (keys docs') step
216 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length time) <> " unit of time" <> "\n")
217 $ unionWith (+) time docs'
220 initPhyloLevels :: Int -> PhyloPeriodId -> Map PhyloLevelId PhyloLevel
221 initPhyloLevels lvlMax pId =
222 fromList $ map (\lvl -> ((pId,lvl),PhyloLevel pId lvl empty)) [1..lvlMax]
225 -- | To init the basic elements of a Phylo
226 toPhyloBase :: [Document] -> TermList -> Config -> Phylo
227 toPhyloBase docs lst conf =
228 let foundations = PhyloFoundations (Vector.fromList $ nub $ concat $ map text docs) lst
229 params = defaultPhyloParam { _phyloParam_config = conf }
230 periods = toPeriods (sort $ nub $ map date docs) (timePeriod conf) (timeStep conf)
231 in trace ("\n" <> "-- | Create PhyloBase out of " <> show(length docs) <> " docs \n")
233 (docsToCoocByYear docs (foundations ^. foundations_roots) conf)
234 (nbDocsByTime docs $ timeUnit conf)
236 (fromList $ map (\prd -> (prd, PhyloPeriod prd (initPhyloLevels (phyloLevel conf) prd))) periods)