2 Module : Gargantext.Viz.Phylo.PhyloMaker
3 Description : Maker engine for rebuilding a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 {-# LANGUAGE NoImplicitPrelude #-}
12 {-# LANGUAGE FlexibleContexts #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE MultiParamTypeClasses #-}
16 module Gargantext.Viz.Phylo.PhyloMaker where
18 import Data.List (concat, nub, partition, sort, (++))
19 import Data.Map (Map, fromListWith, keys, unionWith, fromList, empty, toList, elems, (!), filterWithKey, restrictKeys)
20 import Data.Set (size)
21 import Data.Vector (Vector)
23 import Gargantext.Prelude
24 import Gargantext.Viz.AdaptativePhylo
25 import Gargantext.Viz.Phylo.PhyloTools
26 import Gargantext.Text.Context (TermList)
27 import Gargantext.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, Size(..))
29 import Control.DeepSeq (NFData)
30 import Control.Parallel.Strategies (parList, rdeepseq, using)
31 import Debug.Trace (trace)
32 import Control.Lens hiding (Level)
34 import qualified Data.Vector as Vector
35 import qualified Data.Set as Set
43 toPhylo :: [Document] -> TermList -> Config -> Phylo
44 toPhylo docs lst conf = phylo1
46 --------------------------------------
48 phylo1 = toPhylo1 docs phyloBase
49 --------------------------------------
51 phyloBase = toPhyloBase docs lst conf
52 --------------------------------------
61 appendGroups :: (a -> PhyloPeriodId -> Level -> Int -> Vector Ngrams -> [Cooc] -> PhyloGroup) -> Level -> Map (Date,Date) [a] -> Phylo -> Phylo
62 appendGroups f lvl m phylo = trace ("\n" <> "-- | Append " <> show (length $ concat $ elems m) <> " groups to Level " <> show (lvl) <> "\n")
63 $ over ( phylo_periods
67 (\phyloLvl -> if lvl == (phyloLvl ^. phylo_levelLevel)
69 let pId = phyloLvl ^. phylo_levelPeriod
72 & phylo_levelGroups .~ (fromList $ foldl (\groups obj ->
73 groups ++ [ (((pId,lvl),length groups)
74 , f obj pId lvl (length groups) (getRoots phylo)
75 (elems $ restrictKeys (phylo ^. phylo_timeCooc) $ periodsToYears [pId]))
82 fisToGroup :: PhyloFis -> PhyloPeriodId -> Level -> Int -> Vector Ngrams -> [Cooc] -> PhyloGroup
83 fisToGroup fis pId lvl idx fdt coocs =
84 let ngrams = ngramsToIdx (Set.toList $ fis ^. phyloFis_clique) fdt
85 in PhyloGroup pId lvl idx
86 (fis ^. phyloFis_support)
88 (ngramsToCooc ngrams coocs)
93 toPhylo1 :: [Document] -> Phylo -> Phylo
94 toPhylo1 docs phyloBase = appendGroups fisToGroup 1 phyloFis phyloBase
96 --------------------------------------
97 phyloFis :: Map (Date,Date) [PhyloFis]
98 phyloFis = toPhyloFis docs' (getFisSupport $ contextualUnit $ getConfig phyloBase) (getFisSize $ contextualUnit $ getConfig phyloBase)
99 --------------------------------------
100 docs' :: Map (Date,Date) [Document]
101 docs' = groupDocsByPeriod date (getPeriodIds phyloBase) docs
102 --------------------------------------
105 ---------------------------
106 -- | Frequent Item Set | --
107 ---------------------------
110 -- | To apply a filter with the possibility of keeping some periods non empty (keep : True|False)
111 filterFis :: Bool -> Int -> (Int -> [PhyloFis] -> [PhyloFis]) -> Map (Date, Date) [PhyloFis] -> Map (Date, Date) [PhyloFis]
112 filterFis keep thr f m = case keep of
113 False -> map (\l -> f thr l) m
114 True -> map (\l -> keepFilled (f) thr l) m
117 -- | To filter Fis with small Support
118 filterFisBySupport :: Int -> [PhyloFis] -> [PhyloFis]
119 filterFisBySupport thr l = filter (\fis -> (fis ^. phyloFis_support) >= thr) l
122 -- | To filter Fis with small Clique size
123 filterFisByClique :: Int -> [PhyloFis] -> [PhyloFis]
124 filterFisByClique thr l = filter (\fis -> (size $ fis ^. phyloFis_clique) >= thr) l
127 -- | To filter nested Fis
128 filterFisByNested :: Map (Date, Date) [PhyloFis] -> Map (Date, Date) [PhyloFis]
129 filterFisByNested m =
131 foldl (\mem f -> if (any (\f' -> isNested (Set.toList $ f' ^. phyloFis_clique) (Set.toList $ f ^. phyloFis_clique)) mem)
134 let fMax = filter (\f' -> not $ isNested (Set.toList $ f ^. phyloFis_clique) (Set.toList $ f' ^. phyloFis_clique)) mem
135 in fMax ++ [f] ) [] l)
137 fis' = fis `using` parList rdeepseq
138 in fromList $ zip (keys m) fis'
141 -- | To transform a time map of docs innto a time map of Fis with some filters
142 toPhyloFis :: Map (Date, Date) [Document] -> Int -> Int -> Map (Date,Date) [PhyloFis]
143 toPhyloFis phyloDocs support clique = traceFis "Filtered Fis"
145 $ traceFis "Filtered by clique size"
146 $ filterFis True clique (filterFisByClique)
147 $ traceFis "Filtered by support"
148 $ filterFis True support (filterFisBySupport)
149 $ traceFis "Unfiltered Fis" phyloFis
151 --------------------------------------
152 phyloFis :: Map (Date,Date) [PhyloFis]
154 let fis = map (\(prd,docs) -> let lst = toList $ fisWithSizePolyMap (Segment 1 20) 1 (map text docs)
155 in (prd, map (\f -> PhyloFis (fst f) (snd f) prd) lst))
157 fis' = fis `using` parList rdeepseq
159 --------------------------------------
167 -- | To build the local cooc matrix of each phylogroup
168 ngramsToCooc :: [Int] -> [Cooc] -> Cooc
169 ngramsToCooc ngrams coocs =
170 let cooc = foldl (\acc cooc' -> sumCooc acc cooc') empty coocs
171 pairs = listToKeys ngrams
172 in filterWithKey (\k _ -> elem k pairs) cooc
175 -- | To transform the docs into a time map of coocurency matrix
176 docsToTimeScaleCooc :: [Document] -> Vector Ngrams -> Map Date Cooc
177 docsToTimeScaleCooc docs fdt =
178 let mCooc = fromListWith sumCooc
179 $ map (\(_d,l) -> (_d, listToMatrix l))
180 $ map (\doc -> (date doc, sort $ ngramsToIdx (text doc) fdt)) docs
182 $ map (\t -> (t,empty))
183 $ toTimeScale (map date docs) 1
184 in trace ("\n" <> "-- | Build the coocurency matrix for " <> show (length $ keys mCooc') <> " unit of time" <> "\n")
185 $ unionWith sumCooc mCooc mCooc'
188 -----------------------
189 -- | to Phylo Base | --
190 -----------------------
193 -- | To group a list of Documents by fixed periods
194 groupDocsByPeriod :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
195 groupDocsByPeriod _ _ [] = panic "[ERR][Viz.Phylo.PhyloMaker] Empty [Documents] can not have any periods"
196 groupDocsByPeriod f pds es =
197 let periods = map (inPeriode f es) pds
198 periods' = periods `using` parList rdeepseq
200 in trace ("\n" <> "-- | Group " <> show(length es) <> " docs by " <> show(length pds) <> " periods" <> "\n")
201 $ fromList $ zip pds periods'
203 --------------------------------------
204 inPeriode :: Ord b => (t -> b) -> [t] -> (b, b) -> [t]
205 inPeriode f' h (start,end) =
206 fst $ partition (\d -> f' d >= start && f' d <= end) h
207 --------------------------------------
210 -- | To count the number of docs by unit of time
211 docsToTimeScaleNb :: [Document] -> Map Date Double
212 docsToTimeScaleNb docs =
213 let docs' = fromListWith (+) $ map (\d -> (date d,1)) docs
214 time = fromList $ map (\t -> (t,0)) $ toTimeScale (keys docs') 1
215 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length time) <> " unit of time" <> "\n")
216 $ unionWith (+) time docs'
219 initPhyloLevels :: Int -> PhyloPeriodId -> Map PhyloLevelId PhyloLevel
220 initPhyloLevels lvlMax pId =
221 fromList $ map (\lvl -> ((pId,lvl),PhyloLevel pId lvl empty)) [1..lvlMax]
224 -- | To init the basic elements of a Phylo
225 toPhyloBase :: [Document] -> TermList -> Config -> Phylo
226 toPhyloBase docs lst conf =
227 let foundations = PhyloFoundations (Vector.fromList $ nub $ concat $ map text docs) lst
228 params = defaultPhyloParam { _phyloParam_config = conf }
229 periods = toPeriods (sort $ nub $ map date docs) (getTimePeriod $ timeUnit conf) (getTimeStep $ timeUnit conf)
230 in trace ("\n" <> "-- | Create PhyloBase out of " <> show(length docs) <> " docs \n")
232 (docsToTimeScaleCooc docs (foundations ^. foundations_roots))
233 (docsToTimeScaleNb docs)
235 (fromList $ map (\prd -> (prd, PhyloPeriod prd (initPhyloLevels (phyloLevel conf) prd))) periods)