2 Module : Gargantext.Viz.Phylo.PhyloMaker
3 Description : Maker engine for rebuilding a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 {-# LANGUAGE NoImplicitPrelude #-}
12 {-# LANGUAGE FlexibleContexts #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE MultiParamTypeClasses #-}
16 module Gargantext.Viz.Phylo.PhyloMaker where
18 import Data.List (concat, nub, partition, sort, (++))
19 import Data.Map (Map, fromListWith, keys, unionWith, fromList, empty, mapWithKey, toList, elems)
20 import Data.Set (size)
21 import Data.Vector (Vector)
23 import Gargantext.Prelude
24 import Gargantext.Viz.AdaptativePhylo
25 import Gargantext.Viz.Phylo.PhyloTools
26 import Gargantext.Text.Context (TermList)
27 import Gargantext.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, Size(..))
29 import Control.DeepSeq (NFData)
30 import Control.Parallel.Strategies (parList, rdeepseq, using)
31 import Debug.Trace (trace)
34 import qualified Data.Vector as Vector
35 import qualified Data.Set as Set
43 toPhylo :: [Document] -> TermList -> Config -> Phylo
44 toPhylo docs lst conf = phyloBase
46 --------------------------------------
48 _phylo1 = toPhylo1 docs phyloBase
49 --------------------------------------
51 phyloBase = toPhyloBase docs lst conf
52 --------------------------------------
61 toPhylo1 :: [Document] -> Phylo -> Phylo
62 toPhylo1 docs phyloBase = undefined
64 --------------------------------------
65 _mFis :: Map (Date,Date) [PhyloFis]
66 _mFis = toPhyloFis _docs' (fisSupport $ getConfig phyloBase) (fisSize $ getConfig phyloBase)
67 --------------------------------------
68 _docs' :: Map (Date,Date) [Document]
69 _docs' = groupDocsByPeriod date (getPeriodIds phyloBase) docs
70 --------------------------------------
73 ---------------------------
74 -- | Frequent Item Set | --
75 ---------------------------
78 -- | To apply a filter with the possibility of keeping some periods non empty (keep : True|False)
79 filterFis :: Bool -> Int -> (Int -> [PhyloFis] -> [PhyloFis]) -> Map (Date, Date) [PhyloFis] -> Map (Date, Date) [PhyloFis]
80 filterFis keep thr f m = case keep of
81 False -> map (\l -> f thr l) m
82 True -> map (\l -> keepFilled (f) thr l) m
85 -- | To filter Fis with small Support
86 filterFisBySupport :: Int -> [PhyloFis] -> [PhyloFis]
87 filterFisBySupport thr l = filter (\fis -> (fis ^. phyloFis_support) >= thr) l
90 -- | To filter Fis with small Clique size
91 filterFisByClique :: Int -> [PhyloFis] -> [PhyloFis]
92 filterFisByClique thr l = filter (\fis -> (size $ fis ^. phyloFis_clique) >= thr) l
95 -- | To filter nested Fis
96 filterFisByNested :: Map (Date, Date) [PhyloFis] -> Map (Date, Date) [PhyloFis]
99 foldl (\mem f -> if (any (\f' -> isNested (Set.toList $ f' ^. phyloFis_clique) (Set.toList $ f ^. phyloFis_clique)) mem)
102 let fMax = filter (\f' -> not $ isNested (Set.toList $ f ^. phyloFis_clique) (Set.toList $ f' ^. phyloFis_clique)) mem
103 in fMax ++ [f] ) [] l)
105 fis' = fis `using` parList rdeepseq
106 in fromList $ zip (keys m) fis'
109 -- | To transform a time map of docs innto a time map of Fis with some filters
110 toPhyloFis :: Map (Date, Date) [Document] -> Int -> Int -> Map (Date,Date) [PhyloFis]
111 toPhyloFis mDocs support clique = traceFis "Filtered Fis"
113 $ traceFis "Filtered by clique size"
114 $ filterFis True clique (filterFisByClique)
115 $ traceFis "Filtered by support"
116 $ filterFis True support (filterFisBySupport)
117 $ traceFis "Unfiltered Fis" mFis
119 --------------------------------------
120 -- | create the fis from the docs for each period
121 mFis :: Map (Date,Date) [PhyloFis]
122 mFis = mapWithKey (\prd docs -> let fis = toList $ fisWithSizePolyMap (Segment 1 20) 1 (map text docs)
123 in map (\f -> PhyloFis (fst f) (snd f) prd) fis ) mDocs
124 --------------------------------------
132 -- | To transform the docs into a time map of coocurency matrix
133 docsToCoocByYear :: [Document] -> Vector Ngrams -> Config -> Map Date Cooc
134 docsToCoocByYear docs fdt conf =
135 let mCooc = fromListWith sumCooc
136 $ map (\(_d,l) -> (_d, listToMatrix l))
137 $ map (\doc -> (date doc, sort $ ngramsToIdx (text doc) fdt)) docs
139 $ map (\t -> (t,empty))
140 $ toTimeScale (map date docs) (timeUnit conf)
141 in trace ("\n" <> "-- | Build the coocurency matrix for " <> show (length $ keys mCooc') <> " unit of time" <> "\n")
142 $ unionWith sumCooc mCooc mCooc'
145 -----------------------
146 -- | to Phylo Base | --
147 -----------------------
150 -- | To group a list of Documents by fixed periods
151 groupDocsByPeriod :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
152 groupDocsByPeriod _ _ [] = panic "[ERR][Viz.Phylo.PhyloMaker] Empty [Documents] can not have any periods"
153 groupDocsByPeriod f pds es =
154 let periods = map (inPeriode f es) pds
155 periods' = periods `using` parList rdeepseq
157 in trace ("\n" <> "-- | Group " <> show(length es) <> " docs by " <> show(length pds) <> " periods" <> "\n")
158 $ fromList $ zip pds periods'
160 --------------------------------------
161 inPeriode :: Ord b => (t -> b) -> [t] -> (b, b) -> [t]
162 inPeriode f' h (start,end) =
163 fst $ partition (\d -> f' d >= start && f' d <= end) h
164 --------------------------------------
167 -- | To count the number of docs by unit of time (like a year)
168 nbDocsByTime :: [Document] -> Int -> Map Date Double
169 nbDocsByTime docs step =
170 let docs' = fromListWith (+) $ map (\d -> (date d,1)) docs
171 time = fromList $ map (\t -> (t,0)) $ toTimeScale (keys docs') step
172 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length time) <> " unit of time" <> "\n")
173 $ unionWith (+) time docs'
176 -- | To init the basic elements of a Phylo
177 toPhyloBase :: [Document] -> TermList -> Config -> Phylo
178 toPhyloBase docs lst conf =
179 let foundations = PhyloFoundations (Vector.fromList $ nub $ concat $ map text docs) lst
180 params = defaultPhyloParam { _phyloParam_config = conf }
181 periods = toPeriods (sort $ nub $ map date docs) (timePeriod conf) (timeStep conf)
182 in trace ("\n" <> "-- | Create PhyloBase out of " <> show(length docs) <> " docs \n")
184 (docsToCoocByYear docs (foundations ^. foundations_roots) conf)
185 (nbDocsByTime docs $ timeUnit conf)
187 (map (\prd -> PhyloPeriod prd []) periods)