]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/Phylo/PhyloMaker.hs
add a new group by docs
[gargantext.git] / src / Gargantext / Viz / Phylo / PhyloMaker.hs
1 {-|
2 Module : Gargantext.Viz.Phylo.PhyloMaker
3 Description : Maker engine for rebuilding a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 {-# LANGUAGE NoImplicitPrelude #-}
12 {-# LANGUAGE FlexibleContexts #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE MultiParamTypeClasses #-}
15
16 module Gargantext.Viz.Phylo.PhyloMaker where
17
18 import Data.List (concat, nub, partition, sort, (++), group, intersect, null, sortOn, groupBy, tail)
19 import Data.Map (Map, fromListWith, keys, unionWith, fromList, empty, toList, elems, (!), restrictKeys, foldlWithKey, insert)
20 import Data.Set (size)
21 import Data.Vector (Vector)
22
23 import Gargantext.Prelude
24 import Gargantext.Viz.AdaptativePhylo
25 import Gargantext.Viz.Phylo.PhyloTools
26 import Gargantext.Viz.Phylo.TemporalMatching (adaptativeTemporalMatching, constanteTemporalMatching, getNextPeriods, filterDocs, filterDiago, reduceDiagos, toProximity)
27 import Gargantext.Viz.Phylo.SynchronicClustering (synchronicClustering)
28 import Gargantext.Text.Context (TermList)
29 import Gargantext.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, Size(..))
30
31 import Control.DeepSeq (NFData)
32 import Control.Parallel.Strategies (parList, rdeepseq, using)
33 import Debug.Trace (trace)
34 import Control.Lens hiding (Level)
35
36 import qualified Data.Vector as Vector
37 import qualified Data.Set as Set
38
39
40 ------------------
41 -- | To Phylo | --
42 ------------------
43
44
45 toPhylo :: [Document] -> TermList -> Config -> Phylo
46 toPhylo docs lst conf = trace ("# phylo1 groups " <> show(length $ getGroupsFromLevel 1 phylo1))
47 $ traceToPhylo (phyloLevel conf) $
48 if (phyloLevel conf) > 1
49 then foldl' (\phylo' _ -> synchronicClustering phylo') phylo1 [2..(phyloLevel conf)]
50 else phylo1
51 where
52 --------------------------------------
53 phylo1 :: Phylo
54 phylo1 = toPhylo1 docs phyloBase
55 --------------------------------------
56 phyloBase :: Phylo
57 phyloBase = toPhyloBase docs lst conf
58 --------------------------------------
59
60
61
62 --------------------
63 -- | To Phylo 1 | --
64 --------------------
65
66 toGroupsProxi :: Level -> Phylo -> Phylo
67 toGroupsProxi lvl phylo =
68 let proximity = phyloProximity $ getConfig phylo
69 groupsProxi = foldlWithKey (\acc pId pds ->
70 -- 1) process period by period
71 let egos = map (\g -> (getGroupId g, g ^. phylo_groupNgrams))
72 $ elems
73 $ view ( phylo_periodLevels
74 . traverse . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
75 . phylo_levelGroups ) pds
76 next = getNextPeriods ToParents (getTimeFrame $ timeUnit $ getConfig phylo) pId (keys $ phylo ^. phylo_periods)
77 targets = map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ getGroupsFromLevelPeriods lvl next phylo
78 docs = filterDocs (phylo ^. phylo_timeDocs) ([pId] ++ next)
79 diagos = filterDiago (phylo ^. phylo_timeCooc) ([pId] ++ next)
80 -- 2) compute the pairs in parallel
81 pairs = map (\(id,ngrams) ->
82 map (\(id',ngrams') ->
83 let nbDocs = (sum . elems) $ filterDocs docs ([idToPrd id, idToPrd id'])
84 diago = reduceDiagos $ filterDiago diagos ([idToPrd id, idToPrd id'])
85 in ((id,id'),toProximity nbDocs diago proximity ngrams ngrams' ngrams')
86 ) $ filter (\(_,ngrams') -> (not . null) $ intersect ngrams ngrams') targets
87 ) egos
88 pairs' = pairs `using` parList rdeepseq
89 in acc ++ (concat pairs')
90 ) [] $ phylo ^. phylo_periods
91 in phylo & phylo_groupsProxi .~ ((traceGroupsProxi . fromList) groupsProxi)
92
93
94 appendGroups :: (a -> PhyloPeriodId -> Level -> Int -> Vector Ngrams -> [Cooc] -> PhyloGroup) -> Level -> Map (Date,Date) [a] -> Phylo -> Phylo
95 appendGroups f lvl m phylo = trace ("\n" <> "-- | Append " <> show (length $ concat $ elems m) <> " groups to Level " <> show (lvl) <> "\n")
96 $ over ( phylo_periods
97 . traverse
98 . phylo_periodLevels
99 . traverse)
100 (\phyloLvl -> if lvl == (phyloLvl ^. phylo_levelLevel)
101 then
102 let pId = phyloLvl ^. phylo_levelPeriod
103 phyloCUnit = m ! pId
104 in phyloLvl
105 & phylo_levelGroups .~ (fromList $ foldl (\groups obj ->
106 groups ++ [ (((pId,lvl),length groups)
107 , f obj pId lvl (length groups) (getRoots phylo)
108 (elems $ restrictKeys (phylo ^. phylo_timeCooc) $ periodsToYears [pId]))
109 ] ) [] phyloCUnit)
110 else
111 phyloLvl )
112 phylo
113
114
115 cliqueToGroup :: PhyloClique -> PhyloPeriodId -> Level -> Int -> Vector Ngrams -> [Cooc] -> PhyloGroup
116 cliqueToGroup fis pId lvl idx fdt coocs =
117 let ngrams = ngramsToIdx (Set.toList $ fis ^. phyloClique_nodes) fdt
118 in PhyloGroup pId lvl idx ""
119 (fis ^. phyloClique_support)
120 ngrams
121 (ngramsToCooc ngrams coocs)
122 (1,[0]) -- | branchid (lvl,[path in the branching tree])
123 (fromList [("breaks",[0]),("seaLevels",[0])])
124 [] [] [] []
125
126
127 toPhylo1 :: [Document] -> Phylo -> Phylo
128 toPhylo1 docs phyloBase = case (getSeaElevation phyloBase) of
129 Constante start gap -> constanteTemporalMatching start gap
130 $ appendGroups cliqueToGroup 1 phyloClique phyloBase
131 Adaptative steps -> adaptativeTemporalMatching steps
132 $ toGroupsProxi 1
133 $ appendGroups cliqueToGroup 1 phyloClique phyloBase
134 where
135 --------------------------------------
136 phyloClique :: Map (Date,Date) [PhyloClique]
137 phyloClique = toPhyloClique phyloBase docs'
138 --------------------------------------
139 docs' :: Map (Date,Date) [Document]
140 docs' = groupDocsByPeriodRec date (getPeriodIds phyloBase) (sortOn date docs) empty
141 -- docs' = groupDocsByPeriod' date (getPeriodIds phyloBase) docs
142 --------------------------------------
143
144
145 ---------------------------
146 -- | Frequent Item Set | --
147 ---------------------------
148
149
150 -- | To apply a filter with the possibility of keeping some periods non empty (keep : True|False)
151 filterClique :: Bool -> Int -> (Int -> [PhyloClique] -> [PhyloClique]) -> Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
152 filterClique keep thr f m = case keep of
153 False -> map (\l -> f thr l) m
154 True -> map (\l -> keepFilled (f) thr l) m
155
156
157 -- | To filter Fis with small Support
158 filterCliqueBySupport :: Int -> [PhyloClique] -> [PhyloClique]
159 filterCliqueBySupport thr l = filter (\clq -> (clq ^. phyloClique_support) >= thr) l
160
161
162 -- | To filter Fis with small Clique size
163 filterCliqueBySize :: Int -> [PhyloClique] -> [PhyloClique]
164 filterCliqueBySize thr l = filter (\clq -> (size $ clq ^. phyloClique_nodes) >= thr) l
165
166
167 -- | To filter nested Fis
168 filterCliqueByNested :: Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
169 filterCliqueByNested m =
170 let clq = map (\l ->
171 foldl (\mem f -> if (any (\f' -> isNested (Set.toList $ f' ^. phyloClique_nodes) (Set.toList $ f ^. phyloClique_nodes)) mem)
172 then mem
173 else
174 let fMax = filter (\f' -> not $ isNested (Set.toList $ f ^. phyloClique_nodes) (Set.toList $ f' ^. phyloClique_nodes)) mem
175 in fMax ++ [f] ) [] l)
176 $ elems m
177 clq' = clq `using` parList rdeepseq
178 in fromList $ zip (keys m) clq'
179
180
181 -- | To transform a time map of docs innto a time map of Fis with some filters
182 toPhyloClique :: Phylo -> Map (Date, Date) [Document] -> Map (Date,Date) [PhyloClique]
183 toPhyloClique phylo phyloDocs = case (clique $ getConfig phylo) of
184 Fis s s' -> -- traceFis "Filtered Fis"
185 filterCliqueByNested
186 -- $ traceFis "Filtered by clique size"
187 $ filterClique True s' (filterCliqueBySize)
188 -- $ traceFis "Filtered by support"
189 $ filterClique True s (filterCliqueBySupport)
190 -- $ traceFis "Unfiltered Fis"
191 phyloClique
192 MaxClique _ -> undefined
193 where
194 --------------------------------------
195 phyloClique :: Map (Date,Date) [PhyloClique]
196 phyloClique = case (clique $ getConfig phylo) of
197 Fis _ _ -> let fis = map (\(prd,docs) ->
198 let lst = toList $ fisWithSizePolyMap (Segment 1 20) 1 (map text docs)
199 in (prd, map (\f -> PhyloClique (fst f) (snd f) prd) lst))
200 $ toList phyloDocs
201 fis' = fis `using` parList rdeepseq
202 in fromList fis'
203 MaxClique _ -> undefined
204 --------------------------------------
205
206 -- dev viz graph maxClique getMaxClique
207
208
209 --------------------
210 -- | Coocurency | --
211 --------------------
212
213
214 -- | To transform the docs into a time map of coocurency matrix
215 docsToTimeScaleCooc :: [Document] -> Vector Ngrams -> Map Date Cooc
216 docsToTimeScaleCooc docs fdt =
217 let mCooc = fromListWith sumCooc
218 $ map (\(_d,l) -> (_d, listToMatrix l))
219 $ map (\doc -> (date doc, sort $ ngramsToIdx (text doc) fdt)) docs
220 mCooc' = fromList
221 $ map (\t -> (t,empty))
222 $ toTimeScale (map date docs) 1
223 in trace ("\n" <> "-- | Build the coocurency matrix for " <> show (length $ keys mCooc') <> " unit of time" <> "\n")
224 $ unionWith sumCooc mCooc mCooc'
225
226
227 -----------------------
228 -- | to Phylo Base | --
229 -----------------------
230
231 groupDocsByPeriodRec :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc] -> Map (date, date) [doc]
232 groupDocsByPeriodRec f prds docs acc =
233 if ((null prds) || (null docs))
234 then acc
235 else
236 let prd = head' "groupBy" prds
237 docs' = partition (\d -> (f d >= fst prd) && (f d <= snd prd)) docs
238 in groupDocsByPeriodRec f (tail prds) (snd docs') (insert prd (fst docs') acc)
239
240
241 -- | To group a list of Documents by fixed periods
242 groupDocsByPeriod' :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
243 groupDocsByPeriod' f pds docs =
244 let docs' = groupBy (\d d' -> f d == f d') $ sortOn f docs
245 periods = map (inPeriode f docs') pds
246 periods' = periods `using` parList rdeepseq
247 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length pds) <> " periods" <> "\n")
248 $ fromList $ zip pds periods'
249 where
250 --------------------------------------
251 inPeriode :: Ord b => (t -> b) -> [[t]] -> (b, b) -> [t]
252 inPeriode f' h (start,end) =
253 concat $ fst $ partition (\d -> f' (head' "inPeriode" d) >= start && f' (head' "inPeriode" d) <= end) h
254
255
256
257 -- | To group a list of Documents by fixed periods
258 groupDocsByPeriod :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
259 groupDocsByPeriod _ _ [] = panic "[ERR][Viz.Phylo.PhyloMaker] Empty [Documents] can not have any periods"
260 groupDocsByPeriod f pds es =
261 let periods = map (inPeriode f es) pds
262 periods' = periods `using` parList rdeepseq
263
264 in trace ("\n" <> "-- | Group " <> show(length es) <> " docs by " <> show(length pds) <> " periods" <> "\n")
265 $ fromList $ zip pds periods'
266 where
267 --------------------------------------
268 inPeriode :: Ord b => (t -> b) -> [t] -> (b, b) -> [t]
269 inPeriode f' h (start,end) =
270 fst $ partition (\d -> f' d >= start && f' d <= end) h
271 --------------------------------------
272
273
274 docsToTermFreq :: [Document] -> Vector Ngrams -> Map Int Double
275 docsToTermFreq docs fdt =
276 let nbDocs = fromIntegral $ length docs
277 freqs = map (/(nbDocs))
278 $ fromList
279 $ map (\lst -> (head' "docsToTermFreq" lst, fromIntegral $ length lst))
280 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) docs
281 sumFreqs = sum $ elems freqs
282 in map (/sumFreqs) freqs
283
284
285 -- | To count the number of docs by unit of time
286 docsToTimeScaleNb :: [Document] -> Map Date Double
287 docsToTimeScaleNb docs =
288 let docs' = fromListWith (+) $ map (\d -> (date d,1)) docs
289 time = fromList $ map (\t -> (t,0)) $ toTimeScale (keys docs') 1
290 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length time) <> " unit of time" <> "\n")
291 $ unionWith (+) time docs'
292
293
294 initPhyloLevels :: Int -> PhyloPeriodId -> Map PhyloLevelId PhyloLevel
295 initPhyloLevels lvlMax pId =
296 fromList $ map (\lvl -> ((pId,lvl),PhyloLevel pId lvl empty)) [1..lvlMax]
297
298
299 -- | To init the basic elements of a Phylo
300 toPhyloBase :: [Document] -> TermList -> Config -> Phylo
301 toPhyloBase docs lst conf =
302 let foundations = PhyloFoundations (Vector.fromList $ nub $ concat $ map text docs) lst
303 params = defaultPhyloParam { _phyloParam_config = conf }
304 periods = toPeriods (sort $ nub $ map date docs) (getTimePeriod $ timeUnit conf) (getTimeStep $ timeUnit conf)
305 in trace ("\n" <> "-- | Create PhyloBase out of " <> show(length docs) <> " docs \n")
306 $ Phylo foundations
307 (docsToTimeScaleCooc docs (foundations ^. foundations_roots))
308 (docsToTimeScaleNb docs)
309 (docsToTermFreq docs (foundations ^. foundations_roots))
310 empty
311 empty
312 params
313 (fromList $ map (\prd -> (prd, PhyloPeriod prd (initPhyloLevels 1 prd))) periods)