]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Viz/Phylo/PhyloMaker.hs
Merge remote-tracking branch 'origin/flexible-job-queue' into dev
[gargantext.git] / src / Gargantext / Core / Viz / Phylo / PhyloMaker.hs
1 {-|
2 Module : Gargantext.Core.Viz.Phylo.PhyloMaker
3 Description : Maker engine for rebuilding a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 module Gargantext.Core.Viz.Phylo.PhyloMaker where
12
13 import Control.DeepSeq (NFData)
14 import Control.Lens hiding (Level)
15 import Control.Parallel.Strategies (parList, rdeepseq, using)
16 import Data.List (concat, nub, partition, sort, (++), group, intersect, null, sortOn, groupBy, tail)
17 import Data.Map (Map, fromListWith, keys, unionWith, fromList, empty, toList, elems, (!), restrictKeys, foldlWithKey, insert)
18 import Data.Set (Set)
19 import Data.Text (Text)
20 import Data.Vector (Vector)
21 import Debug.Trace (trace)
22 import Prelude (floor)
23
24 import Gargantext.Core.Methods.Distances (Distance(Conditional))
25 import Gargantext.Core.Methods.Graph.MaxClique (getMaxCliques)
26 import Gargantext.Core.Text.Context (TermList)
27 import Gargantext.Core.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, fisWithSizePolyMap', Size(..))
28 import Gargantext.Core.Viz.Phylo
29 import Gargantext.Core.Viz.Phylo.PhyloExport (toHorizon)
30 import Gargantext.Core.Viz.Phylo.PhyloTools
31 import Gargantext.Core.Viz.Phylo.SynchronicClustering (synchronicClustering)
32 import Gargantext.Core.Viz.Phylo.TemporalMatching (temporalMatching, getNextPeriods, filterDocs, filterDiago, reduceDiagos, toProximity)
33 import Gargantext.Prelude
34
35 import qualified Data.Set as Set
36 import qualified Data.Vector as Vector
37
38 ------------------
39 -- | To Phylo | --
40 ------------------
41
42 {-
43 -- TODO AD
44 data Phylo' = PhyloBase { _phylo'_phyloBase :: Phylo}
45 | PhyloN { _phylo'_flatPhylo :: Phylo}
46
47
48 toPhylo' :: Phylo' -> [Document] -> TermList -> PhyloConfig -> Phylo
49 toPhylo' (PhyloN phylo) = toPhylo'
50 toPhylo' (PhyloBase phylo) = toPhylo
51 -}
52
53 -- TODO an adaptative synchronic clustering with a slider
54
55 toPhylo :: Phylo -> Phylo
56 toPhylo phylowithoutLink = trace ("# flatPhylo groups " <> show(length $ getGroupsFromScale 1 flatPhylo))
57 $ traceToPhylo (phyloScale $ getConfig phylowithoutLink) $
58 if (phyloScale $ getConfig phylowithoutLink) > 1
59 then foldl' (\phylo' _ -> synchronicClustering phylo') phyloAncestors [2..(phyloScale $ getConfig phylowithoutLink)]
60 else phyloAncestors
61 where
62 --------------------------------------
63 phyloAncestors :: Phylo
64 phyloAncestors =
65 if (findAncestors $ getConfig phylowithoutLink)
66 then toHorizon flatPhylo
67 else flatPhylo
68 --------------------------------------
69 flatPhylo :: Phylo
70 flatPhylo = addTemporalLinksToPhylo phylowithoutLink
71 --------------------------------------
72
73
74 -----------------------------
75 -- | Create a flat Phylo | --
76 -----------------------------
77
78 {-
79 -- create an adaptative diachronic 'sea elevation' ladder
80 -}
81 adaptDiachronicLadder :: Double -> Set Double -> Set Double -> [Double]
82 adaptDiachronicLadder curr similarities ladder =
83 if curr <= 0 || Set.null similarities
84 then Set.toList ladder
85 else
86 let idx = ((Set.size similarities) `div` (floor curr)) - 1
87 thr = Set.elemAt idx similarities
88 -- we use a sliding methods 1/10, then 1/9, then ... 1/2
89 in adaptDiachronicLadder (curr -1) (Set.filter (> thr) similarities) (Set.insert thr ladder)
90
91
92 {-
93 -- create a constante diachronic 'sea elevation' ladder
94 -}
95 constDiachronicLadder :: Double -> Double -> Set Double -> [Double]
96 constDiachronicLadder curr step ladder =
97 if curr > 1
98 then Set.toList ladder
99 else constDiachronicLadder (curr + step) step (Set.insert curr ladder)
100
101
102 {-
103 -- process an initial scanning of the kinship links
104 -}
105 scanSimilarity :: Scale -> Phylo -> Phylo
106 scanSimilarity lvl phylo =
107 let proximity = phyloProximity $ getConfig phylo
108 scanning = foldlWithKey (\acc pId pds ->
109 -- 1) process period by period
110 let egos = map (\g -> (getGroupId g, g ^. phylo_groupNgrams))
111 $ elems
112 $ view ( phylo_periodScales
113 . traverse . filtered (\phyloLvl -> phyloLvl ^. phylo_scaleScale == lvl)
114 . phylo_scaleGroups ) pds
115 next = getNextPeriods ToParents (getTimeFrame $ timeUnit $ getConfig phylo) pId (keys $ phylo ^. phylo_periods)
116 targets = map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ getGroupsFromScalePeriods lvl next phylo
117 docs = filterDocs (phylo ^. phylo_timeDocs) ([pId] ++ next)
118 diagos = filterDiago (phylo ^. phylo_timeCooc) ([pId] ++ next)
119 -- 2) compute the pairs in parallel
120 pairs = map (\(id,ngrams) ->
121 map (\(id',ngrams') ->
122 let nbDocs = (sum . elems) $ filterDocs docs ([idToPrd id, idToPrd id'])
123 diago = reduceDiagos $ filterDiago diagos ([idToPrd id, idToPrd id'])
124 in ((id,id'),toProximity nbDocs diago proximity ngrams ngrams' ngrams')
125 ) $ filter (\(_,ngrams') -> (not . null) $ intersect ngrams ngrams') targets
126 ) egos
127 pairs' = pairs `using` parList rdeepseq
128 in acc ++ (concat pairs')
129 ) [] $ phylo ^. phylo_periods
130 in phylo & phylo_diaSimScan .~ Set.fromList (traceGroupsProxi $ map snd scanning)
131
132
133
134 appendGroups :: (a -> Period -> (Text,Text) -> Scale -> Int -> [Cooc] -> PhyloGroup) -> Scale -> Map (Date,Date) [a] -> Phylo -> Phylo
135 appendGroups f lvl m phylo = trace ("\n" <> "-- | Append " <> show (length $ concat $ elems m) <> " groups to Level " <> show (lvl) <> "\n")
136 $ over ( phylo_periods
137 . traverse
138 . phylo_periodScales
139 . traverse)
140 (\phyloLvl -> if lvl == (phyloLvl ^. phylo_scaleScale)
141 then
142 let pId = phyloLvl ^. phylo_scalePeriod
143 pId' = phyloLvl ^. phylo_scalePeriodStr
144 phyloCUnit = m ! pId
145 in phyloLvl
146 & phylo_scaleGroups .~ (fromList $ foldl (\groups obj ->
147 groups ++ [ (((pId,lvl),length groups)
148 , f obj pId pId' lvl (length groups)
149 (elems $ restrictKeys (phylo ^. phylo_timeCooc) $ periodsToYears [pId]))
150 ] ) [] phyloCUnit)
151 else
152 phyloLvl )
153 phylo
154
155
156 clusterToGroup :: Clustering -> Period -> (Text,Text) -> Scale -> Int -> [Cooc] -> PhyloGroup
157 clusterToGroup fis pId pId' lvl idx coocs = PhyloGroup pId pId' lvl idx ""
158 (fis ^. clustering_support )
159 (fis ^. clustering_visWeighting)
160 (fis ^. clustering_visFiltering)
161 (fis ^. clustering_roots)
162 (ngramsToCooc (fis ^. clustering_roots) coocs)
163 (1,[0]) -- branchid (lvl,[path in the branching tree])
164 (fromList [("breaks",[0]),("seaLevels",[0])])
165 [] [] [] [] [] [] []
166
167 {-
168 -- enhance the phylo with temporal links
169 -}
170 addTemporalLinksToPhylo :: Phylo -> Phylo
171 addTemporalLinksToPhylo phylowithoutLink = case strategy of
172 Constante start gap -> temporalMatching (constDiachronicLadder start gap Set.empty) phylowithoutLink
173 Adaptative steps -> temporalMatching (adaptDiachronicLadder steps (phylowithoutLink ^. phylo_diaSimScan) Set.empty) phylowithoutLink
174 where
175 strategy :: SeaElevation
176 strategy = getSeaElevation phylowithoutLink
177
178 -----------------------
179 -- | To Phylo Step | --
180 -----------------------
181
182
183 indexDates' :: Map (Date,Date) [Document] -> Map (Date,Date) (Text,Text)
184 indexDates' m = map (\docs ->
185 let ds = map (\d -> date' d) docs
186 f = if (null ds)
187 then ""
188 else toFstDate ds
189 l = if (null ds)
190 then ""
191 else toLstDate ds
192 in (f,l)) m
193
194
195 -- To build the first phylo step from docs and terms
196 -- QL: backend entre phyloBase et Clustering
197 -- tophylowithoutLink
198 toPhyloWithoutLink :: [Document] -> TermList -> PhyloConfig -> Phylo
199 toPhyloWithoutLink docs lst conf = case (getSeaElevation phyloBase) of
200 Constante _ _ -> appendGroups clusterToGroup 1 seriesOfClustering (updatePeriods (indexDates' docs') phyloBase)
201 Adaptative _ -> scanSimilarity 1
202 $ appendGroups clusterToGroup 1 seriesOfClustering (updatePeriods (indexDates' docs') phyloBase)
203 where
204 --------------------------------------
205 seriesOfClustering :: Map (Date,Date) [Clustering]
206 seriesOfClustering = toSeriesOfClustering phyloBase docs'
207 --------------------------------------
208 docs' :: Map (Date,Date) [Document]
209 -- QL: Time Consuming here
210 docs' = groupDocsByPeriodRec date (getPeriodIds phyloBase) (sortOn date docs) empty
211 --------------------------------------
212 phyloBase :: Phylo
213 phyloBase = initPhylo docs lst conf
214 --------------------------------------
215
216 ---------------------------
217 -- | Frequent Item Set | --
218 ---------------------------
219
220
221 -- To apply a filter with the possibility of keeping some periods non empty (keep : True|False)
222 filterClique :: Bool -> Int -> (Int -> [Clustering] -> [Clustering]) -> Map (Date, Date) [Clustering] -> Map (Date, Date) [Clustering]
223 filterClique keep thr f m = case keep of
224 False -> map (\l -> f thr l) m
225 True -> map (\l -> keepFilled (f) thr l) m
226
227
228 -- To filter Fis with small Support
229 filterCliqueBySupport :: Int -> [Clustering] -> [Clustering]
230 filterCliqueBySupport thr l = filter (\clq -> (clq ^. clustering_support ) >= thr) l
231
232
233 -- To filter Fis with small Clique size
234 filterCliqueBySize :: Int -> [Clustering] -> [Clustering]
235 filterCliqueBySize thr l = filter (\clq -> (length $ clq ^. clustering_roots) >= thr) l
236
237
238 -- To filter nested Fis
239 filterCliqueByNested :: Map (Date, Date) [Clustering] -> Map (Date, Date) [Clustering]
240 filterCliqueByNested m =
241 let clq = map (\l ->
242 foldl (\mem f -> if (any (\f' -> isNested (f' ^. clustering_roots) (f ^. clustering_roots)) mem)
243 then mem
244 else
245 let fMax = filter (\f' -> not $ isNested (f ^. clustering_roots) (f' ^. clustering_roots)) mem
246 in fMax ++ [f] ) [] l)
247 $ elems m
248 clq' = clq `using` parList rdeepseq
249 in fromList $ zip (keys m) clq'
250
251
252 -- | To transform a time map of docs into a time map of Fis with some filters
253 toSeriesOfClustering :: Phylo -> Map (Date, Date) [Document] -> Map (Date,Date) [Clustering]
254 toSeriesOfClustering phylo phyloDocs = case (clique $ getConfig phylo) of
255 Fis s s' -> -- traceFis "Filtered Fis"
256 filterCliqueByNested
257 {- \$ traceFis "Filtered by clique size" -}
258 $ filterClique True s' (filterCliqueBySize)
259 {- \$ traceFis "Filtered by support" -}
260 $ filterClique True s (filterCliqueBySupport)
261 {- \$ traceFis "Unfiltered Fis" -}
262 seriesOfClustering
263 MaxClique s _ _ -> filterClique True s (filterCliqueBySize)
264 seriesOfClustering
265 where
266 --------------------------------------
267 seriesOfClustering :: Map (Date,Date) [Clustering]
268 seriesOfClustering = case (clique $ getConfig phylo) of
269 Fis _ _ ->
270 let fis = map (\(prd,docs) ->
271 case (corpusParser $ getConfig phylo) of
272 Csv' _ -> let lst = toList
273 $ fisWithSizePolyMap' (Segment 1 20) 1 (map (\d -> (ngramsToIdx (text d) (getRoots phylo), (weight d, (sourcesToIdx (sources d) (getSources phylo))))) docs)
274 in (prd, map (\f -> Clustering (Set.toList $ fst f) ((fst . snd) f) prd ((fst . snd . snd) f) (((snd . snd . snd) f))) lst)
275 _ -> let lst = toList
276 $ fisWithSizePolyMap (Segment 1 20) 1 (map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs)
277 in (prd, map (\f -> Clustering (Set.toList $ fst f) (snd f) prd (Just $ fromIntegral $ snd f) []) lst)
278 )
279 $ toList phyloDocs
280 fis' = fis `using` parList rdeepseq
281 in fromList fis'
282 MaxClique _ thr filterType ->
283 let mcl = map (\(prd,docs) ->
284 let cooc = map round
285 $ foldl sumCooc empty
286 $ map listToMatrix
287 $ map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs
288 in (prd, map (\cl -> Clustering cl 0 prd Nothing []) $ getMaxCliques filterType Conditional thr cooc))
289 $ toList phyloDocs
290 mcl' = mcl `using` parList rdeepseq
291 in fromList mcl'
292 --------------------------------------
293
294 -- dev viz graph maxClique getMaxClique
295
296
297 --------------------
298 -- | Coocurency | --
299 --------------------
300
301
302 -- To transform the docs into a time map of coocurency matrix
303 docsToTimeScaleCooc :: [Document] -> Vector Ngrams -> Map Date Cooc
304 docsToTimeScaleCooc docs fdt =
305 let mCooc = fromListWith sumCooc
306 $ map (\(_d,l) -> (_d, listToMatrix l))
307 $ map (\doc -> (date doc, sort $ ngramsToIdx (text doc) fdt)) docs
308 mCooc' = fromList
309 $ map (\t -> (t,empty))
310 $ toTimeScale (map date docs) 1
311 in trace ("\n" <> "-- | Build the coocurency matrix for " <> show (length $ keys mCooc') <> " unit of time" <> "\n")
312 $ unionWith sumCooc mCooc mCooc'
313
314
315 -----------------------
316 -- | to Phylo Base | --
317 -----------------------
318 -- TODO anoe
319 groupDocsByPeriodRec :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc] -> Map (date, date) [doc]
320 groupDocsByPeriodRec f prds docs acc =
321 if ((null prds) || (null docs))
322 then acc
323 else
324 let prd = head' "groupBy" prds
325 docs' = partition (\d -> (f d >= fst prd) && (f d <= snd prd)) docs
326 in groupDocsByPeriodRec f (tail prds) (snd docs') (insert prd (fst docs') acc)
327
328
329 -- To group a list of Documents by fixed periods
330 groupDocsByPeriod' :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
331 groupDocsByPeriod' f pds docs =
332 let docs' = groupBy (\d d' -> f d == f d') $ sortOn f docs
333 periods = map (inPeriode f docs') pds
334 periods' = periods `using` parList rdeepseq
335 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length pds) <> " periods" <> "\n")
336 $ fromList $ zip pds periods'
337 where
338 --------------------------------------
339 inPeriode :: Ord b => (t -> b) -> [[t]] -> (b, b) -> [t]
340 inPeriode f' h (start,end) =
341 concat $ fst $ partition (\d -> f' (head' "inPeriode" d) >= start && f' (head' "inPeriode" d) <= end) h
342
343
344
345 -- To group a list of Documents by fixed periods
346 groupDocsByPeriod :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
347 groupDocsByPeriod _ _ [] = panic "[ERR][Viz.Phylo.PhyloMaker] Empty [Documents] can not have any periods"
348 groupDocsByPeriod f pds es =
349 let periods = map (inPeriode f es) pds
350 periods' = periods `using` parList rdeepseq
351
352 in trace ("\n" <> "-- | Group " <> show(length es) <> " docs by " <> show(length pds) <> " periods" <> "\n")
353 $ fromList $ zip pds periods'
354 where
355 --------------------------------------
356 inPeriode :: Ord b => (t -> b) -> [t] -> (b, b) -> [t]
357 inPeriode f' h (start,end) =
358 fst $ partition (\d -> f' d >= start && f' d <= end) h
359 --------------------------------------
360
361
362 docsToTermFreq :: [Document] -> Vector Ngrams -> Map Int Double
363 docsToTermFreq docs fdt =
364 let nbDocs = fromIntegral $ length docs
365 freqs = map (/(nbDocs))
366 $ fromList
367 $ map (\lst -> (head' "docsToTermFreq" lst, fromIntegral $ length lst))
368 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) docs
369 sumFreqs = sum $ elems freqs
370 in map (/sumFreqs) freqs
371
372 docsToLastTermFreq :: Int -> [Document] -> Vector Ngrams -> Map Int Double
373 docsToLastTermFreq n docs fdt =
374 let last = take n $ reverse $ sort $ map date docs
375 nbDocs = fromIntegral $ length $ filter (\d -> elem (date d) last) docs
376 freqs = map (/(nbDocs))
377 $ fromList
378 $ map (\lst -> (head' "docsToLastTermFreq" lst, fromIntegral $ length lst))
379 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) $ filter (\d -> elem (date d) last) docs
380 sumFreqs = sum $ elems freqs
381 in map (/sumFreqs) freqs
382
383
384 -- To count the number of docs by unit of time
385 docsToTimeScaleNb :: [Document] -> Map Date Double
386 docsToTimeScaleNb docs =
387 let docs' = fromListWith (+) $ map (\d -> (date d,1)) docs
388 time = fromList $ map (\t -> (t,0)) $ toTimeScale (keys docs') 1
389 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length time) <> " unit of time" <> "\n")
390 $ unionWith (+) time docs'
391
392
393 initPhyloScales :: Int -> Period -> Map PhyloScaleId PhyloScale
394 initPhyloScales lvlMax pId =
395 fromList $ map (\lvl -> ((pId,lvl),PhyloScale pId ("","") lvl empty)) [1..lvlMax]
396
397
398
399 -- Init the basic elements of a Phylo
400 --
401 initPhylo :: [Document] -> TermList -> PhyloConfig -> Phylo
402 initPhylo docs lst conf =
403 let foundations = PhyloFoundations (Vector.fromList $ nub $ concat $ map text docs) lst
404 docsSources = PhyloSources (Vector.fromList $ nub $ concat $ map sources docs)
405 params = defaultPhyloParam { _phyloParam_config = conf }
406 periods = toPeriods (sort $ nub $ map date docs) (getTimePeriod $ timeUnit conf) (getTimeStep $ timeUnit conf)
407 in trace ("\n" <> "-- | Init a phylo out of " <> show(length docs) <> " docs \n")
408 $ Phylo foundations
409 docsSources
410 (docsToTimeScaleCooc docs (foundations ^. foundations_roots))
411 (docsToTimeScaleNb docs)
412 (docsToTermFreq docs (foundations ^. foundations_roots))
413 (docsToLastTermFreq (getTimePeriod $ timeUnit conf) docs (foundations ^. foundations_roots))
414 Set.empty
415 params
416 (fromList $ map (\prd -> (prd, PhyloPeriod prd ("","") (initPhyloScales 1 prd))) periods)
417 0