]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Viz/Phylo/PhyloMaker.hs
[zip] implement zip for parsers
[gargantext.git] / src / Gargantext / Core / Viz / Phylo / PhyloMaker.hs
1 {-|
2 Module : Gargantext.Core.Viz.Phylo.PhyloMaker
3 Description : Maker engine for rebuilding a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11
12 module Gargantext.Core.Viz.Phylo.PhyloMaker where
13
14
15 import Control.DeepSeq (NFData)
16 import Control.Lens hiding (Level)
17 import Control.Parallel.Strategies (parList, rdeepseq, using)
18 import Data.List (concat, nub, partition, sort, (++), group, intersect, null, sortOn, groupBy, tail)
19 import Data.Map (Map, fromListWith, keys, unionWith, fromList, empty, toList, elems, (!), restrictKeys, foldlWithKey, insert)
20 import Data.Text (Text)
21 import Data.Vector (Vector)
22 import Debug.Trace (trace)
23
24 import Gargantext.Core.Methods.Distances (Distance(Conditional))
25 import Gargantext.Core.Methods.Graph.MaxClique (getMaxCliques)
26 import Gargantext.Core.Text.Context (TermList)
27 import Gargantext.Core.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, fisWithSizePolyMap', Size(..))
28 import Gargantext.Core.Viz.Phylo
29 import Gargantext.Core.Viz.Phylo.PhyloExport (toHorizon)
30 import Gargantext.Core.Viz.Phylo.PhyloTools
31 import Gargantext.Core.Viz.Phylo.SynchronicClustering (synchronicClustering)
32 import Gargantext.Core.Viz.Phylo.TemporalMatching (adaptativeTemporalMatching, constanteTemporalMatching, getNextPeriods, filterDocs, filterDiago, reduceDiagos, toProximity)
33 import Gargantext.Prelude
34
35 import qualified Data.Set as Set
36 import qualified Data.Vector as Vector
37
38 ------------------
39 -- | To Phylo | --
40 ------------------
41
42 {-
43 -- TODO AD
44 data Phylo' = PhyloBase { _phylo'_phyloBase :: Phylo}
45 | PhyloN { _phylo'_phylo1 :: Phylo}
46
47
48 toPhylo' :: Phylo' -> [Document] -> TermList -> PhyloConfig -> Phylo
49 toPhylo' (PhyloN phylo) = toPhylo'
50 toPhylo' (PhyloBase phylo) = toPhylo
51 -}
52
53
54 toPhylo :: Phylo -> Phylo
55 toPhylo phyloStep = trace ("# phylo1 groups " <> show(length $ getGroupsFromLevel 1 phylo1))
56 $ traceToPhylo (phyloLevel $ getConfig phyloStep) $
57 if (phyloLevel $ getConfig phyloStep) > 1
58 then foldl' (\phylo' _ -> synchronicClustering phylo') phyloAncestors [2..(phyloLevel $ getConfig phyloStep)]
59 else phylo1
60 where
61 --------------------------------------
62 phyloAncestors :: Phylo
63 phyloAncestors =
64 if (findAncestors $ getConfig phyloStep)
65 then toHorizon phylo1
66 else phylo1
67 --------------------------------------
68 phylo1 :: Phylo
69 phylo1 = toPhylo1 phyloStep
70 --------------------------------------
71
72
73 --------------------
74 -- | To Phylo 1 | --
75 --------------------
76
77 toGroupsProxi :: Level -> Phylo -> Phylo
78 toGroupsProxi lvl phylo =
79 let proximity = phyloProximity $ getConfig phylo
80 groupsProxi = foldlWithKey (\acc pId pds ->
81 -- 1) process period by period
82 let egos = map (\g -> (getGroupId g, g ^. phylo_groupNgrams))
83 $ elems
84 $ view ( phylo_periodLevels
85 . traverse . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
86 . phylo_levelGroups ) pds
87 next = getNextPeriods ToParents (getTimeFrame $ timeUnit $ getConfig phylo) pId (keys $ phylo ^. phylo_periods)
88 targets = map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ getGroupsFromLevelPeriods lvl next phylo
89 docs = filterDocs (phylo ^. phylo_timeDocs) ([pId] ++ next)
90 diagos = filterDiago (phylo ^. phylo_timeCooc) ([pId] ++ next)
91 -- 2) compute the pairs in parallel
92 pairs = map (\(id,ngrams) ->
93 map (\(id',ngrams') ->
94 let nbDocs = (sum . elems) $ filterDocs docs ([idToPrd id, idToPrd id'])
95 diago = reduceDiagos $ filterDiago diagos ([idToPrd id, idToPrd id'])
96 in ((id,id'),toProximity nbDocs diago proximity ngrams ngrams' ngrams')
97 ) $ filter (\(_,ngrams') -> (not . null) $ intersect ngrams ngrams') targets
98 ) egos
99 pairs' = pairs `using` parList rdeepseq
100 in acc ++ (concat pairs')
101 ) [] $ phylo ^. phylo_periods
102 in phylo & phylo_groupsProxi .~ ((traceGroupsProxi . fromList) groupsProxi)
103
104
105 appendGroups :: (a -> PhyloPeriodId -> (Text,Text) -> Level -> Int -> [Cooc] -> PhyloGroup) -> Level -> Map (Date,Date) [a] -> Phylo -> Phylo
106 appendGroups f lvl m phylo = trace ("\n" <> "-- | Append " <> show (length $ concat $ elems m) <> " groups to Level " <> show (lvl) <> "\n")
107 $ over ( phylo_periods
108 . traverse
109 . phylo_periodLevels
110 . traverse)
111 (\phyloLvl -> if lvl == (phyloLvl ^. phylo_levelLevel)
112 then
113 let pId = phyloLvl ^. phylo_levelPeriod
114 pId' = phyloLvl ^. phylo_levelPeriod'
115 phyloCUnit = m ! pId
116 in phyloLvl
117 & phylo_levelGroups .~ (fromList $ foldl (\groups obj ->
118 groups ++ [ (((pId,lvl),length groups)
119 , f obj pId pId' lvl (length groups)
120 (elems $ restrictKeys (phylo ^. phylo_timeCooc) $ periodsToYears [pId]))
121 ] ) [] phyloCUnit)
122 else
123 phyloLvl )
124 phylo
125
126
127 cliqueToGroup :: PhyloClique -> PhyloPeriodId -> (Text,Text) -> Level -> Int -> [Cooc] -> PhyloGroup
128 cliqueToGroup fis pId pId' lvl idx coocs = PhyloGroup pId pId' lvl idx ""
129 (fis ^. phyloClique_support)
130 (fis ^. phyloClique_weight)
131 (fis ^. phyloClique_sources)
132 (fis ^. phyloClique_nodes)
133 (ngramsToCooc (fis ^. phyloClique_nodes) coocs)
134 (1,[0]) -- branchid (lvl,[path in the branching tree])
135 (fromList [("breaks",[0]),("seaLevels",[0])])
136 [] [] [] [] [] [] []
137
138
139 toPhylo1 :: Phylo -> Phylo
140 toPhylo1 phyloStep = case (getSeaElevation phyloStep) of
141 Constante start gap -> constanteTemporalMatching start gap phyloStep
142 Adaptative steps -> adaptativeTemporalMatching steps phyloStep
143
144 -----------------------
145 -- | To Phylo Step | --
146 -----------------------
147
148
149 indexDates' :: Map (Date,Date) [Document] -> Map (Date,Date) (Text,Text)
150 indexDates' m = map (\docs ->
151 let ds = map (\d -> date' d) docs
152 f = if (null ds)
153 then ""
154 else toFstDate ds
155 l = if (null ds)
156 then ""
157 else toLstDate ds
158 in (f,l)) m
159
160
161 -- To build the first phylo step from docs and terms
162 -- QL: backend entre phyloBase et phyloClique
163 toPhyloStep :: [Document] -> TermList -> PhyloConfig -> Phylo
164 toPhyloStep docs lst conf = case (getSeaElevation phyloBase) of
165 Constante _ _ -> appendGroups cliqueToGroup 1 phyloClique (updatePeriods (indexDates' docs') phyloBase)
166 Adaptative _ -> toGroupsProxi 1
167 $ appendGroups cliqueToGroup 1 phyloClique (updatePeriods (indexDates' docs') phyloBase)
168 where
169 --------------------------------------
170 phyloClique :: Map (Date,Date) [PhyloClique]
171 phyloClique = toPhyloClique phyloBase docs'
172 --------------------------------------
173 docs' :: Map (Date,Date) [Document]
174 -- QL: Time Consuming here
175 docs' = groupDocsByPeriodRec date (getPeriodIds phyloBase) (sortOn date docs) empty
176 --------------------------------------
177 phyloBase :: Phylo
178 phyloBase = toPhyloBase docs lst conf
179 --------------------------------------
180
181 ---------------------------
182 -- | Frequent Item Set | --
183 ---------------------------
184
185
186 -- To apply a filter with the possibility of keeping some periods non empty (keep : True|False)
187 filterClique :: Bool -> Int -> (Int -> [PhyloClique] -> [PhyloClique]) -> Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
188 filterClique keep thr f m = case keep of
189 False -> map (\l -> f thr l) m
190 True -> map (\l -> keepFilled (f) thr l) m
191
192
193 -- To filter Fis with small Support
194 filterCliqueBySupport :: Int -> [PhyloClique] -> [PhyloClique]
195 filterCliqueBySupport thr l = filter (\clq -> (clq ^. phyloClique_support) >= thr) l
196
197
198 -- To filter Fis with small Clique size
199 filterCliqueBySize :: Int -> [PhyloClique] -> [PhyloClique]
200 filterCliqueBySize thr l = filter (\clq -> (length $ clq ^. phyloClique_nodes) >= thr) l
201
202
203 -- To filter nested Fis
204 filterCliqueByNested :: Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
205 filterCliqueByNested m =
206 let clq = map (\l ->
207 foldl (\mem f -> if (any (\f' -> isNested (f' ^. phyloClique_nodes) (f ^. phyloClique_nodes)) mem)
208 then mem
209 else
210 let fMax = filter (\f' -> not $ isNested (f ^. phyloClique_nodes) (f' ^. phyloClique_nodes)) mem
211 in fMax ++ [f] ) [] l)
212 $ elems m
213 clq' = clq `using` parList rdeepseq
214 in fromList $ zip (keys m) clq'
215
216
217 -- | To transform a time map of docs into a time map of Fis with some filters
218 toPhyloClique :: Phylo -> Map (Date, Date) [Document] -> Map (Date,Date) [PhyloClique]
219 toPhyloClique phylo phyloDocs = case (clique $ getConfig phylo) of
220 Fis s s' -> -- traceFis "Filtered Fis"
221 filterCliqueByNested
222 {- \$ traceFis "Filtered by clique size" -}
223 $ filterClique True s' (filterCliqueBySize)
224 {- \$ traceFis "Filtered by support" -}
225 $ filterClique True s (filterCliqueBySupport)
226 {- \$ traceFis "Unfiltered Fis" -}
227 phyloClique
228 MaxClique s _ _ -> filterClique True s (filterCliqueBySize)
229 phyloClique
230 where
231 --------------------------------------
232 phyloClique :: Map (Date,Date) [PhyloClique]
233 phyloClique = case (clique $ getConfig phylo) of
234 Fis _ _ ->
235 let fis = map (\(prd,docs) ->
236 case (corpusParser $ getConfig phylo) of
237 Csv' _ -> let lst = toList
238 $ fisWithSizePolyMap' (Segment 1 20) 1 (map (\d -> (ngramsToIdx (text d) (getRoots phylo), (weight d, (sourcesToIdx (sources d) (getSources phylo))))) docs)
239 in (prd, map (\f -> PhyloClique (Set.toList $ fst f) ((fst . snd) f) prd ((fst . snd . snd) f) (((snd . snd . snd) f))) lst)
240 _ -> let lst = toList
241 $ fisWithSizePolyMap (Segment 1 20) 1 (map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs)
242 in (prd, map (\f -> PhyloClique (Set.toList $ fst f) (snd f) prd (Just $ fromIntegral $ snd f) []) lst)
243 )
244 $ toList phyloDocs
245 fis' = fis `using` parList rdeepseq
246 in fromList fis'
247 MaxClique _ thr filterType ->
248 let mcl = map (\(prd,docs) ->
249 let cooc = map round
250 $ foldl sumCooc empty
251 $ map listToMatrix
252 $ map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs
253 in (prd, map (\cl -> PhyloClique cl 0 prd Nothing []) $ getMaxCliques filterType Conditional thr cooc))
254 $ toList phyloDocs
255 mcl' = mcl `using` parList rdeepseq
256 in fromList mcl'
257 --------------------------------------
258
259 -- dev viz graph maxClique getMaxClique
260
261
262 --------------------
263 -- | Coocurency | --
264 --------------------
265
266
267 -- To transform the docs into a time map of coocurency matrix
268 docsToTimeScaleCooc :: [Document] -> Vector Ngrams -> Map Date Cooc
269 docsToTimeScaleCooc docs fdt =
270 let mCooc = fromListWith sumCooc
271 $ map (\(_d,l) -> (_d, listToMatrix l))
272 $ map (\doc -> (date doc, sort $ ngramsToIdx (text doc) fdt)) docs
273 mCooc' = fromList
274 $ map (\t -> (t,empty))
275 $ toTimeScale (map date docs) 1
276 in trace ("\n" <> "-- | Build the coocurency matrix for " <> show (length $ keys mCooc') <> " unit of time" <> "\n")
277 $ unionWith sumCooc mCooc mCooc'
278
279
280 -----------------------
281 -- | to Phylo Base | --
282 -----------------------
283 -- TODO anoe
284 groupDocsByPeriodRec :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc] -> Map (date, date) [doc]
285 groupDocsByPeriodRec f prds docs acc =
286 if ((null prds) || (null docs))
287 then acc
288 else
289 let prd = head' "groupBy" prds
290 docs' = partition (\d -> (f d >= fst prd) && (f d <= snd prd)) docs
291 in groupDocsByPeriodRec f (tail prds) (snd docs') (insert prd (fst docs') acc)
292
293
294 -- To group a list of Documents by fixed periods
295 groupDocsByPeriod' :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
296 groupDocsByPeriod' f pds docs =
297 let docs' = groupBy (\d d' -> f d == f d') $ sortOn f docs
298 periods = map (inPeriode f docs') pds
299 periods' = periods `using` parList rdeepseq
300 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length pds) <> " periods" <> "\n")
301 $ fromList $ zip pds periods'
302 where
303 --------------------------------------
304 inPeriode :: Ord b => (t -> b) -> [[t]] -> (b, b) -> [t]
305 inPeriode f' h (start,end) =
306 concat $ fst $ partition (\d -> f' (head' "inPeriode" d) >= start && f' (head' "inPeriode" d) <= end) h
307
308
309
310 -- To group a list of Documents by fixed periods
311 groupDocsByPeriod :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
312 groupDocsByPeriod _ _ [] = panic "[ERR][Viz.Phylo.PhyloMaker] Empty [Documents] can not have any periods"
313 groupDocsByPeriod f pds es =
314 let periods = map (inPeriode f es) pds
315 periods' = periods `using` parList rdeepseq
316
317 in trace ("\n" <> "-- | Group " <> show(length es) <> " docs by " <> show(length pds) <> " periods" <> "\n")
318 $ fromList $ zip pds periods'
319 where
320 --------------------------------------
321 inPeriode :: Ord b => (t -> b) -> [t] -> (b, b) -> [t]
322 inPeriode f' h (start,end) =
323 fst $ partition (\d -> f' d >= start && f' d <= end) h
324 --------------------------------------
325
326
327 docsToTermFreq :: [Document] -> Vector Ngrams -> Map Int Double
328 docsToTermFreq docs fdt =
329 let nbDocs = fromIntegral $ length docs
330 freqs = map (/(nbDocs))
331 $ fromList
332 $ map (\lst -> (head' "docsToTermFreq" lst, fromIntegral $ length lst))
333 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) docs
334 sumFreqs = sum $ elems freqs
335 in map (/sumFreqs) freqs
336
337 docsToLastTermFreq :: Int -> [Document] -> Vector Ngrams -> Map Int Double
338 docsToLastTermFreq n docs fdt =
339 let last = take n $ reverse $ sort $ map date docs
340 nbDocs = fromIntegral $ length $ filter (\d -> elem (date d) last) docs
341 freqs = map (/(nbDocs))
342 $ fromList
343 $ map (\lst -> (head' "docsToLastTermFreq" lst, fromIntegral $ length lst))
344 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) $ filter (\d -> elem (date d) last) docs
345 sumFreqs = sum $ elems freqs
346 in map (/sumFreqs) freqs
347
348
349 -- To count the number of docs by unit of time
350 docsToTimeScaleNb :: [Document] -> Map Date Double
351 docsToTimeScaleNb docs =
352 let docs' = fromListWith (+) $ map (\d -> (date d,1)) docs
353 time = fromList $ map (\t -> (t,0)) $ toTimeScale (keys docs') 1
354 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length time) <> " unit of time" <> "\n")
355 $ unionWith (+) time docs'
356
357
358 initPhyloLevels :: Int -> PhyloPeriodId -> Map PhyloLevelId PhyloLevel
359 initPhyloLevels lvlMax pId =
360 fromList $ map (\lvl -> ((pId,lvl),PhyloLevel pId ("","") lvl empty)) [1..lvlMax]
361
362
363
364 -- To init the basic elements of a Phylo
365 toPhyloBase :: [Document] -> TermList -> PhyloConfig -> Phylo
366 toPhyloBase docs lst conf =
367 let foundations = PhyloFoundations (Vector.fromList $ nub $ concat $ map text docs) lst
368 docsSources = PhyloSources (Vector.fromList $ nub $ concat $ map sources docs)
369 params = defaultPhyloParam { _phyloParam_config = conf }
370 periods = toPeriods (sort $ nub $ map date docs) (getTimePeriod $ timeUnit conf) (getTimeStep $ timeUnit conf)
371 in trace ("\n" <> "-- | Create PhyloBase out of " <> show(length docs) <> " docs \n")
372 $ Phylo foundations
373 docsSources
374 (docsToTimeScaleCooc docs (foundations ^. foundations_roots))
375 (docsToTimeScaleNb docs)
376 (docsToTermFreq docs (foundations ^. foundations_roots))
377 (docsToLastTermFreq (getTimePeriod $ timeUnit conf) docs (foundations ^. foundations_roots))
378 empty
379 empty
380 params
381 (fromList $ map (\prd -> (prd, PhyloPeriod prd ("","") (initPhyloLevels 1 prd))) periods)