]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Viz/Phylo/PhyloMaker.hs
process the ancestors before the synchronic clustering
[gargantext.git] / src / Gargantext / Core / Viz / Phylo / PhyloMaker.hs
1 {-|
2 Module : Gargantext.Core.Viz.Phylo.PhyloMaker
3 Description : Maker engine for rebuilding a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11
12 module Gargantext.Core.Viz.Phylo.PhyloMaker where
13
14 import Data.List (concat, nub, partition, sort, (++), group, intersect, null, sortOn, groupBy, tail)
15 import Data.Map (Map, fromListWith, keys, unionWith, fromList, empty, toList, elems, (!), restrictKeys, foldlWithKey, insert)
16 import Data.Vector (Vector)
17
18 import Gargantext.Prelude
19 import Gargantext.Core.Viz.AdaptativePhylo
20 import Gargantext.Core.Viz.Phylo.PhyloTools
21 import Gargantext.Core.Viz.Phylo.TemporalMatching (adaptativeTemporalMatching, constanteTemporalMatching, getNextPeriods, filterDocs, filterDiago, reduceDiagos, toProximity)
22 import Gargantext.Core.Viz.Phylo.SynchronicClustering (synchronicClustering)
23 import Gargantext.Core.Text.Context (TermList)
24 import Gargantext.Core.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, Size(..))
25 import Gargantext.Core.Viz.Graph.MaxClique (getMaxCliques)
26 import Gargantext.Core.Viz.Graph.Distances (Distance(Conditional))
27 import Gargantext.Core.Viz.Phylo.PhyloExport (toHorizon)
28
29 import Control.DeepSeq (NFData)
30 import Control.Parallel.Strategies (parList, rdeepseq, using)
31 import Debug.Trace (trace)
32 import Control.Lens hiding (Level)
33
34 import qualified Data.Vector as Vector
35 import qualified Data.Set as Set
36
37 ------------------
38 -- | To Phylo | --
39 ------------------
40
41 {-
42 -- TODO AD
43 data Phylo' = PhyloBase { _phylo'_phyloBase :: Phylo}
44 | PhyloN { _phylo'_phylo1 :: Phylo}
45
46
47 toPhylo' :: Phylo' -> [Document] -> TermList -> Config -> Phylo
48 toPhylo' (PhyloN phylo) = toPhylo'
49 toPhylo' (PhyloBase phylo) = toPhylo
50 -}
51
52
53 toPhylo :: [Document] -> TermList -> Config -> Phylo
54 toPhylo docs lst conf = trace ("# phylo1 groups " <> show(length $ getGroupsFromLevel 1 phylo1))
55 $ traceToPhylo (phyloLevel conf) $
56 if (phyloLevel conf) > 1
57 then foldl' (\phylo' _ -> synchronicClustering phylo') phyloAncestors [2..(phyloLevel conf)]
58 else phylo1
59 where
60 --------------------------------------
61 phyloAncestors :: Phylo
62 phyloAncestors = toHorizon phylo1
63 --------------------------------------
64 phylo1 :: Phylo
65 phylo1 = toPhylo1 docs phyloBase
66 -- > AD to db here
67 --------------------------------------
68 phyloBase :: Phylo
69 phyloBase = toPhyloBase docs lst conf
70 -- > AD to db here
71 --------------------------------------
72
73
74
75 --------------------
76 -- | To Phylo 1 | --
77 --------------------
78
79 toGroupsProxi :: Level -> Phylo -> Phylo
80 toGroupsProxi lvl phylo =
81 let proximity = phyloProximity $ getConfig phylo
82 groupsProxi = foldlWithKey (\acc pId pds ->
83 -- 1) process period by period
84 let egos = map (\g -> (getGroupId g, g ^. phylo_groupNgrams))
85 $ elems
86 $ view ( phylo_periodLevels
87 . traverse . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
88 . phylo_levelGroups ) pds
89 next = getNextPeriods ToParents (getTimeFrame $ timeUnit $ getConfig phylo) pId (keys $ phylo ^. phylo_periods)
90 targets = map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ getGroupsFromLevelPeriods lvl next phylo
91 docs = filterDocs (phylo ^. phylo_timeDocs) ([pId] ++ next)
92 diagos = filterDiago (phylo ^. phylo_timeCooc) ([pId] ++ next)
93 -- 2) compute the pairs in parallel
94 pairs = map (\(id,ngrams) ->
95 map (\(id',ngrams') ->
96 let nbDocs = (sum . elems) $ filterDocs docs ([idToPrd id, idToPrd id'])
97 diago = reduceDiagos $ filterDiago diagos ([idToPrd id, idToPrd id'])
98 in ((id,id'),toProximity nbDocs diago proximity ngrams ngrams' ngrams')
99 ) $ filter (\(_,ngrams') -> (not . null) $ intersect ngrams ngrams') targets
100 ) egos
101 pairs' = pairs `using` parList rdeepseq
102 in acc ++ (concat pairs')
103 ) [] $ phylo ^. phylo_periods
104 in phylo & phylo_groupsProxi .~ ((traceGroupsProxi . fromList) groupsProxi)
105
106
107 appendGroups :: (a -> PhyloPeriodId -> Level -> Int -> [Cooc] -> PhyloGroup) -> Level -> Map (Date,Date) [a] -> Phylo -> Phylo
108 appendGroups f lvl m phylo = trace ("\n" <> "-- | Append " <> show (length $ concat $ elems m) <> " groups to Level " <> show (lvl) <> "\n")
109 $ over ( phylo_periods
110 . traverse
111 . phylo_periodLevels
112 . traverse)
113 (\phyloLvl -> if lvl == (phyloLvl ^. phylo_levelLevel)
114 then
115 let pId = phyloLvl ^. phylo_levelPeriod
116 phyloCUnit = m ! pId
117 in phyloLvl
118 & phylo_levelGroups .~ (fromList $ foldl (\groups obj ->
119 groups ++ [ (((pId,lvl),length groups)
120 , f obj pId lvl (length groups)
121 (elems $ restrictKeys (phylo ^. phylo_timeCooc) $ periodsToYears [pId]))
122 ] ) [] phyloCUnit)
123 else
124 phyloLvl )
125 phylo
126
127
128 cliqueToGroup :: PhyloClique -> PhyloPeriodId -> Level -> Int -> [Cooc] -> PhyloGroup
129 cliqueToGroup fis pId lvl idx coocs = PhyloGroup pId lvl idx ""
130 (fis ^. phyloClique_support)
131 (fis ^. phyloClique_nodes)
132 (ngramsToCooc (fis ^. phyloClique_nodes) coocs)
133 (1,[0]) -- branchid (lvl,[path in the branching tree])
134 (fromList [("breaks",[0]),("seaLevels",[0])])
135 [] [] [] [] []
136
137
138 toPhylo1 :: [Document] -> Phylo -> Phylo
139 toPhylo1 docs phyloBase = case (getSeaElevation phyloBase) of
140 Constante start gap -> constanteTemporalMatching start gap
141 $ appendGroups cliqueToGroup 1 phyloClique phyloBase
142 Adaptative steps -> adaptativeTemporalMatching steps
143 $ toGroupsProxi 1
144 $ appendGroups cliqueToGroup 1 phyloClique phyloBase
145 where
146 --------------------------------------
147 phyloClique :: Map (Date,Date) [PhyloClique]
148 phyloClique = toPhyloClique phyloBase docs'
149 --------------------------------------
150 docs' :: Map (Date,Date) [Document]
151 docs' = groupDocsByPeriodRec date (getPeriodIds phyloBase) (sortOn date docs) empty
152 -- docs' = groupDocsByPeriod' date (getPeriodIds phyloBase) docs
153 --------------------------------------
154
155
156 ---------------------------
157 -- | Frequent Item Set | --
158 ---------------------------
159
160
161 -- To apply a filter with the possibility of keeping some periods non empty (keep : True|False)
162 filterClique :: Bool -> Int -> (Int -> [PhyloClique] -> [PhyloClique]) -> Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
163 filterClique keep thr f m = case keep of
164 False -> map (\l -> f thr l) m
165 True -> map (\l -> keepFilled (f) thr l) m
166
167
168 -- To filter Fis with small Support
169 filterCliqueBySupport :: Int -> [PhyloClique] -> [PhyloClique]
170 filterCliqueBySupport thr l = filter (\clq -> (clq ^. phyloClique_support) >= thr) l
171
172
173 -- To filter Fis with small Clique size
174 filterCliqueBySize :: Int -> [PhyloClique] -> [PhyloClique]
175 filterCliqueBySize thr l = filter (\clq -> (length $ clq ^. phyloClique_nodes) >= thr) l
176
177
178 -- To filter nested Fis
179 filterCliqueByNested :: Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
180 filterCliqueByNested m =
181 let clq = map (\l ->
182 foldl (\mem f -> if (any (\f' -> isNested (f' ^. phyloClique_nodes) (f ^. phyloClique_nodes)) mem)
183 then mem
184 else
185 let fMax = filter (\f' -> not $ isNested (f ^. phyloClique_nodes) (f' ^. phyloClique_nodes)) mem
186 in fMax ++ [f] ) [] l)
187 $ elems m
188 clq' = clq `using` parList rdeepseq
189 in fromList $ zip (keys m) clq'
190
191
192 -- | To transform a time map of docs into a time map of Fis with some filters
193 toPhyloClique :: Phylo -> Map (Date, Date) [Document] -> Map (Date,Date) [PhyloClique]
194 toPhyloClique phylo phyloDocs = case (clique $ getConfig phylo) of
195 Fis s s' -> -- traceFis "Filtered Fis"
196 filterCliqueByNested
197 {- \$ traceFis "Filtered by clique size" -}
198 $ filterClique True s' (filterCliqueBySize)
199 {- \$ traceFis "Filtered by support" -}
200 $ filterClique True s (filterCliqueBySupport)
201 {- \$ traceFis "Unfiltered Fis" -}
202 phyloClique
203 MaxClique s -> filterClique True s (filterCliqueBySize)
204 phyloClique
205 where
206 --------------------------------------
207 phyloClique :: Map (Date,Date) [PhyloClique]
208 phyloClique = case (clique $ getConfig phylo) of
209 Fis _ _ ->
210 let fis = map (\(prd,docs) ->
211 let lst = toList $ fisWithSizePolyMap (Segment 1 20) 1 (map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs)
212 in (prd, map (\f -> PhyloClique (Set.toList $ fst f) (snd f) prd) lst))
213 $ toList phyloDocs
214 fis' = fis `using` parList rdeepseq
215 in fromList fis'
216 MaxClique _ ->
217 let mcl = map (\(prd,docs) ->
218 let cooc = map round
219 $ foldl sumCooc empty
220 $ map listToMatrix
221 $ map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs
222 in (prd, map (\cl -> PhyloClique cl 0 prd) $ getMaxCliques Conditional 0.001 cooc))
223 $ toList phyloDocs
224 mcl' = mcl `using` parList rdeepseq
225 in fromList mcl'
226 --------------------------------------
227
228 -- dev viz graph maxClique getMaxClique
229
230
231 --------------------
232 -- | Coocurency | --
233 --------------------
234
235
236 -- To transform the docs into a time map of coocurency matrix
237 docsToTimeScaleCooc :: [Document] -> Vector Ngrams -> Map Date Cooc
238 docsToTimeScaleCooc docs fdt =
239 let mCooc = fromListWith sumCooc
240 $ map (\(_d,l) -> (_d, listToMatrix l))
241 $ map (\doc -> (date doc, sort $ ngramsToIdx (text doc) fdt)) docs
242 mCooc' = fromList
243 $ map (\t -> (t,empty))
244 $ toTimeScale (map date docs) 1
245 in trace ("\n" <> "-- | Build the coocurency matrix for " <> show (length $ keys mCooc') <> " unit of time" <> "\n")
246 $ unionWith sumCooc mCooc mCooc'
247
248
249 -----------------------
250 -- | to Phylo Base | --
251 -----------------------
252 -- TODO anoe
253 groupDocsByPeriodRec :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc] -> Map (date, date) [doc]
254 groupDocsByPeriodRec f prds docs acc =
255 if ((null prds) || (null docs))
256 then acc
257 else
258 let prd = head' "groupBy" prds
259 docs' = partition (\d -> (f d >= fst prd) && (f d <= snd prd)) docs
260 in groupDocsByPeriodRec f (tail prds) (snd docs') (insert prd (fst docs') acc)
261
262
263 -- To group a list of Documents by fixed periods
264 groupDocsByPeriod' :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
265 groupDocsByPeriod' f pds docs =
266 let docs' = groupBy (\d d' -> f d == f d') $ sortOn f docs
267 periods = map (inPeriode f docs') pds
268 periods' = periods `using` parList rdeepseq
269 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length pds) <> " periods" <> "\n")
270 $ fromList $ zip pds periods'
271 where
272 --------------------------------------
273 inPeriode :: Ord b => (t -> b) -> [[t]] -> (b, b) -> [t]
274 inPeriode f' h (start,end) =
275 concat $ fst $ partition (\d -> f' (head' "inPeriode" d) >= start && f' (head' "inPeriode" d) <= end) h
276
277
278
279 -- To group a list of Documents by fixed periods
280 groupDocsByPeriod :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
281 groupDocsByPeriod _ _ [] = panic "[ERR][Viz.Phylo.PhyloMaker] Empty [Documents] can not have any periods"
282 groupDocsByPeriod f pds es =
283 let periods = map (inPeriode f es) pds
284 periods' = periods `using` parList rdeepseq
285
286 in trace ("\n" <> "-- | Group " <> show(length es) <> " docs by " <> show(length pds) <> " periods" <> "\n")
287 $ fromList $ zip pds periods'
288 where
289 --------------------------------------
290 inPeriode :: Ord b => (t -> b) -> [t] -> (b, b) -> [t]
291 inPeriode f' h (start,end) =
292 fst $ partition (\d -> f' d >= start && f' d <= end) h
293 --------------------------------------
294
295
296 docsToTermFreq :: [Document] -> Vector Ngrams -> Map Int Double
297 docsToTermFreq docs fdt =
298 let nbDocs = fromIntegral $ length docs
299 freqs = map (/(nbDocs))
300 $ fromList
301 $ map (\lst -> (head' "docsToTermFreq" lst, fromIntegral $ length lst))
302 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) docs
303 sumFreqs = sum $ elems freqs
304 in map (/sumFreqs) freqs
305
306 docsToLastTermFreq :: Int -> [Document] -> Vector Ngrams -> Map Int Double
307 docsToLastTermFreq n docs fdt =
308 let last = take n $ reverse $ sort $ map date docs
309 nbDocs = fromIntegral $ length $ filter (\d -> elem (date d) last) docs
310 freqs = map (/(nbDocs))
311 $ fromList
312 $ map (\lst -> (head' "docsToLastTermFreq" lst, fromIntegral $ length lst))
313 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) $ filter (\d -> elem (date d) last) docs
314 sumFreqs = sum $ elems freqs
315 in map (/sumFreqs) freqs
316
317
318 -- To count the number of docs by unit of time
319 docsToTimeScaleNb :: [Document] -> Map Date Double
320 docsToTimeScaleNb docs =
321 let docs' = fromListWith (+) $ map (\d -> (date d,1)) docs
322 time = fromList $ map (\t -> (t,0)) $ toTimeScale (keys docs') 1
323 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length time) <> " unit of time" <> "\n")
324 $ unionWith (+) time docs'
325
326
327 initPhyloLevels :: Int -> PhyloPeriodId -> Map PhyloLevelId PhyloLevel
328 initPhyloLevels lvlMax pId =
329 fromList $ map (\lvl -> ((pId,lvl),PhyloLevel pId lvl empty)) [1..lvlMax]
330
331
332 -- To init the basic elements of a Phylo
333 toPhyloBase :: [Document] -> TermList -> Config -> Phylo
334 toPhyloBase docs lst conf =
335 let foundations = PhyloFoundations (Vector.fromList $ nub $ concat $ map text docs) lst
336 params = defaultPhyloParam { _phyloParam_config = conf }
337 periods = toPeriods (sort $ nub $ map date docs) (getTimePeriod $ timeUnit conf) (getTimeStep $ timeUnit conf)
338 in trace ("\n" <> "-- | Create PhyloBase out of " <> show(length docs) <> " docs \n")
339 $ Phylo foundations
340 (docsToTimeScaleCooc docs (foundations ^. foundations_roots))
341 (docsToTimeScaleNb docs)
342 (docsToTermFreq docs (foundations ^. foundations_roots))
343 (docsToLastTermFreq (getTimePeriod $ timeUnit conf) docs (foundations ^. foundations_roots))
344 empty
345 empty
346 params
347 (fromList $ map (\prd -> (prd, PhyloPeriod prd (initPhyloLevels 1 prd))) periods)