]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Viz/Phylo/PhyloMaker.hs
[FIX] map list size filtered (others terms do not appear any more)
[gargantext.git] / src / Gargantext / Core / Viz / Phylo / PhyloMaker.hs
1 {-|
2 Module : Gargantext.Core.Viz.Phylo.PhyloMaker
3 Description : Maker engine for rebuilding a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11
12 module Gargantext.Core.Viz.Phylo.PhyloMaker where
13
14 import Data.List (concat, nub, partition, sort, (++), group, intersect, null, sortOn, groupBy, tail)
15 import Data.Map (Map, fromListWith, keys, unionWith, fromList, empty, toList, elems, (!), restrictKeys, foldlWithKey, insert)
16 import Data.Vector (Vector)
17
18 import Gargantext.Prelude
19 import Gargantext.Core.Viz.AdaptativePhylo
20 import Gargantext.Core.Viz.Phylo.PhyloTools
21 import Gargantext.Core.Viz.Phylo.TemporalMatching (adaptativeTemporalMatching, constanteTemporalMatching, getNextPeriods, filterDocs, filterDiago, reduceDiagos, toProximity)
22 import Gargantext.Core.Viz.Phylo.SynchronicClustering (synchronicClustering)
23 import Gargantext.Core.Text.Context (TermList)
24 import Gargantext.Core.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, Size(..))
25 import Gargantext.Core.Methods.Graph.MaxClique (getMaxCliques)
26 import Gargantext.Core.Methods.Distances (Distance(Conditional))
27 import Gargantext.Core.Viz.Phylo.PhyloExport (toHorizon)
28
29
30 import Control.DeepSeq (NFData)
31 import Control.Parallel.Strategies (parList, rdeepseq, using)
32 import Debug.Trace (trace)
33 import Control.Lens hiding (Level)
34
35 import qualified Data.Vector as Vector
36 import qualified Data.Set as Set
37
38 ------------------
39 -- | To Phylo | --
40 ------------------
41
42 {-
43 -- TODO AD
44 data Phylo' = PhyloBase { _phylo'_phyloBase :: Phylo}
45 | PhyloN { _phylo'_phylo1 :: Phylo}
46
47
48 toPhylo' :: Phylo' -> [Document] -> TermList -> Config -> Phylo
49 toPhylo' (PhyloN phylo) = toPhylo'
50 toPhylo' (PhyloBase phylo) = toPhylo
51 -}
52
53
54 toPhylo :: Phylo -> Phylo
55 toPhylo phyloStep = trace ("# phylo1 groups " <> show(length $ getGroupsFromLevel 1 phylo1))
56 $ traceToPhylo (phyloLevel $ getConfig phyloStep) $
57 if (phyloLevel $ getConfig phyloStep) > 1
58 then foldl' (\phylo' _ -> synchronicClustering phylo') phyloAncestors [2..(phyloLevel $ getConfig phyloStep)]
59 else phylo1
60 where
61 --------------------------------------
62 phyloAncestors :: Phylo
63 phyloAncestors =
64 if (findAncestors $ getConfig phyloStep)
65 then toHorizon phylo1
66 else phylo1
67 --------------------------------------
68 phylo1 :: Phylo
69 phylo1 = toPhylo1 phyloStep
70 -- > AD to db here
71 --------------------------------------
72
73
74
75 --------------------
76 -- | To Phylo 1 | --
77 --------------------
78
79 toGroupsProxi :: Level -> Phylo -> Phylo
80 toGroupsProxi lvl phylo =
81 let proximity = phyloProximity $ getConfig phylo
82 groupsProxi = foldlWithKey (\acc pId pds ->
83 -- 1) process period by period
84 let egos = map (\g -> (getGroupId g, g ^. phylo_groupNgrams))
85 $ elems
86 $ view ( phylo_periodLevels
87 . traverse . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
88 . phylo_levelGroups ) pds
89 next = getNextPeriods ToParents (getTimeFrame $ timeUnit $ getConfig phylo) pId (keys $ phylo ^. phylo_periods)
90 targets = map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ getGroupsFromLevelPeriods lvl next phylo
91 docs = filterDocs (phylo ^. phylo_timeDocs) ([pId] ++ next)
92 diagos = filterDiago (phylo ^. phylo_timeCooc) ([pId] ++ next)
93 -- 2) compute the pairs in parallel
94 pairs = map (\(id,ngrams) ->
95 map (\(id',ngrams') ->
96 let nbDocs = (sum . elems) $ filterDocs docs ([idToPrd id, idToPrd id'])
97 diago = reduceDiagos $ filterDiago diagos ([idToPrd id, idToPrd id'])
98 in ((id,id'),toProximity nbDocs diago proximity ngrams ngrams' ngrams')
99 ) $ filter (\(_,ngrams') -> (not . null) $ intersect ngrams ngrams') targets
100 ) egos
101 pairs' = pairs `using` parList rdeepseq
102 in acc ++ (concat pairs')
103 ) [] $ phylo ^. phylo_periods
104 in phylo & phylo_groupsProxi .~ ((traceGroupsProxi . fromList) groupsProxi)
105
106
107 appendGroups :: (a -> PhyloPeriodId -> Level -> Int -> [Cooc] -> PhyloGroup) -> Level -> Map (Date,Date) [a] -> Phylo -> Phylo
108 appendGroups f lvl m phylo = trace ("\n" <> "-- | Append " <> show (length $ concat $ elems m) <> " groups to Level " <> show (lvl) <> "\n")
109 $ over ( phylo_periods
110 . traverse
111 . phylo_periodLevels
112 . traverse)
113 (\phyloLvl -> if lvl == (phyloLvl ^. phylo_levelLevel)
114 then
115 let pId = phyloLvl ^. phylo_levelPeriod
116 phyloCUnit = m ! pId
117 in phyloLvl
118 & phylo_levelGroups .~ (fromList $ foldl (\groups obj ->
119 groups ++ [ (((pId,lvl),length groups)
120 , f obj pId lvl (length groups)
121 (elems $ restrictKeys (phylo ^. phylo_timeCooc) $ periodsToYears [pId]))
122 ] ) [] phyloCUnit)
123 else
124 phyloLvl )
125 phylo
126
127
128 cliqueToGroup :: PhyloClique -> PhyloPeriodId -> Level -> Int -> [Cooc] -> PhyloGroup
129 cliqueToGroup fis pId lvl idx coocs = PhyloGroup pId lvl idx ""
130 (fis ^. phyloClique_support)
131 (fis ^. phyloClique_nodes)
132 (ngramsToCooc (fis ^. phyloClique_nodes) coocs)
133 (1,[0]) -- branchid (lvl,[path in the branching tree])
134 (fromList [("breaks",[0]),("seaLevels",[0])])
135 [] [] [] [] []
136
137
138 toPhylo1 :: Phylo -> Phylo
139 toPhylo1 phyloStep = case (getSeaElevation phyloStep) of
140 Constante start gap -> constanteTemporalMatching start gap phyloStep
141 Adaptative steps -> adaptativeTemporalMatching steps phyloStep
142
143 -----------------------
144 -- | To Phylo Step | --
145 -----------------------
146
147
148 -- To build the first phylo step from docs and terms
149 toPhyloStep :: [Document] -> TermList -> Config -> Phylo
150 toPhyloStep docs lst conf = case (getSeaElevation phyloBase) of
151 Constante _ _ -> appendGroups cliqueToGroup 1 phyloClique phyloBase
152 Adaptative _ -> toGroupsProxi 1 $ appendGroups cliqueToGroup 1 phyloClique phyloBase
153 where
154 --------------------------------------
155 phyloClique :: Map (Date,Date) [PhyloClique]
156 phyloClique = toPhyloClique phyloBase docs'
157 --------------------------------------
158 docs' :: Map (Date,Date) [Document]
159 docs' = groupDocsByPeriodRec date (getPeriodIds phyloBase) (sortOn date docs) empty
160 --------------------------------------
161 phyloBase :: Phylo
162 phyloBase = toPhyloBase docs lst conf
163 --------------------------------------
164
165 ---------------------------
166 -- | Frequent Item Set | --
167 ---------------------------
168
169
170 -- To apply a filter with the possibility of keeping some periods non empty (keep : True|False)
171 filterClique :: Bool -> Int -> (Int -> [PhyloClique] -> [PhyloClique]) -> Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
172 filterClique keep thr f m = case keep of
173 False -> map (\l -> f thr l) m
174 True -> map (\l -> keepFilled (f) thr l) m
175
176
177 -- To filter Fis with small Support
178 filterCliqueBySupport :: Int -> [PhyloClique] -> [PhyloClique]
179 filterCliqueBySupport thr l = filter (\clq -> (clq ^. phyloClique_support) >= thr) l
180
181
182 -- To filter Fis with small Clique size
183 filterCliqueBySize :: Int -> [PhyloClique] -> [PhyloClique]
184 filterCliqueBySize thr l = filter (\clq -> (length $ clq ^. phyloClique_nodes) >= thr) l
185
186
187 -- To filter nested Fis
188 filterCliqueByNested :: Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
189 filterCliqueByNested m =
190 let clq = map (\l ->
191 foldl (\mem f -> if (any (\f' -> isNested (f' ^. phyloClique_nodes) (f ^. phyloClique_nodes)) mem)
192 then mem
193 else
194 let fMax = filter (\f' -> not $ isNested (f ^. phyloClique_nodes) (f' ^. phyloClique_nodes)) mem
195 in fMax ++ [f] ) [] l)
196 $ elems m
197 clq' = clq `using` parList rdeepseq
198 in fromList $ zip (keys m) clq'
199
200
201 -- | To transform a time map of docs into a time map of Fis with some filters
202 toPhyloClique :: Phylo -> Map (Date, Date) [Document] -> Map (Date,Date) [PhyloClique]
203 toPhyloClique phylo phyloDocs = case (clique $ getConfig phylo) of
204 Fis s s' -> -- traceFis "Filtered Fis"
205 filterCliqueByNested
206 {- \$ traceFis "Filtered by clique size" -}
207 $ filterClique True s' (filterCliqueBySize)
208 {- \$ traceFis "Filtered by support" -}
209 $ filterClique True s (filterCliqueBySupport)
210 {- \$ traceFis "Unfiltered Fis" -}
211 phyloClique
212 MaxClique s _ _ -> filterClique True s (filterCliqueBySize)
213 phyloClique
214 where
215 --------------------------------------
216 phyloClique :: Map (Date,Date) [PhyloClique]
217 phyloClique = case (clique $ getConfig phylo) of
218 Fis _ _ ->
219 let fis = map (\(prd,docs) ->
220 let lst = toList $ fisWithSizePolyMap (Segment 1 20) 1 (map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs)
221 in (prd, map (\f -> PhyloClique (Set.toList $ fst f) (snd f) prd) lst))
222 $ toList phyloDocs
223 fis' = fis `using` parList rdeepseq
224 in fromList fis'
225 MaxClique _ thr filterType ->
226 let mcl = map (\(prd,docs) ->
227 let cooc = map round
228 $ foldl sumCooc empty
229 $ map listToMatrix
230 $ map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs
231 in (prd, map (\cl -> PhyloClique cl 0 prd) $ getMaxCliques filterType Conditional thr cooc))
232 $ toList phyloDocs
233 mcl' = mcl `using` parList rdeepseq
234 in fromList mcl'
235 --------------------------------------
236
237 -- dev viz graph maxClique getMaxClique
238
239
240 --------------------
241 -- | Coocurency | --
242 --------------------
243
244
245 -- To transform the docs into a time map of coocurency matrix
246 docsToTimeScaleCooc :: [Document] -> Vector Ngrams -> Map Date Cooc
247 docsToTimeScaleCooc docs fdt =
248 let mCooc = fromListWith sumCooc
249 $ map (\(_d,l) -> (_d, listToMatrix l))
250 $ map (\doc -> (date doc, sort $ ngramsToIdx (text doc) fdt)) docs
251 mCooc' = fromList
252 $ map (\t -> (t,empty))
253 $ toTimeScale (map date docs) 1
254 in trace ("\n" <> "-- | Build the coocurency matrix for " <> show (length $ keys mCooc') <> " unit of time" <> "\n")
255 $ unionWith sumCooc mCooc mCooc'
256
257
258 -----------------------
259 -- | to Phylo Base | --
260 -----------------------
261 -- TODO anoe
262 groupDocsByPeriodRec :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc] -> Map (date, date) [doc]
263 groupDocsByPeriodRec f prds docs acc =
264 if ((null prds) || (null docs))
265 then acc
266 else
267 let prd = head' "groupBy" prds
268 docs' = partition (\d -> (f d >= fst prd) && (f d <= snd prd)) docs
269 in groupDocsByPeriodRec f (tail prds) (snd docs') (insert prd (fst docs') acc)
270
271
272 -- To group a list of Documents by fixed periods
273 groupDocsByPeriod' :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
274 groupDocsByPeriod' f pds docs =
275 let docs' = groupBy (\d d' -> f d == f d') $ sortOn f docs
276 periods = map (inPeriode f docs') pds
277 periods' = periods `using` parList rdeepseq
278 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length pds) <> " periods" <> "\n")
279 $ fromList $ zip pds periods'
280 where
281 --------------------------------------
282 inPeriode :: Ord b => (t -> b) -> [[t]] -> (b, b) -> [t]
283 inPeriode f' h (start,end) =
284 concat $ fst $ partition (\d -> f' (head' "inPeriode" d) >= start && f' (head' "inPeriode" d) <= end) h
285
286
287
288 -- To group a list of Documents by fixed periods
289 groupDocsByPeriod :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
290 groupDocsByPeriod _ _ [] = panic "[ERR][Viz.Phylo.PhyloMaker] Empty [Documents] can not have any periods"
291 groupDocsByPeriod f pds es =
292 let periods = map (inPeriode f es) pds
293 periods' = periods `using` parList rdeepseq
294
295 in trace ("\n" <> "-- | Group " <> show(length es) <> " docs by " <> show(length pds) <> " periods" <> "\n")
296 $ fromList $ zip pds periods'
297 where
298 --------------------------------------
299 inPeriode :: Ord b => (t -> b) -> [t] -> (b, b) -> [t]
300 inPeriode f' h (start,end) =
301 fst $ partition (\d -> f' d >= start && f' d <= end) h
302 --------------------------------------
303
304
305 docsToTermFreq :: [Document] -> Vector Ngrams -> Map Int Double
306 docsToTermFreq docs fdt =
307 let nbDocs = fromIntegral $ length docs
308 freqs = map (/(nbDocs))
309 $ fromList
310 $ map (\lst -> (head' "docsToTermFreq" lst, fromIntegral $ length lst))
311 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) docs
312 sumFreqs = sum $ elems freqs
313 in map (/sumFreqs) freqs
314
315 docsToLastTermFreq :: Int -> [Document] -> Vector Ngrams -> Map Int Double
316 docsToLastTermFreq n docs fdt =
317 let last = take n $ reverse $ sort $ map date docs
318 nbDocs = fromIntegral $ length $ filter (\d -> elem (date d) last) docs
319 freqs = map (/(nbDocs))
320 $ fromList
321 $ map (\lst -> (head' "docsToLastTermFreq" lst, fromIntegral $ length lst))
322 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) $ filter (\d -> elem (date d) last) docs
323 sumFreqs = sum $ elems freqs
324 in map (/sumFreqs) freqs
325
326
327 -- To count the number of docs by unit of time
328 docsToTimeScaleNb :: [Document] -> Map Date Double
329 docsToTimeScaleNb docs =
330 let docs' = fromListWith (+) $ map (\d -> (date d,1)) docs
331 time = fromList $ map (\t -> (t,0)) $ toTimeScale (keys docs') 1
332 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length time) <> " unit of time" <> "\n")
333 $ unionWith (+) time docs'
334
335
336 initPhyloLevels :: Int -> PhyloPeriodId -> Map PhyloLevelId PhyloLevel
337 initPhyloLevels lvlMax pId =
338 fromList $ map (\lvl -> ((pId,lvl),PhyloLevel pId lvl empty)) [1..lvlMax]
339
340
341 -- To init the basic elements of a Phylo
342 toPhyloBase :: [Document] -> TermList -> Config -> Phylo
343 toPhyloBase docs lst conf =
344 let foundations = PhyloFoundations (Vector.fromList $ nub $ concat $ map text docs) lst
345 params = defaultPhyloParam { _phyloParam_config = conf }
346 periods = toPeriods (sort $ nub $ map date docs) (getTimePeriod $ timeUnit conf) (getTimeStep $ timeUnit conf)
347 in trace ("\n" <> "-- | Create PhyloBase out of " <> show(length docs) <> " docs \n")
348 $ Phylo foundations
349 (docsToTimeScaleCooc docs (foundations ^. foundations_roots))
350 (docsToTimeScaleNb docs)
351 (docsToTermFreq docs (foundations ^. foundations_roots))
352 (docsToLastTermFreq (getTimePeriod $ timeUnit conf) docs (foundations ^. foundations_roots))
353 empty
354 empty
355 params
356 (fromList $ map (\prd -> (prd, PhyloPeriod prd (initPhyloLevels 1 prd))) periods)