]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Viz/Phylo/PhyloMaker.hs
[VERSION] +1 to 0.0.4.9.6
[gargantext.git] / src / Gargantext / Core / Viz / Phylo / PhyloMaker.hs
1 {-|
2 Module : Gargantext.Core.Viz.Phylo.PhyloMaker
3 Description : Maker engine for rebuilding a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11
12 module Gargantext.Core.Viz.Phylo.PhyloMaker where
13
14 import Data.List (concat, nub, partition, sort, (++), group, intersect, null, sortOn, groupBy, tail)
15 import Data.Map (Map, fromListWith, keys, unionWith, fromList, empty, toList, elems, (!), restrictKeys, foldlWithKey, insert)
16 import Data.Vector (Vector)
17 import Data.Text (Text)
18
19 import Gargantext.Prelude
20 import Gargantext.Core.Viz.AdaptativePhylo
21 import Gargantext.Core.Viz.Phylo.PhyloTools
22 import Gargantext.Core.Viz.Phylo.TemporalMatching (adaptativeTemporalMatching, constanteTemporalMatching, getNextPeriods, filterDocs, filterDiago, reduceDiagos, toProximity)
23 import Gargantext.Core.Viz.Phylo.SynchronicClustering (synchronicClustering)
24 import Gargantext.Core.Text.Context (TermList)
25 import Gargantext.Core.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, fisWithSizePolyMap', Size(..))
26 import Gargantext.Core.Methods.Graph.MaxClique (getMaxCliques)
27 import Gargantext.Core.Methods.Distances (Distance(Conditional))
28 import Gargantext.Core.Viz.Phylo.PhyloExport (toHorizon)
29
30
31 import Control.DeepSeq (NFData)
32 import Control.Parallel.Strategies (parList, rdeepseq, using)
33 import Debug.Trace (trace)
34 import Control.Lens hiding (Level)
35
36 import qualified Data.Vector as Vector
37 import qualified Data.Set as Set
38
39 ------------------
40 -- | To Phylo | --
41 ------------------
42
43 {-
44 -- TODO AD
45 data Phylo' = PhyloBase { _phylo'_phyloBase :: Phylo}
46 | PhyloN { _phylo'_phylo1 :: Phylo}
47
48
49 toPhylo' :: Phylo' -> [Document] -> TermList -> Config -> Phylo
50 toPhylo' (PhyloN phylo) = toPhylo'
51 toPhylo' (PhyloBase phylo) = toPhylo
52 -}
53
54
55 toPhylo :: Phylo -> Phylo
56 toPhylo phyloStep = trace ("# phylo1 groups " <> show(length $ getGroupsFromLevel 1 phylo1))
57 $ traceToPhylo (phyloLevel $ getConfig phyloStep) $
58 if (phyloLevel $ getConfig phyloStep) > 1
59 then foldl' (\phylo' _ -> synchronicClustering phylo') phyloAncestors [2..(phyloLevel $ getConfig phyloStep)]
60 else phylo1
61 where
62 --------------------------------------
63 phyloAncestors :: Phylo
64 phyloAncestors =
65 if (findAncestors $ getConfig phyloStep)
66 then toHorizon phylo1
67 else phylo1
68 --------------------------------------
69 phylo1 :: Phylo
70 phylo1 = toPhylo1 phyloStep
71 -- > AD to db here
72 --------------------------------------
73
74
75
76 --------------------
77 -- | To Phylo 1 | --
78 --------------------
79
80 toGroupsProxi :: Level -> Phylo -> Phylo
81 toGroupsProxi lvl phylo =
82 let proximity = phyloProximity $ getConfig phylo
83 groupsProxi = foldlWithKey (\acc pId pds ->
84 -- 1) process period by period
85 let egos = map (\g -> (getGroupId g, g ^. phylo_groupNgrams))
86 $ elems
87 $ view ( phylo_periodLevels
88 . traverse . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
89 . phylo_levelGroups ) pds
90 next = getNextPeriods ToParents (getTimeFrame $ timeUnit $ getConfig phylo) pId (keys $ phylo ^. phylo_periods)
91 targets = map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ getGroupsFromLevelPeriods lvl next phylo
92 docs = filterDocs (phylo ^. phylo_timeDocs) ([pId] ++ next)
93 diagos = filterDiago (phylo ^. phylo_timeCooc) ([pId] ++ next)
94 -- 2) compute the pairs in parallel
95 pairs = map (\(id,ngrams) ->
96 map (\(id',ngrams') ->
97 let nbDocs = (sum . elems) $ filterDocs docs ([idToPrd id, idToPrd id'])
98 diago = reduceDiagos $ filterDiago diagos ([idToPrd id, idToPrd id'])
99 in ((id,id'),toProximity nbDocs diago proximity ngrams ngrams' ngrams')
100 ) $ filter (\(_,ngrams') -> (not . null) $ intersect ngrams ngrams') targets
101 ) egos
102 pairs' = pairs `using` parList rdeepseq
103 in acc ++ (concat pairs')
104 ) [] $ phylo ^. phylo_periods
105 in phylo & phylo_groupsProxi .~ ((traceGroupsProxi . fromList) groupsProxi)
106
107
108 appendGroups :: (a -> PhyloPeriodId -> (Text,Text) -> Level -> Int -> [Cooc] -> PhyloGroup) -> Level -> Map (Date,Date) [a] -> Phylo -> Phylo
109 appendGroups f lvl m phylo = trace ("\n" <> "-- | Append " <> show (length $ concat $ elems m) <> " groups to Level " <> show (lvl) <> "\n")
110 $ over ( phylo_periods
111 . traverse
112 . phylo_periodLevels
113 . traverse)
114 (\phyloLvl -> if lvl == (phyloLvl ^. phylo_levelLevel)
115 then
116 let pId = phyloLvl ^. phylo_levelPeriod
117 pId' = phyloLvl ^. phylo_levelPeriod'
118 phyloCUnit = m ! pId
119 in phyloLvl
120 & phylo_levelGroups .~ (fromList $ foldl (\groups obj ->
121 groups ++ [ (((pId,lvl),length groups)
122 , f obj pId pId' lvl (length groups)
123 (elems $ restrictKeys (phylo ^. phylo_timeCooc) $ periodsToYears [pId]))
124 ] ) [] phyloCUnit)
125 else
126 phyloLvl )
127 phylo
128
129
130 cliqueToGroup :: PhyloClique -> PhyloPeriodId -> (Text,Text) -> Level -> Int -> [Cooc] -> PhyloGroup
131 cliqueToGroup fis pId pId' lvl idx coocs = PhyloGroup pId pId' lvl idx ""
132 (fis ^. phyloClique_support)
133 (fis ^. phyloClique_weight)
134 (fis ^. phyloClique_sources)
135 (fis ^. phyloClique_nodes)
136 (ngramsToCooc (fis ^. phyloClique_nodes) coocs)
137 (1,[0]) -- branchid (lvl,[path in the branching tree])
138 (fromList [("breaks",[0]),("seaLevels",[0])])
139 [] [] [] [] []
140
141
142 toPhylo1 :: Phylo -> Phylo
143 toPhylo1 phyloStep = case (getSeaElevation phyloStep) of
144 Constante start gap -> constanteTemporalMatching start gap phyloStep
145 Adaptative steps -> adaptativeTemporalMatching steps phyloStep
146
147 -----------------------
148 -- | To Phylo Step | --
149 -----------------------
150
151
152 indexDates' :: Map (Date,Date) [Document] -> Map (Date,Date) (Text,Text)
153 indexDates' m = map (\docs ->
154 let ds = map (\d -> date' d) docs
155 f = if (null ds)
156 then ""
157 else toFstDate ds
158 l = if (null ds)
159 then ""
160 else toLstDate ds
161 in (f,l)) m
162
163
164 -- To build the first phylo step from docs and terms
165 toPhyloStep :: [Document] -> TermList -> Config -> Phylo
166 toPhyloStep docs lst conf = case (getSeaElevation phyloBase) of
167 Constante _ _ -> appendGroups cliqueToGroup 1 phyloClique (updatePeriods (indexDates' docs') phyloBase)
168 Adaptative _ -> toGroupsProxi 1
169 $ appendGroups cliqueToGroup 1 phyloClique (updatePeriods (indexDates' docs') phyloBase)
170 where
171 --------------------------------------
172 phyloClique :: Map (Date,Date) [PhyloClique]
173 phyloClique = toPhyloClique phyloBase docs'
174 --------------------------------------
175 docs' :: Map (Date,Date) [Document]
176 docs' = groupDocsByPeriodRec date (getPeriodIds phyloBase) (sortOn date docs) empty
177 --------------------------------------
178 phyloBase :: Phylo
179 phyloBase = toPhyloBase docs lst conf
180 --------------------------------------
181
182 ---------------------------
183 -- | Frequent Item Set | --
184 ---------------------------
185
186
187 -- To apply a filter with the possibility of keeping some periods non empty (keep : True|False)
188 filterClique :: Bool -> Int -> (Int -> [PhyloClique] -> [PhyloClique]) -> Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
189 filterClique keep thr f m = case keep of
190 False -> map (\l -> f thr l) m
191 True -> map (\l -> keepFilled (f) thr l) m
192
193
194 -- To filter Fis with small Support
195 filterCliqueBySupport :: Int -> [PhyloClique] -> [PhyloClique]
196 filterCliqueBySupport thr l = filter (\clq -> (clq ^. phyloClique_support) >= thr) l
197
198
199 -- To filter Fis with small Clique size
200 filterCliqueBySize :: Int -> [PhyloClique] -> [PhyloClique]
201 filterCliqueBySize thr l = filter (\clq -> (length $ clq ^. phyloClique_nodes) >= thr) l
202
203
204 -- To filter nested Fis
205 filterCliqueByNested :: Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
206 filterCliqueByNested m =
207 let clq = map (\l ->
208 foldl (\mem f -> if (any (\f' -> isNested (f' ^. phyloClique_nodes) (f ^. phyloClique_nodes)) mem)
209 then mem
210 else
211 let fMax = filter (\f' -> not $ isNested (f ^. phyloClique_nodes) (f' ^. phyloClique_nodes)) mem
212 in fMax ++ [f] ) [] l)
213 $ elems m
214 clq' = clq `using` parList rdeepseq
215 in fromList $ zip (keys m) clq'
216
217
218 -- | To transform a time map of docs into a time map of Fis with some filters
219 toPhyloClique :: Phylo -> Map (Date, Date) [Document] -> Map (Date,Date) [PhyloClique]
220 toPhyloClique phylo phyloDocs = case (clique $ getConfig phylo) of
221 Fis s s' -> -- traceFis "Filtered Fis"
222 filterCliqueByNested
223 {- \$ traceFis "Filtered by clique size" -}
224 $ filterClique True s' (filterCliqueBySize)
225 {- \$ traceFis "Filtered by support" -}
226 $ filterClique True s (filterCliqueBySupport)
227 {- \$ traceFis "Unfiltered Fis" -}
228 phyloClique
229 MaxClique s _ _ -> filterClique True s (filterCliqueBySize)
230 phyloClique
231 where
232 --------------------------------------
233 phyloClique :: Map (Date,Date) [PhyloClique]
234 phyloClique = case (clique $ getConfig phylo) of
235 Fis _ _ ->
236 let fis = map (\(prd,docs) ->
237 case (corpusParser $ getConfig phylo) of
238 Csv' _ -> let lst = toList
239 $ fisWithSizePolyMap' (Segment 1 20) 1 (map (\d -> (ngramsToIdx (text d) (getRoots phylo), (weight d, (sourcesToIdx (sources d) (getSources phylo))))) docs)
240 in (prd, map (\f -> PhyloClique (Set.toList $ fst f) ((fst . snd) f) prd ((fst . snd . snd) f) (((snd . snd . snd) f))) lst)
241 _ -> let lst = toList
242 $ fisWithSizePolyMap (Segment 1 20) 1 (map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs)
243 in (prd, map (\f -> PhyloClique (Set.toList $ fst f) (snd f) prd Nothing []) lst)
244 )
245 $ toList phyloDocs
246 fis' = fis `using` parList rdeepseq
247 in fromList fis'
248 MaxClique _ thr filterType ->
249 let mcl = map (\(prd,docs) ->
250 let cooc = map round
251 $ foldl sumCooc empty
252 $ map listToMatrix
253 $ map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs
254 in (prd, map (\cl -> PhyloClique cl 0 prd Nothing []) $ getMaxCliques filterType Conditional thr cooc))
255 $ toList phyloDocs
256 mcl' = mcl `using` parList rdeepseq
257 in fromList mcl'
258 --------------------------------------
259
260 -- dev viz graph maxClique getMaxClique
261
262
263 --------------------
264 -- | Coocurency | --
265 --------------------
266
267
268 -- To transform the docs into a time map of coocurency matrix
269 docsToTimeScaleCooc :: [Document] -> Vector Ngrams -> Map Date Cooc
270 docsToTimeScaleCooc docs fdt =
271 let mCooc = fromListWith sumCooc
272 $ map (\(_d,l) -> (_d, listToMatrix l))
273 $ map (\doc -> (date doc, sort $ ngramsToIdx (text doc) fdt)) docs
274 mCooc' = fromList
275 $ map (\t -> (t,empty))
276 $ toTimeScale (map date docs) 1
277 in trace ("\n" <> "-- | Build the coocurency matrix for " <> show (length $ keys mCooc') <> " unit of time" <> "\n")
278 $ unionWith sumCooc mCooc mCooc'
279
280
281 -----------------------
282 -- | to Phylo Base | --
283 -----------------------
284 -- TODO anoe
285 groupDocsByPeriodRec :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc] -> Map (date, date) [doc]
286 groupDocsByPeriodRec f prds docs acc =
287 if ((null prds) || (null docs))
288 then acc
289 else
290 let prd = head' "groupBy" prds
291 docs' = partition (\d -> (f d >= fst prd) && (f d <= snd prd)) docs
292 in groupDocsByPeriodRec f (tail prds) (snd docs') (insert prd (fst docs') acc)
293
294
295 -- To group a list of Documents by fixed periods
296 groupDocsByPeriod' :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
297 groupDocsByPeriod' f pds docs =
298 let docs' = groupBy (\d d' -> f d == f d') $ sortOn f docs
299 periods = map (inPeriode f docs') pds
300 periods' = periods `using` parList rdeepseq
301 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length pds) <> " periods" <> "\n")
302 $ fromList $ zip pds periods'
303 where
304 --------------------------------------
305 inPeriode :: Ord b => (t -> b) -> [[t]] -> (b, b) -> [t]
306 inPeriode f' h (start,end) =
307 concat $ fst $ partition (\d -> f' (head' "inPeriode" d) >= start && f' (head' "inPeriode" d) <= end) h
308
309
310
311 -- To group a list of Documents by fixed periods
312 groupDocsByPeriod :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
313 groupDocsByPeriod _ _ [] = panic "[ERR][Viz.Phylo.PhyloMaker] Empty [Documents] can not have any periods"
314 groupDocsByPeriod f pds es =
315 let periods = map (inPeriode f es) pds
316 periods' = periods `using` parList rdeepseq
317
318 in trace ("\n" <> "-- | Group " <> show(length es) <> " docs by " <> show(length pds) <> " periods" <> "\n")
319 $ fromList $ zip pds periods'
320 where
321 --------------------------------------
322 inPeriode :: Ord b => (t -> b) -> [t] -> (b, b) -> [t]
323 inPeriode f' h (start,end) =
324 fst $ partition (\d -> f' d >= start && f' d <= end) h
325 --------------------------------------
326
327
328 docsToTermFreq :: [Document] -> Vector Ngrams -> Map Int Double
329 docsToTermFreq docs fdt =
330 let nbDocs = fromIntegral $ length docs
331 freqs = map (/(nbDocs))
332 $ fromList
333 $ map (\lst -> (head' "docsToTermFreq" lst, fromIntegral $ length lst))
334 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) docs
335 sumFreqs = sum $ elems freqs
336 in map (/sumFreqs) freqs
337
338 docsToLastTermFreq :: Int -> [Document] -> Vector Ngrams -> Map Int Double
339 docsToLastTermFreq n docs fdt =
340 let last = take n $ reverse $ sort $ map date docs
341 nbDocs = fromIntegral $ length $ filter (\d -> elem (date d) last) docs
342 freqs = map (/(nbDocs))
343 $ fromList
344 $ map (\lst -> (head' "docsToLastTermFreq" lst, fromIntegral $ length lst))
345 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) $ filter (\d -> elem (date d) last) docs
346 sumFreqs = sum $ elems freqs
347 in map (/sumFreqs) freqs
348
349
350 -- To count the number of docs by unit of time
351 docsToTimeScaleNb :: [Document] -> Map Date Double
352 docsToTimeScaleNb docs =
353 let docs' = fromListWith (+) $ map (\d -> (date d,1)) docs
354 time = fromList $ map (\t -> (t,0)) $ toTimeScale (keys docs') 1
355 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length time) <> " unit of time" <> "\n")
356 $ unionWith (+) time docs'
357
358
359 initPhyloLevels :: Int -> PhyloPeriodId -> Map PhyloLevelId PhyloLevel
360 initPhyloLevels lvlMax pId =
361 fromList $ map (\lvl -> ((pId,lvl),PhyloLevel pId ("","") lvl empty)) [1..lvlMax]
362
363
364
365 -- To init the basic elements of a Phylo
366 toPhyloBase :: [Document] -> TermList -> Config -> Phylo
367 toPhyloBase docs lst conf =
368 let foundations = PhyloFoundations (Vector.fromList $ nub $ concat $ map text docs) lst
369 docsSources = PhyloSources (Vector.fromList $ nub $ concat $ map sources docs)
370 params = defaultPhyloParam { _phyloParam_config = conf }
371 periods = toPeriods (sort $ nub $ map date docs) (getTimePeriod $ timeUnit conf) (getTimeStep $ timeUnit conf)
372 in trace ("\n" <> "-- | Create PhyloBase out of " <> show(length docs) <> " docs \n")
373 $ Phylo foundations
374 docsSources
375 (docsToTimeScaleCooc docs (foundations ^. foundations_roots))
376 (docsToTimeScaleNb docs)
377 (docsToTermFreq docs (foundations ^. foundations_roots))
378 (docsToLastTermFreq (getTimePeriod $ timeUnit conf) docs (foundations ^. foundations_roots))
379 empty
380 empty
381 params
382 (fromList $ map (\prd -> (prd, PhyloPeriod prd ("","") (initPhyloLevels 1 prd))) periods)