]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Viz/Phylo/PhyloMaker.hs
[FEAT][PHYLO] corpusId to Documents function
[gargantext.git] / src / Gargantext / Core / Viz / Phylo / PhyloMaker.hs
1 {-|
2 Module : Gargantext.Core.Viz.Phylo.PhyloMaker
3 Description : Maker engine for rebuilding a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11
12 module Gargantext.Core.Viz.Phylo.PhyloMaker where
13
14
15 import Control.DeepSeq (NFData)
16 import Control.Lens hiding (Level)
17 import Control.Parallel.Strategies (parList, rdeepseq, using)
18 import Data.List (concat, nub, partition, sort, (++), group, intersect, null, sortOn, groupBy, tail)
19 import Data.Map (Map, fromListWith, keys, unionWith, fromList, empty, toList, elems, (!), restrictKeys, foldlWithKey, insert)
20 import Data.Text (Text)
21 import Data.Vector (Vector)
22 import Debug.Trace (trace)
23
24 import Gargantext.Core.Methods.Distances (Distance(Conditional))
25 import Gargantext.Core.Methods.Graph.MaxClique (getMaxCliques)
26 import Gargantext.Core.Text.Context (TermList)
27 import Gargantext.Core.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, fisWithSizePolyMap', Size(..))
28 import Gargantext.Core.Viz.Phylo
29 import Gargantext.Core.Viz.Phylo.PhyloExport (toHorizon)
30 import Gargantext.Core.Viz.Phylo.PhyloTools
31 import Gargantext.Core.Viz.Phylo.SynchronicClustering (synchronicClustering)
32 import Gargantext.Core.Viz.Phylo.TemporalMatching (adaptativeTemporalMatching, constanteTemporalMatching, getNextPeriods, filterDocs, filterDiago, reduceDiagos, toProximity)
33 import Gargantext.Prelude
34
35 import qualified Data.Set as Set
36 import qualified Data.Vector as Vector
37
38 ------------------
39 -- | To Phylo | --
40 ------------------
41
42 {-
43 -- TODO AD
44 data Phylo' = PhyloBase { _phylo'_phyloBase :: Phylo}
45 | PhyloN { _phylo'_phylo1 :: Phylo}
46
47
48 toPhylo' :: Phylo' -> [Document] -> TermList -> Config -> Phylo
49 toPhylo' (PhyloN phylo) = toPhylo'
50 toPhylo' (PhyloBase phylo) = toPhylo
51 -}
52
53
54 toPhylo :: Phylo -> Phylo
55 toPhylo phyloStep = trace ("# phylo1 groups " <> show(length $ getGroupsFromLevel 1 phylo1))
56 $ traceToPhylo (phyloLevel $ getConfig phyloStep) $
57 if (phyloLevel $ getConfig phyloStep) > 1
58 then foldl' (\phylo' _ -> synchronicClustering phylo') phyloAncestors [2..(phyloLevel $ getConfig phyloStep)]
59 else phylo1
60 where
61 --------------------------------------
62 phyloAncestors :: Phylo
63 phyloAncestors =
64 if (findAncestors $ getConfig phyloStep)
65 then toHorizon phylo1
66 else phylo1
67 --------------------------------------
68 phylo1 :: Phylo
69 phylo1 = toPhylo1 phyloStep
70 -- > AD to db here
71 --------------------------------------
72
73
74
75 --------------------
76 -- | To Phylo 1 | --
77 --------------------
78
79 toGroupsProxi :: Level -> Phylo -> Phylo
80 toGroupsProxi lvl phylo =
81 let proximity = phyloProximity $ getConfig phylo
82 groupsProxi = foldlWithKey (\acc pId pds ->
83 -- 1) process period by period
84 let egos = map (\g -> (getGroupId g, g ^. phylo_groupNgrams))
85 $ elems
86 $ view ( phylo_periodLevels
87 . traverse . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
88 . phylo_levelGroups ) pds
89 next = getNextPeriods ToParents (getTimeFrame $ timeUnit $ getConfig phylo) pId (keys $ phylo ^. phylo_periods)
90 targets = map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ getGroupsFromLevelPeriods lvl next phylo
91 docs = filterDocs (phylo ^. phylo_timeDocs) ([pId] ++ next)
92 diagos = filterDiago (phylo ^. phylo_timeCooc) ([pId] ++ next)
93 -- 2) compute the pairs in parallel
94 pairs = map (\(id,ngrams) ->
95 map (\(id',ngrams') ->
96 let nbDocs = (sum . elems) $ filterDocs docs ([idToPrd id, idToPrd id'])
97 diago = reduceDiagos $ filterDiago diagos ([idToPrd id, idToPrd id'])
98 in ((id,id'),toProximity nbDocs diago proximity ngrams ngrams' ngrams')
99 ) $ filter (\(_,ngrams') -> (not . null) $ intersect ngrams ngrams') targets
100 ) egos
101 pairs' = pairs `using` parList rdeepseq
102 in acc ++ (concat pairs')
103 ) [] $ phylo ^. phylo_periods
104 in phylo & phylo_groupsProxi .~ ((traceGroupsProxi . fromList) groupsProxi)
105
106
107 appendGroups :: (a -> PhyloPeriodId -> (Text,Text) -> Level -> Int -> [Cooc] -> PhyloGroup) -> Level -> Map (Date,Date) [a] -> Phylo -> Phylo
108 appendGroups f lvl m phylo = trace ("\n" <> "-- | Append " <> show (length $ concat $ elems m) <> " groups to Level " <> show (lvl) <> "\n")
109 $ over ( phylo_periods
110 . traverse
111 . phylo_periodLevels
112 . traverse)
113 (\phyloLvl -> if lvl == (phyloLvl ^. phylo_levelLevel)
114 then
115 let pId = phyloLvl ^. phylo_levelPeriod
116 pId' = phyloLvl ^. phylo_levelPeriod'
117 phyloCUnit = m ! pId
118 in phyloLvl
119 & phylo_levelGroups .~ (fromList $ foldl (\groups obj ->
120 groups ++ [ (((pId,lvl),length groups)
121 , f obj pId pId' lvl (length groups)
122 (elems $ restrictKeys (phylo ^. phylo_timeCooc) $ periodsToYears [pId]))
123 ] ) [] phyloCUnit)
124 else
125 phyloLvl )
126 phylo
127
128
129 cliqueToGroup :: PhyloClique -> PhyloPeriodId -> (Text,Text) -> Level -> Int -> [Cooc] -> PhyloGroup
130 cliqueToGroup fis pId pId' lvl idx coocs = PhyloGroup pId pId' lvl idx ""
131 (fis ^. phyloClique_support)
132 (fis ^. phyloClique_weight)
133 (fis ^. phyloClique_sources)
134 (fis ^. phyloClique_nodes)
135 (ngramsToCooc (fis ^. phyloClique_nodes) coocs)
136 (1,[0]) -- branchid (lvl,[path in the branching tree])
137 (fromList [("breaks",[0]),("seaLevels",[0])])
138 [] [] [] [] []
139
140
141 toPhylo1 :: Phylo -> Phylo
142 toPhylo1 phyloStep = case (getSeaElevation phyloStep) of
143 Constante start gap -> constanteTemporalMatching start gap phyloStep
144 Adaptative steps -> adaptativeTemporalMatching steps phyloStep
145
146 -----------------------
147 -- | To Phylo Step | --
148 -----------------------
149
150
151 indexDates' :: Map (Date,Date) [Document] -> Map (Date,Date) (Text,Text)
152 indexDates' m = map (\docs ->
153 let ds = map (\d -> date' d) docs
154 f = if (null ds)
155 then ""
156 else toFstDate ds
157 l = if (null ds)
158 then ""
159 else toLstDate ds
160 in (f,l)) m
161
162
163 -- To build the first phylo step from docs and terms
164 -- QL: backend entre phyloBase et phyloClique
165 toPhyloStep :: [Document] -> TermList -> Config -> Phylo
166 toPhyloStep docs lst conf = case (getSeaElevation phyloBase) of
167 Constante _ _ -> appendGroups cliqueToGroup 1 phyloClique (updatePeriods (indexDates' docs') phyloBase)
168 Adaptative _ -> toGroupsProxi 1
169 $ appendGroups cliqueToGroup 1 phyloClique (updatePeriods (indexDates' docs') phyloBase)
170 where
171 --------------------------------------
172 phyloClique :: Map (Date,Date) [PhyloClique]
173 phyloClique = toPhyloClique phyloBase docs'
174 --------------------------------------
175 docs' :: Map (Date,Date) [Document]
176 -- QL: Time Consuming here
177 docs' = groupDocsByPeriodRec date (getPeriodIds phyloBase) (sortOn date docs) empty
178 --------------------------------------
179 phyloBase :: Phylo
180 phyloBase = toPhyloBase docs lst conf
181 --------------------------------------
182
183 ---------------------------
184 -- | Frequent Item Set | --
185 ---------------------------
186
187
188 -- To apply a filter with the possibility of keeping some periods non empty (keep : True|False)
189 filterClique :: Bool -> Int -> (Int -> [PhyloClique] -> [PhyloClique]) -> Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
190 filterClique keep thr f m = case keep of
191 False -> map (\l -> f thr l) m
192 True -> map (\l -> keepFilled (f) thr l) m
193
194
195 -- To filter Fis with small Support
196 filterCliqueBySupport :: Int -> [PhyloClique] -> [PhyloClique]
197 filterCliqueBySupport thr l = filter (\clq -> (clq ^. phyloClique_support) >= thr) l
198
199
200 -- To filter Fis with small Clique size
201 filterCliqueBySize :: Int -> [PhyloClique] -> [PhyloClique]
202 filterCliqueBySize thr l = filter (\clq -> (length $ clq ^. phyloClique_nodes) >= thr) l
203
204
205 -- To filter nested Fis
206 filterCliqueByNested :: Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
207 filterCliqueByNested m =
208 let clq = map (\l ->
209 foldl (\mem f -> if (any (\f' -> isNested (f' ^. phyloClique_nodes) (f ^. phyloClique_nodes)) mem)
210 then mem
211 else
212 let fMax = filter (\f' -> not $ isNested (f ^. phyloClique_nodes) (f' ^. phyloClique_nodes)) mem
213 in fMax ++ [f] ) [] l)
214 $ elems m
215 clq' = clq `using` parList rdeepseq
216 in fromList $ zip (keys m) clq'
217
218
219 -- | To transform a time map of docs into a time map of Fis with some filters
220 toPhyloClique :: Phylo -> Map (Date, Date) [Document] -> Map (Date,Date) [PhyloClique]
221 toPhyloClique phylo phyloDocs = case (clique $ getConfig phylo) of
222 Fis s s' -> -- traceFis "Filtered Fis"
223 filterCliqueByNested
224 {- \$ traceFis "Filtered by clique size" -}
225 $ filterClique True s' (filterCliqueBySize)
226 {- \$ traceFis "Filtered by support" -}
227 $ filterClique True s (filterCliqueBySupport)
228 {- \$ traceFis "Unfiltered Fis" -}
229 phyloClique
230 MaxClique s _ _ -> filterClique True s (filterCliqueBySize)
231 phyloClique
232 where
233 --------------------------------------
234 phyloClique :: Map (Date,Date) [PhyloClique]
235 phyloClique = case (clique $ getConfig phylo) of
236 Fis _ _ ->
237 let fis = map (\(prd,docs) ->
238 case (corpusParser $ getConfig phylo) of
239 Csv' _ -> let lst = toList
240 $ fisWithSizePolyMap' (Segment 1 20) 1 (map (\d -> (ngramsToIdx (text d) (getRoots phylo), (weight d, (sourcesToIdx (sources d) (getSources phylo))))) docs)
241 in (prd, map (\f -> PhyloClique (Set.toList $ fst f) ((fst . snd) f) prd ((fst . snd . snd) f) (((snd . snd . snd) f))) lst)
242 _ -> let lst = toList
243 $ fisWithSizePolyMap (Segment 1 20) 1 (map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs)
244 in (prd, map (\f -> PhyloClique (Set.toList $ fst f) (snd f) prd Nothing []) lst)
245 )
246 $ toList phyloDocs
247 fis' = fis `using` parList rdeepseq
248 in fromList fis'
249 MaxClique _ thr filterType ->
250 let mcl = map (\(prd,docs) ->
251 let cooc = map round
252 $ foldl sumCooc empty
253 $ map listToMatrix
254 $ map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs
255 in (prd, map (\cl -> PhyloClique cl 0 prd Nothing []) $ getMaxCliques filterType Conditional thr cooc))
256 $ toList phyloDocs
257 mcl' = mcl `using` parList rdeepseq
258 in fromList mcl'
259 --------------------------------------
260
261 -- dev viz graph maxClique getMaxClique
262
263
264 --------------------
265 -- | Coocurency | --
266 --------------------
267
268
269 -- To transform the docs into a time map of coocurency matrix
270 docsToTimeScaleCooc :: [Document] -> Vector Ngrams -> Map Date Cooc
271 docsToTimeScaleCooc docs fdt =
272 let mCooc = fromListWith sumCooc
273 $ map (\(_d,l) -> (_d, listToMatrix l))
274 $ map (\doc -> (date doc, sort $ ngramsToIdx (text doc) fdt)) docs
275 mCooc' = fromList
276 $ map (\t -> (t,empty))
277 $ toTimeScale (map date docs) 1
278 in trace ("\n" <> "-- | Build the coocurency matrix for " <> show (length $ keys mCooc') <> " unit of time" <> "\n")
279 $ unionWith sumCooc mCooc mCooc'
280
281
282 -----------------------
283 -- | to Phylo Base | --
284 -----------------------
285 -- TODO anoe
286 groupDocsByPeriodRec :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc] -> Map (date, date) [doc]
287 groupDocsByPeriodRec f prds docs acc =
288 if ((null prds) || (null docs))
289 then acc
290 else
291 let prd = head' "groupBy" prds
292 docs' = partition (\d -> (f d >= fst prd) && (f d <= snd prd)) docs
293 in groupDocsByPeriodRec f (tail prds) (snd docs') (insert prd (fst docs') acc)
294
295
296 -- To group a list of Documents by fixed periods
297 groupDocsByPeriod' :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
298 groupDocsByPeriod' f pds docs =
299 let docs' = groupBy (\d d' -> f d == f d') $ sortOn f docs
300 periods = map (inPeriode f docs') pds
301 periods' = periods `using` parList rdeepseq
302 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length pds) <> " periods" <> "\n")
303 $ fromList $ zip pds periods'
304 where
305 --------------------------------------
306 inPeriode :: Ord b => (t -> b) -> [[t]] -> (b, b) -> [t]
307 inPeriode f' h (start,end) =
308 concat $ fst $ partition (\d -> f' (head' "inPeriode" d) >= start && f' (head' "inPeriode" d) <= end) h
309
310
311
312 -- To group a list of Documents by fixed periods
313 groupDocsByPeriod :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
314 groupDocsByPeriod _ _ [] = panic "[ERR][Viz.Phylo.PhyloMaker] Empty [Documents] can not have any periods"
315 groupDocsByPeriod f pds es =
316 let periods = map (inPeriode f es) pds
317 periods' = periods `using` parList rdeepseq
318
319 in trace ("\n" <> "-- | Group " <> show(length es) <> " docs by " <> show(length pds) <> " periods" <> "\n")
320 $ fromList $ zip pds periods'
321 where
322 --------------------------------------
323 inPeriode :: Ord b => (t -> b) -> [t] -> (b, b) -> [t]
324 inPeriode f' h (start,end) =
325 fst $ partition (\d -> f' d >= start && f' d <= end) h
326 --------------------------------------
327
328
329 docsToTermFreq :: [Document] -> Vector Ngrams -> Map Int Double
330 docsToTermFreq docs fdt =
331 let nbDocs = fromIntegral $ length docs
332 freqs = map (/(nbDocs))
333 $ fromList
334 $ map (\lst -> (head' "docsToTermFreq" lst, fromIntegral $ length lst))
335 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) docs
336 sumFreqs = sum $ elems freqs
337 in map (/sumFreqs) freqs
338
339 docsToLastTermFreq :: Int -> [Document] -> Vector Ngrams -> Map Int Double
340 docsToLastTermFreq n docs fdt =
341 let last = take n $ reverse $ sort $ map date docs
342 nbDocs = fromIntegral $ length $ filter (\d -> elem (date d) last) docs
343 freqs = map (/(nbDocs))
344 $ fromList
345 $ map (\lst -> (head' "docsToLastTermFreq" lst, fromIntegral $ length lst))
346 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) $ filter (\d -> elem (date d) last) docs
347 sumFreqs = sum $ elems freqs
348 in map (/sumFreqs) freqs
349
350
351 -- To count the number of docs by unit of time
352 docsToTimeScaleNb :: [Document] -> Map Date Double
353 docsToTimeScaleNb docs =
354 let docs' = fromListWith (+) $ map (\d -> (date d,1)) docs
355 time = fromList $ map (\t -> (t,0)) $ toTimeScale (keys docs') 1
356 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length time) <> " unit of time" <> "\n")
357 $ unionWith (+) time docs'
358
359
360 initPhyloLevels :: Int -> PhyloPeriodId -> Map PhyloLevelId PhyloLevel
361 initPhyloLevels lvlMax pId =
362 fromList $ map (\lvl -> ((pId,lvl),PhyloLevel pId ("","") lvl empty)) [1..lvlMax]
363
364
365
366 -- To init the basic elements of a Phylo
367 toPhyloBase :: [Document] -> TermList -> Config -> Phylo
368 toPhyloBase docs lst conf =
369 let foundations = PhyloFoundations (Vector.fromList $ nub $ concat $ map text docs) lst
370 docsSources = PhyloSources (Vector.fromList $ nub $ concat $ map sources docs)
371 params = defaultPhyloParam { _phyloParam_config = conf }
372 periods = toPeriods (sort $ nub $ map date docs) (getTimePeriod $ timeUnit conf) (getTimeStep $ timeUnit conf)
373 in trace ("\n" <> "-- | Create PhyloBase out of " <> show(length docs) <> " docs \n")
374 $ Phylo foundations
375 docsSources
376 (docsToTimeScaleCooc docs (foundations ^. foundations_roots))
377 (docsToTimeScaleNb docs)
378 (docsToTermFreq docs (foundations ^. foundations_roots))
379 (docsToLastTermFreq (getTimePeriod $ timeUnit conf) docs (foundations ^. foundations_roots))
380 empty
381 empty
382 params
383 (fromList $ map (\prd -> (prd, PhyloPeriod prd ("","") (initPhyloLevels 1 prd))) periods)