]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/Phylo/PhyloMaker.hs
changement du seuil
[gargantext.git] / src / Gargantext / Viz / Phylo / PhyloMaker.hs
1 {-|
2 Module : Gargantext.Viz.Phylo.PhyloMaker
3 Description : Maker engine for rebuilding a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 {-# LANGUAGE NoImplicitPrelude #-}
12 {-# LANGUAGE FlexibleContexts #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE MultiParamTypeClasses #-}
15
16 module Gargantext.Viz.Phylo.PhyloMaker where
17
18 import Data.List (concat, nub, partition, sort, (++), group, intersect, null, sortOn, groupBy, tail)
19 import Data.Map (Map, fromListWith, keys, unionWith, fromList, empty, toList, elems, (!), restrictKeys, foldlWithKey, insert)
20 import Data.Vector (Vector)
21
22 import Gargantext.Prelude
23 import Gargantext.Viz.AdaptativePhylo
24 import Gargantext.Viz.Phylo.PhyloTools
25 import Gargantext.Viz.Phylo.TemporalMatching (adaptativeTemporalMatching, constanteTemporalMatching, getNextPeriods, filterDocs, filterDiago, reduceDiagos, toProximity)
26 import Gargantext.Viz.Phylo.SynchronicClustering (synchronicClustering)
27 import Gargantext.Text.Context (TermList)
28 import Gargantext.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, Size(..))
29 import Gargantext.Viz.Graph.MaxClique (getMaxCliques)
30
31 import Control.DeepSeq (NFData)
32 import Control.Parallel.Strategies (parList, rdeepseq, using)
33 import Debug.Trace (trace)
34 import Control.Lens hiding (Level)
35
36 import qualified Data.Vector as Vector
37 import qualified Data.Set as Set
38
39
40 ------------------
41 -- | To Phylo | --
42 ------------------
43
44
45 toPhylo :: [Document] -> TermList -> Config -> Phylo
46 toPhylo docs lst conf = trace ("# phylo1 groups " <> show(length $ getGroupsFromLevel 1 phylo1))
47 $ traceToPhylo (phyloLevel conf) $
48 if (phyloLevel conf) > 1
49 then foldl' (\phylo' _ -> synchronicClustering phylo') phylo1 [2..(phyloLevel conf)]
50 else phylo1
51 where
52 --------------------------------------
53 phylo1 :: Phylo
54 phylo1 = toPhylo1 docs phyloBase
55 --------------------------------------
56 phyloBase :: Phylo
57 phyloBase = toPhyloBase docs lst conf
58 --------------------------------------
59
60
61
62 --------------------
63 -- | To Phylo 1 | --
64 --------------------
65
66 toGroupsProxi :: Level -> Phylo -> Phylo
67 toGroupsProxi lvl phylo =
68 let proximity = phyloProximity $ getConfig phylo
69 groupsProxi = foldlWithKey (\acc pId pds ->
70 -- 1) process period by period
71 let egos = map (\g -> (getGroupId g, g ^. phylo_groupNgrams))
72 $ elems
73 $ view ( phylo_periodLevels
74 . traverse . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
75 . phylo_levelGroups ) pds
76 next = getNextPeriods ToParents (getTimeFrame $ timeUnit $ getConfig phylo) pId (keys $ phylo ^. phylo_periods)
77 targets = map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ getGroupsFromLevelPeriods lvl next phylo
78 docs = filterDocs (phylo ^. phylo_timeDocs) ([pId] ++ next)
79 diagos = filterDiago (phylo ^. phylo_timeCooc) ([pId] ++ next)
80 -- 2) compute the pairs in parallel
81 pairs = map (\(id,ngrams) ->
82 map (\(id',ngrams') ->
83 let nbDocs = (sum . elems) $ filterDocs docs ([idToPrd id, idToPrd id'])
84 diago = reduceDiagos $ filterDiago diagos ([idToPrd id, idToPrd id'])
85 in ((id,id'),toProximity nbDocs diago proximity ngrams ngrams' ngrams')
86 ) $ filter (\(_,ngrams') -> (not . null) $ intersect ngrams ngrams') targets
87 ) egos
88 pairs' = pairs `using` parList rdeepseq
89 in acc ++ (concat pairs')
90 ) [] $ phylo ^. phylo_periods
91 in phylo & phylo_groupsProxi .~ ((traceGroupsProxi . fromList) groupsProxi)
92
93
94 appendGroups :: (a -> PhyloPeriodId -> Level -> Int -> [Cooc] -> PhyloGroup) -> Level -> Map (Date,Date) [a] -> Phylo -> Phylo
95 appendGroups f lvl m phylo = trace ("\n" <> "-- | Append " <> show (length $ concat $ elems m) <> " groups to Level " <> show (lvl) <> "\n")
96 $ over ( phylo_periods
97 . traverse
98 . phylo_periodLevels
99 . traverse)
100 (\phyloLvl -> if lvl == (phyloLvl ^. phylo_levelLevel)
101 then
102 let pId = phyloLvl ^. phylo_levelPeriod
103 phyloCUnit = m ! pId
104 in phyloLvl
105 & phylo_levelGroups .~ (fromList $ foldl (\groups obj ->
106 groups ++ [ (((pId,lvl),length groups)
107 , f obj pId lvl (length groups)
108 (elems $ restrictKeys (phylo ^. phylo_timeCooc) $ periodsToYears [pId]))
109 ] ) [] phyloCUnit)
110 else
111 phyloLvl )
112 phylo
113
114
115 cliqueToGroup :: PhyloClique -> PhyloPeriodId -> Level -> Int -> [Cooc] -> PhyloGroup
116 cliqueToGroup fis pId lvl idx coocs = PhyloGroup pId lvl idx ""
117 (fis ^. phyloClique_support)
118 (fis ^. phyloClique_nodes)
119 (ngramsToCooc (fis ^. phyloClique_nodes) coocs)
120 (1,[0]) -- | branchid (lvl,[path in the branching tree])
121 (fromList [("breaks",[0]),("seaLevels",[0])])
122 [] [] [] []
123
124
125 toPhylo1 :: [Document] -> Phylo -> Phylo
126 toPhylo1 docs phyloBase = case (getSeaElevation phyloBase) of
127 Constante start gap -> constanteTemporalMatching start gap
128 $ appendGroups cliqueToGroup 1 phyloClique phyloBase
129 Adaptative steps -> adaptativeTemporalMatching steps
130 $ toGroupsProxi 1
131 $ appendGroups cliqueToGroup 1 phyloClique phyloBase
132 where
133 --------------------------------------
134 phyloClique :: Map (Date,Date) [PhyloClique]
135 phyloClique = toPhyloClique phyloBase docs'
136 --------------------------------------
137 docs' :: Map (Date,Date) [Document]
138 docs' = groupDocsByPeriodRec date (getPeriodIds phyloBase) (sortOn date docs) empty
139 -- docs' = groupDocsByPeriod' date (getPeriodIds phyloBase) docs
140 --------------------------------------
141
142
143 ---------------------------
144 -- | Frequent Item Set | --
145 ---------------------------
146
147
148 -- | To apply a filter with the possibility of keeping some periods non empty (keep : True|False)
149 filterClique :: Bool -> Int -> (Int -> [PhyloClique] -> [PhyloClique]) -> Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
150 filterClique keep thr f m = case keep of
151 False -> map (\l -> f thr l) m
152 True -> map (\l -> keepFilled (f) thr l) m
153
154
155 -- | To filter Fis with small Support
156 filterCliqueBySupport :: Int -> [PhyloClique] -> [PhyloClique]
157 filterCliqueBySupport thr l = filter (\clq -> (clq ^. phyloClique_support) >= thr) l
158
159
160 -- | To filter Fis with small Clique size
161 filterCliqueBySize :: Int -> [PhyloClique] -> [PhyloClique]
162 filterCliqueBySize thr l = filter (\clq -> (length $ clq ^. phyloClique_nodes) >= thr) l
163
164
165 -- | To filter nested Fis
166 filterCliqueByNested :: Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
167 filterCliqueByNested m =
168 let clq = map (\l ->
169 foldl (\mem f -> if (any (\f' -> isNested (f' ^. phyloClique_nodes) (f ^. phyloClique_nodes)) mem)
170 then mem
171 else
172 let fMax = filter (\f' -> not $ isNested (f ^. phyloClique_nodes) (f' ^. phyloClique_nodes)) mem
173 in fMax ++ [f] ) [] l)
174 $ elems m
175 clq' = clq `using` parList rdeepseq
176 in fromList $ zip (keys m) clq'
177
178
179 -- | To transform a time map of docs into a time map of Fis with some filters
180 toPhyloClique :: Phylo -> Map (Date, Date) [Document] -> Map (Date,Date) [PhyloClique]
181 toPhyloClique phylo phyloDocs = case (clique $ getConfig phylo) of
182 Fis s s' -> -- traceFis "Filtered Fis"
183 filterCliqueByNested
184 -- $ traceFis "Filtered by clique size"
185 $ filterClique True s' (filterCliqueBySize)
186 -- $ traceFis "Filtered by support"
187 $ filterClique True s (filterCliqueBySupport)
188 -- $ traceFis "Unfiltered Fis"
189 phyloClique
190 MaxClique s -> filterClique True s (filterCliqueBySize)
191 phyloClique
192 where
193 --------------------------------------
194 phyloClique :: Map (Date,Date) [PhyloClique]
195 phyloClique = case (clique $ getConfig phylo) of
196 Fis _ _ ->
197 let fis = map (\(prd,docs) ->
198 let lst = toList $ fisWithSizePolyMap (Segment 1 20) 1 (map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs)
199 in (prd, map (\f -> PhyloClique (Set.toList $ fst f) (snd f) prd) lst))
200 $ toList phyloDocs
201 fis' = fis `using` parList rdeepseq
202 in fromList fis'
203 MaxClique _ ->
204 let mcl = map (\(prd,docs) ->
205 let cooc = map round
206 $ foldl sumCooc empty
207 $ map listToMatrix
208 $ map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs
209
210 in (prd, map (\cl -> PhyloClique cl 0 prd) $ getMaxCliques 0.001 cooc))
211
212 $ toList phyloDocs
213 mcl' = mcl `using` parList rdeepseq
214 in fromList mcl'
215 --------------------------------------
216
217 -- dev viz graph maxClique getMaxClique
218
219
220 --------------------
221 -- | Coocurency | --
222 --------------------
223
224
225 -- | To transform the docs into a time map of coocurency matrix
226 docsToTimeScaleCooc :: [Document] -> Vector Ngrams -> Map Date Cooc
227 docsToTimeScaleCooc docs fdt =
228 let mCooc = fromListWith sumCooc
229 $ map (\(_d,l) -> (_d, listToMatrix l))
230 $ map (\doc -> (date doc, sort $ ngramsToIdx (text doc) fdt)) docs
231 mCooc' = fromList
232 $ map (\t -> (t,empty))
233 $ toTimeScale (map date docs) 1
234 in trace ("\n" <> "-- | Build the coocurency matrix for " <> show (length $ keys mCooc') <> " unit of time" <> "\n")
235 $ unionWith sumCooc mCooc mCooc'
236
237
238 -----------------------
239 -- | to Phylo Base | --
240 -----------------------
241
242 groupDocsByPeriodRec :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc] -> Map (date, date) [doc]
243 groupDocsByPeriodRec f prds docs acc =
244 if ((null prds) || (null docs))
245 then acc
246 else
247 let prd = head' "groupBy" prds
248 docs' = partition (\d -> (f d >= fst prd) && (f d <= snd prd)) docs
249 in groupDocsByPeriodRec f (tail prds) (snd docs') (insert prd (fst docs') acc)
250
251
252 -- | To group a list of Documents by fixed periods
253 groupDocsByPeriod' :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
254 groupDocsByPeriod' f pds docs =
255 let docs' = groupBy (\d d' -> f d == f d') $ sortOn f docs
256 periods = map (inPeriode f docs') pds
257 periods' = periods `using` parList rdeepseq
258 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length pds) <> " periods" <> "\n")
259 $ fromList $ zip pds periods'
260 where
261 --------------------------------------
262 inPeriode :: Ord b => (t -> b) -> [[t]] -> (b, b) -> [t]
263 inPeriode f' h (start,end) =
264 concat $ fst $ partition (\d -> f' (head' "inPeriode" d) >= start && f' (head' "inPeriode" d) <= end) h
265
266
267
268 -- | To group a list of Documents by fixed periods
269 groupDocsByPeriod :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
270 groupDocsByPeriod _ _ [] = panic "[ERR][Viz.Phylo.PhyloMaker] Empty [Documents] can not have any periods"
271 groupDocsByPeriod f pds es =
272 let periods = map (inPeriode f es) pds
273 periods' = periods `using` parList rdeepseq
274
275 in trace ("\n" <> "-- | Group " <> show(length es) <> " docs by " <> show(length pds) <> " periods" <> "\n")
276 $ fromList $ zip pds periods'
277 where
278 --------------------------------------
279 inPeriode :: Ord b => (t -> b) -> [t] -> (b, b) -> [t]
280 inPeriode f' h (start,end) =
281 fst $ partition (\d -> f' d >= start && f' d <= end) h
282 --------------------------------------
283
284
285 docsToTermFreq :: [Document] -> Vector Ngrams -> Map Int Double
286 docsToTermFreq docs fdt =
287 let nbDocs = fromIntegral $ length docs
288 freqs = map (/(nbDocs))
289 $ fromList
290 $ map (\lst -> (head' "docsToTermFreq" lst, fromIntegral $ length lst))
291 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) docs
292 sumFreqs = sum $ elems freqs
293 in map (/sumFreqs) freqs
294
295
296 -- | To count the number of docs by unit of time
297 docsToTimeScaleNb :: [Document] -> Map Date Double
298 docsToTimeScaleNb docs =
299 let docs' = fromListWith (+) $ map (\d -> (date d,1)) docs
300 time = fromList $ map (\t -> (t,0)) $ toTimeScale (keys docs') 1
301 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length time) <> " unit of time" <> "\n")
302 $ unionWith (+) time docs'
303
304
305 initPhyloLevels :: Int -> PhyloPeriodId -> Map PhyloLevelId PhyloLevel
306 initPhyloLevels lvlMax pId =
307 fromList $ map (\lvl -> ((pId,lvl),PhyloLevel pId lvl empty)) [1..lvlMax]
308
309
310 -- | To init the basic elements of a Phylo
311 toPhyloBase :: [Document] -> TermList -> Config -> Phylo
312 toPhyloBase docs lst conf =
313 let foundations = PhyloFoundations (Vector.fromList $ nub $ concat $ map text docs) lst
314 params = defaultPhyloParam { _phyloParam_config = conf }
315 periods = toPeriods (sort $ nub $ map date docs) (getTimePeriod $ timeUnit conf) (getTimeStep $ timeUnit conf)
316 in trace ("\n" <> "-- | Create PhyloBase out of " <> show(length docs) <> " docs \n")
317 $ Phylo foundations
318 (docsToTimeScaleCooc docs (foundations ^. foundations_roots))
319 (docsToTimeScaleNb docs)
320 (docsToTermFreq docs (foundations ^. foundations_roots))
321 empty
322 empty
323 params
324 (fromList $ map (\prd -> (prd, PhyloPeriod prd (initPhyloLevels 1 prd))) periods)