]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/Phylo/PhyloMaker.hs
Merge branch 'dev' into dev-doc-table-optimization
[gargantext.git] / src / Gargantext / Viz / Phylo / PhyloMaker.hs
1 {-|
2 Module : Gargantext.Viz.Phylo.PhyloMaker
3 Description : Maker engine for rebuilding a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11
12 module Gargantext.Viz.Phylo.PhyloMaker where
13
14 import Data.List (concat, nub, partition, sort, (++), group, intersect, null, sortOn, groupBy, tail)
15 import Data.Map (Map, fromListWith, keys, unionWith, fromList, empty, toList, elems, (!), restrictKeys, foldlWithKey, insert)
16 import Data.Vector (Vector)
17
18 import Gargantext.Prelude
19 import Gargantext.Viz.AdaptativePhylo
20 import Gargantext.Viz.Phylo.PhyloTools
21 import Gargantext.Viz.Phylo.TemporalMatching (adaptativeTemporalMatching, constanteTemporalMatching, getNextPeriods, filterDocs, filterDiago, reduceDiagos, toProximity)
22 import Gargantext.Viz.Phylo.SynchronicClustering (synchronicClustering)
23 import Gargantext.Text.Context (TermList)
24 import Gargantext.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, Size(..))
25 import Gargantext.Viz.Graph.MaxClique (getMaxCliques)
26 import Gargantext.Viz.Graph.Distances (Distance(Conditional))
27
28 import Control.DeepSeq (NFData)
29 import Control.Parallel.Strategies (parList, rdeepseq, using)
30 import Debug.Trace (trace)
31 import Control.Lens hiding (Level)
32
33 import qualified Data.Vector as Vector
34 import qualified Data.Set as Set
35
36
37 ------------------
38 -- | To Phylo | --
39 ------------------
40
41
42 toPhylo :: [Document] -> TermList -> Config -> Phylo
43 toPhylo docs lst conf = trace ("# phylo1 groups " <> show(length $ getGroupsFromLevel 1 phylo1))
44 $ traceToPhylo (phyloLevel conf) $
45 if (phyloLevel conf) > 1
46 then foldl' (\phylo' _ -> synchronicClustering phylo') phylo1 [2..(phyloLevel conf)]
47 else phylo1
48 where
49 --------------------------------------
50 phylo1 :: Phylo
51 phylo1 = toPhylo1 docs phyloBase
52 --------------------------------------
53 phyloBase :: Phylo
54 phyloBase = toPhyloBase docs lst conf
55 --------------------------------------
56
57
58
59 --------------------
60 -- | To Phylo 1 | --
61 --------------------
62
63 toGroupsProxi :: Level -> Phylo -> Phylo
64 toGroupsProxi lvl phylo =
65 let proximity = phyloProximity $ getConfig phylo
66 groupsProxi = foldlWithKey (\acc pId pds ->
67 -- 1) process period by period
68 let egos = map (\g -> (getGroupId g, g ^. phylo_groupNgrams))
69 $ elems
70 $ view ( phylo_periodLevels
71 . traverse . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
72 . phylo_levelGroups ) pds
73 next = getNextPeriods ToParents (getTimeFrame $ timeUnit $ getConfig phylo) pId (keys $ phylo ^. phylo_periods)
74 targets = map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ getGroupsFromLevelPeriods lvl next phylo
75 docs = filterDocs (phylo ^. phylo_timeDocs) ([pId] ++ next)
76 diagos = filterDiago (phylo ^. phylo_timeCooc) ([pId] ++ next)
77 -- 2) compute the pairs in parallel
78 pairs = map (\(id,ngrams) ->
79 map (\(id',ngrams') ->
80 let nbDocs = (sum . elems) $ filterDocs docs ([idToPrd id, idToPrd id'])
81 diago = reduceDiagos $ filterDiago diagos ([idToPrd id, idToPrd id'])
82 in ((id,id'),toProximity nbDocs diago proximity ngrams ngrams' ngrams')
83 ) $ filter (\(_,ngrams') -> (not . null) $ intersect ngrams ngrams') targets
84 ) egos
85 pairs' = pairs `using` parList rdeepseq
86 in acc ++ (concat pairs')
87 ) [] $ phylo ^. phylo_periods
88 in phylo & phylo_groupsProxi .~ ((traceGroupsProxi . fromList) groupsProxi)
89
90
91 appendGroups :: (a -> PhyloPeriodId -> Level -> Int -> [Cooc] -> PhyloGroup) -> Level -> Map (Date,Date) [a] -> Phylo -> Phylo
92 appendGroups f lvl m phylo = trace ("\n" <> "-- | Append " <> show (length $ concat $ elems m) <> " groups to Level " <> show (lvl) <> "\n")
93 $ over ( phylo_periods
94 . traverse
95 . phylo_periodLevels
96 . traverse)
97 (\phyloLvl -> if lvl == (phyloLvl ^. phylo_levelLevel)
98 then
99 let pId = phyloLvl ^. phylo_levelPeriod
100 phyloCUnit = m ! pId
101 in phyloLvl
102 & phylo_levelGroups .~ (fromList $ foldl (\groups obj ->
103 groups ++ [ (((pId,lvl),length groups)
104 , f obj pId lvl (length groups)
105 (elems $ restrictKeys (phylo ^. phylo_timeCooc) $ periodsToYears [pId]))
106 ] ) [] phyloCUnit)
107 else
108 phyloLvl )
109 phylo
110
111
112 cliqueToGroup :: PhyloClique -> PhyloPeriodId -> Level -> Int -> [Cooc] -> PhyloGroup
113 cliqueToGroup fis pId lvl idx coocs = PhyloGroup pId lvl idx ""
114 (fis ^. phyloClique_support)
115 (fis ^. phyloClique_nodes)
116 (ngramsToCooc (fis ^. phyloClique_nodes) coocs)
117 (1,[0]) -- branchid (lvl,[path in the branching tree])
118 (fromList [("breaks",[0]),("seaLevels",[0])])
119 [] [] [] []
120
121
122 toPhylo1 :: [Document] -> Phylo -> Phylo
123 toPhylo1 docs phyloBase = case (getSeaElevation phyloBase) of
124 Constante start gap -> constanteTemporalMatching start gap
125 $ appendGroups cliqueToGroup 1 phyloClique phyloBase
126 Adaptative steps -> adaptativeTemporalMatching steps
127 $ toGroupsProxi 1
128 $ appendGroups cliqueToGroup 1 phyloClique phyloBase
129 where
130 --------------------------------------
131 phyloClique :: Map (Date,Date) [PhyloClique]
132 phyloClique = toPhyloClique phyloBase docs'
133 --------------------------------------
134 docs' :: Map (Date,Date) [Document]
135 docs' = groupDocsByPeriodRec date (getPeriodIds phyloBase) (sortOn date docs) empty
136 -- docs' = groupDocsByPeriod' date (getPeriodIds phyloBase) docs
137 --------------------------------------
138
139
140 ---------------------------
141 -- | Frequent Item Set | --
142 ---------------------------
143
144
145 -- To apply a filter with the possibility of keeping some periods non empty (keep : True|False)
146 filterClique :: Bool -> Int -> (Int -> [PhyloClique] -> [PhyloClique]) -> Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
147 filterClique keep thr f m = case keep of
148 False -> map (\l -> f thr l) m
149 True -> map (\l -> keepFilled (f) thr l) m
150
151
152 -- To filter Fis with small Support
153 filterCliqueBySupport :: Int -> [PhyloClique] -> [PhyloClique]
154 filterCliqueBySupport thr l = filter (\clq -> (clq ^. phyloClique_support) >= thr) l
155
156
157 -- To filter Fis with small Clique size
158 filterCliqueBySize :: Int -> [PhyloClique] -> [PhyloClique]
159 filterCliqueBySize thr l = filter (\clq -> (length $ clq ^. phyloClique_nodes) >= thr) l
160
161
162 -- To filter nested Fis
163 filterCliqueByNested :: Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
164 filterCliqueByNested m =
165 let clq = map (\l ->
166 foldl (\mem f -> if (any (\f' -> isNested (f' ^. phyloClique_nodes) (f ^. phyloClique_nodes)) mem)
167 then mem
168 else
169 let fMax = filter (\f' -> not $ isNested (f ^. phyloClique_nodes) (f' ^. phyloClique_nodes)) mem
170 in fMax ++ [f] ) [] l)
171 $ elems m
172 clq' = clq `using` parList rdeepseq
173 in fromList $ zip (keys m) clq'
174
175
176 -- | To transform a time map of docs into a time map of Fis with some filters
177 toPhyloClique :: Phylo -> Map (Date, Date) [Document] -> Map (Date,Date) [PhyloClique]
178 toPhyloClique phylo phyloDocs = case (clique $ getConfig phylo) of
179 Fis s s' -> -- traceFis "Filtered Fis"
180 filterCliqueByNested
181 {- \$ traceFis "Filtered by clique size" -}
182 $ filterClique True s' (filterCliqueBySize)
183 {- \$ traceFis "Filtered by support" -}
184 $ filterClique True s (filterCliqueBySupport)
185 {- \$ traceFis "Unfiltered Fis" -}
186 phyloClique
187 MaxClique s -> filterClique True s (filterCliqueBySize)
188 phyloClique
189 where
190 --------------------------------------
191 phyloClique :: Map (Date,Date) [PhyloClique]
192 phyloClique = case (clique $ getConfig phylo) of
193 Fis _ _ ->
194 let fis = map (\(prd,docs) ->
195 let lst = toList $ fisWithSizePolyMap (Segment 1 20) 1 (map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs)
196 in (prd, map (\f -> PhyloClique (Set.toList $ fst f) (snd f) prd) lst))
197 $ toList phyloDocs
198 fis' = fis `using` parList rdeepseq
199 in fromList fis'
200 MaxClique _ ->
201 let mcl = map (\(prd,docs) ->
202 let cooc = map round
203 $ foldl sumCooc empty
204 $ map listToMatrix
205 $ map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs
206 in (prd, map (\cl -> PhyloClique cl 0 prd) $ getMaxCliques Conditional 0.001 cooc))
207 $ toList phyloDocs
208 mcl' = mcl `using` parList rdeepseq
209 in fromList mcl'
210 --------------------------------------
211
212 -- dev viz graph maxClique getMaxClique
213
214
215 --------------------
216 -- | Coocurency | --
217 --------------------
218
219
220 -- To transform the docs into a time map of coocurency matrix
221 docsToTimeScaleCooc :: [Document] -> Vector Ngrams -> Map Date Cooc
222 docsToTimeScaleCooc docs fdt =
223 let mCooc = fromListWith sumCooc
224 $ map (\(_d,l) -> (_d, listToMatrix l))
225 $ map (\doc -> (date doc, sort $ ngramsToIdx (text doc) fdt)) docs
226 mCooc' = fromList
227 $ map (\t -> (t,empty))
228 $ toTimeScale (map date docs) 1
229 in trace ("\n" <> "-- | Build the coocurency matrix for " <> show (length $ keys mCooc') <> " unit of time" <> "\n")
230 $ unionWith sumCooc mCooc mCooc'
231
232
233 -----------------------
234 -- | to Phylo Base | --
235 -----------------------
236
237 groupDocsByPeriodRec :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc] -> Map (date, date) [doc]
238 groupDocsByPeriodRec f prds docs acc =
239 if ((null prds) || (null docs))
240 then acc
241 else
242 let prd = head' "groupBy" prds
243 docs' = partition (\d -> (f d >= fst prd) && (f d <= snd prd)) docs
244 in groupDocsByPeriodRec f (tail prds) (snd docs') (insert prd (fst docs') acc)
245
246
247 -- To group a list of Documents by fixed periods
248 groupDocsByPeriod' :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
249 groupDocsByPeriod' f pds docs =
250 let docs' = groupBy (\d d' -> f d == f d') $ sortOn f docs
251 periods = map (inPeriode f docs') pds
252 periods' = periods `using` parList rdeepseq
253 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length pds) <> " periods" <> "\n")
254 $ fromList $ zip pds periods'
255 where
256 --------------------------------------
257 inPeriode :: Ord b => (t -> b) -> [[t]] -> (b, b) -> [t]
258 inPeriode f' h (start,end) =
259 concat $ fst $ partition (\d -> f' (head' "inPeriode" d) >= start && f' (head' "inPeriode" d) <= end) h
260
261
262
263 -- To group a list of Documents by fixed periods
264 groupDocsByPeriod :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
265 groupDocsByPeriod _ _ [] = panic "[ERR][Viz.Phylo.PhyloMaker] Empty [Documents] can not have any periods"
266 groupDocsByPeriod f pds es =
267 let periods = map (inPeriode f es) pds
268 periods' = periods `using` parList rdeepseq
269
270 in trace ("\n" <> "-- | Group " <> show(length es) <> " docs by " <> show(length pds) <> " periods" <> "\n")
271 $ fromList $ zip pds periods'
272 where
273 --------------------------------------
274 inPeriode :: Ord b => (t -> b) -> [t] -> (b, b) -> [t]
275 inPeriode f' h (start,end) =
276 fst $ partition (\d -> f' d >= start && f' d <= end) h
277 --------------------------------------
278
279
280 docsToTermFreq :: [Document] -> Vector Ngrams -> Map Int Double
281 docsToTermFreq docs fdt =
282 let nbDocs = fromIntegral $ length docs
283 freqs = map (/(nbDocs))
284 $ fromList
285 $ map (\lst -> (head' "docsToTermFreq" lst, fromIntegral $ length lst))
286 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) docs
287 sumFreqs = sum $ elems freqs
288 in map (/sumFreqs) freqs
289
290
291 -- To count the number of docs by unit of time
292 docsToTimeScaleNb :: [Document] -> Map Date Double
293 docsToTimeScaleNb docs =
294 let docs' = fromListWith (+) $ map (\d -> (date d,1)) docs
295 time = fromList $ map (\t -> (t,0)) $ toTimeScale (keys docs') 1
296 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length time) <> " unit of time" <> "\n")
297 $ unionWith (+) time docs'
298
299
300 initPhyloLevels :: Int -> PhyloPeriodId -> Map PhyloLevelId PhyloLevel
301 initPhyloLevels lvlMax pId =
302 fromList $ map (\lvl -> ((pId,lvl),PhyloLevel pId lvl empty)) [1..lvlMax]
303
304
305 -- To init the basic elements of a Phylo
306 toPhyloBase :: [Document] -> TermList -> Config -> Phylo
307 toPhyloBase docs lst conf =
308 let foundations = PhyloFoundations (Vector.fromList $ nub $ concat $ map text docs) lst
309 params = defaultPhyloParam { _phyloParam_config = conf }
310 periods = toPeriods (sort $ nub $ map date docs) (getTimePeriod $ timeUnit conf) (getTimeStep $ timeUnit conf)
311 in trace ("\n" <> "-- | Create PhyloBase out of " <> show(length docs) <> " docs \n")
312 $ Phylo foundations
313 (docsToTimeScaleCooc docs (foundations ^. foundations_roots))
314 (docsToTimeScaleNb docs)
315 (docsToTermFreq docs (foundations ^. foundations_roots))
316 empty
317 empty
318 params
319 (fromList $ map (\prd -> (prd, PhyloPeriod prd (initPhyloLevels 1 prd))) periods)