]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/Phylo/PhyloMaker.hs
Merge branch 'dev' into dev-doc-table-optimization
[gargantext.git] / src / Gargantext / Viz / Phylo / PhyloMaker.hs
1 {-|
2 Module : Gargantext.Viz.Phylo.PhyloMaker
3 Description : Maker engine for rebuilding a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11
12 module Gargantext.Viz.Phylo.PhyloMaker where
13
14 import Data.List (concat, nub, partition, sort, (++), group, intersect, null, sortOn, groupBy)
15 import Data.Map (Map, fromListWith, keys, unionWith, fromList, empty, toList, elems, (!), restrictKeys, foldlWithKey)
16 import Data.Set (size)
17 import Data.Vector (Vector)
18
19 import Gargantext.Prelude
20 import Gargantext.Viz.AdaptativePhylo
21 import Gargantext.Viz.Phylo.PhyloTools
22 import Gargantext.Viz.Phylo.TemporalMatching (adaptativeTemporalMatching, constanteTemporalMatching, getNextPeriods, filterDocs, filterDiago, reduceDiagos, toProximity)
23 import Gargantext.Viz.Phylo.SynchronicClustering (synchronicClustering)
24 import Gargantext.Text.Context (TermList)
25 import Gargantext.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, Size(..))
26
27 import Control.DeepSeq (NFData)
28 import Control.Parallel.Strategies (parList, rdeepseq, using)
29 import Debug.Trace (trace)
30 import Control.Lens hiding (Level)
31
32 import qualified Data.Vector as Vector
33 import qualified Data.Set as Set
34
35
36 ------------------
37 -- | To Phylo | --
38 ------------------
39
40
41 toPhylo :: [Document] -> TermList -> Config -> Phylo
42 toPhylo docs lst conf = trace ("# phylo1 groups " <> show(length $ getGroupsFromLevel 1 phylo1))
43 $ traceToPhylo (phyloLevel conf) $
44 if (phyloLevel conf) > 1
45 then foldl' (\phylo' _ -> synchronicClustering phylo') phylo1 [2..(phyloLevel conf)]
46 else phylo1
47 where
48 --------------------------------------
49 phylo1 :: Phylo
50 phylo1 = toPhylo1 docs phyloBase
51 --------------------------------------
52 phyloBase :: Phylo
53 phyloBase = toPhyloBase docs lst conf
54 --------------------------------------
55
56
57
58 --------------------
59 -- | To Phylo 1 | --
60 --------------------
61
62 toGroupsProxi :: Level -> Phylo -> Phylo
63 toGroupsProxi lvl phylo =
64 let proximity = phyloProximity $ getConfig phylo
65 groupsProxi = foldlWithKey (\acc pId pds ->
66 -- 1) process period by period
67 let egos = map (\g -> (getGroupId g, g ^. phylo_groupNgrams))
68 $ elems
69 $ view ( phylo_periodLevels
70 . traverse . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
71 . phylo_levelGroups ) pds
72 next = getNextPeriods ToParents (getTimeFrame $ timeUnit $ getConfig phylo) pId (keys $ phylo ^. phylo_periods)
73 targets = map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ getGroupsFromLevelPeriods lvl next phylo
74 docs = filterDocs (phylo ^. phylo_timeDocs) ([pId] ++ next)
75 diagos = filterDiago (phylo ^. phylo_timeCooc) ([pId] ++ next)
76 -- 2) compute the pairs in parallel
77 pairs = map (\(id,ngrams) ->
78 map (\(id',ngrams') ->
79 let nbDocs = (sum . elems) $ filterDocs docs ([idToPrd id, idToPrd id'])
80 diago = reduceDiagos $ filterDiago diagos ([idToPrd id, idToPrd id'])
81 in ((id,id'),toProximity nbDocs diago proximity ngrams ngrams' ngrams')
82 ) $ filter (\(_,ngrams') -> (not . null) $ intersect ngrams ngrams') targets
83 ) egos
84 pairs' = pairs `using` parList rdeepseq
85 in acc ++ (concat pairs')
86 ) [] $ phylo ^. phylo_periods
87 in phylo & phylo_groupsProxi .~ ((traceGroupsProxi . fromList) groupsProxi)
88
89
90 appendGroups :: (a -> PhyloPeriodId -> Level -> Int -> Vector Ngrams -> [Cooc] -> PhyloGroup) -> Level -> Map (Date,Date) [a] -> Phylo -> Phylo
91 appendGroups f lvl m phylo = trace ("\n" <> "-- | Append " <> show (length $ concat $ elems m) <> " groups to Level " <> show (lvl) <> "\n")
92 $ over ( phylo_periods
93 . traverse
94 . phylo_periodLevels
95 . traverse)
96 (\phyloLvl -> if lvl == (phyloLvl ^. phylo_levelLevel)
97 then
98 let pId = phyloLvl ^. phylo_levelPeriod
99 phyloCUnit = m ! pId
100 in phyloLvl
101 & phylo_levelGroups .~ (fromList $ foldl (\groups obj ->
102 groups ++ [ (((pId,lvl),length groups)
103 , f obj pId lvl (length groups) (getRoots phylo)
104 (elems $ restrictKeys (phylo ^. phylo_timeCooc) $ periodsToYears [pId]))
105 ] ) [] phyloCUnit)
106 else
107 phyloLvl )
108 phylo
109
110
111 cliqueToGroup :: PhyloClique -> PhyloPeriodId -> Level -> Int -> Vector Ngrams -> [Cooc] -> PhyloGroup
112 cliqueToGroup fis pId lvl idx fdt coocs =
113 let ngrams = ngramsToIdx (Set.toList $ fis ^. phyloClique_nodes) fdt
114 in PhyloGroup pId lvl idx ""
115 (fis ^. phyloClique_support)
116 ngrams
117 (ngramsToCooc ngrams coocs)
118 (1,[0]) -- branchid (lvl,[path in the branching tree])
119 (fromList [("breaks",[0]),("seaLevels",[0])])
120 [] [] [] []
121
122
123 toPhylo1 :: [Document] -> Phylo -> Phylo
124 toPhylo1 docs phyloBase = case (getSeaElevation phyloBase) of
125 Constante start gap -> constanteTemporalMatching start gap
126 $ appendGroups cliqueToGroup 1 phyloClique phyloBase
127 Adaptative steps -> adaptativeTemporalMatching steps
128 $ toGroupsProxi 1
129 $ appendGroups cliqueToGroup 1 phyloClique phyloBase
130 where
131 --------------------------------------
132 phyloClique :: Map (Date,Date) [PhyloClique]
133 phyloClique = toPhyloClique phyloBase docs'
134 --------------------------------------
135 docs' :: Map (Date,Date) [Document]
136 docs' = groupDocsByPeriod' date (getPeriodIds phyloBase) docs
137 --------------------------------------
138
139
140 ---------------------------
141 -- | Frequent Item Set | --
142 ---------------------------
143
144
145 -- To apply a filter with the possibility of keeping some periods non empty (keep : True|False)
146 filterClique :: Bool -> Int -> (Int -> [PhyloClique] -> [PhyloClique]) -> Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
147 filterClique keep thr f m = case keep of
148 False -> map (\l -> f thr l) m
149 True -> map (\l -> keepFilled (f) thr l) m
150
151
152 -- To filter Fis with small Support
153 filterCliqueBySupport :: Int -> [PhyloClique] -> [PhyloClique]
154 filterCliqueBySupport thr l = filter (\clq -> (clq ^. phyloClique_support) >= thr) l
155
156
157 -- To filter Fis with small Clique size
158 filterCliqueBySize :: Int -> [PhyloClique] -> [PhyloClique]
159 filterCliqueBySize thr l = filter (\clq -> (size $ clq ^. phyloClique_nodes) >= thr) l
160
161
162 -- To filter nested Fis
163 filterCliqueByNested :: Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
164 filterCliqueByNested m =
165 let clq = map (\l ->
166 foldl (\mem f -> if (any (\f' -> isNested (Set.toList $ f' ^. phyloClique_nodes) (Set.toList $ f ^. phyloClique_nodes)) mem)
167 then mem
168 else
169 let fMax = filter (\f' -> not $ isNested (Set.toList $ f ^. phyloClique_nodes) (Set.toList $ f' ^. phyloClique_nodes)) mem
170 in fMax ++ [f] ) [] l)
171 $ elems m
172 clq' = clq `using` parList rdeepseq
173 in fromList $ zip (keys m) clq'
174
175
176 -- To transform a time map of docs innto a time map of Fis with some filters
177 toPhyloClique :: Phylo -> Map (Date, Date) [Document] -> Map (Date,Date) [PhyloClique]
178 toPhyloClique phylo phyloDocs = case (clique $ getConfig phylo) of
179 Fis s s' -> -- traceFis "Filtered Fis"
180 filterCliqueByNested
181 {- \$ traceFis "Filtered by clique size" -}
182 $ filterClique True s' (filterCliqueBySize)
183 {- \$ traceFis "Filtered by support" -}
184 $ filterClique True s (filterCliqueBySupport)
185 {- \$ traceFis "Unfiltered Fis" -}
186 phyloClique
187 MaxClique _ -> undefined
188 where
189 --------------------------------------
190 phyloClique :: Map (Date,Date) [PhyloClique]
191 phyloClique = case (clique $ getConfig phylo) of
192 Fis _ _ -> let fis = map (\(prd,docs) ->
193 let lst = toList $ fisWithSizePolyMap (Segment 1 20) 1 (map text docs)
194 in (prd, map (\f -> PhyloClique (fst f) (snd f) prd) lst))
195 $ toList phyloDocs
196 fis' = fis `using` parList rdeepseq
197 in fromList fis'
198 MaxClique _ -> undefined
199 --------------------------------------
200
201
202 --------------------
203 -- | Coocurency | --
204 --------------------
205
206
207 -- To transform the docs into a time map of coocurency matrix
208 docsToTimeScaleCooc :: [Document] -> Vector Ngrams -> Map Date Cooc
209 docsToTimeScaleCooc docs fdt =
210 let mCooc = fromListWith sumCooc
211 $ map (\(_d,l) -> (_d, listToMatrix l))
212 $ map (\doc -> (date doc, sort $ ngramsToIdx (text doc) fdt)) docs
213 mCooc' = fromList
214 $ map (\t -> (t,empty))
215 $ toTimeScale (map date docs) 1
216 in trace ("\n" <> "-- | Build the coocurency matrix for " <> show (length $ keys mCooc') <> " unit of time" <> "\n")
217 $ unionWith sumCooc mCooc mCooc'
218
219
220 -----------------------
221 -- | to Phylo Base | --
222 -----------------------
223
224 -- To group a list of Documents by fixed periods
225 groupDocsByPeriod' :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
226 groupDocsByPeriod' f pds docs =
227 let docs' = groupBy (\d d' -> f d == f d') $ sortOn f docs
228 periods = map (inPeriode f docs') pds
229 periods' = periods `using` parList rdeepseq
230 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length pds) <> " periods" <> "\n")
231 $ fromList $ zip pds periods'
232 where
233 --------------------------------------
234 inPeriode :: Ord b => (t -> b) -> [[t]] -> (b, b) -> [t]
235 inPeriode f' h (start,end) =
236 concat $ fst $ partition (\d -> f' (head' "inPeriode" d) >= start && f' (head' "inPeriode" d) <= end) h
237
238
239
240 -- To group a list of Documents by fixed periods
241 groupDocsByPeriod :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
242 groupDocsByPeriod _ _ [] = panic "[ERR][Viz.Phylo.PhyloMaker] Empty [Documents] can not have any periods"
243 groupDocsByPeriod f pds es =
244 let periods = map (inPeriode f es) pds
245 periods' = periods `using` parList rdeepseq
246
247 in trace ("\n" <> "-- | Group " <> show(length es) <> " docs by " <> show(length pds) <> " periods" <> "\n")
248 $ fromList $ zip pds periods'
249 where
250 --------------------------------------
251 inPeriode :: Ord b => (t -> b) -> [t] -> (b, b) -> [t]
252 inPeriode f' h (start,end) =
253 fst $ partition (\d -> f' d >= start && f' d <= end) h
254 --------------------------------------
255
256
257 docsToTermFreq :: [Document] -> Vector Ngrams -> Map Int Double
258 docsToTermFreq docs fdt =
259 let nbDocs = fromIntegral $ length docs
260 freqs = map (/(nbDocs))
261 $ fromList
262 $ map (\lst -> (head' "docsToTermFreq" lst, fromIntegral $ length lst))
263 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) docs
264 sumFreqs = sum $ elems freqs
265 in map (/sumFreqs) freqs
266
267
268 -- To count the number of docs by unit of time
269 docsToTimeScaleNb :: [Document] -> Map Date Double
270 docsToTimeScaleNb docs =
271 let docs' = fromListWith (+) $ map (\d -> (date d,1)) docs
272 time = fromList $ map (\t -> (t,0)) $ toTimeScale (keys docs') 1
273 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length time) <> " unit of time" <> "\n")
274 $ unionWith (+) time docs'
275
276
277 initPhyloLevels :: Int -> PhyloPeriodId -> Map PhyloLevelId PhyloLevel
278 initPhyloLevels lvlMax pId =
279 fromList $ map (\lvl -> ((pId,lvl),PhyloLevel pId lvl empty)) [1..lvlMax]
280
281
282 -- To init the basic elements of a Phylo
283 toPhyloBase :: [Document] -> TermList -> Config -> Phylo
284 toPhyloBase docs lst conf =
285 let foundations = PhyloFoundations (Vector.fromList $ nub $ concat $ map text docs) lst
286 params = defaultPhyloParam { _phyloParam_config = conf }
287 periods = toPeriods (sort $ nub $ map date docs) (getTimePeriod $ timeUnit conf) (getTimeStep $ timeUnit conf)
288 in trace ("\n" <> "-- | Create PhyloBase out of " <> show(length docs) <> " docs \n")
289 $ Phylo foundations
290 (docsToTimeScaleCooc docs (foundations ^. foundations_roots))
291 (docsToTimeScaleNb docs)
292 (docsToTermFreq docs (foundations ^. foundations_roots))
293 empty
294 params
295 (fromList $ map (\prd -> (prd, PhyloPeriod prd (initPhyloLevels 1 prd))) periods)