]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/Phylo/PhyloMaker.hs
Merge branch 'dev' into dev-phylo
[gargantext.git] / src / Gargantext / Viz / Phylo / PhyloMaker.hs
1 {-|
2 Module : Gargantext.Viz.Phylo.PhyloMaker
3 Description : Maker engine for rebuilding a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 {-# LANGUAGE NoImplicitPrelude #-}
12 {-# LANGUAGE FlexibleContexts #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE MultiParamTypeClasses #-}
15
16 module Gargantext.Viz.Phylo.PhyloMaker where
17
18 import Data.List (concat, nub, partition, sort, (++), group, intersect, null, sortOn, groupBy)
19 import Data.Map (Map, fromListWith, keys, unionWith, fromList, empty, toList, elems, (!), restrictKeys, foldlWithKey)
20 import Data.Set (size)
21 import Data.Vector (Vector)
22
23 import Gargantext.Prelude
24 import Gargantext.Viz.AdaptativePhylo
25 import Gargantext.Viz.Phylo.PhyloTools
26 import Gargantext.Viz.Phylo.TemporalMatching (adaptativeTemporalMatching, constanteTemporalMatching, getNextPeriods, filterDocs, filterDiago, reduceDiagos, toProximity)
27 import Gargantext.Viz.Phylo.SynchronicClustering (synchronicClustering)
28 import Gargantext.Text.Context (TermList)
29 import Gargantext.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, Size(..))
30
31 import Control.DeepSeq (NFData)
32 import Control.Parallel.Strategies (parList, rdeepseq, using)
33 import Debug.Trace (trace)
34 import Control.Lens hiding (Level)
35
36 import qualified Data.Vector as Vector
37 import qualified Data.Set as Set
38
39
40 ------------------
41 -- | To Phylo | --
42 ------------------
43
44
45 toPhylo :: [Document] -> TermList -> Config -> Phylo
46 toPhylo docs lst conf = trace ("# phylo1 groups " <> show(length $ getGroupsFromLevel 1 phylo1))
47 $ traceToPhylo (phyloLevel conf) $
48 if (phyloLevel conf) > 1
49 then foldl' (\phylo' _ -> synchronicClustering phylo') phylo1 [2..(phyloLevel conf)]
50 else phylo1
51 where
52 --------------------------------------
53 phylo1 :: Phylo
54 phylo1 = toPhylo1 docs phyloBase
55 --------------------------------------
56 phyloBase :: Phylo
57 phyloBase = toPhyloBase docs lst conf
58 --------------------------------------
59
60
61
62 --------------------
63 -- | To Phylo 1 | --
64 --------------------
65
66 toGroupsProxi :: Level -> Phylo -> Phylo
67 toGroupsProxi lvl phylo =
68 let proximity = phyloProximity $ getConfig phylo
69 groupsProxi = foldlWithKey (\acc pId pds ->
70 -- 1) process period by period
71 let egos = map (\g -> (getGroupId g, g ^. phylo_groupNgrams))
72 $ elems
73 $ view ( phylo_periodLevels
74 . traverse . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
75 . phylo_levelGroups ) pds
76 next = getNextPeriods ToParents (getTimeFrame $ timeUnit $ getConfig phylo) pId (keys $ phylo ^. phylo_periods)
77 targets = map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ getGroupsFromLevelPeriods lvl next phylo
78 docs = filterDocs (phylo ^. phylo_timeDocs) ([pId] ++ next)
79 diagos = filterDiago (phylo ^. phylo_timeCooc) ([pId] ++ next)
80 -- 2) compute the pairs in parallel
81 pairs = map (\(id,ngrams) ->
82 map (\(id',ngrams') ->
83 let nbDocs = (sum . elems) $ filterDocs docs ([idToPrd id, idToPrd id'])
84 diago = reduceDiagos $ filterDiago diagos ([idToPrd id, idToPrd id'])
85 in ((id,id'),toProximity nbDocs diago proximity ngrams ngrams' ngrams')
86 ) $ filter (\(_,ngrams') -> (not . null) $ intersect ngrams ngrams') targets
87 ) egos
88 pairs' = pairs `using` parList rdeepseq
89 in acc ++ (concat pairs')
90 ) [] $ phylo ^. phylo_periods
91 in phylo & phylo_groupsProxi .~ ((traceGroupsProxi . fromList) groupsProxi)
92
93
94 appendGroups :: (a -> PhyloPeriodId -> Level -> Int -> Vector Ngrams -> [Cooc] -> PhyloGroup) -> Level -> Map (Date,Date) [a] -> Phylo -> Phylo
95 appendGroups f lvl m phylo = trace ("\n" <> "-- | Append " <> show (length $ concat $ elems m) <> " groups to Level " <> show (lvl) <> "\n")
96 $ over ( phylo_periods
97 . traverse
98 . phylo_periodLevels
99 . traverse)
100 (\phyloLvl -> if lvl == (phyloLvl ^. phylo_levelLevel)
101 then
102 let pId = phyloLvl ^. phylo_levelPeriod
103 phyloCUnit = m ! pId
104 in phyloLvl
105 & phylo_levelGroups .~ (fromList $ foldl (\groups obj ->
106 groups ++ [ (((pId,lvl),length groups)
107 , f obj pId lvl (length groups) (getRoots phylo)
108 (elems $ restrictKeys (phylo ^. phylo_timeCooc) $ periodsToYears [pId]))
109 ] ) [] phyloCUnit)
110 else
111 phyloLvl )
112 phylo
113
114
115 cliqueToGroup :: PhyloClique -> PhyloPeriodId -> Level -> Int -> Vector Ngrams -> [Cooc] -> PhyloGroup
116 cliqueToGroup fis pId lvl idx fdt coocs =
117 let ngrams = ngramsToIdx (Set.toList $ fis ^. phyloClique_nodes) fdt
118 in PhyloGroup pId lvl idx ""
119 (fis ^. phyloClique_support)
120 ngrams
121 (ngramsToCooc ngrams coocs)
122 (1,[0]) -- | branchid (lvl,[path in the branching tree])
123 (fromList [("breaks",[0]),("seaLevels",[0])])
124 [] [] [] []
125
126
127 toPhylo1 :: [Document] -> Phylo -> Phylo
128 toPhylo1 docs phyloBase = case (getSeaElevation phyloBase) of
129 Constante start gap -> constanteTemporalMatching start gap
130 $ appendGroups cliqueToGroup 1 phyloClique phyloBase
131 Adaptative steps -> adaptativeTemporalMatching steps
132 $ toGroupsProxi 1
133 $ appendGroups cliqueToGroup 1 phyloClique phyloBase
134 where
135 --------------------------------------
136 phyloClique :: Map (Date,Date) [PhyloClique]
137 phyloClique = toPhyloClique phyloBase docs'
138 --------------------------------------
139 docs' :: Map (Date,Date) [Document]
140 docs' = groupDocsByPeriod' date (getPeriodIds phyloBase) docs
141 --------------------------------------
142
143
144 ---------------------------
145 -- | Frequent Item Set | --
146 ---------------------------
147
148
149 -- | To apply a filter with the possibility of keeping some periods non empty (keep : True|False)
150 filterClique :: Bool -> Int -> (Int -> [PhyloClique] -> [PhyloClique]) -> Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
151 filterClique keep thr f m = case keep of
152 False -> map (\l -> f thr l) m
153 True -> map (\l -> keepFilled (f) thr l) m
154
155
156 -- | To filter Fis with small Support
157 filterCliqueBySupport :: Int -> [PhyloClique] -> [PhyloClique]
158 filterCliqueBySupport thr l = filter (\clq -> (clq ^. phyloClique_support) >= thr) l
159
160
161 -- | To filter Fis with small Clique size
162 filterCliqueBySize :: Int -> [PhyloClique] -> [PhyloClique]
163 filterCliqueBySize thr l = filter (\clq -> (size $ clq ^. phyloClique_nodes) >= thr) l
164
165
166 -- | To filter nested Fis
167 filterCliqueByNested :: Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
168 filterCliqueByNested m =
169 let clq = map (\l ->
170 foldl (\mem f -> if (any (\f' -> isNested (Set.toList $ f' ^. phyloClique_nodes) (Set.toList $ f ^. phyloClique_nodes)) mem)
171 then mem
172 else
173 let fMax = filter (\f' -> not $ isNested (Set.toList $ f ^. phyloClique_nodes) (Set.toList $ f' ^. phyloClique_nodes)) mem
174 in fMax ++ [f] ) [] l)
175 $ elems m
176 clq' = clq `using` parList rdeepseq
177 in fromList $ zip (keys m) clq'
178
179
180 -- | To transform a time map of docs innto a time map of Fis with some filters
181 toPhyloClique :: Phylo -> Map (Date, Date) [Document] -> Map (Date,Date) [PhyloClique]
182 toPhyloClique phylo phyloDocs = case (clique $ getConfig phylo) of
183 Fis s s' -> -- traceFis "Filtered Fis"
184 filterCliqueByNested
185 -- $ traceFis "Filtered by clique size"
186 $ filterClique True s' (filterCliqueBySize)
187 -- $ traceFis "Filtered by support"
188 $ filterClique True s (filterCliqueBySupport)
189 -- $ traceFis "Unfiltered Fis"
190 phyloClique
191 MaxClique _ -> undefined
192 where
193 --------------------------------------
194 phyloClique :: Map (Date,Date) [PhyloClique]
195 phyloClique = case (clique $ getConfig phylo) of
196 Fis _ _ -> let fis = map (\(prd,docs) ->
197 let lst = toList $ fisWithSizePolyMap (Segment 1 20) 1 (map text docs)
198 in (prd, map (\f -> PhyloClique (fst f) (snd f) prd) lst))
199 $ toList phyloDocs
200 fis' = fis `using` parList rdeepseq
201 in fromList fis'
202 MaxClique _ -> undefined
203 --------------------------------------
204
205 -- dev viz graph maxClique getMaxClique
206
207
208 --------------------
209 -- | Coocurency | --
210 --------------------
211
212
213 -- | To transform the docs into a time map of coocurency matrix
214 docsToTimeScaleCooc :: [Document] -> Vector Ngrams -> Map Date Cooc
215 docsToTimeScaleCooc docs fdt =
216 let mCooc = fromListWith sumCooc
217 $ map (\(_d,l) -> (_d, listToMatrix l))
218 $ map (\doc -> (date doc, sort $ ngramsToIdx (text doc) fdt)) docs
219 mCooc' = fromList
220 $ map (\t -> (t,empty))
221 $ toTimeScale (map date docs) 1
222 in trace ("\n" <> "-- | Build the coocurency matrix for " <> show (length $ keys mCooc') <> " unit of time" <> "\n")
223 $ unionWith sumCooc mCooc mCooc'
224
225
226 -----------------------
227 -- | to Phylo Base | --
228 -----------------------
229
230 -- | To group a list of Documents by fixed periods
231 groupDocsByPeriod' :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
232 groupDocsByPeriod' f pds docs =
233 let docs' = groupBy (\d d' -> f d == f d') $ sortOn f docs
234 periods = map (inPeriode f docs') pds
235 periods' = periods `using` parList rdeepseq
236 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length pds) <> " periods" <> "\n")
237 $ fromList $ zip pds periods'
238 where
239 --------------------------------------
240 inPeriode :: Ord b => (t -> b) -> [[t]] -> (b, b) -> [t]
241 inPeriode f' h (start,end) =
242 concat $ fst $ partition (\d -> f' (head' "inPeriode" d) >= start && f' (head' "inPeriode" d) <= end) h
243
244
245
246 -- | To group a list of Documents by fixed periods
247 groupDocsByPeriod :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
248 groupDocsByPeriod _ _ [] = panic "[ERR][Viz.Phylo.PhyloMaker] Empty [Documents] can not have any periods"
249 groupDocsByPeriod f pds es =
250 let periods = map (inPeriode f es) pds
251 periods' = periods `using` parList rdeepseq
252
253 in trace ("\n" <> "-- | Group " <> show(length es) <> " docs by " <> show(length pds) <> " periods" <> "\n")
254 $ fromList $ zip pds periods'
255 where
256 --------------------------------------
257 inPeriode :: Ord b => (t -> b) -> [t] -> (b, b) -> [t]
258 inPeriode f' h (start,end) =
259 fst $ partition (\d -> f' d >= start && f' d <= end) h
260 --------------------------------------
261
262
263 docsToTermFreq :: [Document] -> Vector Ngrams -> Map Int Double
264 docsToTermFreq docs fdt =
265 let nbDocs = fromIntegral $ length docs
266 freqs = map (/(nbDocs))
267 $ fromList
268 $ map (\lst -> (head' "docsToTermFreq" lst, fromIntegral $ length lst))
269 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) docs
270 sumFreqs = sum $ elems freqs
271 in map (/sumFreqs) freqs
272
273
274 -- | To count the number of docs by unit of time
275 docsToTimeScaleNb :: [Document] -> Map Date Double
276 docsToTimeScaleNb docs =
277 let docs' = fromListWith (+) $ map (\d -> (date d,1)) docs
278 time = fromList $ map (\t -> (t,0)) $ toTimeScale (keys docs') 1
279 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length time) <> " unit of time" <> "\n")
280 $ unionWith (+) time docs'
281
282
283 initPhyloLevels :: Int -> PhyloPeriodId -> Map PhyloLevelId PhyloLevel
284 initPhyloLevels lvlMax pId =
285 fromList $ map (\lvl -> ((pId,lvl),PhyloLevel pId lvl empty)) [1..lvlMax]
286
287
288 -- | To init the basic elements of a Phylo
289 toPhyloBase :: [Document] -> TermList -> Config -> Phylo
290 toPhyloBase docs lst conf =
291 let foundations = PhyloFoundations (Vector.fromList $ nub $ concat $ map text docs) lst
292 params = defaultPhyloParam { _phyloParam_config = conf }
293 periods = toPeriods (sort $ nub $ map date docs) (getTimePeriod $ timeUnit conf) (getTimeStep $ timeUnit conf)
294 in trace ("\n" <> "-- | Create PhyloBase out of " <> show(length docs) <> " docs \n")
295 $ Phylo foundations
296 (docsToTimeScaleCooc docs (foundations ^. foundations_roots))
297 (docsToTimeScaleNb docs)
298 (docsToTermFreq docs (foundations ^. foundations_roots))
299 empty
300 params
301 (fromList $ map (\prd -> (prd, PhyloPeriod prd (initPhyloLevels 1 prd))) periods)