]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Viz/Phylo/PhyloMaker.hs
[FEAT] Distributional (WIP)
[gargantext.git] / src / Gargantext / Core / Viz / Phylo / PhyloMaker.hs
1 {-|
2 Module : Gargantext.Core.Viz.Phylo.PhyloMaker
3 Description : Maker engine for rebuilding a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11
12 module Gargantext.Core.Viz.Phylo.PhyloMaker where
13
14 import Data.List (concat, nub, partition, sort, (++), group, intersect, null, sortOn, groupBy, tail)
15 import Data.Map (Map, fromListWith, keys, unionWith, fromList, empty, toList, elems, (!), restrictKeys, foldlWithKey, insert)
16 import Data.Vector (Vector)
17
18 import Gargantext.Prelude
19 import Gargantext.Core.Viz.AdaptativePhylo
20 import Gargantext.Core.Viz.Phylo.PhyloTools
21 import Gargantext.Core.Viz.Phylo.TemporalMatching (adaptativeTemporalMatching, constanteTemporalMatching, getNextPeriods, filterDocs, filterDiago, reduceDiagos, toProximity)
22 import Gargantext.Core.Viz.Phylo.SynchronicClustering (synchronicClustering)
23 import Gargantext.Core.Text.Context (TermList)
24 import Gargantext.Core.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, Size(..))
25 import Gargantext.Core.Methods.Graph.MaxClique (getMaxCliques)
26 import Gargantext.Core.Methods.Distances (Distance(Conditional))
27
28 import Control.DeepSeq (NFData)
29 import Control.Parallel.Strategies (parList, rdeepseq, using)
30 import Debug.Trace (trace)
31 import Control.Lens hiding (Level)
32
33 import qualified Data.Vector as Vector
34 import qualified Data.Set as Set
35
36 ------------------
37 -- | To Phylo | --
38 ------------------
39
40 {-
41 -- TODO AD
42 data Phylo' = PhyloBase { _phylo'_phyloBase :: Phylo}
43 | PhyloN { _phylo'_phylo1 :: Phylo}
44
45
46 toPhylo' :: Phylo' -> [Document] -> TermList -> Config -> Phylo
47 toPhylo' (PhyloN phylo) = toPhylo'
48 toPhylo' (PhyloBase phylo) = toPhylo
49 -}
50
51
52 toPhylo :: [Document] -> TermList -> Config -> Phylo
53 toPhylo docs lst conf = trace ("# phylo1 groups " <> show(length $ getGroupsFromLevel 1 phylo1))
54 $ traceToPhylo (phyloLevel conf) $
55 if (phyloLevel conf) > 1
56 then foldl' (\phylo' _ -> synchronicClustering phylo') phylo1 [2..(phyloLevel conf)]
57 else phylo1
58 where
59 --------------------------------------
60 phylo1 :: Phylo
61 phylo1 = toPhylo1 docs phyloBase
62 -- > AD to db here
63 --------------------------------------
64 phyloBase :: Phylo
65 phyloBase = toPhyloBase docs lst conf
66 -- > AD to db here
67 --------------------------------------
68
69
70
71 --------------------
72 -- | To Phylo 1 | --
73 --------------------
74
75 toGroupsProxi :: Level -> Phylo -> Phylo
76 toGroupsProxi lvl phylo =
77 let proximity = phyloProximity $ getConfig phylo
78 groupsProxi = foldlWithKey (\acc pId pds ->
79 -- 1) process period by period
80 let egos = map (\g -> (getGroupId g, g ^. phylo_groupNgrams))
81 $ elems
82 $ view ( phylo_periodLevels
83 . traverse . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
84 . phylo_levelGroups ) pds
85 next = getNextPeriods ToParents (getTimeFrame $ timeUnit $ getConfig phylo) pId (keys $ phylo ^. phylo_periods)
86 targets = map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ getGroupsFromLevelPeriods lvl next phylo
87 docs = filterDocs (phylo ^. phylo_timeDocs) ([pId] ++ next)
88 diagos = filterDiago (phylo ^. phylo_timeCooc) ([pId] ++ next)
89 -- 2) compute the pairs in parallel
90 pairs = map (\(id,ngrams) ->
91 map (\(id',ngrams') ->
92 let nbDocs = (sum . elems) $ filterDocs docs ([idToPrd id, idToPrd id'])
93 diago = reduceDiagos $ filterDiago diagos ([idToPrd id, idToPrd id'])
94 in ((id,id'),toProximity nbDocs diago proximity ngrams ngrams' ngrams')
95 ) $ filter (\(_,ngrams') -> (not . null) $ intersect ngrams ngrams') targets
96 ) egos
97 pairs' = pairs `using` parList rdeepseq
98 in acc ++ (concat pairs')
99 ) [] $ phylo ^. phylo_periods
100 in phylo & phylo_groupsProxi .~ ((traceGroupsProxi . fromList) groupsProxi)
101
102
103 appendGroups :: (a -> PhyloPeriodId -> Level -> Int -> [Cooc] -> PhyloGroup) -> Level -> Map (Date,Date) [a] -> Phylo -> Phylo
104 appendGroups f lvl m phylo = trace ("\n" <> "-- | Append " <> show (length $ concat $ elems m) <> " groups to Level " <> show (lvl) <> "\n")
105 $ over ( phylo_periods
106 . traverse
107 . phylo_periodLevels
108 . traverse)
109 (\phyloLvl -> if lvl == (phyloLvl ^. phylo_levelLevel)
110 then
111 let pId = phyloLvl ^. phylo_levelPeriod
112 phyloCUnit = m ! pId
113 in phyloLvl
114 & phylo_levelGroups .~ (fromList $ foldl (\groups obj ->
115 groups ++ [ (((pId,lvl),length groups)
116 , f obj pId lvl (length groups)
117 (elems $ restrictKeys (phylo ^. phylo_timeCooc) $ periodsToYears [pId]))
118 ] ) [] phyloCUnit)
119 else
120 phyloLvl )
121 phylo
122
123
124 cliqueToGroup :: PhyloClique -> PhyloPeriodId -> Level -> Int -> [Cooc] -> PhyloGroup
125 cliqueToGroup fis pId lvl idx coocs = PhyloGroup pId lvl idx ""
126 (fis ^. phyloClique_support)
127 (fis ^. phyloClique_nodes)
128 (ngramsToCooc (fis ^. phyloClique_nodes) coocs)
129 (1,[0]) -- branchid (lvl,[path in the branching tree])
130 (fromList [("breaks",[0]),("seaLevels",[0])])
131 [] [] [] [] []
132
133
134 toPhylo1 :: [Document] -> Phylo -> Phylo
135 toPhylo1 docs phyloBase = case (getSeaElevation phyloBase) of
136 Constante start gap -> constanteTemporalMatching start gap
137 $ appendGroups cliqueToGroup 1 phyloClique phyloBase
138 Adaptative steps -> adaptativeTemporalMatching steps
139 $ toGroupsProxi 1
140 $ appendGroups cliqueToGroup 1 phyloClique phyloBase
141 where
142 --------------------------------------
143 phyloClique :: Map (Date,Date) [PhyloClique]
144 phyloClique = toPhyloClique phyloBase docs'
145 --------------------------------------
146 docs' :: Map (Date,Date) [Document]
147 docs' = groupDocsByPeriodRec date (getPeriodIds phyloBase) (sortOn date docs) empty
148 -- docs' = groupDocsByPeriod' date (getPeriodIds phyloBase) docs
149 --------------------------------------
150
151
152 ---------------------------
153 -- | Frequent Item Set | --
154 ---------------------------
155
156
157 -- To apply a filter with the possibility of keeping some periods non empty (keep : True|False)
158 filterClique :: Bool -> Int -> (Int -> [PhyloClique] -> [PhyloClique]) -> Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
159 filterClique keep thr f m = case keep of
160 False -> map (\l -> f thr l) m
161 True -> map (\l -> keepFilled (f) thr l) m
162
163
164 -- To filter Fis with small Support
165 filterCliqueBySupport :: Int -> [PhyloClique] -> [PhyloClique]
166 filterCliqueBySupport thr l = filter (\clq -> (clq ^. phyloClique_support) >= thr) l
167
168
169 -- To filter Fis with small Clique size
170 filterCliqueBySize :: Int -> [PhyloClique] -> [PhyloClique]
171 filterCliqueBySize thr l = filter (\clq -> (length $ clq ^. phyloClique_nodes) >= thr) l
172
173
174 -- To filter nested Fis
175 filterCliqueByNested :: Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
176 filterCliqueByNested m =
177 let clq = map (\l ->
178 foldl (\mem f -> if (any (\f' -> isNested (f' ^. phyloClique_nodes) (f ^. phyloClique_nodes)) mem)
179 then mem
180 else
181 let fMax = filter (\f' -> not $ isNested (f ^. phyloClique_nodes) (f' ^. phyloClique_nodes)) mem
182 in fMax ++ [f] ) [] l)
183 $ elems m
184 clq' = clq `using` parList rdeepseq
185 in fromList $ zip (keys m) clq'
186
187
188 -- | To transform a time map of docs into a time map of Fis with some filters
189 toPhyloClique :: Phylo -> Map (Date, Date) [Document] -> Map (Date,Date) [PhyloClique]
190 toPhyloClique phylo phyloDocs = case (clique $ getConfig phylo) of
191 Fis s s' -> -- traceFis "Filtered Fis"
192 filterCliqueByNested
193 {- \$ traceFis "Filtered by clique size" -}
194 $ filterClique True s' (filterCliqueBySize)
195 {- \$ traceFis "Filtered by support" -}
196 $ filterClique True s (filterCliqueBySupport)
197 {- \$ traceFis "Unfiltered Fis" -}
198 phyloClique
199 MaxClique s -> filterClique True s (filterCliqueBySize)
200 phyloClique
201 where
202 --------------------------------------
203 phyloClique :: Map (Date,Date) [PhyloClique]
204 phyloClique = case (clique $ getConfig phylo) of
205 Fis _ _ ->
206 let fis = map (\(prd,docs) ->
207 let lst = toList $ fisWithSizePolyMap (Segment 1 20) 1 (map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs)
208 in (prd, map (\f -> PhyloClique (Set.toList $ fst f) (snd f) prd) lst))
209 $ toList phyloDocs
210 fis' = fis `using` parList rdeepseq
211 in fromList fis'
212 MaxClique _ ->
213 let mcl = map (\(prd,docs) ->
214 let cooc = map round
215 $ foldl sumCooc empty
216 $ map listToMatrix
217 $ map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs
218 in (prd, map (\cl -> PhyloClique cl 0 prd) $ getMaxCliques Conditional 0.001 cooc))
219 $ toList phyloDocs
220 mcl' = mcl `using` parList rdeepseq
221 in fromList mcl'
222 --------------------------------------
223
224 -- dev viz graph maxClique getMaxClique
225
226
227 --------------------
228 -- | Coocurency | --
229 --------------------
230
231
232 -- To transform the docs into a time map of coocurency matrix
233 docsToTimeScaleCooc :: [Document] -> Vector Ngrams -> Map Date Cooc
234 docsToTimeScaleCooc docs fdt =
235 let mCooc = fromListWith sumCooc
236 $ map (\(_d,l) -> (_d, listToMatrix l))
237 $ map (\doc -> (date doc, sort $ ngramsToIdx (text doc) fdt)) docs
238 mCooc' = fromList
239 $ map (\t -> (t,empty))
240 $ toTimeScale (map date docs) 1
241 in trace ("\n" <> "-- | Build the coocurency matrix for " <> show (length $ keys mCooc') <> " unit of time" <> "\n")
242 $ unionWith sumCooc mCooc mCooc'
243
244
245 -----------------------
246 -- | to Phylo Base | --
247 -----------------------
248 -- TODO anoe
249 groupDocsByPeriodRec :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc] -> Map (date, date) [doc]
250 groupDocsByPeriodRec f prds docs acc =
251 if ((null prds) || (null docs))
252 then acc
253 else
254 let prd = head' "groupBy" prds
255 docs' = partition (\d -> (f d >= fst prd) && (f d <= snd prd)) docs
256 in groupDocsByPeriodRec f (tail prds) (snd docs') (insert prd (fst docs') acc)
257
258
259 -- To group a list of Documents by fixed periods
260 groupDocsByPeriod' :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
261 groupDocsByPeriod' f pds docs =
262 let docs' = groupBy (\d d' -> f d == f d') $ sortOn f docs
263 periods = map (inPeriode f docs') pds
264 periods' = periods `using` parList rdeepseq
265 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length pds) <> " periods" <> "\n")
266 $ fromList $ zip pds periods'
267 where
268 --------------------------------------
269 inPeriode :: Ord b => (t -> b) -> [[t]] -> (b, b) -> [t]
270 inPeriode f' h (start,end) =
271 concat $ fst $ partition (\d -> f' (head' "inPeriode" d) >= start && f' (head' "inPeriode" d) <= end) h
272
273
274
275 -- To group a list of Documents by fixed periods
276 groupDocsByPeriod :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
277 groupDocsByPeriod _ _ [] = panic "[ERR][Viz.Phylo.PhyloMaker] Empty [Documents] can not have any periods"
278 groupDocsByPeriod f pds es =
279 let periods = map (inPeriode f es) pds
280 periods' = periods `using` parList rdeepseq
281
282 in trace ("\n" <> "-- | Group " <> show(length es) <> " docs by " <> show(length pds) <> " periods" <> "\n")
283 $ fromList $ zip pds periods'
284 where
285 --------------------------------------
286 inPeriode :: Ord b => (t -> b) -> [t] -> (b, b) -> [t]
287 inPeriode f' h (start,end) =
288 fst $ partition (\d -> f' d >= start && f' d <= end) h
289 --------------------------------------
290
291
292 docsToTermFreq :: [Document] -> Vector Ngrams -> Map Int Double
293 docsToTermFreq docs fdt =
294 let nbDocs = fromIntegral $ length docs
295 freqs = map (/(nbDocs))
296 $ fromList
297 $ map (\lst -> (head' "docsToTermFreq" lst, fromIntegral $ length lst))
298 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) docs
299 sumFreqs = sum $ elems freqs
300 in map (/sumFreqs) freqs
301
302 docsToLastTermFreq :: Int -> [Document] -> Vector Ngrams -> Map Int Double
303 docsToLastTermFreq n docs fdt =
304 let last = take n $ reverse $ sort $ map date docs
305 nbDocs = fromIntegral $ length $ filter (\d -> elem (date d) last) docs
306 freqs = map (/(nbDocs))
307 $ fromList
308 $ map (\lst -> (head' "docsToLastTermFreq" lst, fromIntegral $ length lst))
309 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) $ filter (\d -> elem (date d) last) docs
310 sumFreqs = sum $ elems freqs
311 in map (/sumFreqs) freqs
312
313
314 -- To count the number of docs by unit of time
315 docsToTimeScaleNb :: [Document] -> Map Date Double
316 docsToTimeScaleNb docs =
317 let docs' = fromListWith (+) $ map (\d -> (date d,1)) docs
318 time = fromList $ map (\t -> (t,0)) $ toTimeScale (keys docs') 1
319 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length time) <> " unit of time" <> "\n")
320 $ unionWith (+) time docs'
321
322
323 initPhyloLevels :: Int -> PhyloPeriodId -> Map PhyloLevelId PhyloLevel
324 initPhyloLevels lvlMax pId =
325 fromList $ map (\lvl -> ((pId,lvl),PhyloLevel pId lvl empty)) [1..lvlMax]
326
327
328 -- To init the basic elements of a Phylo
329 toPhyloBase :: [Document] -> TermList -> Config -> Phylo
330 toPhyloBase docs lst conf =
331 let foundations = PhyloFoundations (Vector.fromList $ nub $ concat $ map text docs) lst
332 params = defaultPhyloParam { _phyloParam_config = conf }
333 periods = toPeriods (sort $ nub $ map date docs) (getTimePeriod $ timeUnit conf) (getTimeStep $ timeUnit conf)
334 in trace ("\n" <> "-- | Create PhyloBase out of " <> show(length docs) <> " docs \n")
335 $ Phylo foundations
336 (docsToTimeScaleCooc docs (foundations ^. foundations_roots))
337 (docsToTimeScaleNb docs)
338 (docsToTermFreq docs (foundations ^. foundations_roots))
339 (docsToLastTermFreq (getTimePeriod $ timeUnit conf) docs (foundations ^. foundations_roots))
340 empty
341 empty
342 params
343 (fromList $ map (\prd -> (prd, PhyloPeriod prd (initPhyloLevels 1 prd))) periods)