]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Viz/Phylo/PhyloMaker.hs
working on a new distance in phylo
[gargantext.git] / src / Gargantext / Core / Viz / Phylo / PhyloMaker.hs
1 {-|
2 Module : Gargantext.Core.Viz.Phylo.PhyloMaker
3 Description : Maker engine for rebuilding a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11
12 module Gargantext.Core.Viz.Phylo.PhyloMaker where
13
14 import Data.List (concat, nub, partition, sort, (++), group, intersect, null, sortOn, groupBy, tail)
15 import Data.Map (Map, fromListWith, keys, unionWith, fromList, empty, toList, elems, (!), restrictKeys, foldlWithKey, insert)
16 import Data.Vector (Vector)
17
18 import Gargantext.Prelude
19 import Gargantext.Core.Viz.AdaptativePhylo
20 import Gargantext.Core.Viz.Phylo.PhyloTools
21 import Gargantext.Core.Viz.Phylo.TemporalMatching (adaptativeTemporalMatching, constanteTemporalMatching, getNextPeriods, filterDocs, filterDiago, reduceDiagos, toProximity)
22 import Gargantext.Core.Viz.Phylo.SynchronicClustering (synchronicClustering)
23 import Gargantext.Core.Text.Context (TermList)
24 import Gargantext.Core.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, Size(..))
25 import Gargantext.Core.Viz.Graph.MaxClique (getMaxCliques)
26 import Gargantext.Core.Viz.Graph.Distances (Distance(Conditional))
27 import Gargantext.Core.Viz.Phylo.PhyloExport (toHorizon)
28
29 import Control.DeepSeq (NFData)
30 import Control.Parallel.Strategies (parList, rdeepseq, using)
31 import Debug.Trace (trace)
32 import Control.Lens hiding (Level)
33
34 import qualified Data.Vector as Vector
35 import qualified Data.Set as Set
36
37 ------------------
38 -- | To Phylo | --
39 ------------------
40
41 {-
42 -- TODO AD
43 data Phylo' = PhyloBase { _phylo'_phyloBase :: Phylo}
44 | PhyloN { _phylo'_phylo1 :: Phylo}
45
46
47 toPhylo' :: Phylo' -> [Document] -> TermList -> Config -> Phylo
48 toPhylo' (PhyloN phylo) = toPhylo'
49 toPhylo' (PhyloBase phylo) = toPhylo
50 -}
51
52
53 toPhylo :: [Document] -> TermList -> Config -> Phylo
54 toPhylo docs lst conf = trace ("# phylo1 groups " <> show(length $ getGroupsFromLevel 1 phylo1))
55 $ traceToPhylo (phyloLevel conf) $
56 if (phyloLevel conf) > 1
57 then foldl' (\phylo' _ -> synchronicClustering phylo') phyloAncestors [2..(phyloLevel conf)]
58 else phylo1
59 where
60 --------------------------------------
61 phyloAncestors :: Phylo
62 phyloAncestors =
63 if (findAncestors conf)
64 then toHorizon phylo1
65 else phylo1
66 --------------------------------------
67 phylo1 :: Phylo
68 phylo1 = toPhylo1 docs phyloBase
69 -- > AD to db here
70 --------------------------------------
71 phyloBase :: Phylo
72 phyloBase = toPhyloBase docs lst conf
73 -- > AD to db here
74 --------------------------------------
75
76
77
78 --------------------
79 -- | To Phylo 1 | --
80 --------------------
81
82 toGroupsProxi :: Level -> Phylo -> Phylo
83 toGroupsProxi lvl phylo =
84 let proximity = phyloProximity $ getConfig phylo
85 groupsProxi = foldlWithKey (\acc pId pds ->
86 -- 1) process period by period
87 let egos = map (\g -> (getGroupId g, g ^. phylo_groupNgrams))
88 $ elems
89 $ view ( phylo_periodLevels
90 . traverse . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
91 . phylo_levelGroups ) pds
92 next = getNextPeriods ToParents (getTimeFrame $ timeUnit $ getConfig phylo) pId (keys $ phylo ^. phylo_periods)
93 targets = map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ getGroupsFromLevelPeriods lvl next phylo
94 docs = filterDocs (phylo ^. phylo_timeDocs) ([pId] ++ next)
95 diagos = filterDiago (phylo ^. phylo_timeCooc) ([pId] ++ next)
96 -- 2) compute the pairs in parallel
97 pairs = map (\(id,ngrams) ->
98 map (\(id',ngrams') ->
99 let nbDocs = (sum . elems) $ filterDocs docs ([idToPrd id, idToPrd id'])
100 diago = reduceDiagos $ filterDiago diagos ([idToPrd id, idToPrd id'])
101 in ((id,id'),toProximity nbDocs diago proximity ngrams ngrams' ngrams')
102 ) $ filter (\(_,ngrams') -> (not . null) $ intersect ngrams ngrams') targets
103 ) egos
104 pairs' = pairs `using` parList rdeepseq
105 in acc ++ (concat pairs')
106 ) [] $ phylo ^. phylo_periods
107 in phylo & phylo_groupsProxi .~ ((traceGroupsProxi . fromList) groupsProxi)
108
109
110 appendGroups :: (a -> PhyloPeriodId -> Level -> Int -> [Cooc] -> PhyloGroup) -> Level -> Map (Date,Date) [a] -> Phylo -> Phylo
111 appendGroups f lvl m phylo = trace ("\n" <> "-- | Append " <> show (length $ concat $ elems m) <> " groups to Level " <> show (lvl) <> "\n")
112 $ over ( phylo_periods
113 . traverse
114 . phylo_periodLevels
115 . traverse)
116 (\phyloLvl -> if lvl == (phyloLvl ^. phylo_levelLevel)
117 then
118 let pId = phyloLvl ^. phylo_levelPeriod
119 phyloCUnit = m ! pId
120 in phyloLvl
121 & phylo_levelGroups .~ (fromList $ foldl (\groups obj ->
122 groups ++ [ (((pId,lvl),length groups)
123 , f obj pId lvl (length groups)
124 (elems $ restrictKeys (phylo ^. phylo_timeCooc) $ periodsToYears [pId]))
125 ] ) [] phyloCUnit)
126 else
127 phyloLvl )
128 phylo
129
130
131 cliqueToGroup :: PhyloClique -> PhyloPeriodId -> Level -> Int -> [Cooc] -> PhyloGroup
132 cliqueToGroup fis pId lvl idx coocs = PhyloGroup pId lvl idx ""
133 (fis ^. phyloClique_support)
134 (fis ^. phyloClique_nodes)
135 (ngramsToCooc (fis ^. phyloClique_nodes) coocs)
136 (1,[0]) -- branchid (lvl,[path in the branching tree])
137 (fromList [("breaks",[0]),("seaLevels",[0])])
138 [] [] [] [] []
139
140
141 toPhylo1 :: [Document] -> Phylo -> Phylo
142 toPhylo1 docs phyloBase = case (getSeaElevation phyloBase) of
143 Constante start gap -> constanteTemporalMatching start gap
144 $ appendGroups cliqueToGroup 1 phyloClique phyloBase
145 Adaptative steps -> adaptativeTemporalMatching steps
146 $ toGroupsProxi 1
147 $ appendGroups cliqueToGroup 1 phyloClique phyloBase
148 where
149 --------------------------------------
150 phyloClique :: Map (Date,Date) [PhyloClique]
151 phyloClique = toPhyloClique phyloBase docs'
152 --------------------------------------
153 docs' :: Map (Date,Date) [Document]
154 docs' = groupDocsByPeriodRec date (getPeriodIds phyloBase) (sortOn date docs) empty
155 -- docs' = groupDocsByPeriod' date (getPeriodIds phyloBase) docs
156 --------------------------------------
157
158
159 ---------------------------
160 -- | Frequent Item Set | --
161 ---------------------------
162
163
164 -- To apply a filter with the possibility of keeping some periods non empty (keep : True|False)
165 filterClique :: Bool -> Int -> (Int -> [PhyloClique] -> [PhyloClique]) -> Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
166 filterClique keep thr f m = case keep of
167 False -> map (\l -> f thr l) m
168 True -> map (\l -> keepFilled (f) thr l) m
169
170
171 -- To filter Fis with small Support
172 filterCliqueBySupport :: Int -> [PhyloClique] -> [PhyloClique]
173 filterCliqueBySupport thr l = filter (\clq -> (clq ^. phyloClique_support) >= thr) l
174
175
176 -- To filter Fis with small Clique size
177 filterCliqueBySize :: Int -> [PhyloClique] -> [PhyloClique]
178 filterCliqueBySize thr l = filter (\clq -> (length $ clq ^. phyloClique_nodes) >= thr) l
179
180
181 -- To filter nested Fis
182 filterCliqueByNested :: Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
183 filterCliqueByNested m =
184 let clq = map (\l ->
185 foldl (\mem f -> if (any (\f' -> isNested (f' ^. phyloClique_nodes) (f ^. phyloClique_nodes)) mem)
186 then mem
187 else
188 let fMax = filter (\f' -> not $ isNested (f ^. phyloClique_nodes) (f' ^. phyloClique_nodes)) mem
189 in fMax ++ [f] ) [] l)
190 $ elems m
191 clq' = clq `using` parList rdeepseq
192 in fromList $ zip (keys m) clq'
193
194
195 -- | To transform a time map of docs into a time map of Fis with some filters
196 toPhyloClique :: Phylo -> Map (Date, Date) [Document] -> Map (Date,Date) [PhyloClique]
197 toPhyloClique phylo phyloDocs = case (clique $ getConfig phylo) of
198 Fis s s' -> -- traceFis "Filtered Fis"
199 filterCliqueByNested
200 {- \$ traceFis "Filtered by clique size" -}
201 $ filterClique True s' (filterCliqueBySize)
202 {- \$ traceFis "Filtered by support" -}
203 $ filterClique True s (filterCliqueBySupport)
204 {- \$ traceFis "Unfiltered Fis" -}
205 phyloClique
206 MaxClique s _ _ -> filterClique True s (filterCliqueBySize)
207 phyloClique
208 where
209 --------------------------------------
210 phyloClique :: Map (Date,Date) [PhyloClique]
211 phyloClique = case (clique $ getConfig phylo) of
212 Fis _ _ ->
213 let fis = map (\(prd,docs) ->
214 let lst = toList $ fisWithSizePolyMap (Segment 1 20) 1 (map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs)
215 in (prd, map (\f -> PhyloClique (Set.toList $ fst f) (snd f) prd) lst))
216 $ toList phyloDocs
217 fis' = fis `using` parList rdeepseq
218 in fromList fis'
219 MaxClique _ thr filterType ->
220 let mcl = map (\(prd,docs) ->
221 let cooc = map round
222 $ foldl sumCooc empty
223 $ map listToMatrix
224 $ map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs
225 in (prd, map (\cl -> PhyloClique cl 0 prd) $ getMaxCliques filterType Conditional thr cooc))
226 $ toList phyloDocs
227 mcl' = mcl `using` parList rdeepseq
228 in fromList mcl'
229 --------------------------------------
230
231 -- dev viz graph maxClique getMaxClique
232
233
234 --------------------
235 -- | Coocurency | --
236 --------------------
237
238
239 -- To transform the docs into a time map of coocurency matrix
240 docsToTimeScaleCooc :: [Document] -> Vector Ngrams -> Map Date Cooc
241 docsToTimeScaleCooc docs fdt =
242 let mCooc = fromListWith sumCooc
243 $ map (\(_d,l) -> (_d, listToMatrix l))
244 $ map (\doc -> (date doc, sort $ ngramsToIdx (text doc) fdt)) docs
245 mCooc' = fromList
246 $ map (\t -> (t,empty))
247 $ toTimeScale (map date docs) 1
248 in trace ("\n" <> "-- | Build the coocurency matrix for " <> show (length $ keys mCooc') <> " unit of time" <> "\n")
249 $ unionWith sumCooc mCooc mCooc'
250
251
252 -----------------------
253 -- | to Phylo Base | --
254 -----------------------
255 -- TODO anoe
256 groupDocsByPeriodRec :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc] -> Map (date, date) [doc]
257 groupDocsByPeriodRec f prds docs acc =
258 if ((null prds) || (null docs))
259 then acc
260 else
261 let prd = head' "groupBy" prds
262 docs' = partition (\d -> (f d >= fst prd) && (f d <= snd prd)) docs
263 in groupDocsByPeriodRec f (tail prds) (snd docs') (insert prd (fst docs') acc)
264
265
266 -- To group a list of Documents by fixed periods
267 groupDocsByPeriod' :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
268 groupDocsByPeriod' f pds docs =
269 let docs' = groupBy (\d d' -> f d == f d') $ sortOn f docs
270 periods = map (inPeriode f docs') pds
271 periods' = periods `using` parList rdeepseq
272 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length pds) <> " periods" <> "\n")
273 $ fromList $ zip pds periods'
274 where
275 --------------------------------------
276 inPeriode :: Ord b => (t -> b) -> [[t]] -> (b, b) -> [t]
277 inPeriode f' h (start,end) =
278 concat $ fst $ partition (\d -> f' (head' "inPeriode" d) >= start && f' (head' "inPeriode" d) <= end) h
279
280
281
282 -- To group a list of Documents by fixed periods
283 groupDocsByPeriod :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
284 groupDocsByPeriod _ _ [] = panic "[ERR][Viz.Phylo.PhyloMaker] Empty [Documents] can not have any periods"
285 groupDocsByPeriod f pds es =
286 let periods = map (inPeriode f es) pds
287 periods' = periods `using` parList rdeepseq
288
289 in trace ("\n" <> "-- | Group " <> show(length es) <> " docs by " <> show(length pds) <> " periods" <> "\n")
290 $ fromList $ zip pds periods'
291 where
292 --------------------------------------
293 inPeriode :: Ord b => (t -> b) -> [t] -> (b, b) -> [t]
294 inPeriode f' h (start,end) =
295 fst $ partition (\d -> f' d >= start && f' d <= end) h
296 --------------------------------------
297
298
299 docsToTermFreq :: [Document] -> Vector Ngrams -> Map Int Double
300 docsToTermFreq docs fdt =
301 let nbDocs = fromIntegral $ length docs
302 freqs = map (/(nbDocs))
303 $ fromList
304 $ map (\lst -> (head' "docsToTermFreq" lst, fromIntegral $ length lst))
305 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) docs
306 sumFreqs = sum $ elems freqs
307 in map (/sumFreqs) freqs
308
309 docsToLastTermFreq :: Int -> [Document] -> Vector Ngrams -> Map Int Double
310 docsToLastTermFreq n docs fdt =
311 let last = take n $ reverse $ sort $ map date docs
312 nbDocs = fromIntegral $ length $ filter (\d -> elem (date d) last) docs
313 freqs = map (/(nbDocs))
314 $ fromList
315 $ map (\lst -> (head' "docsToLastTermFreq" lst, fromIntegral $ length lst))
316 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) $ filter (\d -> elem (date d) last) docs
317 sumFreqs = sum $ elems freqs
318 in map (/sumFreqs) freqs
319
320
321 -- To count the number of docs by unit of time
322 docsToTimeScaleNb :: [Document] -> Map Date Double
323 docsToTimeScaleNb docs =
324 let docs' = fromListWith (+) $ map (\d -> (date d,1)) docs
325 time = fromList $ map (\t -> (t,0)) $ toTimeScale (keys docs') 1
326 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length time) <> " unit of time" <> "\n")
327 $ unionWith (+) time docs'
328
329
330 initPhyloLevels :: Int -> PhyloPeriodId -> Map PhyloLevelId PhyloLevel
331 initPhyloLevels lvlMax pId =
332 fromList $ map (\lvl -> ((pId,lvl),PhyloLevel pId lvl empty)) [1..lvlMax]
333
334
335 -- To init the basic elements of a Phylo
336 toPhyloBase :: [Document] -> TermList -> Config -> Phylo
337 toPhyloBase docs lst conf =
338 let foundations = PhyloFoundations (Vector.fromList $ nub $ concat $ map text docs) lst
339 params = defaultPhyloParam { _phyloParam_config = conf }
340 periods = toPeriods (sort $ nub $ map date docs) (getTimePeriod $ timeUnit conf) (getTimeStep $ timeUnit conf)
341 in trace ("\n" <> "-- | Create PhyloBase out of " <> show(length docs) <> " docs \n")
342 $ Phylo foundations
343 (docsToTimeScaleCooc docs (foundations ^. foundations_roots))
344 (docsToTimeScaleNb docs)
345 (docsToTermFreq docs (foundations ^. foundations_roots))
346 (docsToLastTermFreq (getTimePeriod $ timeUnit conf) docs (foundations ^. foundations_roots))
347 empty
348 empty
349 params
350 (fromList $ map (\prd -> (prd, PhyloPeriod prd (initPhyloLevels 1 prd))) periods)