]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/Phylo/PhyloMaker.hs
add adjustable clustering by threshold
[gargantext.git] / src / Gargantext / Viz / Phylo / PhyloMaker.hs
1 {-|
2 Module : Gargantext.Viz.Phylo.PhyloMaker
3 Description : Maker engine for rebuilding a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 {-# LANGUAGE NoImplicitPrelude #-}
12 {-# LANGUAGE FlexibleContexts #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE MultiParamTypeClasses #-}
15
16 module Gargantext.Viz.Phylo.PhyloMaker where
17
18 import Data.List (concat, nub, partition, sort, (++), group)
19 import Data.Map (Map, fromListWith, keys, unionWith, fromList, empty, toList, elems, (!), restrictKeys, singleton)
20 import Data.Set (size)
21 import Data.Vector (Vector)
22
23 import Gargantext.Prelude
24 import Gargantext.Viz.AdaptativePhylo
25 import Gargantext.Viz.Phylo.PhyloTools
26 import Gargantext.Viz.Phylo.TemporalMatching (temporalMatching)
27 import Gargantext.Viz.Phylo.SynchronicClustering (synchronicClustering)
28 import Gargantext.Text.Context (TermList)
29 import Gargantext.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, Size(..))
30
31 import Control.DeepSeq (NFData)
32 import Control.Parallel.Strategies (parList, rdeepseq, using)
33 import Debug.Trace (trace)
34 import Control.Lens hiding (Level)
35
36 import qualified Data.Vector as Vector
37 import qualified Data.Set as Set
38
39
40 ------------------
41 -- | To Phylo | --
42 ------------------
43
44
45 toPhylo :: [Document] -> TermList -> Config -> Phylo
46 toPhylo docs lst conf = traceToPhylo (phyloLevel conf) $
47 if (phyloLevel conf) > 1
48 then foldl' (\phylo' _ -> synchronicClustering phylo') phylo1 [2..(phyloLevel conf)]
49 else phylo1
50 where
51 --------------------------------------
52 phylo1 :: Phylo
53 phylo1 = toPhylo1 docs phyloBase
54 --------------------------------------
55 phyloBase :: Phylo
56 phyloBase = toPhyloBase docs lst conf
57 --------------------------------------
58
59
60
61 --------------------
62 -- | To Phylo 1 | --
63 --------------------
64
65
66 appendGroups :: (a -> Double -> PhyloPeriodId -> Level -> Int -> Vector Ngrams -> [Cooc] -> PhyloGroup) -> Level -> Map (Date,Date) [a] -> Phylo -> Phylo
67 appendGroups f lvl m phylo = trace ("\n" <> "-- | Append " <> show (length $ concat $ elems m) <> " groups to Level " <> show (lvl) <> "\n")
68 $ over ( phylo_periods
69 . traverse
70 . phylo_periodLevels
71 . traverse)
72 (\phyloLvl -> if lvl == (phyloLvl ^. phylo_levelLevel)
73 then
74 let pId = phyloLvl ^. phylo_levelPeriod
75 phyloCUnit = m ! pId
76 in phyloLvl
77 & phylo_levelGroups .~ (fromList $ foldl (\groups obj ->
78 groups ++ [ (((pId,lvl),length groups)
79 , f obj(getPhyloThresholdInit phylo) pId lvl (length groups) (getRoots phylo)
80 (elems $ restrictKeys (phylo ^. phylo_timeCooc) $ periodsToYears [pId]))
81 ] ) [] phyloCUnit)
82 else
83 phyloLvl )
84 phylo
85
86
87 cliqueToGroup :: PhyloClique -> Double -> PhyloPeriodId -> Level -> Int -> Vector Ngrams -> [Cooc] -> PhyloGroup
88 cliqueToGroup fis thr pId lvl idx fdt coocs =
89 let ngrams = ngramsToIdx (Set.toList $ fis ^. phyloClique_nodes) fdt
90 in PhyloGroup pId lvl idx ""
91 (fis ^. phyloClique_support)
92 ngrams
93 (ngramsToCooc ngrams coocs)
94 (1,[0])
95 (singleton "thr" [thr])
96 [] [] [] []
97
98
99 toPhylo1 :: [Document] -> Phylo -> Phylo
100 toPhylo1 docs phyloBase = temporalMatching
101 $ appendGroups cliqueToGroup 1 phyloClique phyloBase
102 where
103 --------------------------------------
104 phyloClique :: Map (Date,Date) [PhyloClique]
105 phyloClique = toPhyloClique phyloBase docs'
106 --------------------------------------
107 docs' :: Map (Date,Date) [Document]
108 docs' = groupDocsByPeriod date (getPeriodIds phyloBase) docs
109 --------------------------------------
110
111
112 ---------------------------
113 -- | Frequent Item Set | --
114 ---------------------------
115
116
117 -- | To apply a filter with the possibility of keeping some periods non empty (keep : True|False)
118 filterClique :: Bool -> Int -> (Int -> [PhyloClique] -> [PhyloClique]) -> Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
119 filterClique keep thr f m = case keep of
120 False -> map (\l -> f thr l) m
121 True -> map (\l -> keepFilled (f) thr l) m
122
123
124 -- | To filter Fis with small Support
125 filterCliqueBySupport :: Int -> [PhyloClique] -> [PhyloClique]
126 filterCliqueBySupport thr l = filter (\clq -> (clq ^. phyloClique_support) >= thr) l
127
128
129 -- | To filter Fis with small Clique size
130 filterCliqueBySize :: Int -> [PhyloClique] -> [PhyloClique]
131 filterCliqueBySize thr l = filter (\clq -> (size $ clq ^. phyloClique_nodes) >= thr) l
132
133
134 -- | To filter nested Fis
135 filterCliqueByNested :: Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
136 filterCliqueByNested m =
137 let clq = map (\l ->
138 foldl (\mem f -> if (any (\f' -> isNested (Set.toList $ f' ^. phyloClique_nodes) (Set.toList $ f ^. phyloClique_nodes)) mem)
139 then mem
140 else
141 let fMax = filter (\f' -> not $ isNested (Set.toList $ f ^. phyloClique_nodes) (Set.toList $ f' ^. phyloClique_nodes)) mem
142 in fMax ++ [f] ) [] l)
143 $ elems m
144 clq' = clq `using` parList rdeepseq
145 in fromList $ zip (keys m) clq'
146
147
148 -- | To transform a time map of docs innto a time map of Fis with some filters
149 toPhyloClique :: Phylo -> Map (Date, Date) [Document] -> Map (Date,Date) [PhyloClique]
150 toPhyloClique phylo phyloDocs = case (clique $ getConfig phylo) of
151 Fis s s' -> -- traceFis "Filtered Fis"
152 filterCliqueByNested
153 -- $ traceFis "Filtered by clique size"
154 $ filterClique True s' (filterCliqueBySize)
155 -- $ traceFis "Filtered by support"
156 $ filterClique True s (filterCliqueBySupport)
157 -- $ traceFis "Unfiltered Fis"
158 phyloClique
159 MaxClique _ -> undefined
160 where
161 --------------------------------------
162 phyloClique :: Map (Date,Date) [PhyloClique]
163 phyloClique = case (clique $ getConfig phylo) of
164 Fis _ _ -> let fis = map (\(prd,docs) ->
165 let lst = toList $ fisWithSizePolyMap (Segment 1 20) 1 (map text docs)
166 in (prd, map (\f -> PhyloClique (fst f) (snd f) prd) lst))
167 $ toList phyloDocs
168 fis' = fis `using` parList rdeepseq
169 in fromList fis'
170 MaxClique _ -> undefined
171 --------------------------------------
172
173
174 --------------------
175 -- | Coocurency | --
176 --------------------
177
178
179 -- | To transform the docs into a time map of coocurency matrix
180 docsToTimeScaleCooc :: [Document] -> Vector Ngrams -> Map Date Cooc
181 docsToTimeScaleCooc docs fdt =
182 let mCooc = fromListWith sumCooc
183 $ map (\(_d,l) -> (_d, listToMatrix l))
184 $ map (\doc -> (date doc, sort $ ngramsToIdx (text doc) fdt)) docs
185 mCooc' = fromList
186 $ map (\t -> (t,empty))
187 $ toTimeScale (map date docs) 1
188 in trace ("\n" <> "-- | Build the coocurency matrix for " <> show (length $ keys mCooc') <> " unit of time" <> "\n")
189 $ unionWith sumCooc mCooc mCooc'
190
191
192 -----------------------
193 -- | to Phylo Base | --
194 -----------------------
195
196
197 -- | To group a list of Documents by fixed periods
198 groupDocsByPeriod :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
199 groupDocsByPeriod _ _ [] = panic "[ERR][Viz.Phylo.PhyloMaker] Empty [Documents] can not have any periods"
200 groupDocsByPeriod f pds es =
201 let periods = map (inPeriode f es) pds
202 periods' = periods `using` parList rdeepseq
203
204 in trace ("\n" <> "-- | Group " <> show(length es) <> " docs by " <> show(length pds) <> " periods" <> "\n")
205 $ fromList $ zip pds periods'
206 where
207 --------------------------------------
208 inPeriode :: Ord b => (t -> b) -> [t] -> (b, b) -> [t]
209 inPeriode f' h (start,end) =
210 fst $ partition (\d -> f' d >= start && f' d <= end) h
211 --------------------------------------
212
213
214 docsToTermFreq :: [Document] -> Vector Ngrams -> Map Int Double
215 docsToTermFreq docs fdt =
216 let nbDocs = fromIntegral $ length docs
217 freqs = map (/(nbDocs))
218 $ fromList
219 $ map (\lst -> (head' "docsToTermFreq" lst, fromIntegral $ length lst))
220 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) docs
221 sumFreqs = sum $ elems freqs
222 in map (/sumFreqs) freqs
223
224
225 -- | To count the number of docs by unit of time
226 docsToTimeScaleNb :: [Document] -> Map Date Double
227 docsToTimeScaleNb docs =
228 let docs' = fromListWith (+) $ map (\d -> (date d,1)) docs
229 time = fromList $ map (\t -> (t,0)) $ toTimeScale (keys docs') 1
230 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length time) <> " unit of time" <> "\n")
231 $ unionWith (+) time docs'
232
233
234 initPhyloLevels :: Int -> PhyloPeriodId -> Map PhyloLevelId PhyloLevel
235 initPhyloLevels lvlMax pId =
236 fromList $ map (\lvl -> ((pId,lvl),PhyloLevel pId lvl empty)) [1..lvlMax]
237
238
239 -- | To init the basic elements of a Phylo
240 toPhyloBase :: [Document] -> TermList -> Config -> Phylo
241 toPhyloBase docs lst conf =
242 let foundations = PhyloFoundations (Vector.fromList $ nub $ concat $ map text docs) lst
243 params = defaultPhyloParam { _phyloParam_config = conf }
244 periods = toPeriods (sort $ nub $ map date docs) (getTimePeriod $ timeUnit conf) (getTimeStep $ timeUnit conf)
245 in trace ("\n" <> "-- | Create PhyloBase out of " <> show(length docs) <> " docs \n")
246 $ Phylo foundations
247 (docsToTimeScaleCooc docs (foundations ^. foundations_roots))
248 (docsToTimeScaleNb docs)
249 (docsToTermFreq docs (foundations ^. foundations_roots))
250 params
251 (fromList $ map (\prd -> (prd, PhyloPeriod prd (initPhyloLevels 1 prd))) periods)