]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/Phylo/PhyloMaker.hs
add the maxClique (in progress)
[gargantext.git] / src / Gargantext / Viz / Phylo / PhyloMaker.hs
1 {-|
2 Module : Gargantext.Viz.Phylo.PhyloMaker
3 Description : Maker engine for rebuilding a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 {-# LANGUAGE NoImplicitPrelude #-}
12 {-# LANGUAGE FlexibleContexts #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE MultiParamTypeClasses #-}
15
16 module Gargantext.Viz.Phylo.PhyloMaker where
17
18 import Data.List (concat, nub, partition, sort, (++), group)
19 import Data.Map (Map, fromListWith, keys, unionWith, fromList, empty, toList, elems, (!), restrictKeys)
20 import Data.Set (size)
21 import Data.Vector (Vector)
22
23 import Gargantext.Prelude
24 import Gargantext.Viz.AdaptativePhylo
25 import Gargantext.Viz.Phylo.PhyloTools
26 import Gargantext.Viz.Phylo.TemporalMatching (temporalMatching)
27 import Gargantext.Viz.Phylo.SynchronicClustering (synchronicClustering)
28 import Gargantext.Text.Context (TermList)
29 import Gargantext.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, Size(..))
30
31 import Control.DeepSeq (NFData)
32 import Control.Parallel.Strategies (parList, rdeepseq, using)
33 import Debug.Trace (trace)
34 import Control.Lens hiding (Level)
35
36 import qualified Data.Vector as Vector
37 import qualified Data.Set as Set
38
39
40 ------------------
41 -- | To Phylo | --
42 ------------------
43
44
45 toPhylo :: [Document] -> TermList -> Config -> Phylo
46 toPhylo docs lst conf = traceToPhylo (phyloLevel conf) $
47 if (phyloLevel conf) > 1
48 then foldl' (\phylo' _ -> synchronicClustering phylo') phylo1 [2..(phyloLevel conf)]
49 else phylo1
50 where
51 --------------------------------------
52 phylo1 :: Phylo
53 phylo1 = toPhylo1 docs phyloBase
54 --------------------------------------
55 phyloBase :: Phylo
56 phyloBase = toPhyloBase docs lst conf
57 --------------------------------------
58
59
60
61 --------------------
62 -- | To Phylo 1 | --
63 --------------------
64
65
66 appendGroups :: (a -> PhyloPeriodId -> Level -> Int -> Vector Ngrams -> [Cooc] -> PhyloGroup) -> Level -> Map (Date,Date) [a] -> Phylo -> Phylo
67 appendGroups f lvl m phylo = trace ("\n" <> "-- | Append " <> show (length $ concat $ elems m) <> " groups to Level " <> show (lvl) <> "\n")
68 $ over ( phylo_periods
69 . traverse
70 . phylo_periodLevels
71 . traverse)
72 (\phyloLvl -> if lvl == (phyloLvl ^. phylo_levelLevel)
73 then
74 let pId = phyloLvl ^. phylo_levelPeriod
75 phyloCUnit = m ! pId
76 in phyloLvl
77 & phylo_levelGroups .~ (fromList $ foldl (\groups obj ->
78 groups ++ [ (((pId,lvl),length groups)
79 , f obj pId lvl (length groups) (getRoots phylo)
80 (elems $ restrictKeys (phylo ^. phylo_timeCooc) $ periodsToYears [pId]))
81 ] ) [] phyloCUnit)
82 else
83 phyloLvl )
84 phylo
85
86
87 fisToGroup :: PhyloCUnit -> PhyloPeriodId -> Level -> Int -> Vector Ngrams -> [Cooc] -> PhyloGroup
88 fisToGroup fis pId lvl idx fdt coocs =
89 let ngrams = ngramsToIdx (Set.toList $ fis ^. phyloCUnit_nodes) fdt
90 in PhyloGroup pId lvl idx ""
91 (fis ^. phyloCUnit_support)
92 ngrams
93 (ngramsToCooc ngrams coocs)
94 (1,[0])
95 empty
96 [] [] [] []
97
98
99 toPhylo1 :: [Document] -> Phylo -> Phylo
100 toPhylo1 docs phyloBase = temporalMatching
101 $ appendGroups fisToGroup 1 phyloCUnit phyloBase
102 where
103 --------------------------------------
104 phyloCUnit :: Map (Date,Date) [PhyloCUnit]
105 phyloCUnit = case (contextualUnit $ getConfig phyloBase) of
106 Fis s s' -> toPhyloFis docs' s s'
107 MaxClique _ -> undefined
108 --------------------------------------
109 docs' :: Map (Date,Date) [Document]
110 docs' = groupDocsByPeriod date (getPeriodIds phyloBase) docs
111 --------------------------------------
112
113
114 ---------------------------
115 -- | Frequent Item Set | --
116 ---------------------------
117
118
119 -- | To apply a filter with the possibility of keeping some periods non empty (keep : True|False)
120 filterFis :: Bool -> Int -> (Int -> [PhyloCUnit] -> [PhyloCUnit]) -> Map (Date, Date) [PhyloCUnit] -> Map (Date, Date) [PhyloCUnit]
121 filterFis keep thr f m = case keep of
122 False -> map (\l -> f thr l) m
123 True -> map (\l -> keepFilled (f) thr l) m
124
125
126 -- | To filter Fis with small Support
127 filterFisBySupport :: Int -> [PhyloCUnit] -> [PhyloCUnit]
128 filterFisBySupport thr l = filter (\fis -> (fis ^. phyloCUnit_support) >= thr) l
129
130
131 -- | To filter Fis with small Clique size
132 filterFisByClique :: Int -> [PhyloCUnit] -> [PhyloCUnit]
133 filterFisByClique thr l = filter (\fis -> (size $ fis ^. phyloCUnit_nodes) >= thr) l
134
135
136 -- | To filter nested Fis
137 filterFisByNested :: Map (Date, Date) [PhyloCUnit] -> Map (Date, Date) [PhyloCUnit]
138 filterFisByNested m =
139 let fis = map (\l ->
140 foldl (\mem f -> if (any (\f' -> isNested (Set.toList $ f' ^. phyloCUnit_nodes) (Set.toList $ f ^. phyloCUnit_nodes)) mem)
141 then mem
142 else
143 let fMax = filter (\f' -> not $ isNested (Set.toList $ f ^. phyloCUnit_nodes) (Set.toList $ f' ^. phyloCUnit_nodes)) mem
144 in fMax ++ [f] ) [] l)
145 $ elems m
146 fis' = fis `using` parList rdeepseq
147 in fromList $ zip (keys m) fis'
148
149
150 -- | To transform a time map of docs innto a time map of Fis with some filters
151 toPhyloFis :: Map (Date, Date) [Document] -> Int -> Int -> Map (Date,Date) [PhyloCUnit]
152 toPhyloFis phyloDocs support clique = traceFis "Filtered Fis"
153 $ filterFisByNested
154 $ traceFis "Filtered by clique size"
155 $ filterFis True clique (filterFisByClique)
156 $ traceFis "Filtered by support"
157 $ filterFis True support (filterFisBySupport)
158 $ traceFis "Unfiltered Fis" phyloFis
159 where
160 --------------------------------------
161 phyloFis :: Map (Date,Date) [PhyloCUnit]
162 phyloFis =
163 let fis = map (\(prd,docs) -> let lst = toList $ fisWithSizePolyMap (Segment 1 20) 1 (map text docs)
164 in (prd, map (\f -> PhyloCUnit (fst f) (snd f) prd) lst))
165 $ toList phyloDocs
166 fis' = fis `using` parList rdeepseq
167 in fromList fis'
168 --------------------------------------
169
170
171 --------------------
172 -- | Coocurency | --
173 --------------------
174
175
176 -- | To transform the docs into a time map of coocurency matrix
177 docsToTimeScaleCooc :: [Document] -> Vector Ngrams -> Map Date Cooc
178 docsToTimeScaleCooc docs fdt =
179 let mCooc = fromListWith sumCooc
180 $ map (\(_d,l) -> (_d, listToMatrix l))
181 $ map (\doc -> (date doc, sort $ ngramsToIdx (text doc) fdt)) docs
182 mCooc' = fromList
183 $ map (\t -> (t,empty))
184 $ toTimeScale (map date docs) 1
185 in trace ("\n" <> "-- | Build the coocurency matrix for " <> show (length $ keys mCooc') <> " unit of time" <> "\n")
186 $ unionWith sumCooc mCooc mCooc'
187
188
189 -----------------------
190 -- | to Phylo Base | --
191 -----------------------
192
193
194 -- | To group a list of Documents by fixed periods
195 groupDocsByPeriod :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
196 groupDocsByPeriod _ _ [] = panic "[ERR][Viz.Phylo.PhyloMaker] Empty [Documents] can not have any periods"
197 groupDocsByPeriod f pds es =
198 let periods = map (inPeriode f es) pds
199 periods' = periods `using` parList rdeepseq
200
201 in trace ("\n" <> "-- | Group " <> show(length es) <> " docs by " <> show(length pds) <> " periods" <> "\n")
202 $ fromList $ zip pds periods'
203 where
204 --------------------------------------
205 inPeriode :: Ord b => (t -> b) -> [t] -> (b, b) -> [t]
206 inPeriode f' h (start,end) =
207 fst $ partition (\d -> f' d >= start && f' d <= end) h
208 --------------------------------------
209
210
211 docsToTermFreq :: [Document] -> Vector Ngrams -> Map Int Double
212 docsToTermFreq docs fdt =
213 let nbDocs = fromIntegral $ length docs
214 freqs = map (/(log nbDocs))
215 $ fromList
216 $ map (\lst -> (head' "docsToTermFreq" lst, log $ fromIntegral $ length lst))
217 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) docs
218 sumFreqs = sum $ elems freqs
219 in map (/sumFreqs) freqs
220
221
222 -- | To count the number of docs by unit of time
223 docsToTimeScaleNb :: [Document] -> Map Date Double
224 docsToTimeScaleNb docs =
225 let docs' = fromListWith (+) $ map (\d -> (date d,1)) docs
226 time = fromList $ map (\t -> (t,0)) $ toTimeScale (keys docs') 1
227 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length time) <> " unit of time" <> "\n")
228 $ unionWith (+) time docs'
229
230
231 initPhyloLevels :: Int -> PhyloPeriodId -> Map PhyloLevelId PhyloLevel
232 initPhyloLevels lvlMax pId =
233 fromList $ map (\lvl -> ((pId,lvl),PhyloLevel pId lvl empty)) [1..lvlMax]
234
235
236 -- | To init the basic elements of a Phylo
237 toPhyloBase :: [Document] -> TermList -> Config -> Phylo
238 toPhyloBase docs lst conf =
239 let foundations = PhyloFoundations (Vector.fromList $ nub $ concat $ map text docs) lst
240 params = defaultPhyloParam { _phyloParam_config = conf }
241 periods = toPeriods (sort $ nub $ map date docs) (getTimePeriod $ timeUnit conf) (getTimeStep $ timeUnit conf)
242 in trace ("\n" <> "-- | Create PhyloBase out of " <> show(length docs) <> " docs \n")
243 $ Phylo foundations
244 (docsToTimeScaleCooc docs (foundations ^. foundations_roots))
245 (docsToTimeScaleNb docs)
246 (docsToTermFreq docs (foundations ^. foundations_roots))
247 params
248 (fromList $ map (\prd -> (prd, PhyloPeriod prd (initPhyloLevels 1 prd))) periods)