]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/Phylo/PhyloMaker.hs
fix the freq
[gargantext.git] / src / Gargantext / Viz / Phylo / PhyloMaker.hs
1 {-|
2 Module : Gargantext.Viz.Phylo.PhyloMaker
3 Description : Maker engine for rebuilding a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 {-# LANGUAGE NoImplicitPrelude #-}
12 {-# LANGUAGE FlexibleContexts #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE MultiParamTypeClasses #-}
15
16 module Gargantext.Viz.Phylo.PhyloMaker where
17
18 import Data.List (concat, nub, partition, sort, (++), group)
19 import Data.Map (Map, fromListWith, keys, unionWith, fromList, empty, toList, elems, (!), restrictKeys)
20 import Data.Set (size)
21 import Data.Vector (Vector)
22
23 import Gargantext.Prelude
24 import Gargantext.Viz.AdaptativePhylo
25 import Gargantext.Viz.Phylo.PhyloTools
26 import Gargantext.Viz.Phylo.TemporalMatching (temporalMatching)
27 import Gargantext.Viz.Phylo.SynchronicClustering (synchronicClustering)
28 import Gargantext.Text.Context (TermList)
29 import Gargantext.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, Size(..))
30
31 import Control.DeepSeq (NFData)
32 import Control.Parallel.Strategies (parList, rdeepseq, using)
33 import Debug.Trace (trace)
34 import Control.Lens hiding (Level)
35
36 import qualified Data.Vector as Vector
37 import qualified Data.Set as Set
38
39
40 ------------------
41 -- | To Phylo | --
42 ------------------
43
44
45 toPhylo :: [Document] -> TermList -> Config -> Phylo
46 toPhylo docs lst conf = traceToPhylo (phyloLevel conf) $
47 if (phyloLevel conf) > 1
48 then foldl' (\phylo' _ -> synchronicClustering phylo') phylo1 [2..(phyloLevel conf)]
49 else phylo1
50 where
51 --------------------------------------
52 phylo1 :: Phylo
53 phylo1 = toPhylo1 docs phyloBase
54 --------------------------------------
55 phyloBase :: Phylo
56 phyloBase = toPhyloBase docs lst conf
57 --------------------------------------
58
59
60
61 --------------------
62 -- | To Phylo 1 | --
63 --------------------
64
65
66 appendGroups :: (a -> PhyloPeriodId -> Level -> Int -> Vector Ngrams -> [Cooc] -> PhyloGroup) -> Level -> Map (Date,Date) [a] -> Phylo -> Phylo
67 appendGroups f lvl m phylo = trace ("\n" <> "-- | Append " <> show (length $ concat $ elems m) <> " groups to Level " <> show (lvl) <> "\n")
68 $ over ( phylo_periods
69 . traverse
70 . phylo_periodLevels
71 . traverse)
72 (\phyloLvl -> if lvl == (phyloLvl ^. phylo_levelLevel)
73 then
74 let pId = phyloLvl ^. phylo_levelPeriod
75 phyloFis = m ! pId
76 in phyloLvl
77 & phylo_levelGroups .~ (fromList $ foldl (\groups obj ->
78 groups ++ [ (((pId,lvl),length groups)
79 , f obj pId lvl (length groups) (getRoots phylo)
80 (elems $ restrictKeys (phylo ^. phylo_timeCooc) $ periodsToYears [pId]))
81 ] ) [] phyloFis)
82 else
83 phyloLvl )
84 phylo
85
86
87 fisToGroup :: PhyloFis -> PhyloPeriodId -> Level -> Int -> Vector Ngrams -> [Cooc] -> PhyloGroup
88 fisToGroup fis pId lvl idx fdt coocs =
89 let ngrams = ngramsToIdx (Set.toList $ fis ^. phyloFis_clique) fdt
90 in PhyloGroup pId lvl idx ""
91 (fis ^. phyloFis_support)
92 ngrams
93 (ngramsToCooc ngrams coocs)
94 (1,[0])
95 empty
96 [] [] [] []
97
98
99 toPhylo1 :: [Document] -> Phylo -> Phylo
100 toPhylo1 docs phyloBase = temporalMatching
101 $ appendGroups fisToGroup 1 phyloFis phyloBase
102 where
103 --------------------------------------
104 phyloFis :: Map (Date,Date) [PhyloFis]
105 phyloFis = toPhyloFis docs' (getFisSupport $ contextualUnit $ getConfig phyloBase) (getFisSize $ contextualUnit $ getConfig phyloBase)
106 --------------------------------------
107 docs' :: Map (Date,Date) [Document]
108 docs' = groupDocsByPeriod date (getPeriodIds phyloBase) docs
109 --------------------------------------
110
111
112 ---------------------------
113 -- | Frequent Item Set | --
114 ---------------------------
115
116
117 -- | To apply a filter with the possibility of keeping some periods non empty (keep : True|False)
118 filterFis :: Bool -> Int -> (Int -> [PhyloFis] -> [PhyloFis]) -> Map (Date, Date) [PhyloFis] -> Map (Date, Date) [PhyloFis]
119 filterFis keep thr f m = case keep of
120 False -> map (\l -> f thr l) m
121 True -> map (\l -> keepFilled (f) thr l) m
122
123
124 -- | To filter Fis with small Support
125 filterFisBySupport :: Int -> [PhyloFis] -> [PhyloFis]
126 filterFisBySupport thr l = filter (\fis -> (fis ^. phyloFis_support) >= thr) l
127
128
129 -- | To filter Fis with small Clique size
130 filterFisByClique :: Int -> [PhyloFis] -> [PhyloFis]
131 filterFisByClique thr l = filter (\fis -> (size $ fis ^. phyloFis_clique) >= thr) l
132
133
134 -- | To filter nested Fis
135 filterFisByNested :: Map (Date, Date) [PhyloFis] -> Map (Date, Date) [PhyloFis]
136 filterFisByNested m =
137 let fis = map (\l ->
138 foldl (\mem f -> if (any (\f' -> isNested (Set.toList $ f' ^. phyloFis_clique) (Set.toList $ f ^. phyloFis_clique)) mem)
139 then mem
140 else
141 let fMax = filter (\f' -> not $ isNested (Set.toList $ f ^. phyloFis_clique) (Set.toList $ f' ^. phyloFis_clique)) mem
142 in fMax ++ [f] ) [] l)
143 $ elems m
144 fis' = fis `using` parList rdeepseq
145 in fromList $ zip (keys m) fis'
146
147
148 -- | To transform a time map of docs innto a time map of Fis with some filters
149 toPhyloFis :: Map (Date, Date) [Document] -> Int -> Int -> Map (Date,Date) [PhyloFis]
150 toPhyloFis phyloDocs support clique = traceFis "Filtered Fis"
151 $ filterFisByNested
152 $ traceFis "Filtered by clique size"
153 $ filterFis True clique (filterFisByClique)
154 $ traceFis "Filtered by support"
155 $ filterFis True support (filterFisBySupport)
156 $ traceFis "Unfiltered Fis" phyloFis
157 where
158 --------------------------------------
159 phyloFis :: Map (Date,Date) [PhyloFis]
160 phyloFis =
161 let fis = map (\(prd,docs) -> let lst = toList $ fisWithSizePolyMap (Segment 1 20) 1 (map text docs)
162 in (prd, map (\f -> PhyloFis (fst f) (snd f) prd) lst))
163 $ toList phyloDocs
164 fis' = fis `using` parList rdeepseq
165 in fromList fis'
166 --------------------------------------
167
168
169 --------------------
170 -- | Coocurency | --
171 --------------------
172
173
174 -- | To transform the docs into a time map of coocurency matrix
175 docsToTimeScaleCooc :: [Document] -> Vector Ngrams -> Map Date Cooc
176 docsToTimeScaleCooc docs fdt =
177 let mCooc = fromListWith sumCooc
178 $ map (\(_d,l) -> (_d, listToMatrix l))
179 $ map (\doc -> (date doc, sort $ ngramsToIdx (text doc) fdt)) docs
180 mCooc' = fromList
181 $ map (\t -> (t,empty))
182 $ toTimeScale (map date docs) 1
183 in trace ("\n" <> "-- | Build the coocurency matrix for " <> show (length $ keys mCooc') <> " unit of time" <> "\n")
184 $ unionWith sumCooc mCooc mCooc'
185
186
187 -----------------------
188 -- | to Phylo Base | --
189 -----------------------
190
191
192 -- | To group a list of Documents by fixed periods
193 groupDocsByPeriod :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
194 groupDocsByPeriod _ _ [] = panic "[ERR][Viz.Phylo.PhyloMaker] Empty [Documents] can not have any periods"
195 groupDocsByPeriod f pds es =
196 let periods = map (inPeriode f es) pds
197 periods' = periods `using` parList rdeepseq
198
199 in trace ("\n" <> "-- | Group " <> show(length es) <> " docs by " <> show(length pds) <> " periods" <> "\n")
200 $ fromList $ zip pds periods'
201 where
202 --------------------------------------
203 inPeriode :: Ord b => (t -> b) -> [t] -> (b, b) -> [t]
204 inPeriode f' h (start,end) =
205 fst $ partition (\d -> f' d >= start && f' d <= end) h
206 --------------------------------------
207
208
209 docsToTermFreq :: [Document] -> Vector Ngrams -> Map Int Double
210 docsToTermFreq docs fdt =
211 let nbDocs = fromIntegral $ length docs
212 freqs = map (/nbDocs)
213 $ fromList
214 $ map (\lst -> (head' "docsToTermFreq" lst, fromIntegral $ length lst))
215 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) docs
216 sumFreqs = sum $ elems freqs
217 in map (/sumFreqs) freqs
218
219
220 -- | To count the number of docs by unit of time
221 docsToTimeScaleNb :: [Document] -> Map Date Double
222 docsToTimeScaleNb docs =
223 let docs' = fromListWith (+) $ map (\d -> (date d,1)) docs
224 time = fromList $ map (\t -> (t,0)) $ toTimeScale (keys docs') 1
225 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length time) <> " unit of time" <> "\n")
226 $ unionWith (+) time docs'
227
228
229 initPhyloLevels :: Int -> PhyloPeriodId -> Map PhyloLevelId PhyloLevel
230 initPhyloLevels lvlMax pId =
231 fromList $ map (\lvl -> ((pId,lvl),PhyloLevel pId lvl empty)) [1..lvlMax]
232
233
234 -- | To init the basic elements of a Phylo
235 toPhyloBase :: [Document] -> TermList -> Config -> Phylo
236 toPhyloBase docs lst conf =
237 let foundations = PhyloFoundations (Vector.fromList $ nub $ concat $ map text docs) lst
238 params = defaultPhyloParam { _phyloParam_config = conf }
239 periods = toPeriods (sort $ nub $ map date docs) (getTimePeriod $ timeUnit conf) (getTimeStep $ timeUnit conf)
240 in trace ("\n" <> "-- | Create PhyloBase out of " <> show(length docs) <> " docs \n")
241 $ Phylo foundations
242 (docsToTimeScaleCooc docs (foundations ^. foundations_roots))
243 (docsToTimeScaleNb docs)
244 (docsToTermFreq docs (foundations ^. foundations_roots))
245 params
246 (fromList $ map (\prd -> (prd, PhyloPeriod prd (initPhyloLevels 1 prd))) periods)