]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/Phylo/PhyloMaker.hs
[ELEVE] Specifications to test to fix concurrent multi-terms.
[gargantext.git] / src / Gargantext / Viz / Phylo / PhyloMaker.hs
1 {-|
2 Module : Gargantext.Viz.Phylo.PhyloMaker
3 Description : Maker engine for rebuilding a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 {-# LANGUAGE NoImplicitPrelude #-}
12 {-# LANGUAGE FlexibleContexts #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE MultiParamTypeClasses #-}
15
16 module Gargantext.Viz.Phylo.PhyloMaker where
17
18 import Data.List (concat, nub, partition, sort, (++))
19 import Data.Map (Map, fromListWith, keys, unionWith, fromList, empty, toList, elems, (!), filterWithKey, restrictKeys)
20 import Data.Set (size)
21 import Data.Vector (Vector)
22
23 import Gargantext.Prelude
24 import Gargantext.Viz.AdaptativePhylo
25 import Gargantext.Viz.Phylo.PhyloTools
26 import Gargantext.Text.Context (TermList)
27 import Gargantext.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, Size(..))
28
29 import Control.DeepSeq (NFData)
30 import Control.Parallel.Strategies (parList, rdeepseq, using)
31 import Debug.Trace (trace)
32 import Control.Lens hiding (Level)
33
34 import qualified Data.Vector as Vector
35 import qualified Data.Set as Set
36
37
38 ------------------
39 -- | To Phylo | --
40 ------------------
41
42
43 toPhylo :: [Document] -> TermList -> Config -> Phylo
44 toPhylo docs lst conf = phylo1
45 where
46 --------------------------------------
47 phylo1 :: Phylo
48 phylo1 = toPhylo1 docs phyloBase
49 --------------------------------------
50 phyloBase :: Phylo
51 phyloBase = toPhyloBase docs lst conf
52 --------------------------------------
53
54
55
56 --------------------
57 -- | To Phylo 1 | --
58 --------------------
59
60
61 appendGroups :: (a -> PhyloPeriodId -> Level -> Int -> Vector Ngrams -> [Cooc] -> PhyloGroup) -> Level -> Map (Date,Date) [a] -> Phylo -> Phylo
62 appendGroups f lvl m phylo = trace ("\n" <> "-- | Append " <> show (length $ concat $ elems m) <> " groups to Level " <> show (lvl) <> "\n")
63 $ over ( phylo_periods
64 . traverse
65 . phylo_periodLevels
66 . traverse)
67 (\phyloLvl -> if lvl == (phyloLvl ^. phylo_levelLevel)
68 then
69 let pId = phyloLvl ^. phylo_levelPeriod
70 phyloFis = m ! pId
71 in phyloLvl
72 & phylo_levelGroups .~ (fromList $ foldl (\groups obj ->
73 groups ++ [ (((pId,lvl),length groups)
74 , f obj pId lvl (length groups) (getRoots phylo)
75 (elems $ restrictKeys (phylo ^. phylo_timeCooc) $ periodsToYears [pId]))
76 ] ) [] phyloFis)
77 else
78 phyloLvl )
79 phylo
80
81
82 fisToGroup :: PhyloFis -> PhyloPeriodId -> Level -> Int -> Vector Ngrams -> [Cooc] -> PhyloGroup
83 fisToGroup fis pId lvl idx fdt coocs =
84 let ngrams = ngramsToIdx (Set.toList $ fis ^. phyloFis_clique) fdt
85 in PhyloGroup pId lvl idx
86 (fis ^. phyloFis_support)
87 ngrams
88 (ngramsToCooc ngrams coocs)
89 (1,[])
90 [] [] [] [] []
91
92
93 toPhylo1 :: [Document] -> Phylo -> Phylo
94 toPhylo1 docs phyloBase = appendGroups fisToGroup 1 phyloFis phyloBase
95 where
96 --------------------------------------
97 phyloFis :: Map (Date,Date) [PhyloFis]
98 phyloFis = toPhyloFis docs' (getFisSupport $ contextualUnit $ getConfig phyloBase) (getFisSize $ contextualUnit $ getConfig phyloBase)
99 --------------------------------------
100 docs' :: Map (Date,Date) [Document]
101 docs' = groupDocsByPeriod date (getPeriodIds phyloBase) docs
102 --------------------------------------
103
104
105 ---------------------------
106 -- | Frequent Item Set | --
107 ---------------------------
108
109
110 -- | To apply a filter with the possibility of keeping some periods non empty (keep : True|False)
111 filterFis :: Bool -> Int -> (Int -> [PhyloFis] -> [PhyloFis]) -> Map (Date, Date) [PhyloFis] -> Map (Date, Date) [PhyloFis]
112 filterFis keep thr f m = case keep of
113 False -> map (\l -> f thr l) m
114 True -> map (\l -> keepFilled (f) thr l) m
115
116
117 -- | To filter Fis with small Support
118 filterFisBySupport :: Int -> [PhyloFis] -> [PhyloFis]
119 filterFisBySupport thr l = filter (\fis -> (fis ^. phyloFis_support) >= thr) l
120
121
122 -- | To filter Fis with small Clique size
123 filterFisByClique :: Int -> [PhyloFis] -> [PhyloFis]
124 filterFisByClique thr l = filter (\fis -> (size $ fis ^. phyloFis_clique) >= thr) l
125
126
127 -- | To filter nested Fis
128 filterFisByNested :: Map (Date, Date) [PhyloFis] -> Map (Date, Date) [PhyloFis]
129 filterFisByNested m =
130 let fis = map (\l ->
131 foldl (\mem f -> if (any (\f' -> isNested (Set.toList $ f' ^. phyloFis_clique) (Set.toList $ f ^. phyloFis_clique)) mem)
132 then mem
133 else
134 let fMax = filter (\f' -> not $ isNested (Set.toList $ f ^. phyloFis_clique) (Set.toList $ f' ^. phyloFis_clique)) mem
135 in fMax ++ [f] ) [] l)
136 $ elems m
137 fis' = fis `using` parList rdeepseq
138 in fromList $ zip (keys m) fis'
139
140
141 -- | To transform a time map of docs innto a time map of Fis with some filters
142 toPhyloFis :: Map (Date, Date) [Document] -> Int -> Int -> Map (Date,Date) [PhyloFis]
143 toPhyloFis phyloDocs support clique = traceFis "Filtered Fis"
144 $ filterFisByNested
145 $ traceFis "Filtered by clique size"
146 $ filterFis True clique (filterFisByClique)
147 $ traceFis "Filtered by support"
148 $ filterFis True support (filterFisBySupport)
149 $ traceFis "Unfiltered Fis" phyloFis
150 where
151 --------------------------------------
152 phyloFis :: Map (Date,Date) [PhyloFis]
153 phyloFis =
154 let fis = map (\(prd,docs) -> let lst = toList $ fisWithSizePolyMap (Segment 1 20) 1 (map text docs)
155 in (prd, map (\f -> PhyloFis (fst f) (snd f) prd) lst))
156 $ toList phyloDocs
157 fis' = fis `using` parList rdeepseq
158 in fromList fis'
159 --------------------------------------
160
161
162 --------------------
163 -- | Coocurency | --
164 --------------------
165
166
167 -- | To build the local cooc matrix of each phylogroup
168 ngramsToCooc :: [Int] -> [Cooc] -> Cooc
169 ngramsToCooc ngrams coocs =
170 let cooc = foldl (\acc cooc' -> sumCooc acc cooc') empty coocs
171 pairs = listToKeys ngrams
172 in filterWithKey (\k _ -> elem k pairs) cooc
173
174
175 -- | To transform the docs into a time map of coocurency matrix
176 docsToTimeScaleCooc :: [Document] -> Vector Ngrams -> Map Date Cooc
177 docsToTimeScaleCooc docs fdt =
178 let mCooc = fromListWith sumCooc
179 $ map (\(_d,l) -> (_d, listToMatrix l))
180 $ map (\doc -> (date doc, sort $ ngramsToIdx (text doc) fdt)) docs
181 mCooc' = fromList
182 $ map (\t -> (t,empty))
183 $ toTimeScale (map date docs) 1
184 in trace ("\n" <> "-- | Build the coocurency matrix for " <> show (length $ keys mCooc') <> " unit of time" <> "\n")
185 $ unionWith sumCooc mCooc mCooc'
186
187
188 -----------------------
189 -- | to Phylo Base | --
190 -----------------------
191
192
193 -- | To group a list of Documents by fixed periods
194 groupDocsByPeriod :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
195 groupDocsByPeriod _ _ [] = panic "[ERR][Viz.Phylo.PhyloMaker] Empty [Documents] can not have any periods"
196 groupDocsByPeriod f pds es =
197 let periods = map (inPeriode f es) pds
198 periods' = periods `using` parList rdeepseq
199
200 in trace ("\n" <> "-- | Group " <> show(length es) <> " docs by " <> show(length pds) <> " periods" <> "\n")
201 $ fromList $ zip pds periods'
202 where
203 --------------------------------------
204 inPeriode :: Ord b => (t -> b) -> [t] -> (b, b) -> [t]
205 inPeriode f' h (start,end) =
206 fst $ partition (\d -> f' d >= start && f' d <= end) h
207 --------------------------------------
208
209
210 -- | To count the number of docs by unit of time
211 docsToTimeScaleNb :: [Document] -> Map Date Double
212 docsToTimeScaleNb docs =
213 let docs' = fromListWith (+) $ map (\d -> (date d,1)) docs
214 time = fromList $ map (\t -> (t,0)) $ toTimeScale (keys docs') 1
215 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length time) <> " unit of time" <> "\n")
216 $ unionWith (+) time docs'
217
218
219 initPhyloLevels :: Int -> PhyloPeriodId -> Map PhyloLevelId PhyloLevel
220 initPhyloLevels lvlMax pId =
221 fromList $ map (\lvl -> ((pId,lvl),PhyloLevel pId lvl empty)) [1..lvlMax]
222
223
224 -- | To init the basic elements of a Phylo
225 toPhyloBase :: [Document] -> TermList -> Config -> Phylo
226 toPhyloBase docs lst conf =
227 let foundations = PhyloFoundations (Vector.fromList $ nub $ concat $ map text docs) lst
228 params = defaultPhyloParam { _phyloParam_config = conf }
229 periods = toPeriods (sort $ nub $ map date docs) (getTimePeriod $ timeUnit conf) (getTimeStep $ timeUnit conf)
230 in trace ("\n" <> "-- | Create PhyloBase out of " <> show(length docs) <> " docs \n")
231 $ Phylo foundations
232 (docsToTimeScaleCooc docs (foundations ^. foundations_roots))
233 (docsToTimeScaleNb docs)
234 params
235 (fromList $ map (\prd -> (prd, PhyloPeriod prd (initPhyloLevels (phyloLevel conf) prd))) periods)