]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Viz/Phylo/PhyloMaker.hs
Merge branch '104-dev-john-snow-nlp' of ssh://gitlab.iscpif.fr:20022/cgenie/haskell...
[gargantext.git] / src / Gargantext / Core / Viz / Phylo / PhyloMaker.hs
1 {-|
2 Module : Gargantext.Core.Viz.Phylo.PhyloMaker
3 Description : Maker engine for rebuilding a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 module Gargantext.Core.Viz.Phylo.PhyloMaker where
12
13 import Control.DeepSeq (NFData)
14 import Control.Lens hiding (Level)
15 import Control.Parallel.Strategies (parList, rdeepseq, using)
16 import Data.List (concat, nub, partition, sort, (++), group, intersect, null, sortOn, groupBy, tail)
17 import Data.Map (Map, fromListWith, keys, unionWith, fromList, empty, toList, elems, (!), restrictKeys, foldlWithKey, insert)
18 import Data.Text (Text)
19 import Data.Vector (Vector)
20 import Debug.Trace (trace)
21
22 import Gargantext.Core.Methods.Distances (Distance(Conditional))
23 import Gargantext.Core.Methods.Graph.MaxClique (getMaxCliques)
24 import Gargantext.Core.Text.Context (TermList)
25 import Gargantext.Core.Text.Metrics.FrequentItemSet (fisWithSizePolyMap, fisWithSizePolyMap', Size(..))
26 import Gargantext.Core.Viz.Phylo
27 import Gargantext.Core.Viz.Phylo.PhyloExport (toHorizon)
28 import Gargantext.Core.Viz.Phylo.PhyloTools
29 import Gargantext.Core.Viz.Phylo.SynchronicClustering (synchronicClustering)
30 import Gargantext.Core.Viz.Phylo.TemporalMatching (adaptativeTemporalMatching, constanteTemporalMatching, getNextPeriods, filterDocs, filterDiago, reduceDiagos, toProximity)
31 import Gargantext.Prelude
32
33 import qualified Data.Set as Set
34 import qualified Data.Vector as Vector
35
36 ------------------
37 -- | To Phylo | --
38 ------------------
39
40 {-
41 -- TODO AD
42 data Phylo' = PhyloBase { _phylo'_phyloBase :: Phylo}
43 | PhyloN { _phylo'_phylo1 :: Phylo}
44
45
46 toPhylo' :: Phylo' -> [Document] -> TermList -> PhyloConfig -> Phylo
47 toPhylo' (PhyloN phylo) = toPhylo'
48 toPhylo' (PhyloBase phylo) = toPhylo
49 -}
50
51
52 toPhylo :: Phylo -> Phylo
53 toPhylo phyloStep = trace ("# phylo1 groups " <> show(length $ getGroupsFromLevel 1 phylo1))
54 $ traceToPhylo (phyloLevel $ getConfig phyloStep) $
55 if (phyloLevel $ getConfig phyloStep) > 1
56 then foldl' (\phylo' _ -> synchronicClustering phylo') phyloAncestors [2..(phyloLevel $ getConfig phyloStep)]
57 else phylo1
58 where
59 --------------------------------------
60 phyloAncestors :: Phylo
61 phyloAncestors =
62 if (findAncestors $ getConfig phyloStep)
63 then toHorizon phylo1
64 else phylo1
65 --------------------------------------
66 phylo1 :: Phylo
67 phylo1 = toPhylo1 phyloStep
68 --------------------------------------
69
70
71 --------------------
72 -- | To Phylo 1 | --
73 --------------------
74
75 toGroupsProxi :: Level -> Phylo -> Phylo
76 toGroupsProxi lvl phylo =
77 let proximity = phyloProximity $ getConfig phylo
78 groupsProxi = foldlWithKey (\acc pId pds ->
79 -- 1) process period by period
80 let egos = map (\g -> (getGroupId g, g ^. phylo_groupNgrams))
81 $ elems
82 $ view ( phylo_periodLevels
83 . traverse . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
84 . phylo_levelGroups ) pds
85 next = getNextPeriods ToParents (getTimeFrame $ timeUnit $ getConfig phylo) pId (keys $ phylo ^. phylo_periods)
86 targets = map (\g -> (getGroupId g, g ^. phylo_groupNgrams)) $ getGroupsFromLevelPeriods lvl next phylo
87 docs = filterDocs (phylo ^. phylo_timeDocs) ([pId] ++ next)
88 diagos = filterDiago (phylo ^. phylo_timeCooc) ([pId] ++ next)
89 -- 2) compute the pairs in parallel
90 pairs = map (\(id,ngrams) ->
91 map (\(id',ngrams') ->
92 let nbDocs = (sum . elems) $ filterDocs docs ([idToPrd id, idToPrd id'])
93 diago = reduceDiagos $ filterDiago diagos ([idToPrd id, idToPrd id'])
94 in ((id,id'),toProximity nbDocs diago proximity ngrams ngrams' ngrams')
95 ) $ filter (\(_,ngrams') -> (not . null) $ intersect ngrams ngrams') targets
96 ) egos
97 pairs' = pairs `using` parList rdeepseq
98 in acc ++ (concat pairs')
99 ) [] $ phylo ^. phylo_periods
100 in phylo & phylo_groupsProxi .~ ((traceGroupsProxi . fromList) groupsProxi)
101
102
103 appendGroups :: (a -> PhyloPeriodId -> (Text,Text) -> Level -> Int -> [Cooc] -> PhyloGroup) -> Level -> Map (Date,Date) [a] -> Phylo -> Phylo
104 appendGroups f lvl m phylo = trace ("\n" <> "-- | Append " <> show (length $ concat $ elems m) <> " groups to Level " <> show (lvl) <> "\n")
105 $ over ( phylo_periods
106 . traverse
107 . phylo_periodLevels
108 . traverse)
109 (\phyloLvl -> if lvl == (phyloLvl ^. phylo_levelLevel)
110 then
111 let pId = phyloLvl ^. phylo_levelPeriod
112 pId' = phyloLvl ^. phylo_levelPeriod'
113 phyloCUnit = m ! pId
114 in phyloLvl
115 & phylo_levelGroups .~ (fromList $ foldl (\groups obj ->
116 groups ++ [ (((pId,lvl),length groups)
117 , f obj pId pId' lvl (length groups)
118 (elems $ restrictKeys (phylo ^. phylo_timeCooc) $ periodsToYears [pId]))
119 ] ) [] phyloCUnit)
120 else
121 phyloLvl )
122 phylo
123
124
125 cliqueToGroup :: PhyloClique -> PhyloPeriodId -> (Text,Text) -> Level -> Int -> [Cooc] -> PhyloGroup
126 cliqueToGroup fis pId pId' lvl idx coocs = PhyloGroup pId pId' lvl idx ""
127 (fis ^. phyloClique_support)
128 (fis ^. phyloClique_weight)
129 (fis ^. phyloClique_sources)
130 (fis ^. phyloClique_nodes)
131 (ngramsToCooc (fis ^. phyloClique_nodes) coocs)
132 (1,[0]) -- branchid (lvl,[path in the branching tree])
133 (fromList [("breaks",[0]),("seaLevels",[0])])
134 [] [] [] [] [] [] []
135
136
137 toPhylo1 :: Phylo -> Phylo
138 toPhylo1 phyloStep = case (getSeaElevation phyloStep) of
139 Constante start gap -> constanteTemporalMatching start gap phyloStep
140 Adaptative steps -> adaptativeTemporalMatching steps phyloStep
141
142 -----------------------
143 -- | To Phylo Step | --
144 -----------------------
145
146
147 indexDates' :: Map (Date,Date) [Document] -> Map (Date,Date) (Text,Text)
148 indexDates' m = map (\docs ->
149 let ds = map (\d -> date' d) docs
150 f = if (null ds)
151 then ""
152 else toFstDate ds
153 l = if (null ds)
154 then ""
155 else toLstDate ds
156 in (f,l)) m
157
158
159 -- To build the first phylo step from docs and terms
160 -- QL: backend entre phyloBase et phyloClique
161 toPhyloStep :: [Document] -> TermList -> PhyloConfig -> Phylo
162 toPhyloStep docs lst conf = case (getSeaElevation phyloBase) of
163 Constante _ _ -> appendGroups cliqueToGroup 1 phyloClique (updatePeriods (indexDates' docs') phyloBase)
164 Adaptative _ -> toGroupsProxi 1
165 $ appendGroups cliqueToGroup 1 phyloClique (updatePeriods (indexDates' docs') phyloBase)
166 where
167 --------------------------------------
168 phyloClique :: Map (Date,Date) [PhyloClique]
169 phyloClique = toPhyloClique phyloBase docs'
170 --------------------------------------
171 docs' :: Map (Date,Date) [Document]
172 -- QL: Time Consuming here
173 docs' = groupDocsByPeriodRec date (getPeriodIds phyloBase) (sortOn date docs) empty
174 --------------------------------------
175 phyloBase :: Phylo
176 phyloBase = toPhyloBase docs lst conf
177 --------------------------------------
178
179 ---------------------------
180 -- | Frequent Item Set | --
181 ---------------------------
182
183
184 -- To apply a filter with the possibility of keeping some periods non empty (keep : True|False)
185 filterClique :: Bool -> Int -> (Int -> [PhyloClique] -> [PhyloClique]) -> Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
186 filterClique keep thr f m = case keep of
187 False -> map (\l -> f thr l) m
188 True -> map (\l -> keepFilled (f) thr l) m
189
190
191 -- To filter Fis with small Support
192 filterCliqueBySupport :: Int -> [PhyloClique] -> [PhyloClique]
193 filterCliqueBySupport thr l = filter (\clq -> (clq ^. phyloClique_support) >= thr) l
194
195
196 -- To filter Fis with small Clique size
197 filterCliqueBySize :: Int -> [PhyloClique] -> [PhyloClique]
198 filterCliqueBySize thr l = filter (\clq -> (length $ clq ^. phyloClique_nodes) >= thr) l
199
200
201 -- To filter nested Fis
202 filterCliqueByNested :: Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
203 filterCliqueByNested m =
204 let clq = map (\l ->
205 foldl (\mem f -> if (any (\f' -> isNested (f' ^. phyloClique_nodes) (f ^. phyloClique_nodes)) mem)
206 then mem
207 else
208 let fMax = filter (\f' -> not $ isNested (f ^. phyloClique_nodes) (f' ^. phyloClique_nodes)) mem
209 in fMax ++ [f] ) [] l)
210 $ elems m
211 clq' = clq `using` parList rdeepseq
212 in fromList $ zip (keys m) clq'
213
214
215 -- | To transform a time map of docs into a time map of Fis with some filters
216 toPhyloClique :: Phylo -> Map (Date, Date) [Document] -> Map (Date,Date) [PhyloClique]
217 toPhyloClique phylo phyloDocs = case (clique $ getConfig phylo) of
218 Fis s s' -> -- traceFis "Filtered Fis"
219 filterCliqueByNested
220 {- \$ traceFis "Filtered by clique size" -}
221 $ filterClique True s' (filterCliqueBySize)
222 {- \$ traceFis "Filtered by support" -}
223 $ filterClique True s (filterCliqueBySupport)
224 {- \$ traceFis "Unfiltered Fis" -}
225 phyloClique
226 MaxClique s _ _ -> filterClique True s (filterCliqueBySize)
227 phyloClique
228 where
229 --------------------------------------
230 phyloClique :: Map (Date,Date) [PhyloClique]
231 phyloClique = case (clique $ getConfig phylo) of
232 Fis _ _ ->
233 let fis = map (\(prd,docs) ->
234 case (corpusParser $ getConfig phylo) of
235 Csv' _ -> let lst = toList
236 $ fisWithSizePolyMap' (Segment 1 20) 1 (map (\d -> (ngramsToIdx (text d) (getRoots phylo), (weight d, (sourcesToIdx (sources d) (getSources phylo))))) docs)
237 in (prd, map (\f -> PhyloClique (Set.toList $ fst f) ((fst . snd) f) prd ((fst . snd . snd) f) (((snd . snd . snd) f))) lst)
238 _ -> let lst = toList
239 $ fisWithSizePolyMap (Segment 1 20) 1 (map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs)
240 in (prd, map (\f -> PhyloClique (Set.toList $ fst f) (snd f) prd (Just $ fromIntegral $ snd f) []) lst)
241 )
242 $ toList phyloDocs
243 fis' = fis `using` parList rdeepseq
244 in fromList fis'
245 MaxClique _ thr filterType ->
246 let mcl = map (\(prd,docs) ->
247 let cooc = map round
248 $ foldl sumCooc empty
249 $ map listToMatrix
250 $ map (\d -> ngramsToIdx (text d) (getRoots phylo)) docs
251 in (prd, map (\cl -> PhyloClique cl 0 prd Nothing []) $ getMaxCliques filterType Conditional thr cooc))
252 $ toList phyloDocs
253 mcl' = mcl `using` parList rdeepseq
254 in fromList mcl'
255 --------------------------------------
256
257 -- dev viz graph maxClique getMaxClique
258
259
260 --------------------
261 -- | Coocurency | --
262 --------------------
263
264
265 -- To transform the docs into a time map of coocurency matrix
266 docsToTimeScaleCooc :: [Document] -> Vector Ngrams -> Map Date Cooc
267 docsToTimeScaleCooc docs fdt =
268 let mCooc = fromListWith sumCooc
269 $ map (\(_d,l) -> (_d, listToMatrix l))
270 $ map (\doc -> (date doc, sort $ ngramsToIdx (text doc) fdt)) docs
271 mCooc' = fromList
272 $ map (\t -> (t,empty))
273 $ toTimeScale (map date docs) 1
274 in trace ("\n" <> "-- | Build the coocurency matrix for " <> show (length $ keys mCooc') <> " unit of time" <> "\n")
275 $ unionWith sumCooc mCooc mCooc'
276
277
278 -----------------------
279 -- | to Phylo Base | --
280 -----------------------
281 -- TODO anoe
282 groupDocsByPeriodRec :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc] -> Map (date, date) [doc]
283 groupDocsByPeriodRec f prds docs acc =
284 if ((null prds) || (null docs))
285 then acc
286 else
287 let prd = head' "groupBy" prds
288 docs' = partition (\d -> (f d >= fst prd) && (f d <= snd prd)) docs
289 in groupDocsByPeriodRec f (tail prds) (snd docs') (insert prd (fst docs') acc)
290
291
292 -- To group a list of Documents by fixed periods
293 groupDocsByPeriod' :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
294 groupDocsByPeriod' f pds docs =
295 let docs' = groupBy (\d d' -> f d == f d') $ sortOn f docs
296 periods = map (inPeriode f docs') pds
297 periods' = periods `using` parList rdeepseq
298 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length pds) <> " periods" <> "\n")
299 $ fromList $ zip pds periods'
300 where
301 --------------------------------------
302 inPeriode :: Ord b => (t -> b) -> [[t]] -> (b, b) -> [t]
303 inPeriode f' h (start,end) =
304 concat $ fst $ partition (\d -> f' (head' "inPeriode" d) >= start && f' (head' "inPeriode" d) <= end) h
305
306
307
308 -- To group a list of Documents by fixed periods
309 groupDocsByPeriod :: (NFData doc, Ord date, Enum date) => (doc -> date) -> [(date,date)] -> [doc] -> Map (date, date) [doc]
310 groupDocsByPeriod _ _ [] = panic "[ERR][Viz.Phylo.PhyloMaker] Empty [Documents] can not have any periods"
311 groupDocsByPeriod f pds es =
312 let periods = map (inPeriode f es) pds
313 periods' = periods `using` parList rdeepseq
314
315 in trace ("\n" <> "-- | Group " <> show(length es) <> " docs by " <> show(length pds) <> " periods" <> "\n")
316 $ fromList $ zip pds periods'
317 where
318 --------------------------------------
319 inPeriode :: Ord b => (t -> b) -> [t] -> (b, b) -> [t]
320 inPeriode f' h (start,end) =
321 fst $ partition (\d -> f' d >= start && f' d <= end) h
322 --------------------------------------
323
324
325 docsToTermFreq :: [Document] -> Vector Ngrams -> Map Int Double
326 docsToTermFreq docs fdt =
327 let nbDocs = fromIntegral $ length docs
328 freqs = map (/(nbDocs))
329 $ fromList
330 $ map (\lst -> (head' "docsToTermFreq" lst, fromIntegral $ length lst))
331 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) docs
332 sumFreqs = sum $ elems freqs
333 in map (/sumFreqs) freqs
334
335 docsToLastTermFreq :: Int -> [Document] -> Vector Ngrams -> Map Int Double
336 docsToLastTermFreq n docs fdt =
337 let last = take n $ reverse $ sort $ map date docs
338 nbDocs = fromIntegral $ length $ filter (\d -> elem (date d) last) docs
339 freqs = map (/(nbDocs))
340 $ fromList
341 $ map (\lst -> (head' "docsToLastTermFreq" lst, fromIntegral $ length lst))
342 $ group $ sort $ concat $ map (\d -> nub $ ngramsToIdx (text d) fdt) $ filter (\d -> elem (date d) last) docs
343 sumFreqs = sum $ elems freqs
344 in map (/sumFreqs) freqs
345
346
347 -- To count the number of docs by unit of time
348 docsToTimeScaleNb :: [Document] -> Map Date Double
349 docsToTimeScaleNb docs =
350 let docs' = fromListWith (+) $ map (\d -> (date d,1)) docs
351 time = fromList $ map (\t -> (t,0)) $ toTimeScale (keys docs') 1
352 in trace ("\n" <> "-- | Group " <> show(length docs) <> " docs by " <> show(length time) <> " unit of time" <> "\n")
353 $ unionWith (+) time docs'
354
355
356 initPhyloLevels :: Int -> PhyloPeriodId -> Map PhyloLevelId PhyloLevel
357 initPhyloLevels lvlMax pId =
358 fromList $ map (\lvl -> ((pId,lvl),PhyloLevel pId ("","") lvl empty)) [1..lvlMax]
359
360
361
362 -- To init the basic elements of a Phylo
363 toPhyloBase :: [Document] -> TermList -> PhyloConfig -> Phylo
364 toPhyloBase docs lst conf =
365 let foundations = PhyloFoundations (Vector.fromList $ nub $ concat $ map text docs) lst
366 docsSources = PhyloSources (Vector.fromList $ nub $ concat $ map sources docs)
367 params = defaultPhyloParam { _phyloParam_config = conf }
368 periods = toPeriods (sort $ nub $ map date docs) (getTimePeriod $ timeUnit conf) (getTimeStep $ timeUnit conf)
369 in trace ("\n" <> "-- | Create PhyloBase out of " <> show(length docs) <> " docs \n")
370 $ Phylo foundations
371 docsSources
372 (docsToTimeScaleCooc docs (foundations ^. foundations_roots))
373 (docsToTimeScaleNb docs)
374 (docsToTermFreq docs (foundations ^. foundations_roots))
375 (docsToLastTermFreq (getTimePeriod $ timeUnit conf) docs (foundations ^. foundations_roots))
376 empty
377 empty
378 params
379 (fromList $ map (\prd -> (prd, PhyloPeriod prd ("","") (initPhyloLevels 1 prd))) periods)