]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/Phylo/PhyloTools.hs
working on perf
[gargantext.git] / src / Gargantext / Viz / Phylo / PhyloTools.hs
1 {-|
2 Module : Gargantext.Viz.Phylo.PhyloTools
3 Description : Module dedicated to all the tools needed for making a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 {-# LANGUAGE FlexibleContexts #-}
12 {-# LANGUAGE NoImplicitPrelude #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE RankNTypes #-}
15 {-# LANGUAGE ViewPatterns #-}
16
17 module Gargantext.Viz.Phylo.PhyloTools where
18
19 import Data.Vector (Vector, elemIndex)
20 import Data.List (sort, concat, null, union, (++), tails, sortOn, nub, init, tail, partition)
21 import Data.Set (Set, size, disjoint)
22 import Data.Map (Map, elems, fromList, unionWith, keys, member, (!), filterWithKey, fromListWith, empty)
23 import Data.String (String)
24 import Data.Text (Text, unwords)
25
26 import Gargantext.Prelude
27 import Gargantext.Viz.AdaptativePhylo
28 import Text.Printf
29
30
31 import Debug.Trace (trace)
32 import Control.Lens hiding (Level)
33
34 import qualified Data.Vector as Vector
35 import qualified Data.List as List
36 import qualified Data.Set as Set
37
38 ------------
39 -- | Io | --
40 ------------
41
42 -- | To print an important message as an IO()
43 printIOMsg :: String -> IO ()
44 printIOMsg msg =
45 putStrLn ( "\n"
46 <> "------------"
47 <> "\n"
48 <> "-- | " <> msg <> "\n" )
49
50
51 -- | To print a comment as an IO()
52 printIOComment :: String -> IO ()
53 printIOComment cmt =
54 putStrLn ( "\n" <> cmt <> "\n" )
55
56
57 --------------
58 -- | Misc | --
59 --------------
60
61
62 roundToStr :: (PrintfArg a, Floating a) => Int -> a -> String
63 roundToStr = printf "%0.*f"
64
65
66 countSup :: Double -> [Double] -> Int
67 countSup s l = length $ filter (>s) l
68
69
70 elemIndex' :: Eq a => a -> [a] -> Int
71 elemIndex' e l = case (List.elemIndex e l) of
72 Nothing -> panic ("[ERR][Viz.Phylo.PhyloTools] element not in list")
73 Just i -> i
74
75
76 ---------------------
77 -- | Foundations | --
78 ---------------------
79
80
81 -- | Is this Ngrams a Foundations Root ?
82 isRoots :: Ngrams -> Vector Ngrams -> Bool
83 isRoots n ns = Vector.elem n ns
84
85 -- | To transform a list of nrams into a list of foundation's index
86 ngramsToIdx :: [Ngrams] -> Vector Ngrams -> [Int]
87 ngramsToIdx ns fdt = map (\n -> fromJust $ elemIndex n fdt) ns
88
89 -- | To transform a list of Ngrams Indexes into a Label
90 ngramsToLabel :: Vector Ngrams -> [Int] -> Text
91 ngramsToLabel ngrams l = unwords $ tail' "ngramsToLabel" $ concat $ map (\n -> ["|",n]) $ ngramsToText ngrams l
92
93
94 -- | To transform a list of Ngrams Indexes into a list of Text
95 ngramsToText :: Vector Ngrams -> [Int] -> [Text]
96 ngramsToText ngrams l = map (\idx -> ngrams Vector.! idx) l
97
98
99 --------------
100 -- | Time | --
101 --------------
102
103 -- | To transform a list of periods into a set of Dates
104 periodsToYears :: [(Date,Date)] -> Set Date
105 periodsToYears periods = (Set.fromList . sort . concat)
106 $ map (\(d,d') -> [d..d']) periods
107
108
109 findBounds :: [Date] -> (Date,Date)
110 findBounds dates =
111 let dates' = sort dates
112 in (head' "findBounds" dates', last' "findBounds" dates')
113
114
115 toPeriods :: [Date] -> Int -> Int -> [(Date,Date)]
116 toPeriods dates p s =
117 let (start,end) = findBounds dates
118 in map (\dates' -> (head' "toPeriods" dates', last' "toPeriods" dates'))
119 $ chunkAlong p s [start .. end]
120
121
122 -- | Get a regular & ascendante timeScale from a given list of dates
123 toTimeScale :: [Date] -> Int -> [Date]
124 toTimeScale dates step =
125 let (start,end) = findBounds dates
126 in [start, (start + step) .. end]
127
128
129 getTimeStep :: TimeUnit -> Int
130 getTimeStep time = case time of
131 Year _ s _ -> s
132
133 getTimePeriod :: TimeUnit -> Int
134 getTimePeriod time = case time of
135 Year p _ _ -> p
136
137 getTimeFrame :: TimeUnit -> Int
138 getTimeFrame time = case time of
139 Year _ _ f -> f
140
141 -------------
142 -- | Fis | --
143 -------------
144
145
146 -- | To find if l' is nested in l
147 isNested :: Eq a => [a] -> [a] -> Bool
148 isNested l l'
149 | null l' = True
150 | length l' > length l = False
151 | (union l l') == l = True
152 | otherwise = False
153
154
155 -- | To filter Fis with small Support but by keeping non empty Periods
156 keepFilled :: (Int -> [a] -> [a]) -> Int -> [a] -> [a]
157 keepFilled f thr l = if (null $ f thr l) && (not $ null l)
158 then keepFilled f (thr - 1) l
159 else f thr l
160
161
162 traceClique :: Map (Date, Date) [PhyloFis] -> String
163 traceClique mFis = foldl (\msg cpt -> msg <> show (countSup cpt cliques) <> " (>" <> show (cpt) <> ") " ) "" [1..6]
164 where
165 --------------------------------------
166 cliques :: [Double]
167 cliques = sort $ map (fromIntegral . size . _phyloFis_clique) $ concat $ elems mFis
168 --------------------------------------
169
170
171 traceSupport :: Map (Date, Date) [PhyloFis] -> String
172 traceSupport mFis = foldl (\msg cpt -> msg <> show (countSup cpt supports) <> " (>" <> show (cpt) <> ") " ) "" [1..6]
173 where
174 --------------------------------------
175 supports :: [Double]
176 supports = sort $ map (fromIntegral . _phyloFis_support) $ concat $ elems mFis
177 --------------------------------------
178
179
180 traceFis :: [Char] -> Map (Date, Date) [PhyloFis] -> Map (Date, Date) [PhyloFis]
181 traceFis msg mFis = trace ( "\n" <> "-- | " <> msg <> " : " <> show (sum $ map length $ elems mFis) <> "\n"
182 <> "Support : " <> (traceSupport mFis) <> "\n"
183 <> "Clique : " <> (traceClique mFis) <> "\n" ) mFis
184
185
186 -------------------------
187 -- | Contextual unit | --
188 -------------------------
189
190
191 getFisSupport :: ContextualUnit -> Int
192 getFisSupport unit = case unit of
193 Fis s _ -> s
194 -- _ -> panic ("[ERR][Viz.Phylo.PhyloTools.getFisSupport] Only Fis has a support")
195
196 getFisSize :: ContextualUnit -> Int
197 getFisSize unit = case unit of
198 Fis _ s -> s
199 -- _ -> panic ("[ERR][Viz.Phylo.PhyloTools.getFisSupport] Only Fis has a clique size")
200
201
202 --------------
203 -- | Cooc | --
204 --------------
205
206 listToCombi' :: [a] -> [(a,a)]
207 listToCombi' l = [(x,y) | (x:rest) <- tails l, y <- rest]
208
209 listToEqual' :: Eq a => [a] -> [(a,a)]
210 listToEqual' l = [(x,y) | x <- l, y <- l, x == y]
211
212 listToKeys :: Eq a => [a] -> [(a,a)]
213 listToKeys lst = (listToCombi' lst) ++ (listToEqual' lst)
214
215 listToMatrix :: [Int] -> Map (Int,Int) Double
216 listToMatrix lst = fromList $ map (\k -> (k,1)) $ listToKeys $ sort lst
217
218 sumCooc :: Cooc -> Cooc -> Cooc
219 sumCooc cooc cooc' = unionWith (+) cooc cooc'
220
221 getTrace :: Cooc -> Double
222 getTrace cooc = sum $ elems $ filterWithKey (\(k,k') _ -> k == k') cooc
223
224
225 -- | To build the local cooc matrix of each phylogroup
226 ngramsToCooc :: [Int] -> [Cooc] -> Cooc
227 ngramsToCooc ngrams coocs =
228 let cooc = foldl (\acc cooc' -> sumCooc acc cooc') empty coocs
229 pairs = listToKeys ngrams
230 in filterWithKey (\k _ -> elem k pairs) cooc
231
232
233 --------------------
234 -- | PhyloGroup | --
235 --------------------
236
237 getGroupId :: PhyloGroup -> PhyloGroupId
238 getGroupId group = ((group ^. phylo_groupPeriod, group ^. phylo_groupLevel), group ^. phylo_groupIndex)
239
240 groupByField :: Ord a => (PhyloGroup -> a) -> [PhyloGroup] -> Map a [PhyloGroup]
241 groupByField toField groups = fromListWith (++) $ map (\g -> (toField g, [g])) groups
242
243 getPeriodPointers :: Filiation -> PhyloGroup -> [Pointer]
244 getPeriodPointers fil group =
245 case fil of
246 ToChilds -> group ^. phylo_groupPeriodChilds
247 ToParents -> group ^. phylo_groupPeriodParents
248
249 filterProximity :: Proximity -> Double -> Double -> Bool
250 filterProximity proximity thr local =
251 case proximity of
252 WeightedLogJaccard _ _ _ -> local >= thr
253 Hamming -> undefined
254
255 filterPointers :: Filiation -> PointerType -> Proximity -> Double -> PhyloGroup -> PhyloGroup
256 filterPointers fil pty proximity thr group =
257 case pty of
258 TemporalPointer -> case fil of
259 ToChilds -> group & phylo_groupPeriodChilds %~ (filter (\(_,w) -> filterProximity proximity thr w))
260 ToParents -> group & phylo_groupPeriodParents %~ (filter (\(_,w) -> filterProximity proximity thr w))
261 LevelPointer -> undefined
262
263
264 ---------------
265 -- | Phylo | --
266 ---------------
267
268 addPointers :: PhyloGroup -> Filiation -> PointerType -> [Pointer] -> PhyloGroup
269 addPointers group fil pty pointers =
270 case pty of
271 TemporalPointer -> case fil of
272 ToChilds -> group & phylo_groupPeriodChilds .~ pointers
273 ToParents -> group & phylo_groupPeriodParents .~ pointers
274 LevelPointer -> case fil of
275 ToChilds -> group & phylo_groupLevelChilds .~ pointers
276 ToParents -> group & phylo_groupLevelParents .~ pointers
277
278
279 getPeriodIds :: Phylo -> [(Date,Date)]
280 getPeriodIds phylo = sortOn fst
281 $ keys
282 $ phylo ^. phylo_periods
283
284 getLevelParentId :: PhyloGroup -> PhyloGroupId
285 getLevelParentId g = fst $ head' "getLevelParentId" $ g ^. phylo_groupLevelParents
286
287 getLastLevel :: Phylo -> Level
288 getLastLevel phylo = last' "lastLevel" $ getLevels phylo
289
290 getLevels :: Phylo -> [Level]
291 getLevels phylo = nub
292 $ map snd
293 $ keys $ view ( phylo_periods
294 . traverse
295 . phylo_periodLevels ) phylo
296
297
298 getConfig :: Phylo -> Config
299 getConfig phylo = (phylo ^. phylo_param) ^. phyloParam_config
300
301
302 getRoots :: Phylo -> Vector Ngrams
303 getRoots phylo = (phylo ^. phylo_foundations) ^. foundations_roots
304
305 phyloToLastBranches :: Phylo -> [[PhyloGroup]]
306 phyloToLastBranches phylo = elems
307 $ fromListWith (++)
308 $ map (\g -> (g ^. phylo_groupBranchId, [g]))
309 $ getGroupsFromLevel (last' "byBranches" $ getLevels phylo) phylo
310
311 getGroupsFromLevel :: Level -> Phylo -> [PhyloGroup]
312 getGroupsFromLevel lvl phylo =
313 elems $ view ( phylo_periods
314 . traverse
315 . phylo_periodLevels
316 . traverse
317 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
318 . phylo_levelGroups ) phylo
319
320
321 updatePhyloGroups :: Level -> Map PhyloGroupId PhyloGroup -> Phylo -> Phylo
322 updatePhyloGroups lvl m phylo =
323 over ( phylo_periods
324 . traverse
325 . phylo_periodLevels
326 . traverse
327 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
328 . phylo_levelGroups
329 . traverse
330 ) (\group ->
331 let id = getGroupId group
332 in
333 if member id m
334 then m ! id
335 else group ) phylo
336
337
338 traceToPhylo :: Level -> Phylo -> Phylo
339 traceToPhylo lvl phylo =
340 trace ("\n" <> "-- | End of phylo making at level " <> show (lvl) <> " with "
341 <> show (length $ getGroupsFromLevel lvl phylo) <> " groups and "
342 <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel lvl phylo) <> " branches" <> "\n") phylo
343
344 --------------------
345 -- | Clustering | --
346 --------------------
347
348 relatedComponents :: Ord a => [[a]] -> [[a]]
349 relatedComponents graph = foldl' (\acc groups ->
350 if (null acc)
351 then acc ++ [groups]
352 else
353 let acc' = partition (\groups' -> disjoint (Set.fromList groups') (Set.fromList groups)) acc
354 in (fst acc') ++ [nub $ concat $ (snd acc') ++ [groups]]) [] graph
355
356
357 traceSynchronyEnd :: Phylo -> Phylo
358 traceSynchronyEnd phylo =
359 trace ( "\n" <> "-- | End synchronic clustering at level " <> show (getLastLevel phylo)
360 <> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
361 <> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
362 <> "\n" ) phylo
363
364 traceSynchronyStart :: Phylo -> Phylo
365 traceSynchronyStart phylo =
366 trace ( "\n" <> "-- | Start synchronic clustering at level " <> show (getLastLevel phylo)
367 <> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
368 <> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
369 <> "\n" ) phylo
370
371
372 -------------------
373 -- | Proximity | --
374 -------------------
375
376 getSensibility :: Proximity -> Double
377 getSensibility proxi = case proxi of
378 WeightedLogJaccard s _ _ -> s
379 Hamming -> undefined
380
381 getThresholdInit :: Proximity -> Double
382 getThresholdInit proxi = case proxi of
383 WeightedLogJaccard _ t _ -> t
384 Hamming -> undefined
385
386 getThresholdStep :: Proximity -> Double
387 getThresholdStep proxi = case proxi of
388 WeightedLogJaccard _ _ s -> s
389 Hamming -> undefined
390
391
392 traceBranchMatching :: Proximity -> Double -> [PhyloGroup] -> [PhyloGroup]
393 traceBranchMatching proxi thr groups = case proxi of
394 WeightedLogJaccard _ i s -> trace (
395 roundToStr 2 thr <> " "
396 <> foldl (\acc _ -> acc <> ".") "." [(10*i),(10*i + 10*s)..(10*thr)]
397 <> " " <> show(length groups) <> " groups"
398 ) groups
399 Hamming -> undefined
400
401 ----------------
402 -- | Branch | --
403 ----------------
404
405 intersectInit :: Eq a => [a] -> [a] -> [a] -> [a]
406 intersectInit acc lst lst' =
407 if (null lst) || (null lst')
408 then acc
409 else if (head' "intersectInit" lst) == (head' "intersectInit" lst')
410 then intersectInit (acc ++ [head' "intersectInit" lst]) (tail lst) (tail lst')
411 else acc
412
413 branchIdsToProximity :: PhyloBranchId -> PhyloBranchId -> Double -> Double -> Double
414 branchIdsToProximity id id' thrInit thrStep = thrInit + thrStep * (fromIntegral $ length $ intersectInit [] (snd id) (snd id'))
415
416 ngramsInBranches :: [[PhyloGroup]] -> [Int]
417 ngramsInBranches branches = nub $ foldl (\acc g -> acc ++ (g ^. phylo_groupNgrams)) [] $ concat branches
418
419
420 traceMatchSuccess :: Double -> Double -> Double -> [[[PhyloGroup]]] -> [[[PhyloGroup]]]
421 traceMatchSuccess thr qua qua' nextBranches =
422 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . init . snd)
423 $ (head' "trace" $ head' "trace" $ head' "trace" nextBranches) ^. phylo_groupBranchId))
424 <> ",(1.." <> show (length nextBranches) <> ")]"
425 <> " | " <> show ((length . concat . concat) nextBranches) <> " groups" <> "\n"
426 <> " - splited with success in " <> show (map length nextBranches) <> " sub-branches" <> "\n"
427 <> " - for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " < " <> show(qua') <> ")\n" ) nextBranches
428
429
430 traceMatchFailure :: Double -> Double -> Double -> [[PhyloGroup]] -> [[PhyloGroup]]
431 traceMatchFailure thr qua qua' branches =
432 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
433 <> ",(1.." <> show (length branches) <> ")]"
434 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
435 <> " - split with failure for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " > " <> show(qua') <> ")\n"
436 ) branches
437
438
439 traceMatchNoSplit :: [[PhyloGroup]] -> [[PhyloGroup]]
440 traceMatchNoSplit branches =
441 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
442 <> ",(1.." <> show (length branches) <> ")]"
443 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
444 <> " - unable to split in smaller branches" <> "\n"
445 ) branches
446
447
448 traceMatchLimit :: [[PhyloGroup]] -> [[PhyloGroup]]
449 traceMatchLimit branches =
450 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
451 <> ",(1.." <> show (length branches) <> ")]"
452 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
453 <> " - unable to increase the threshold above 1" <> "\n"
454 ) branches
455
456
457 traceMatchEnd :: [PhyloGroup] -> [PhyloGroup]
458 traceMatchEnd groups =
459 trace ("\n" <> "-- | End temporal matching with " <> show (length $ nub $ map (\g -> g ^. phylo_groupBranchId) groups)
460 <> " branches and " <> show (length groups) <> " groups" <> "\n") groups
461
462
463 traceTemporalMatching :: [PhyloGroup] -> [PhyloGroup]
464 traceTemporalMatching groups =
465 trace ( "\n" <> "-- | Start temporal matching for " <> show(length groups) <> " groups" <> "\n") groups