]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/Phylo/PhyloTools.hs
add the ancestors
[gargantext.git] / src / Gargantext / Viz / Phylo / PhyloTools.hs
1 {-|
2 Module : Gargantext.Viz.Phylo.PhyloTools
3 Description : Module dedicated to all the tools needed for making a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 {-# LANGUAGE FlexibleContexts #-}
12 {-# LANGUAGE NoImplicitPrelude #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE RankNTypes #-}
15 {-# LANGUAGE ViewPatterns #-}
16
17 module Gargantext.Viz.Phylo.PhyloTools where
18
19 import Data.Vector (Vector, elemIndex)
20 import Data.List (sort, concat, null, union, (++), tails, sortOn, nub, init, tail, partition, tails, nubBy, maximum, group)
21 import Data.Set (Set, disjoint)
22 import Data.Map (Map, elems, fromList, unionWith, keys, member, (!), filterWithKey, fromListWith, empty, restrictKeys)
23 import Data.String (String)
24 import Data.Text (Text, unwords)
25
26 import Gargantext.Prelude
27 import Gargantext.Viz.AdaptativePhylo
28 import Text.Printf
29
30
31 import Debug.Trace (trace)
32 import Control.Lens hiding (Level)
33
34 import qualified Data.Vector as Vector
35 import qualified Data.List as List
36 import qualified Data.Set as Set
37 import qualified Data.Map as Map
38
39 ------------
40 -- | Io | --
41 ------------
42
43 -- | To print an important message as an IO()
44 printIOMsg :: String -> IO ()
45 printIOMsg msg =
46 putStrLn ( "\n"
47 <> "------------"
48 <> "\n"
49 <> "-- | " <> msg <> "\n" )
50
51
52 -- | To print a comment as an IO()
53 printIOComment :: String -> IO ()
54 printIOComment cmt =
55 putStrLn ( "\n" <> cmt <> "\n" )
56
57
58 --------------
59 -- | Misc | --
60 --------------
61
62
63 roundToStr :: (PrintfArg a, Floating a) => Int -> a -> String
64 roundToStr = printf "%0.*f"
65
66
67 countSup :: Double -> [Double] -> Int
68 countSup s l = length $ filter (>s) l
69
70
71 dropByIdx :: Int -> [a] -> [a]
72 dropByIdx k l = take k l ++ drop (k+1) l
73
74
75 elemIndex' :: Eq a => a -> [a] -> Int
76 elemIndex' e l = case (List.elemIndex e l) of
77 Nothing -> panic ("[ERR][Viz.Phylo.PhyloTools] element not in list")
78 Just i -> i
79
80
81 commonPrefix :: Eq a => [a] -> [a] -> [a] -> [a]
82 commonPrefix lst lst' acc =
83 if (null lst || null lst')
84 then acc
85 else if (head' "commonPrefix" lst == head' "commonPrefix" lst')
86 then commonPrefix (tail lst) (tail lst') (acc ++ [head' "commonPrefix" lst])
87 else acc
88
89
90 ---------------------
91 -- | Foundations | --
92 ---------------------
93
94
95 -- | Is this Ngrams a Foundations Root ?
96 isRoots :: Ngrams -> Vector Ngrams -> Bool
97 isRoots n ns = Vector.elem n ns
98
99 -- | To transform a list of nrams into a list of foundation's index
100 ngramsToIdx :: [Ngrams] -> Vector Ngrams -> [Int]
101 ngramsToIdx ns fdt = map (\n -> fromJust $ elemIndex n fdt) ns
102
103 -- | To transform a list of Ngrams Indexes into a Label
104 ngramsToLabel :: Vector Ngrams -> [Int] -> Text
105 ngramsToLabel ngrams l = unwords $ tail' "ngramsToLabel" $ concat $ map (\n -> ["|",n]) $ ngramsToText ngrams l
106
107
108 -- | To transform a list of Ngrams Indexes into a list of Text
109 ngramsToText :: Vector Ngrams -> [Int] -> [Text]
110 ngramsToText ngrams l = map (\idx -> ngrams Vector.! idx) l
111
112
113 --------------
114 -- | Time | --
115 --------------
116
117 -- | To transform a list of periods into a set of Dates
118 periodsToYears :: [(Date,Date)] -> Set Date
119 periodsToYears periods = (Set.fromList . sort . concat)
120 $ map (\(d,d') -> [d..d']) periods
121
122
123 findBounds :: [Date] -> (Date,Date)
124 findBounds dates =
125 let dates' = sort dates
126 in (head' "findBounds" dates', last' "findBounds" dates')
127
128
129 toPeriods :: [Date] -> Int -> Int -> [(Date,Date)]
130 toPeriods dates p s =
131 let (start,end) = findBounds dates
132 in map (\dates' -> (head' "toPeriods" dates', last' "toPeriods" dates'))
133 $ chunkAlong p s [start .. end]
134
135
136 -- | Get a regular & ascendante timeScale from a given list of dates
137 toTimeScale :: [Date] -> Int -> [Date]
138 toTimeScale dates step =
139 let (start,end) = findBounds dates
140 in [start, (start + step) .. end]
141
142
143 getTimeStep :: TimeUnit -> Int
144 getTimeStep time = case time of
145 Year _ s _ -> s
146
147 getTimePeriod :: TimeUnit -> Int
148 getTimePeriod time = case time of
149 Year p _ _ -> p
150
151 getTimeFrame :: TimeUnit -> Int
152 getTimeFrame time = case time of
153 Year _ _ f -> f
154
155 -------------
156 -- | Fis | --
157 -------------
158
159
160 -- | To find if l' is nested in l
161 isNested :: Eq a => [a] -> [a] -> Bool
162 isNested l l'
163 | null l' = True
164 | length l' > length l = False
165 | (union l l') == l = True
166 | otherwise = False
167
168
169 -- | To filter Fis with small Support but by keeping non empty Periods
170 keepFilled :: (Int -> [a] -> [a]) -> Int -> [a] -> [a]
171 keepFilled f thr l = if (null $ f thr l) && (not $ null l)
172 then keepFilled f (thr - 1) l
173 else f thr l
174
175
176 traceClique :: Map (Date, Date) [PhyloClique] -> String
177 traceClique mFis = foldl (\msg cpt -> msg <> show (countSup cpt cliques) <> " (>" <> show (cpt) <> ") " ) "" [1..6]
178 where
179 --------------------------------------
180 cliques :: [Double]
181 cliques = sort $ map (fromIntegral . length . _phyloClique_nodes) $ concat $ elems mFis
182 --------------------------------------
183
184
185 traceSupport :: Map (Date, Date) [PhyloClique] -> String
186 traceSupport mFis = foldl (\msg cpt -> msg <> show (countSup cpt supports) <> " (>" <> show (cpt) <> ") " ) "" [1..6]
187 where
188 --------------------------------------
189 supports :: [Double]
190 supports = sort $ map (fromIntegral . _phyloClique_support) $ concat $ elems mFis
191 --------------------------------------
192
193
194 traceFis :: [Char] -> Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
195 traceFis msg mFis = trace ( "\n" <> "-- | " <> msg <> " : " <> show (sum $ map length $ elems mFis) <> "\n"
196 <> "Support : " <> (traceSupport mFis) <> "\n"
197 <> "Nb Ngrams : " <> (traceClique mFis) <> "\n" ) mFis
198
199
200 ---------------
201 -- | Clique| --
202 ---------------
203
204
205 getCliqueSupport :: Clique -> Int
206 getCliqueSupport unit = case unit of
207 Fis s _ -> s
208 MaxClique _ -> 0
209
210 getCliqueSize :: Clique -> Int
211 getCliqueSize unit = case unit of
212 Fis _ s -> s
213 MaxClique s -> s
214
215
216 --------------
217 -- | Cooc | --
218 --------------
219
220 listToCombi' :: [a] -> [(a,a)]
221 listToCombi' l = [(x,y) | (x:rest) <- tails l, y <- rest]
222
223 listToEqual' :: Eq a => [a] -> [(a,a)]
224 listToEqual' l = [(x,y) | x <- l, y <- l, x == y]
225
226 listToKeys :: Eq a => [a] -> [(a,a)]
227 listToKeys lst = (listToCombi' lst) ++ (listToEqual' lst)
228
229 listToMatrix :: [Int] -> Map (Int,Int) Double
230 listToMatrix lst = fromList $ map (\k -> (k,1)) $ listToKeys $ sort lst
231
232 listToMatrix' :: [Ngrams] -> Map (Ngrams,Ngrams) Int
233 listToMatrix' lst = fromList $ map (\k -> (k,1)) $ listToKeys $ sort lst
234
235 listToSeq :: Eq a => [a] -> [(a,a)]
236 listToSeq l = nubBy (\x y -> fst x == fst y) $ [ (x,y) | (x:rest) <- tails l, y <- rest ]
237
238 sumCooc :: Cooc -> Cooc -> Cooc
239 sumCooc cooc cooc' = unionWith (+) cooc cooc'
240
241 getTrace :: Cooc -> Double
242 getTrace cooc = sum $ elems $ filterWithKey (\(k,k') _ -> k == k') cooc
243
244 coocToDiago :: Cooc -> Cooc
245 coocToDiago cooc = filterWithKey (\(k,k') _ -> k == k') cooc
246
247 -- | To build the local cooc matrix of each phylogroup
248 ngramsToCooc :: [Int] -> [Cooc] -> Cooc
249 ngramsToCooc ngrams coocs =
250 let cooc = foldl (\acc cooc' -> sumCooc acc cooc') empty coocs
251 pairs = listToKeys ngrams
252 in filterWithKey (\k _ -> elem k pairs) cooc
253
254
255 --------------------
256 -- | PhyloGroup | --
257 --------------------
258
259 getGroupId :: PhyloGroup -> PhyloGroupId
260 getGroupId g = ((g ^. phylo_groupPeriod, g ^. phylo_groupLevel), g ^. phylo_groupIndex)
261
262 idToPrd :: PhyloGroupId -> PhyloPeriodId
263 idToPrd id = (fst . fst) id
264
265 groupByField :: Ord a => (PhyloGroup -> a) -> [PhyloGroup] -> Map a [PhyloGroup]
266 groupByField toField groups = fromListWith (++) $ map (\g -> (toField g, [g])) groups
267
268 getPeriodPointers :: Filiation -> PhyloGroup -> [Pointer]
269 getPeriodPointers fil g =
270 case fil of
271 ToChilds -> g ^. phylo_groupPeriodChilds
272 ToParents -> g ^. phylo_groupPeriodParents
273
274 filterProximity :: Proximity -> Double -> Double -> Bool
275 filterProximity proximity thr local =
276 case proximity of
277 WeightedLogJaccard _ -> local >= thr
278 Hamming -> undefined
279
280 getProximityName :: Proximity -> String
281 getProximityName proximity =
282 case proximity of
283 WeightedLogJaccard _ -> "WLJaccard"
284 Hamming -> "Hamming"
285
286 ---------------
287 -- | Phylo | --
288 ---------------
289
290 addPointers :: Filiation -> PointerType -> [Pointer] -> PhyloGroup -> PhyloGroup
291 addPointers fil pty pointers g =
292 case pty of
293 TemporalPointer -> case fil of
294 ToChilds -> g & phylo_groupPeriodChilds .~ pointers
295 ToParents -> g & phylo_groupPeriodParents .~ pointers
296 LevelPointer -> case fil of
297 ToChilds -> g & phylo_groupLevelChilds .~ pointers
298 ToParents -> g & phylo_groupLevelParents .~ pointers
299
300
301 getPeriodIds :: Phylo -> [(Date,Date)]
302 getPeriodIds phylo = sortOn fst
303 $ keys
304 $ phylo ^. phylo_periods
305
306 getLevelParentId :: PhyloGroup -> PhyloGroupId
307 getLevelParentId g = fst $ head' "getLevelParentId" $ g ^. phylo_groupLevelParents
308
309 getLastLevel :: Phylo -> Level
310 getLastLevel phylo = last' "lastLevel" $ getLevels phylo
311
312 getLevels :: Phylo -> [Level]
313 getLevels phylo = nub
314 $ map snd
315 $ keys $ view ( phylo_periods
316 . traverse
317 . phylo_periodLevels ) phylo
318
319 getSeaElevation :: Phylo -> SeaElevation
320 getSeaElevation phylo = seaElevation (getConfig phylo)
321
322
323 getConfig :: Phylo -> Config
324 getConfig phylo = (phylo ^. phylo_param) ^. phyloParam_config
325
326
327 getRoots :: Phylo -> Vector Ngrams
328 getRoots phylo = (phylo ^. phylo_foundations) ^. foundations_roots
329
330 phyloToLastBranches :: Phylo -> [[PhyloGroup]]
331 phyloToLastBranches phylo = elems
332 $ fromListWith (++)
333 $ map (\g -> (g ^. phylo_groupBranchId, [g]))
334 $ getGroupsFromLevel (last' "byBranches" $ getLevels phylo) phylo
335
336 getGroupsFromLevel :: Level -> Phylo -> [PhyloGroup]
337 getGroupsFromLevel lvl phylo =
338 elems $ view ( phylo_periods
339 . traverse
340 . phylo_periodLevels
341 . traverse
342 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
343 . phylo_levelGroups ) phylo
344
345
346 getGroupsFromLevelPeriods :: Level -> [PhyloPeriodId] -> Phylo -> [PhyloGroup]
347 getGroupsFromLevelPeriods lvl periods phylo =
348 elems $ view ( phylo_periods
349 . traverse
350 . filtered (\phyloPrd -> elem (phyloPrd ^. phylo_periodPeriod) periods)
351 . phylo_periodLevels
352 . traverse
353 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
354 . phylo_levelGroups ) phylo
355
356
357 getGroupsFromPeriods :: Level -> Map PhyloPeriodId PhyloPeriod -> [PhyloGroup]
358 getGroupsFromPeriods lvl periods =
359 elems $ view ( traverse
360 . phylo_periodLevels
361 . traverse
362 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
363 . phylo_levelGroups ) periods
364
365
366 updatePhyloGroups :: Level -> Map PhyloGroupId PhyloGroup -> Phylo -> Phylo
367 updatePhyloGroups lvl m phylo =
368 over ( phylo_periods
369 . traverse
370 . phylo_periodLevels
371 . traverse
372 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
373 . phylo_levelGroups
374 . traverse
375 ) (\g ->
376 let id = getGroupId g
377 in
378 if member id m
379 then m ! id
380 else g ) phylo
381
382
383 traceToPhylo :: Level -> Phylo -> Phylo
384 traceToPhylo lvl phylo =
385 trace ("\n" <> "-- | End of phylo making at level " <> show (lvl) <> " with "
386 <> show (length $ getGroupsFromLevel lvl phylo) <> " groups and "
387 <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel lvl phylo) <> " branches" <> "\n") phylo
388
389 --------------------
390 -- | Clustering | --
391 --------------------
392
393 mergeBranchIds :: [[Int]] -> [Int]
394 mergeBranchIds ids = (head' "mergeBranchIds" . sort . mostFreq') ids
395 where
396 -- | 2) find the most Up Left ids in the hierarchy of similarity
397 -- mostUpLeft :: [[Int]] -> [[Int]]
398 -- mostUpLeft ids' =
399 -- let groupIds = (map (\gIds -> (length $ head' "gIds" gIds, head' "gIds" gIds)) . groupBy (\id id' -> length id == length id') . sortOn length) ids'
400 -- inf = (fst . minimum) groupIds
401 -- in map snd $ filter (\gIds -> fst gIds == inf) groupIds
402 -- | 1) find the most frequent ids
403 mostFreq' :: [[Int]] -> [[Int]]
404 mostFreq' ids' =
405 let groupIds = (map (\gIds -> (length gIds, head' "gIds" gIds)) . group . sort) ids'
406 sup = (fst . maximum) groupIds
407 in map snd $ filter (\gIds -> fst gIds == sup) groupIds
408
409
410 mergeMeta :: [Int] -> [PhyloGroup] -> Map Text [Double]
411 mergeMeta bId groups =
412 let ego = head' "mergeMeta" $ filter (\g -> (snd (g ^. phylo_groupBranchId)) == bId) groups
413 in fromList [("breaks",(ego ^. phylo_groupMeta) ! "breaks"),("seaLevels",(ego ^. phylo_groupMeta) ! "seaLevels")]
414
415
416 groupsToBranches :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
417 groupsToBranches groups =
418 -- | run the related component algorithm
419 let egos = map (\g -> [getGroupId g]
420 ++ (map fst $ g ^. phylo_groupPeriodParents)
421 ++ (map fst $ g ^. phylo_groupPeriodChilds)
422 ++ (map fst $ g ^. phylo_groupAncestors)) $ elems groups
423 graph = relatedComponents egos
424 -- | update each group's branch id
425 in map (\ids ->
426 let groups' = elems $ restrictKeys groups (Set.fromList ids)
427 bId = mergeBranchIds $ map (\g -> snd $ g ^. phylo_groupBranchId) groups'
428 in map (\g -> g & phylo_groupBranchId %~ (\(lvl,_) -> (lvl,bId))) groups') graph
429
430 relatedComponents :: Ord a => [[a]] -> [[a]]
431 relatedComponents graph = foldl' (\acc groups ->
432 if (null acc)
433 then acc ++ [groups]
434 else
435 let acc' = partition (\groups' -> disjoint (Set.fromList groups') (Set.fromList groups)) acc
436 in (fst acc') ++ [nub $ concat $ (snd acc') ++ [groups]]) [] graph
437
438 toRelatedComponents :: [PhyloGroup] -> [((PhyloGroup,PhyloGroup),Double)] -> [[PhyloGroup]]
439 toRelatedComponents nodes edges =
440 let ref = fromList $ map (\g -> (getGroupId g, g)) nodes
441 clusters = relatedComponents $ ((map (\((g,g'),_) -> [getGroupId g, getGroupId g']) edges) ++ (map (\g -> [getGroupId g]) nodes))
442 in map (\cluster -> map (\gId -> ref ! gId) cluster) clusters
443
444
445 traceSynchronyEnd :: Phylo -> Phylo
446 traceSynchronyEnd phylo =
447 trace ( "\n" <> "-- | End synchronic clustering at level " <> show (getLastLevel phylo)
448 <> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
449 <> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
450 <> "\n" ) phylo
451
452 traceSynchronyStart :: Phylo -> Phylo
453 traceSynchronyStart phylo =
454 trace ( "\n" <> "-- | Start synchronic clustering at level " <> show (getLastLevel phylo)
455 <> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
456 <> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
457 <> "\n" ) phylo
458
459
460 -------------------
461 -- | Proximity | --
462 -------------------
463
464 getSensibility :: Proximity -> Double
465 getSensibility proxi = case proxi of
466 WeightedLogJaccard s -> s
467 Hamming -> undefined
468
469 ----------------
470 -- | Branch | --
471 ----------------
472
473 intersectInit :: Eq a => [a] -> [a] -> [a] -> [a]
474 intersectInit acc lst lst' =
475 if (null lst) || (null lst')
476 then acc
477 else if (head' "intersectInit" lst) == (head' "intersectInit" lst')
478 then intersectInit (acc ++ [head' "intersectInit" lst]) (tail lst) (tail lst')
479 else acc
480
481 branchIdsToProximity :: PhyloBranchId -> PhyloBranchId -> Double -> Double -> Double
482 branchIdsToProximity id id' thrInit thrStep = thrInit + thrStep * (fromIntegral $ length $ intersectInit [] (snd id) (snd id'))
483
484 ngramsInBranches :: [[PhyloGroup]] -> [Int]
485 ngramsInBranches branches = nub $ foldl (\acc g -> acc ++ (g ^. phylo_groupNgrams)) [] $ concat branches
486
487
488 traceMatchSuccess :: Double -> Double -> Double -> [[[PhyloGroup]]] -> [[[PhyloGroup]]]
489 traceMatchSuccess thr qua qua' nextBranches =
490 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . init . snd)
491 $ (head' "trace" $ head' "trace" $ head' "trace" nextBranches) ^. phylo_groupBranchId))
492 <> ",(1.." <> show (length nextBranches) <> ")]"
493 <> " | " <> show ((length . concat . concat) nextBranches) <> " groups" <> "\n"
494 <> " - splited with success in " <> show (map length nextBranches) <> " sub-branches" <> "\n"
495 <> " - for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " < " <> show(qua') <> ")\n" ) nextBranches
496
497
498 traceMatchFailure :: Double -> Double -> Double -> [[PhyloGroup]] -> [[PhyloGroup]]
499 traceMatchFailure thr qua qua' branches =
500 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
501 <> ",(1.." <> show (length branches) <> ")]"
502 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
503 <> " - split with failure for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " > " <> show(qua') <> ")\n"
504 ) branches
505
506
507 traceMatchNoSplit :: [[PhyloGroup]] -> [[PhyloGroup]]
508 traceMatchNoSplit branches =
509 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
510 <> ",(1.." <> show (length branches) <> ")]"
511 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
512 <> " - unable to split in smaller branches" <> "\n"
513 ) branches
514
515
516 traceMatchLimit :: [[PhyloGroup]] -> [[PhyloGroup]]
517 traceMatchLimit branches =
518 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
519 <> ",(1.." <> show (length branches) <> ")]"
520 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
521 <> " - unable to increase the threshold above 1" <> "\n"
522 ) branches
523
524
525 traceMatchEnd :: [PhyloGroup] -> [PhyloGroup]
526 traceMatchEnd groups =
527 trace ("\n" <> "-- | End temporal matching with " <> show (length $ nub $ map (\g -> g ^. phylo_groupBranchId) groups)
528 <> " branches and " <> show (length groups) <> " groups" <> "\n") groups
529
530
531 traceTemporalMatching :: [PhyloGroup] -> [PhyloGroup]
532 traceTemporalMatching groups =
533 trace ( "\n" <> "-- | Start temporal matching for " <> show(length groups) <> " groups" <> "\n") groups
534
535
536 traceGroupsProxi :: Map (PhyloGroupId,PhyloGroupId) Double -> Map (PhyloGroupId,PhyloGroupId) Double
537 traceGroupsProxi m =
538 trace ( "\n" <> "-- | " <> show(Map.size m) <> " computed pairs of groups proximity" <> "\n") m