2 Module : Gargantext.Core.Viz.Phylo.PhyloTools
3 Description : Module dedicated to all the tools needed for making a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 {-# LANGUAGE ViewPatterns #-}
13 module Gargantext.Core.Viz.Phylo.PhyloTools where
15 import Data.Vector (Vector, elemIndex)
16 import Data.List (sort, concat, null, union, (++), tails, sortOn, nub, init, tail, partition, tails, nubBy, maximum, group)
17 import Data.Set (Set, disjoint)
18 import Data.Map (Map, elems, fromList, unionWith, keys, member, (!), filterWithKey, fromListWith, empty, restrictKeys)
19 import Data.String (String)
20 import Data.Text (Text)
22 import Prelude (floor)
24 import Gargantext.Prelude
25 import Gargantext.Core.Viz.AdaptativePhylo
29 import Debug.Trace (trace)
30 import Control.Lens hiding (Level)
32 import qualified Data.Vector as Vector
33 import qualified Data.List as List
34 import qualified Data.Set as Set
35 import qualified Data.Map as Map
36 import qualified Data.Text as Text
42 -- | To print an important message as an IO()
43 printIOMsg :: String -> IO ()
48 <> "-- | " <> msg <> "\n" )
51 -- | To print a comment as an IO()
52 printIOComment :: String -> IO ()
54 putStrLn ( "\n" <> cmt <> "\n" )
61 -- truncate' :: Double -> Int -> Double
62 -- truncate' x n = (fromIntegral (floor (x * t))) / t
65 truncate' :: Double -> Int -> Double
66 truncate' x n = (fromIntegral $ (floor (x * t) :: Int)) / t
72 roundToStr :: (PrintfArg a, Floating a) => Int -> a -> String
73 roundToStr = printf "%0.*f"
76 countSup :: Double -> [Double] -> Int
77 countSup s l = length $ filter (>s) l
80 dropByIdx :: Int -> [a] -> [a]
81 dropByIdx k l = take k l ++ drop (k+1) l
84 elemIndex' :: Eq a => a -> [a] -> Int
85 elemIndex' e l = case (List.elemIndex e l) of
86 Nothing -> panic ("[ERR][Viz.Phylo.PhyloTools] element not in list")
90 commonPrefix :: Eq a => [a] -> [a] -> [a] -> [a]
91 commonPrefix lst lst' acc =
92 if (null lst || null lst')
94 else if (head' "commonPrefix" lst == head' "commonPrefix" lst')
95 then commonPrefix (tail lst) (tail lst') (acc ++ [head' "commonPrefix" lst])
100 -- | Foundations | --
101 ---------------------
104 -- | Is this Ngrams a Foundations Root ?
105 isRoots :: Ngrams -> Vector Ngrams -> Bool
106 isRoots n ns = Vector.elem n ns
108 -- | To transform a list of nrams into a list of foundation's index
109 ngramsToIdx :: [Ngrams] -> Vector Ngrams -> [Int]
110 ngramsToIdx ns fdt = map (\n -> fromJust $ elemIndex n fdt) ns
112 -- | To transform a list of Ngrams Indexes into a Label
113 ngramsToLabel :: Vector Ngrams -> [Int] -> Text
114 ngramsToLabel ngrams l = Text.unwords $ tail' "ngramsToLabel" $ concat $ map (\n -> ["|",n]) $ ngramsToText ngrams l
116 idxToLabel :: [Int] -> String
117 idxToLabel l = List.unwords $ tail' "idxToLabel" $ concat $ map (\n -> ["|",show n]) l
119 idxToLabel' :: [Double] -> String
120 idxToLabel' l = List.unwords $ tail' "idxToLabel" $ concat $ map (\n -> ["|",show n]) l
122 -- | To transform a list of Ngrams Indexes into a list of Text
123 ngramsToText :: Vector Ngrams -> [Int] -> [Text]
124 ngramsToText ngrams l = map (\idx -> ngrams Vector.! idx) l
131 -- | To transform a list of periods into a set of Dates
132 periodsToYears :: [(Date,Date)] -> Set Date
133 periodsToYears periods = (Set.fromList . sort . concat)
134 $ map (\(d,d') -> [d..d']) periods
137 findBounds :: [Date] -> (Date,Date)
139 let dates' = sort dates
140 in (head' "findBounds" dates', last' "findBounds" dates')
143 toPeriods :: [Date] -> Int -> Int -> [(Date,Date)]
144 toPeriods dates p s =
145 let (start,end) = findBounds dates
146 in map (\dates' -> (head' "toPeriods" dates', last' "toPeriods" dates'))
147 $ chunkAlong p s [start .. end]
150 -- | Get a regular & ascendante timeScale from a given list of dates
151 toTimeScale :: [Date] -> Int -> [Date]
152 toTimeScale dates step =
153 let (start,end) = findBounds dates
154 in [start, (start + step) .. end]
157 getTimeStep :: TimeUnit -> Int
158 getTimeStep time = case time of
161 getTimePeriod :: TimeUnit -> Int
162 getTimePeriod time = case time of
165 getTimeFrame :: TimeUnit -> Int
166 getTimeFrame time = case time of
174 -- | To find if l' is nested in l
175 isNested :: Eq a => [a] -> [a] -> Bool
178 | length l' > length l = False
179 | (union l l') == l = True
183 -- | To filter Fis with small Support but by keeping non empty Periods
184 keepFilled :: (Int -> [a] -> [a]) -> Int -> [a] -> [a]
185 keepFilled f thr l = if (null $ f thr l) && (not $ null l)
186 then keepFilled f (thr - 1) l
190 traceClique :: Map (Date, Date) [PhyloClique] -> String
191 traceClique mFis = foldl (\msg cpt -> msg <> show (countSup cpt cliques) <> " (>" <> show (cpt) <> ") " ) "" [1..6]
193 --------------------------------------
195 cliques = sort $ map (fromIntegral . length . _phyloClique_nodes) $ concat $ elems mFis
196 --------------------------------------
199 traceSupport :: Map (Date, Date) [PhyloClique] -> String
200 traceSupport mFis = foldl (\msg cpt -> msg <> show (countSup cpt supports) <> " (>" <> show (cpt) <> ") " ) "" [1..6]
202 --------------------------------------
204 supports = sort $ map (fromIntegral . _phyloClique_support) $ concat $ elems mFis
205 --------------------------------------
208 traceFis :: [Char] -> Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
209 traceFis msg mFis = trace ( "\n" <> "-- | " <> msg <> " : " <> show (sum $ map length $ elems mFis) <> "\n"
210 <> "Support : " <> (traceSupport mFis) <> "\n"
211 <> "Nb Ngrams : " <> (traceClique mFis) <> "\n" ) mFis
219 getCliqueSupport :: Clique -> Int
220 getCliqueSupport unit = case unit of
224 getCliqueSize :: Clique -> Int
225 getCliqueSize unit = case unit of
234 listToCombi' :: [a] -> [(a,a)]
235 listToCombi' l = [(x,y) | (x:rest) <- tails l, y <- rest]
237 listToEqual' :: Eq a => [a] -> [(a,a)]
238 listToEqual' l = [(x,y) | x <- l, y <- l, x == y]
240 listToKeys :: Eq a => [a] -> [(a,a)]
241 listToKeys lst = (listToCombi' lst) ++ (listToEqual' lst)
243 listToMatrix :: [Int] -> Map (Int,Int) Double
244 listToMatrix lst = fromList $ map (\k -> (k,1)) $ listToKeys $ sort lst
246 listToMatrix' :: [Ngrams] -> Map (Ngrams,Ngrams) Int
247 listToMatrix' lst = fromList $ map (\k -> (k,1)) $ listToKeys $ sort lst
249 listToSeq :: Eq a => [a] -> [(a,a)]
250 listToSeq l = nubBy (\x y -> fst x == fst y) $ [ (x,y) | (x:rest) <- tails l, y <- rest ]
252 sumCooc :: Cooc -> Cooc -> Cooc
253 sumCooc cooc cooc' = unionWith (+) cooc cooc'
255 getTrace :: Cooc -> Double
256 getTrace cooc = sum $ elems $ filterWithKey (\(k,k') _ -> k == k') cooc
258 coocToDiago :: Cooc -> Cooc
259 coocToDiago cooc = filterWithKey (\(k,k') _ -> k == k') cooc
261 -- | To build the local cooc matrix of each phylogroup
262 ngramsToCooc :: [Int] -> [Cooc] -> Cooc
263 ngramsToCooc ngrams coocs =
264 let cooc = foldl (\acc cooc' -> sumCooc acc cooc') empty coocs
265 pairs = listToKeys ngrams
266 in filterWithKey (\k _ -> elem k pairs) cooc
273 getGroupId :: PhyloGroup -> PhyloGroupId
274 getGroupId g = ((g ^. phylo_groupPeriod, g ^. phylo_groupLevel), g ^. phylo_groupIndex)
276 idToPrd :: PhyloGroupId -> PhyloPeriodId
277 idToPrd id = (fst . fst) id
279 groupByField :: Ord a => (PhyloGroup -> a) -> [PhyloGroup] -> Map a [PhyloGroup]
280 groupByField toField groups = fromListWith (++) $ map (\g -> (toField g, [g])) groups
282 getPeriodPointers :: Filiation -> PhyloGroup -> [Pointer]
283 getPeriodPointers fil g =
285 ToChilds -> g ^. phylo_groupPeriodChilds
286 ToParents -> g ^. phylo_groupPeriodParents
288 filterProximity :: Proximity -> Double -> Double -> Bool
289 filterProximity proximity thr local =
291 WeightedLogJaccard _ -> local >= thr
294 getProximityName :: Proximity -> String
295 getProximityName proximity =
297 WeightedLogJaccard _ -> "WLJaccard"
304 addPointers :: Filiation -> PointerType -> [Pointer] -> PhyloGroup -> PhyloGroup
305 addPointers fil pty pointers g =
307 TemporalPointer -> case fil of
308 ToChilds -> g & phylo_groupPeriodChilds .~ pointers
309 ToParents -> g & phylo_groupPeriodParents .~ pointers
310 LevelPointer -> case fil of
311 ToChilds -> g & phylo_groupLevelChilds .~ pointers
312 ToParents -> g & phylo_groupLevelParents .~ pointers
315 getPeriodIds :: Phylo -> [(Date,Date)]
316 getPeriodIds phylo = sortOn fst
318 $ phylo ^. phylo_periods
320 getLevelParentId :: PhyloGroup -> PhyloGroupId
321 getLevelParentId g = fst $ head' "getLevelParentId" $ g ^. phylo_groupLevelParents
323 getLastLevel :: Phylo -> Level
324 getLastLevel phylo = last' "lastLevel" $ getLevels phylo
326 getLevels :: Phylo -> [Level]
327 getLevels phylo = nub
329 $ keys $ view ( phylo_periods
331 . phylo_periodLevels ) phylo
333 getSeaElevation :: Phylo -> SeaElevation
334 getSeaElevation phylo = seaElevation (getConfig phylo)
337 getConfig :: Phylo -> Config
338 getConfig phylo = (phylo ^. phylo_param) ^. phyloParam_config
341 getRoots :: Phylo -> Vector Ngrams
342 getRoots phylo = (phylo ^. phylo_foundations) ^. foundations_roots
344 phyloToLastBranches :: Phylo -> [[PhyloGroup]]
345 phyloToLastBranches phylo = elems
347 $ map (\g -> (g ^. phylo_groupBranchId, [g]))
348 $ getGroupsFromLevel (last' "byBranches" $ getLevels phylo) phylo
350 getGroupsFromLevel :: Level -> Phylo -> [PhyloGroup]
351 getGroupsFromLevel lvl phylo =
352 elems $ view ( phylo_periods
356 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
357 . phylo_levelGroups ) phylo
360 getGroupsFromLevelPeriods :: Level -> [PhyloPeriodId] -> Phylo -> [PhyloGroup]
361 getGroupsFromLevelPeriods lvl periods phylo =
362 elems $ view ( phylo_periods
364 . filtered (\phyloPrd -> elem (phyloPrd ^. phylo_periodPeriod) periods)
367 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
368 . phylo_levelGroups ) phylo
371 getGroupsFromPeriods :: Level -> Map PhyloPeriodId PhyloPeriod -> [PhyloGroup]
372 getGroupsFromPeriods lvl periods =
373 elems $ view ( traverse
376 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
377 . phylo_levelGroups ) periods
380 updatePhyloGroups :: Level -> Map PhyloGroupId PhyloGroup -> Phylo -> Phylo
381 updatePhyloGroups lvl m phylo =
386 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
390 let id = getGroupId g
397 traceToPhylo :: Level -> Phylo -> Phylo
398 traceToPhylo lvl phylo =
399 trace ("\n" <> "-- | End of phylo making at level " <> show (lvl) <> " with "
400 <> show (length $ getGroupsFromLevel lvl phylo) <> " groups and "
401 <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel lvl phylo) <> " branches" <> "\n") phylo
407 mergeBranchIds :: [[Int]] -> [Int]
408 mergeBranchIds ids = (head' "mergeBranchIds" . sort . mostFreq') ids
410 -- | 2) find the most Up Left ids in the hierarchy of similarity
411 -- mostUpLeft :: [[Int]] -> [[Int]]
413 -- let groupIds = (map (\gIds -> (length $ head' "gIds" gIds, head' "gIds" gIds)) . groupBy (\id id' -> length id == length id') . sortOn length) ids'
414 -- inf = (fst . minimum) groupIds
415 -- in map snd $ filter (\gIds -> fst gIds == inf) groupIds
416 -- | 1) find the most frequent ids
417 mostFreq' :: [[Int]] -> [[Int]]
419 let groupIds = (map (\gIds -> (length gIds, head' "gIds" gIds)) . group . sort) ids'
420 sup = (fst . maximum) groupIds
421 in map snd $ filter (\gIds -> fst gIds == sup) groupIds
424 mergeMeta :: [Int] -> [PhyloGroup] -> Map Text [Double]
425 mergeMeta bId groups =
426 let ego = head' "mergeMeta" $ filter (\g -> (snd (g ^. phylo_groupBranchId)) == bId) groups
427 in fromList [("breaks",(ego ^. phylo_groupMeta) ! "breaks"),("seaLevels",(ego ^. phylo_groupMeta) ! "seaLevels")]
430 groupsToBranches :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
431 groupsToBranches groups =
432 -- | run the related component algorithm
433 let egos = map (\g -> [getGroupId g]
434 ++ (map fst $ g ^. phylo_groupPeriodParents)
435 ++ (map fst $ g ^. phylo_groupPeriodChilds)
436 ++ (map fst $ g ^. phylo_groupAncestors)) $ elems groups
437 graph = relatedComponents egos
438 -- | update each group's branch id
440 let groups' = elems $ restrictKeys groups (Set.fromList ids)
441 bId = mergeBranchIds $ map (\g -> snd $ g ^. phylo_groupBranchId) groups'
442 in map (\g -> g & phylo_groupBranchId %~ (\(lvl,_) -> (lvl,bId))) groups') graph
444 relatedComponents :: Ord a => [[a]] -> [[a]]
445 relatedComponents graph = foldl' (\acc groups ->
449 let acc' = partition (\groups' -> disjoint (Set.fromList groups') (Set.fromList groups)) acc
450 in (fst acc') ++ [nub $ concat $ (snd acc') ++ [groups]]) [] graph
452 toRelatedComponents :: [PhyloGroup] -> [((PhyloGroup,PhyloGroup),Double)] -> [[PhyloGroup]]
453 toRelatedComponents nodes edges =
454 let ref = fromList $ map (\g -> (getGroupId g, g)) nodes
455 clusters = relatedComponents $ ((map (\((g,g'),_) -> [getGroupId g, getGroupId g']) edges) ++ (map (\g -> [getGroupId g]) nodes))
456 in map (\cluster -> map (\gId -> ref ! gId) cluster) clusters
459 traceSynchronyEnd :: Phylo -> Phylo
460 traceSynchronyEnd phylo =
461 trace ( "\n" <> "-- | End synchronic clustering at level " <> show (getLastLevel phylo)
462 <> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
463 <> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
466 traceSynchronyStart :: Phylo -> Phylo
467 traceSynchronyStart phylo =
468 trace ( "\n" <> "-- | Start synchronic clustering at level " <> show (getLastLevel phylo)
469 <> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
470 <> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
478 getSensibility :: Proximity -> Double
479 getSensibility proxi = case proxi of
480 WeightedLogJaccard s -> s
487 intersectInit :: Eq a => [a] -> [a] -> [a] -> [a]
488 intersectInit acc lst lst' =
489 if (null lst) || (null lst')
491 else if (head' "intersectInit" lst) == (head' "intersectInit" lst')
492 then intersectInit (acc ++ [head' "intersectInit" lst]) (tail lst) (tail lst')
495 branchIdsToProximity :: PhyloBranchId -> PhyloBranchId -> Double -> Double -> Double
496 branchIdsToProximity id id' thrInit thrStep = thrInit + thrStep * (fromIntegral $ length $ intersectInit [] (snd id) (snd id'))
498 ngramsInBranches :: [[PhyloGroup]] -> [Int]
499 ngramsInBranches branches = nub $ foldl (\acc g -> acc ++ (g ^. phylo_groupNgrams)) [] $ concat branches
502 traceMatchSuccess :: Double -> Double -> Double -> [[[PhyloGroup]]] -> [[[PhyloGroup]]]
503 traceMatchSuccess thr qua qua' nextBranches =
504 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . init . snd)
505 $ (head' "trace" $ head' "trace" $ head' "trace" nextBranches) ^. phylo_groupBranchId))
506 <> ",(1.." <> show (length nextBranches) <> ")]"
507 <> " | " <> show ((length . concat . concat) nextBranches) <> " groups" <> "\n"
508 <> " - splited with success in " <> show (map length nextBranches) <> " sub-branches" <> "\n"
509 <> " - for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " < " <> show(qua') <> ")\n" ) nextBranches
512 traceMatchFailure :: Double -> Double -> Double -> [[PhyloGroup]] -> [[PhyloGroup]]
513 traceMatchFailure thr qua qua' branches =
514 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
515 <> ",(1.." <> show (length branches) <> ")]"
516 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
517 <> " - split with failure for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " > " <> show(qua') <> ")\n"
521 traceMatchNoSplit :: [[PhyloGroup]] -> [[PhyloGroup]]
522 traceMatchNoSplit branches =
523 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
524 <> ",(1.." <> show (length branches) <> ")]"
525 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
526 <> " - unable to split in smaller branches" <> "\n"
530 traceMatchLimit :: [[PhyloGroup]] -> [[PhyloGroup]]
531 traceMatchLimit branches =
532 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
533 <> ",(1.." <> show (length branches) <> ")]"
534 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
535 <> " - unable to increase the threshold above 1" <> "\n"
539 traceMatchEnd :: [PhyloGroup] -> [PhyloGroup]
540 traceMatchEnd groups =
541 trace ("\n" <> "-- | End temporal matching with " <> show (length $ nub $ map (\g -> g ^. phylo_groupBranchId) groups)
542 <> " branches and " <> show (length groups) <> " groups" <> "\n") groups
545 traceTemporalMatching :: [PhyloGroup] -> [PhyloGroup]
546 traceTemporalMatching groups =
547 trace ( "\n" <> "-- | Start temporal matching for " <> show(length groups) <> " groups" <> "\n") groups
550 traceGroupsProxi :: Map (PhyloGroupId,PhyloGroupId) Double -> Map (PhyloGroupId,PhyloGroupId) Double
552 trace ( "\n" <> "-- | " <> show(Map.size m) <> " computed pairs of groups proximity" <> "\n") m