2 Module : Gargantext.Viz.Phylo.PhyloTools
3 Description : Module dedicated to all the tools needed for making a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 {-# LANGUAGE FlexibleContexts #-}
12 {-# LANGUAGE NoImplicitPrelude #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE RankNTypes #-}
15 {-# LANGUAGE ViewPatterns #-}
17 module Gargantext.Viz.Phylo.PhyloTools where
19 import Data.Vector (Vector, elemIndex)
20 import Data.List (sort, concat, null, union, (++), tails, sortOn, nub, init, tail, partition, tails, nubBy, maximum, group)
21 import Data.Set (Set, disjoint)
22 import Data.Map (Map, elems, fromList, unionWith, keys, member, (!), filterWithKey, fromListWith, empty, restrictKeys)
23 import Data.String (String)
24 import Data.Text (Text, unwords)
26 import Gargantext.Prelude
27 import Gargantext.Viz.AdaptativePhylo
31 import Debug.Trace (trace)
32 import Control.Lens hiding (Level)
34 import qualified Data.Vector as Vector
35 import qualified Data.List as List
36 import qualified Data.Set as Set
37 import qualified Data.Map as Map
43 -- | To print an important message as an IO()
44 printIOMsg :: String -> IO ()
49 <> "-- | " <> msg <> "\n" )
52 -- | To print a comment as an IO()
53 printIOComment :: String -> IO ()
55 putStrLn ( "\n" <> cmt <> "\n" )
63 roundToStr :: (PrintfArg a, Floating a) => Int -> a -> String
64 roundToStr = printf "%0.*f"
67 countSup :: Double -> [Double] -> Int
68 countSup s l = length $ filter (>s) l
71 dropByIdx :: Int -> [a] -> [a]
72 dropByIdx k l = take k l ++ drop (k+1) l
75 elemIndex' :: Eq a => a -> [a] -> Int
76 elemIndex' e l = case (List.elemIndex e l) of
77 Nothing -> panic ("[ERR][Viz.Phylo.PhyloTools] element not in list")
81 commonPrefix :: Eq a => [a] -> [a] -> [a] -> [a]
82 commonPrefix lst lst' acc =
83 if (null lst || null lst')
85 else if (head' "commonPrefix" lst == head' "commonPrefix" lst')
86 then commonPrefix (tail lst) (tail lst') (acc ++ [head' "commonPrefix" lst])
95 -- | Is this Ngrams a Foundations Root ?
96 isRoots :: Ngrams -> Vector Ngrams -> Bool
97 isRoots n ns = Vector.elem n ns
99 -- | To transform a list of nrams into a list of foundation's index
100 ngramsToIdx :: [Ngrams] -> Vector Ngrams -> [Int]
101 ngramsToIdx ns fdt = map (\n -> fromJust $ elemIndex n fdt) ns
103 -- | To transform a list of Ngrams Indexes into a Label
104 ngramsToLabel :: Vector Ngrams -> [Int] -> Text
105 ngramsToLabel ngrams l = unwords $ tail' "ngramsToLabel" $ concat $ map (\n -> ["|",n]) $ ngramsToText ngrams l
108 -- | To transform a list of Ngrams Indexes into a list of Text
109 ngramsToText :: Vector Ngrams -> [Int] -> [Text]
110 ngramsToText ngrams l = map (\idx -> ngrams Vector.! idx) l
117 -- | To transform a list of periods into a set of Dates
118 periodsToYears :: [(Date,Date)] -> Set Date
119 periodsToYears periods = (Set.fromList . sort . concat)
120 $ map (\(d,d') -> [d..d']) periods
123 findBounds :: [Date] -> (Date,Date)
125 let dates' = sort dates
126 in (head' "findBounds" dates', last' "findBounds" dates')
129 toPeriods :: [Date] -> Int -> Int -> [(Date,Date)]
130 toPeriods dates p s =
131 let (start,end) = findBounds dates
132 in map (\dates' -> (head' "toPeriods" dates', last' "toPeriods" dates'))
133 $ chunkAlong p s [start .. end]
136 -- | Get a regular & ascendante timeScale from a given list of dates
137 toTimeScale :: [Date] -> Int -> [Date]
138 toTimeScale dates step =
139 let (start,end) = findBounds dates
140 in [start, (start + step) .. end]
143 getTimeStep :: TimeUnit -> Int
144 getTimeStep time = case time of
147 getTimePeriod :: TimeUnit -> Int
148 getTimePeriod time = case time of
151 getTimeFrame :: TimeUnit -> Int
152 getTimeFrame time = case time of
160 -- | To find if l' is nested in l
161 isNested :: Eq a => [a] -> [a] -> Bool
164 | length l' > length l = False
165 | (union l l') == l = True
169 -- | To filter Fis with small Support but by keeping non empty Periods
170 keepFilled :: (Int -> [a] -> [a]) -> Int -> [a] -> [a]
171 keepFilled f thr l = if (null $ f thr l) && (not $ null l)
172 then keepFilled f (thr - 1) l
176 traceClique :: Map (Date, Date) [PhyloClique] -> String
177 traceClique mFis = foldl (\msg cpt -> msg <> show (countSup cpt cliques) <> " (>" <> show (cpt) <> ") " ) "" [1..6]
179 --------------------------------------
181 cliques = sort $ map (fromIntegral . length . _phyloClique_nodes) $ concat $ elems mFis
182 --------------------------------------
185 traceSupport :: Map (Date, Date) [PhyloClique] -> String
186 traceSupport mFis = foldl (\msg cpt -> msg <> show (countSup cpt supports) <> " (>" <> show (cpt) <> ") " ) "" [1..6]
188 --------------------------------------
190 supports = sort $ map (fromIntegral . _phyloClique_support) $ concat $ elems mFis
191 --------------------------------------
194 traceFis :: [Char] -> Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
195 traceFis msg mFis = trace ( "\n" <> "-- | " <> msg <> " : " <> show (sum $ map length $ elems mFis) <> "\n"
196 <> "Support : " <> (traceSupport mFis) <> "\n"
197 <> "Nb Ngrams : " <> (traceClique mFis) <> "\n" ) mFis
205 getCliqueSupport :: Clique -> Int
206 getCliqueSupport unit = case unit of
210 getCliqueSize :: Clique -> Int
211 getCliqueSize unit = case unit of
220 listToCombi' :: [a] -> [(a,a)]
221 listToCombi' l = [(x,y) | (x:rest) <- tails l, y <- rest]
223 listToEqual' :: Eq a => [a] -> [(a,a)]
224 listToEqual' l = [(x,y) | x <- l, y <- l, x == y]
226 listToKeys :: Eq a => [a] -> [(a,a)]
227 listToKeys lst = (listToCombi' lst) ++ (listToEqual' lst)
229 listToMatrix :: [Int] -> Map (Int,Int) Double
230 listToMatrix lst = fromList $ map (\k -> (k,1)) $ listToKeys $ sort lst
232 listToMatrix' :: [Ngrams] -> Map (Ngrams,Ngrams) Int
233 listToMatrix' lst = fromList $ map (\k -> (k,1)) $ listToKeys $ sort lst
235 listToSeq :: Eq a => [a] -> [(a,a)]
236 listToSeq l = nubBy (\x y -> fst x == fst y) $ [ (x,y) | (x:rest) <- tails l, y <- rest ]
238 sumCooc :: Cooc -> Cooc -> Cooc
239 sumCooc cooc cooc' = unionWith (+) cooc cooc'
241 getTrace :: Cooc -> Double
242 getTrace cooc = sum $ elems $ filterWithKey (\(k,k') _ -> k == k') cooc
244 coocToDiago :: Cooc -> Cooc
245 coocToDiago cooc = filterWithKey (\(k,k') _ -> k == k') cooc
247 -- | To build the local cooc matrix of each phylogroup
248 ngramsToCooc :: [Int] -> [Cooc] -> Cooc
249 ngramsToCooc ngrams coocs =
250 let cooc = foldl (\acc cooc' -> sumCooc acc cooc') empty coocs
251 pairs = listToKeys ngrams
252 in filterWithKey (\k _ -> elem k pairs) cooc
259 getGroupId :: PhyloGroup -> PhyloGroupId
260 getGroupId g = ((g ^. phylo_groupPeriod, g ^. phylo_groupLevel), g ^. phylo_groupIndex)
262 idToPrd :: PhyloGroupId -> PhyloPeriodId
263 idToPrd id = (fst . fst) id
265 groupByField :: Ord a => (PhyloGroup -> a) -> [PhyloGroup] -> Map a [PhyloGroup]
266 groupByField toField groups = fromListWith (++) $ map (\g -> (toField g, [g])) groups
268 getPeriodPointers :: Filiation -> PhyloGroup -> [Pointer]
269 getPeriodPointers fil g =
271 ToChilds -> g ^. phylo_groupPeriodChilds
272 ToParents -> g ^. phylo_groupPeriodParents
274 filterProximity :: Proximity -> Double -> Double -> Bool
275 filterProximity proximity thr local =
277 WeightedLogJaccard _ -> local >= thr
280 getProximityName :: Proximity -> String
281 getProximityName proximity =
283 WeightedLogJaccard _ -> "WLJaccard"
290 addPointers :: Filiation -> PointerType -> [Pointer] -> PhyloGroup -> PhyloGroup
291 addPointers fil pty pointers g =
293 TemporalPointer -> case fil of
294 ToChilds -> g & phylo_groupPeriodChilds .~ pointers
295 ToParents -> g & phylo_groupPeriodParents .~ pointers
296 LevelPointer -> case fil of
297 ToChilds -> g & phylo_groupLevelChilds .~ pointers
298 ToParents -> g & phylo_groupLevelParents .~ pointers
301 getPeriodIds :: Phylo -> [(Date,Date)]
302 getPeriodIds phylo = sortOn fst
304 $ phylo ^. phylo_periods
306 getLevelParentId :: PhyloGroup -> PhyloGroupId
307 getLevelParentId g = fst $ head' "getLevelParentId" $ g ^. phylo_groupLevelParents
309 getLastLevel :: Phylo -> Level
310 getLastLevel phylo = last' "lastLevel" $ getLevels phylo
312 getLevels :: Phylo -> [Level]
313 getLevels phylo = nub
315 $ keys $ view ( phylo_periods
317 . phylo_periodLevels ) phylo
319 getSeaElevation :: Phylo -> SeaElevation
320 getSeaElevation phylo = seaElevation (getConfig phylo)
323 getConfig :: Phylo -> Config
324 getConfig phylo = (phylo ^. phylo_param) ^. phyloParam_config
327 getRoots :: Phylo -> Vector Ngrams
328 getRoots phylo = (phylo ^. phylo_foundations) ^. foundations_roots
330 phyloToLastBranches :: Phylo -> [[PhyloGroup]]
331 phyloToLastBranches phylo = elems
333 $ map (\g -> (g ^. phylo_groupBranchId, [g]))
334 $ getGroupsFromLevel (last' "byBranches" $ getLevels phylo) phylo
336 getGroupsFromLevel :: Level -> Phylo -> [PhyloGroup]
337 getGroupsFromLevel lvl phylo =
338 elems $ view ( phylo_periods
342 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
343 . phylo_levelGroups ) phylo
346 getGroupsFromLevelPeriods :: Level -> [PhyloPeriodId] -> Phylo -> [PhyloGroup]
347 getGroupsFromLevelPeriods lvl periods phylo =
348 elems $ view ( phylo_periods
350 . filtered (\phyloPrd -> elem (phyloPrd ^. phylo_periodPeriod) periods)
353 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
354 . phylo_levelGroups ) phylo
357 getGroupsFromPeriods :: Level -> Map PhyloPeriodId PhyloPeriod -> [PhyloGroup]
358 getGroupsFromPeriods lvl periods =
359 elems $ view ( traverse
362 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
363 . phylo_levelGroups ) periods
366 updatePhyloGroups :: Level -> Map PhyloGroupId PhyloGroup -> Phylo -> Phylo
367 updatePhyloGroups lvl m phylo =
372 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
376 let id = getGroupId g
383 traceToPhylo :: Level -> Phylo -> Phylo
384 traceToPhylo lvl phylo =
385 trace ("\n" <> "-- | End of phylo making at level " <> show (lvl) <> " with "
386 <> show (length $ getGroupsFromLevel lvl phylo) <> " groups and "
387 <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel lvl phylo) <> " branches" <> "\n") phylo
393 mergeBranchIds :: [[Int]] -> [Int]
394 mergeBranchIds ids = (head' "mergeBranchIds" . sort . mostFreq') ids
396 -- | 2) find the most Up Left ids in the hierarchy of similarity
397 -- mostUpLeft :: [[Int]] -> [[Int]]
399 -- let groupIds = (map (\gIds -> (length $ head' "gIds" gIds, head' "gIds" gIds)) . groupBy (\id id' -> length id == length id') . sortOn length) ids'
400 -- inf = (fst . minimum) groupIds
401 -- in map snd $ filter (\gIds -> fst gIds == inf) groupIds
402 -- | 1) find the most frequent ids
403 mostFreq' :: [[Int]] -> [[Int]]
405 let groupIds = (map (\gIds -> (length gIds, head' "gIds" gIds)) . group . sort) ids'
406 sup = (fst . maximum) groupIds
407 in map snd $ filter (\gIds -> fst gIds == sup) groupIds
410 mergeMeta :: [Int] -> [PhyloGroup] -> Map Text [Double]
411 mergeMeta bId groups =
412 let ego = head' "mergeMeta" $ filter (\g -> (snd (g ^. phylo_groupBranchId)) == bId) groups
413 in fromList [("breaks",(ego ^. phylo_groupMeta) ! "breaks"),("seaLevels",(ego ^. phylo_groupMeta) ! "seaLevels")]
416 groupsToBranches :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
417 groupsToBranches groups =
418 -- | run the related component algorithm
419 let egos = map (\g -> [getGroupId g]
420 ++ (map fst $ g ^. phylo_groupPeriodParents)
421 ++ (map fst $ g ^. phylo_groupPeriodChilds)
422 ++ (map fst $ g ^. phylo_groupAncestors)) $ elems groups
423 graph = relatedComponents egos
424 -- | update each group's branch id
426 let groups' = elems $ restrictKeys groups (Set.fromList ids)
427 bId = mergeBranchIds $ map (\g -> snd $ g ^. phylo_groupBranchId) groups'
428 in map (\g -> g & phylo_groupBranchId %~ (\(lvl,_) -> (lvl,bId))) groups') graph
430 relatedComponents :: Ord a => [[a]] -> [[a]]
431 relatedComponents graph = foldl' (\acc groups ->
435 let acc' = partition (\groups' -> disjoint (Set.fromList groups') (Set.fromList groups)) acc
436 in (fst acc') ++ [nub $ concat $ (snd acc') ++ [groups]]) [] graph
438 toRelatedComponents :: [PhyloGroup] -> [((PhyloGroup,PhyloGroup),Double)] -> [[PhyloGroup]]
439 toRelatedComponents nodes edges =
440 let ref = fromList $ map (\g -> (getGroupId g, g)) nodes
441 clusters = relatedComponents $ ((map (\((g,g'),_) -> [getGroupId g, getGroupId g']) edges) ++ (map (\g -> [getGroupId g]) nodes))
442 in map (\cluster -> map (\gId -> ref ! gId) cluster) clusters
445 traceSynchronyEnd :: Phylo -> Phylo
446 traceSynchronyEnd phylo =
447 trace ( "\n" <> "-- | End synchronic clustering at level " <> show (getLastLevel phylo)
448 <> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
449 <> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
452 traceSynchronyStart :: Phylo -> Phylo
453 traceSynchronyStart phylo =
454 trace ( "\n" <> "-- | Start synchronic clustering at level " <> show (getLastLevel phylo)
455 <> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
456 <> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
464 getSensibility :: Proximity -> Double
465 getSensibility proxi = case proxi of
466 WeightedLogJaccard s -> s
473 intersectInit :: Eq a => [a] -> [a] -> [a] -> [a]
474 intersectInit acc lst lst' =
475 if (null lst) || (null lst')
477 else if (head' "intersectInit" lst) == (head' "intersectInit" lst')
478 then intersectInit (acc ++ [head' "intersectInit" lst]) (tail lst) (tail lst')
481 branchIdsToProximity :: PhyloBranchId -> PhyloBranchId -> Double -> Double -> Double
482 branchIdsToProximity id id' thrInit thrStep = thrInit + thrStep * (fromIntegral $ length $ intersectInit [] (snd id) (snd id'))
484 ngramsInBranches :: [[PhyloGroup]] -> [Int]
485 ngramsInBranches branches = nub $ foldl (\acc g -> acc ++ (g ^. phylo_groupNgrams)) [] $ concat branches
488 traceMatchSuccess :: Double -> Double -> Double -> [[[PhyloGroup]]] -> [[[PhyloGroup]]]
489 traceMatchSuccess thr qua qua' nextBranches =
490 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . init . snd)
491 $ (head' "trace" $ head' "trace" $ head' "trace" nextBranches) ^. phylo_groupBranchId))
492 <> ",(1.." <> show (length nextBranches) <> ")]"
493 <> " | " <> show ((length . concat . concat) nextBranches) <> " groups" <> "\n"
494 <> " - splited with success in " <> show (map length nextBranches) <> " sub-branches" <> "\n"
495 <> " - for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " < " <> show(qua') <> ")\n" ) nextBranches
498 traceMatchFailure :: Double -> Double -> Double -> [[PhyloGroup]] -> [[PhyloGroup]]
499 traceMatchFailure thr qua qua' branches =
500 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
501 <> ",(1.." <> show (length branches) <> ")]"
502 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
503 <> " - split with failure for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " > " <> show(qua') <> ")\n"
507 traceMatchNoSplit :: [[PhyloGroup]] -> [[PhyloGroup]]
508 traceMatchNoSplit branches =
509 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
510 <> ",(1.." <> show (length branches) <> ")]"
511 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
512 <> " - unable to split in smaller branches" <> "\n"
516 traceMatchLimit :: [[PhyloGroup]] -> [[PhyloGroup]]
517 traceMatchLimit branches =
518 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
519 <> ",(1.." <> show (length branches) <> ")]"
520 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
521 <> " - unable to increase the threshold above 1" <> "\n"
525 traceMatchEnd :: [PhyloGroup] -> [PhyloGroup]
526 traceMatchEnd groups =
527 trace ("\n" <> "-- | End temporal matching with " <> show (length $ nub $ map (\g -> g ^. phylo_groupBranchId) groups)
528 <> " branches and " <> show (length groups) <> " groups" <> "\n") groups
531 traceTemporalMatching :: [PhyloGroup] -> [PhyloGroup]
532 traceTemporalMatching groups =
533 trace ( "\n" <> "-- | Start temporal matching for " <> show(length groups) <> " groups" <> "\n") groups
536 traceGroupsProxi :: Map (PhyloGroupId,PhyloGroupId) Double -> Map (PhyloGroupId,PhyloGroupId) Double
538 trace ( "\n" <> "-- | " <> show(Map.size m) <> " computed pairs of groups proximity" <> "\n") m