2 Module : Gargantext.Viz.Phylo.PhyloTools
3 Description : Module dedicated to all the tools needed for making a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 {-# LANGUAGE FlexibleContexts #-}
12 {-# LANGUAGE NoImplicitPrelude #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE RankNTypes #-}
15 {-# LANGUAGE ViewPatterns #-}
17 module Gargantext.Viz.Phylo.PhyloTools where
19 import Data.Vector (Vector, elemIndex)
20 import Data.List (sort, concat, null, union, (++), tails, sortOn, nub, init, tail, partition, tails, nubBy)
21 import Data.Set (Set, size, disjoint)
22 import Data.Map (Map, elems, fromList, unionWith, keys, member, (!), filterWithKey, fromListWith, empty)
23 import Data.String (String)
24 import Data.Text (Text, unwords)
26 import Gargantext.Prelude
27 import Gargantext.Viz.AdaptativePhylo
31 import Debug.Trace (trace)
32 import Control.Lens hiding (Level)
34 import qualified Data.Vector as Vector
35 import qualified Data.List as List
36 import qualified Data.Set as Set
37 import qualified Data.Map as Map
43 -- | To print an important message as an IO()
44 printIOMsg :: String -> IO ()
49 <> "-- | " <> msg <> "\n" )
52 -- | To print a comment as an IO()
53 printIOComment :: String -> IO ()
55 putStrLn ( "\n" <> cmt <> "\n" )
63 roundToStr :: (PrintfArg a, Floating a) => Int -> a -> String
64 roundToStr = printf "%0.*f"
67 countSup :: Double -> [Double] -> Int
68 countSup s l = length $ filter (>s) l
71 dropByIdx :: Int -> [a] -> [a]
72 dropByIdx k l = take k l ++ drop (k+1) l
75 elemIndex' :: Eq a => a -> [a] -> Int
76 elemIndex' e l = case (List.elemIndex e l) of
77 Nothing -> panic ("[ERR][Viz.Phylo.PhyloTools] element not in list")
81 commonPrefix :: Eq a => [a] -> [a] -> [a] -> [a]
82 commonPrefix lst lst' acc =
83 if (null lst || null lst')
85 else if (head' "commonPrefix" lst == head' "commonPrefix" lst')
86 then commonPrefix (tail lst) (tail lst') (acc ++ [head' "commonPrefix" lst])
95 -- | Is this Ngrams a Foundations Root ?
96 isRoots :: Ngrams -> Vector Ngrams -> Bool
97 isRoots n ns = Vector.elem n ns
99 -- | To transform a list of nrams into a list of foundation's index
100 ngramsToIdx :: [Ngrams] -> Vector Ngrams -> [Int]
101 ngramsToIdx ns fdt = map (\n -> fromJust $ elemIndex n fdt) ns
103 -- | To transform a list of Ngrams Indexes into a Label
104 ngramsToLabel :: Vector Ngrams -> [Int] -> Text
105 ngramsToLabel ngrams l = unwords $ tail' "ngramsToLabel" $ concat $ map (\n -> ["|",n]) $ ngramsToText ngrams l
108 -- | To transform a list of Ngrams Indexes into a list of Text
109 ngramsToText :: Vector Ngrams -> [Int] -> [Text]
110 ngramsToText ngrams l = map (\idx -> ngrams Vector.! idx) l
117 -- | To transform a list of periods into a set of Dates
118 periodsToYears :: [(Date,Date)] -> Set Date
119 periodsToYears periods = (Set.fromList . sort . concat)
120 $ map (\(d,d') -> [d..d']) periods
123 findBounds :: [Date] -> (Date,Date)
125 let dates' = sort dates
126 in (head' "findBounds" dates', last' "findBounds" dates')
129 toPeriods :: [Date] -> Int -> Int -> [(Date,Date)]
130 toPeriods dates p s =
131 let (start,end) = findBounds dates
132 in map (\dates' -> (head' "toPeriods" dates', last' "toPeriods" dates'))
133 $ chunkAlong p s [start .. end]
136 -- | Get a regular & ascendante timeScale from a given list of dates
137 toTimeScale :: [Date] -> Int -> [Date]
138 toTimeScale dates step =
139 let (start,end) = findBounds dates
140 in [start, (start + step) .. end]
143 getTimeStep :: TimeUnit -> Int
144 getTimeStep time = case time of
147 getTimePeriod :: TimeUnit -> Int
148 getTimePeriod time = case time of
151 getTimeFrame :: TimeUnit -> Int
152 getTimeFrame time = case time of
160 -- | To find if l' is nested in l
161 isNested :: Eq a => [a] -> [a] -> Bool
164 | length l' > length l = False
165 | (union l l') == l = True
169 -- | To filter Fis with small Support but by keeping non empty Periods
170 keepFilled :: (Int -> [a] -> [a]) -> Int -> [a] -> [a]
171 keepFilled f thr l = if (null $ f thr l) && (not $ null l)
172 then keepFilled f (thr - 1) l
176 traceClique :: Map (Date, Date) [PhyloClique] -> String
177 traceClique mFis = foldl (\msg cpt -> msg <> show (countSup cpt cliques) <> " (>" <> show (cpt) <> ") " ) "" [1..6]
179 --------------------------------------
181 cliques = sort $ map (fromIntegral . size . _phyloClique_nodes) $ concat $ elems mFis
182 --------------------------------------
185 traceSupport :: Map (Date, Date) [PhyloClique] -> String
186 traceSupport mFis = foldl (\msg cpt -> msg <> show (countSup cpt supports) <> " (>" <> show (cpt) <> ") " ) "" [1..6]
188 --------------------------------------
190 supports = sort $ map (fromIntegral . _phyloClique_support) $ concat $ elems mFis
191 --------------------------------------
194 traceFis :: [Char] -> Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
195 traceFis msg mFis = trace ( "\n" <> "-- | " <> msg <> " : " <> show (sum $ map length $ elems mFis) <> "\n"
196 <> "Support : " <> (traceSupport mFis) <> "\n"
197 <> "Nb Ngrams : " <> (traceClique mFis) <> "\n" ) mFis
205 getCliqueSupport :: Clique -> Int
206 getCliqueSupport unit = case unit of
210 getCliqueSize :: Clique -> Int
211 getCliqueSize unit = case unit of
220 listToCombi' :: [a] -> [(a,a)]
221 listToCombi' l = [(x,y) | (x:rest) <- tails l, y <- rest]
223 listToEqual' :: Eq a => [a] -> [(a,a)]
224 listToEqual' l = [(x,y) | x <- l, y <- l, x == y]
226 listToKeys :: Eq a => [a] -> [(a,a)]
227 listToKeys lst = (listToCombi' lst) ++ (listToEqual' lst)
229 listToMatrix :: [Int] -> Map (Int,Int) Double
230 listToMatrix lst = fromList $ map (\k -> (k,1)) $ listToKeys $ sort lst
232 listToSeq :: Eq a => [a] -> [(a,a)]
233 listToSeq l = nubBy (\x y -> fst x == fst y) $ [ (x,y) | (x:rest) <- tails l, y <- rest ]
235 sumCooc :: Cooc -> Cooc -> Cooc
236 sumCooc cooc cooc' = unionWith (+) cooc cooc'
238 getTrace :: Cooc -> Double
239 getTrace cooc = sum $ elems $ filterWithKey (\(k,k') _ -> k == k') cooc
241 coocToDiago :: Cooc -> Cooc
242 coocToDiago cooc = filterWithKey (\(k,k') _ -> k == k') cooc
244 -- | To build the local cooc matrix of each phylogroup
245 ngramsToCooc :: [Int] -> [Cooc] -> Cooc
246 ngramsToCooc ngrams coocs =
247 let cooc = foldl (\acc cooc' -> sumCooc acc cooc') empty coocs
248 pairs = listToKeys ngrams
249 in filterWithKey (\k _ -> elem k pairs) cooc
256 getGroupId :: PhyloGroup -> PhyloGroupId
257 getGroupId group = ((group ^. phylo_groupPeriod, group ^. phylo_groupLevel), group ^. phylo_groupIndex)
259 idToPrd :: PhyloGroupId -> PhyloPeriodId
260 idToPrd id = (fst . fst) id
262 getGroupThr :: PhyloGroup -> Double
263 getGroupThr group = last' "getGroupThr" ((group ^. phylo_groupMeta) ! "breaks")
265 groupByField :: Ord a => (PhyloGroup -> a) -> [PhyloGroup] -> Map a [PhyloGroup]
266 groupByField toField groups = fromListWith (++) $ map (\g -> (toField g, [g])) groups
268 getPeriodPointers :: Filiation -> PhyloGroup -> [Pointer]
269 getPeriodPointers fil group =
271 ToChilds -> group ^. phylo_groupPeriodChilds
272 ToParents -> group ^. phylo_groupPeriodParents
274 filterProximity :: Proximity -> Double -> Double -> Bool
275 filterProximity proximity thr local =
277 WeightedLogJaccard _ -> local >= thr
280 getProximityName :: Proximity -> String
281 getProximityName proximity =
283 WeightedLogJaccard _ -> "WLJaccard"
290 addPointers :: Filiation -> PointerType -> [Pointer] -> PhyloGroup -> PhyloGroup
291 addPointers fil pty pointers group =
293 TemporalPointer -> case fil of
294 ToChilds -> group & phylo_groupPeriodChilds .~ pointers
295 ToParents -> group & phylo_groupPeriodParents .~ pointers
296 LevelPointer -> case fil of
297 ToChilds -> group & phylo_groupLevelChilds .~ pointers
298 ToParents -> group & phylo_groupLevelParents .~ pointers
301 getPeriodIds :: Phylo -> [(Date,Date)]
302 getPeriodIds phylo = sortOn fst
304 $ phylo ^. phylo_periods
306 getLevelParentId :: PhyloGroup -> PhyloGroupId
307 getLevelParentId g = fst $ head' "getLevelParentId" $ g ^. phylo_groupLevelParents
309 getLastLevel :: Phylo -> Level
310 getLastLevel phylo = last' "lastLevel" $ getLevels phylo
312 getLevels :: Phylo -> [Level]
313 getLevels phylo = nub
315 $ keys $ view ( phylo_periods
317 . phylo_periodLevels ) phylo
319 getSeaElevation :: Phylo -> SeaElevation
320 getSeaElevation phylo = seaElevation (getConfig phylo)
323 getConfig :: Phylo -> Config
324 getConfig phylo = (phylo ^. phylo_param) ^. phyloParam_config
327 getRoots :: Phylo -> Vector Ngrams
328 getRoots phylo = (phylo ^. phylo_foundations) ^. foundations_roots
330 phyloToLastBranches :: Phylo -> [[PhyloGroup]]
331 phyloToLastBranches phylo = elems
333 $ map (\g -> (g ^. phylo_groupBranchId, [g]))
334 $ getGroupsFromLevel (last' "byBranches" $ getLevels phylo) phylo
336 getGroupsFromLevel :: Level -> Phylo -> [PhyloGroup]
337 getGroupsFromLevel lvl phylo =
338 elems $ view ( phylo_periods
342 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
343 . phylo_levelGroups ) phylo
346 getGroupsFromLevelPeriods :: Level -> [PhyloPeriodId] -> Phylo -> [PhyloGroup]
347 getGroupsFromLevelPeriods lvl periods phylo =
348 elems $ view ( phylo_periods
350 . filtered (\phyloPrd -> elem (phyloPrd ^. phylo_periodPeriod) periods)
353 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
354 . phylo_levelGroups ) phylo
357 getGroupsFromPeriods :: Level -> Map PhyloPeriodId PhyloPeriod -> [PhyloGroup]
358 getGroupsFromPeriods lvl periods =
359 elems $ view ( traverse
362 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
363 . phylo_levelGroups ) periods
366 updatePhyloGroups :: Level -> Map PhyloGroupId PhyloGroup -> Phylo -> Phylo
367 updatePhyloGroups lvl m phylo =
372 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
376 let id = getGroupId group
383 traceToPhylo :: Level -> Phylo -> Phylo
384 traceToPhylo lvl phylo =
385 trace ("\n" <> "-- | End of phylo making at level " <> show (lvl) <> " with "
386 <> show (length $ getGroupsFromLevel lvl phylo) <> " groups and "
387 <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel lvl phylo) <> " branches" <> "\n") phylo
393 relatedComponents :: Ord a => [[a]] -> [[a]]
394 relatedComponents graph = foldl' (\acc groups ->
398 let acc' = partition (\groups' -> disjoint (Set.fromList groups') (Set.fromList groups)) acc
399 in (fst acc') ++ [nub $ concat $ (snd acc') ++ [groups]]) [] graph
401 toRelatedComponents :: [PhyloGroup] -> [((PhyloGroup,PhyloGroup),Double)] -> [[PhyloGroup]]
402 toRelatedComponents nodes edges =
403 let ref = fromList $ map (\g -> (getGroupId g, g)) nodes
404 clusters = relatedComponents $ ((map (\((g,g'),_) -> [getGroupId g, getGroupId g']) edges) ++ (map (\g -> [getGroupId g]) nodes))
405 in map (\cluster -> map (\gId -> ref ! gId) cluster) clusters
408 traceSynchronyEnd :: Phylo -> Phylo
409 traceSynchronyEnd phylo =
410 trace ( "\n" <> "-- | End synchronic clustering at level " <> show (getLastLevel phylo)
411 <> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
412 <> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
415 traceSynchronyStart :: Phylo -> Phylo
416 traceSynchronyStart phylo =
417 trace ( "\n" <> "-- | Start synchronic clustering at level " <> show (getLastLevel phylo)
418 <> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
419 <> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
427 getSensibility :: Proximity -> Double
428 getSensibility proxi = case proxi of
429 WeightedLogJaccard s -> s
436 intersectInit :: Eq a => [a] -> [a] -> [a] -> [a]
437 intersectInit acc lst lst' =
438 if (null lst) || (null lst')
440 else if (head' "intersectInit" lst) == (head' "intersectInit" lst')
441 then intersectInit (acc ++ [head' "intersectInit" lst]) (tail lst) (tail lst')
444 branchIdsToProximity :: PhyloBranchId -> PhyloBranchId -> Double -> Double -> Double
445 branchIdsToProximity id id' thrInit thrStep = thrInit + thrStep * (fromIntegral $ length $ intersectInit [] (snd id) (snd id'))
447 ngramsInBranches :: [[PhyloGroup]] -> [Int]
448 ngramsInBranches branches = nub $ foldl (\acc g -> acc ++ (g ^. phylo_groupNgrams)) [] $ concat branches
451 traceMatchSuccess :: Double -> Double -> Double -> [[[PhyloGroup]]] -> [[[PhyloGroup]]]
452 traceMatchSuccess thr qua qua' nextBranches =
453 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . init . snd)
454 $ (head' "trace" $ head' "trace" $ head' "trace" nextBranches) ^. phylo_groupBranchId))
455 <> ",(1.." <> show (length nextBranches) <> ")]"
456 <> " | " <> show ((length . concat . concat) nextBranches) <> " groups" <> "\n"
457 <> " - splited with success in " <> show (map length nextBranches) <> " sub-branches" <> "\n"
458 <> " - for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " < " <> show(qua') <> ")\n" ) nextBranches
461 traceMatchFailure :: Double -> Double -> Double -> [[PhyloGroup]] -> [[PhyloGroup]]
462 traceMatchFailure thr qua qua' branches =
463 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
464 <> ",(1.." <> show (length branches) <> ")]"
465 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
466 <> " - split with failure for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " > " <> show(qua') <> ")\n"
470 traceMatchNoSplit :: [[PhyloGroup]] -> [[PhyloGroup]]
471 traceMatchNoSplit branches =
472 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
473 <> ",(1.." <> show (length branches) <> ")]"
474 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
475 <> " - unable to split in smaller branches" <> "\n"
479 traceMatchLimit :: [[PhyloGroup]] -> [[PhyloGroup]]
480 traceMatchLimit branches =
481 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
482 <> ",(1.." <> show (length branches) <> ")]"
483 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
484 <> " - unable to increase the threshold above 1" <> "\n"
488 traceMatchEnd :: [PhyloGroup] -> [PhyloGroup]
489 traceMatchEnd groups =
490 trace ("\n" <> "-- | End temporal matching with " <> show (length $ nub $ map (\g -> g ^. phylo_groupBranchId) groups)
491 <> " branches and " <> show (length groups) <> " groups" <> "\n") groups
494 traceTemporalMatching :: [PhyloGroup] -> [PhyloGroup]
495 traceTemporalMatching groups =
496 trace ( "\n" <> "-- | Start temporal matching for " <> show(length groups) <> " groups" <> "\n") groups
499 traceGroupsProxi :: Map (PhyloGroupId,PhyloGroupId) Double -> Map (PhyloGroupId,PhyloGroupId) Double
501 trace ( "\n" <> "-- | " <> show(Map.size m) <> " computed pairs of groups proximity" <> "\n") m