2 Module : Gargantext.Viz.Phylo.PhyloTools
3 Description : Module dedicated to all the tools needed for making a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 {-# LANGUAGE FlexibleContexts #-}
12 {-# LANGUAGE NoImplicitPrelude #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE RankNTypes #-}
15 {-# LANGUAGE ViewPatterns #-}
17 module Gargantext.Viz.Phylo.PhyloTools where
19 import Data.Vector (Vector, elemIndex)
20 import Data.List (sort, concat, null, union, (++), tails, sortOn, nub, init, tail, partition, tails, nubBy)
21 import Data.Set (Set, disjoint)
22 import Data.Map (Map, elems, fromList, unionWith, keys, member, (!), filterWithKey, fromListWith, empty)
23 import Data.String (String)
24 import Data.Text (Text, unwords)
26 import Gargantext.Prelude
27 import Gargantext.Viz.AdaptativePhylo
31 import Debug.Trace (trace)
32 import Control.Lens hiding (Level)
34 import qualified Data.Vector as Vector
35 import qualified Data.List as List
36 import qualified Data.Set as Set
37 import qualified Data.Map as Map
43 -- | To print an important message as an IO()
44 printIOMsg :: String -> IO ()
49 <> "-- | " <> msg <> "\n" )
52 -- | To print a comment as an IO()
53 printIOComment :: String -> IO ()
55 putStrLn ( "\n" <> cmt <> "\n" )
63 roundToStr :: (PrintfArg a, Floating a) => Int -> a -> String
64 roundToStr = printf "%0.*f"
67 countSup :: Double -> [Double] -> Int
68 countSup s l = length $ filter (>s) l
71 dropByIdx :: Int -> [a] -> [a]
72 dropByIdx k l = take k l ++ drop (k+1) l
75 elemIndex' :: Eq a => a -> [a] -> Int
76 elemIndex' e l = case (List.elemIndex e l) of
77 Nothing -> panic ("[ERR][Viz.Phylo.PhyloTools] element not in list")
81 commonPrefix :: Eq a => [a] -> [a] -> [a] -> [a]
82 commonPrefix lst lst' acc =
83 if (null lst || null lst')
85 else if (head' "commonPrefix" lst == head' "commonPrefix" lst')
86 then commonPrefix (tail lst) (tail lst') (acc ++ [head' "commonPrefix" lst])
95 -- | Is this Ngrams a Foundations Root ?
96 isRoots :: Ngrams -> Vector Ngrams -> Bool
97 isRoots n ns = Vector.elem n ns
99 -- | To transform a list of nrams into a list of foundation's index
100 ngramsToIdx :: [Ngrams] -> Vector Ngrams -> [Int]
101 ngramsToIdx ns fdt = map (\n -> fromJust $ elemIndex n fdt) ns
103 -- | To transform a list of Ngrams Indexes into a Label
104 ngramsToLabel :: Vector Ngrams -> [Int] -> Text
105 ngramsToLabel ngrams l = unwords $ tail' "ngramsToLabel" $ concat $ map (\n -> ["|",n]) $ ngramsToText ngrams l
108 -- | To transform a list of Ngrams Indexes into a list of Text
109 ngramsToText :: Vector Ngrams -> [Int] -> [Text]
110 ngramsToText ngrams l = map (\idx -> ngrams Vector.! idx) l
117 -- | To transform a list of periods into a set of Dates
118 periodsToYears :: [(Date,Date)] -> Set Date
119 periodsToYears periods = (Set.fromList . sort . concat)
120 $ map (\(d,d') -> [d..d']) periods
123 findBounds :: [Date] -> (Date,Date)
125 let dates' = sort dates
126 in (head' "findBounds" dates', last' "findBounds" dates')
129 toPeriods :: [Date] -> Int -> Int -> [(Date,Date)]
130 toPeriods dates p s =
131 let (start,end) = findBounds dates
132 in map (\dates' -> (head' "toPeriods" dates', last' "toPeriods" dates'))
133 $ chunkAlong p s [start .. end]
136 -- | Get a regular & ascendante timeScale from a given list of dates
137 toTimeScale :: [Date] -> Int -> [Date]
138 toTimeScale dates step =
139 let (start,end) = findBounds dates
140 in [start, (start + step) .. end]
143 getTimeStep :: TimeUnit -> Int
144 getTimeStep time = case time of
147 getTimePeriod :: TimeUnit -> Int
148 getTimePeriod time = case time of
151 getTimeFrame :: TimeUnit -> Int
152 getTimeFrame time = case time of
160 -- | To find if l' is nested in l
161 isNested :: Eq a => [a] -> [a] -> Bool
164 | length l' > length l = False
165 | (union l l') == l = True
169 -- | To filter Fis with small Support but by keeping non empty Periods
170 keepFilled :: (Int -> [a] -> [a]) -> Int -> [a] -> [a]
171 keepFilled f thr l = if (null $ f thr l) && (not $ null l)
172 then keepFilled f (thr - 1) l
176 traceClique :: Map (Date, Date) [PhyloClique] -> String
177 traceClique mFis = foldl (\msg cpt -> msg <> show (countSup cpt cliques) <> " (>" <> show (cpt) <> ") " ) "" [1..6]
179 --------------------------------------
181 cliques = sort $ map (fromIntegral . length . _phyloClique_nodes) $ concat $ elems mFis
182 --------------------------------------
185 traceSupport :: Map (Date, Date) [PhyloClique] -> String
186 traceSupport mFis = foldl (\msg cpt -> msg <> show (countSup cpt supports) <> " (>" <> show (cpt) <> ") " ) "" [1..6]
188 --------------------------------------
190 supports = sort $ map (fromIntegral . _phyloClique_support) $ concat $ elems mFis
191 --------------------------------------
194 traceFis :: [Char] -> Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
195 traceFis msg mFis = trace ( "\n" <> "-- | " <> msg <> " : " <> show (sum $ map length $ elems mFis) <> "\n"
196 <> "Support : " <> (traceSupport mFis) <> "\n"
197 <> "Nb Ngrams : " <> (traceClique mFis) <> "\n" ) mFis
205 getCliqueSupport :: Clique -> Int
206 getCliqueSupport unit = case unit of
210 getCliqueSize :: Clique -> Int
211 getCliqueSize unit = case unit of
220 listToCombi' :: [a] -> [(a,a)]
221 listToCombi' l = [(x,y) | (x:rest) <- tails l, y <- rest]
223 listToEqual' :: Eq a => [a] -> [(a,a)]
224 listToEqual' l = [(x,y) | x <- l, y <- l, x == y]
226 listToKeys :: Eq a => [a] -> [(a,a)]
227 listToKeys lst = (listToCombi' lst) ++ (listToEqual' lst)
229 listToMatrix :: [Int] -> Map (Int,Int) Double
230 listToMatrix lst = fromList $ map (\k -> (k,1)) $ listToKeys $ sort lst
232 listToMatrix' :: [Ngrams] -> Map (Ngrams,Ngrams) Int
233 listToMatrix' lst = fromList $ map (\k -> (k,1)) $ listToKeys $ sort lst
235 listToSeq :: Eq a => [a] -> [(a,a)]
236 listToSeq l = nubBy (\x y -> fst x == fst y) $ [ (x,y) | (x:rest) <- tails l, y <- rest ]
238 sumCooc :: Cooc -> Cooc -> Cooc
239 sumCooc cooc cooc' = unionWith (+) cooc cooc'
241 getTrace :: Cooc -> Double
242 getTrace cooc = sum $ elems $ filterWithKey (\(k,k') _ -> k == k') cooc
244 coocToDiago :: Cooc -> Cooc
245 coocToDiago cooc = filterWithKey (\(k,k') _ -> k == k') cooc
247 -- | To build the local cooc matrix of each phylogroup
248 ngramsToCooc :: [Int] -> [Cooc] -> Cooc
249 ngramsToCooc ngrams coocs =
250 let cooc = foldl (\acc cooc' -> sumCooc acc cooc') empty coocs
251 pairs = listToKeys ngrams
252 in filterWithKey (\k _ -> elem k pairs) cooc
259 getGroupId :: PhyloGroup -> PhyloGroupId
260 getGroupId group = ((group ^. phylo_groupPeriod, group ^. phylo_groupLevel), group ^. phylo_groupIndex)
262 idToPrd :: PhyloGroupId -> PhyloPeriodId
263 idToPrd id = (fst . fst) id
265 getGroupThr :: PhyloGroup -> Double
266 getGroupThr group = last' "getGroupThr" ((group ^. phylo_groupMeta) ! "breaks")
268 groupByField :: Ord a => (PhyloGroup -> a) -> [PhyloGroup] -> Map a [PhyloGroup]
269 groupByField toField groups = fromListWith (++) $ map (\g -> (toField g, [g])) groups
271 getPeriodPointers :: Filiation -> PhyloGroup -> [Pointer]
272 getPeriodPointers fil group =
274 ToChilds -> group ^. phylo_groupPeriodChilds
275 ToParents -> group ^. phylo_groupPeriodParents
277 filterProximity :: Proximity -> Double -> Double -> Bool
278 filterProximity proximity thr local =
280 WeightedLogJaccard _ -> local >= thr
283 getProximityName :: Proximity -> String
284 getProximityName proximity =
286 WeightedLogJaccard _ -> "WLJaccard"
293 addPointers :: Filiation -> PointerType -> [Pointer] -> PhyloGroup -> PhyloGroup
294 addPointers fil pty pointers group =
296 TemporalPointer -> case fil of
297 ToChilds -> group & phylo_groupPeriodChilds .~ pointers
298 ToParents -> group & phylo_groupPeriodParents .~ pointers
299 LevelPointer -> case fil of
300 ToChilds -> group & phylo_groupLevelChilds .~ pointers
301 ToParents -> group & phylo_groupLevelParents .~ pointers
304 getPeriodIds :: Phylo -> [(Date,Date)]
305 getPeriodIds phylo = sortOn fst
307 $ phylo ^. phylo_periods
309 getLevelParentId :: PhyloGroup -> PhyloGroupId
310 getLevelParentId g = fst $ head' "getLevelParentId" $ g ^. phylo_groupLevelParents
312 getLastLevel :: Phylo -> Level
313 getLastLevel phylo = last' "lastLevel" $ getLevels phylo
315 getLevels :: Phylo -> [Level]
316 getLevels phylo = nub
318 $ keys $ view ( phylo_periods
320 . phylo_periodLevels ) phylo
322 getSeaElevation :: Phylo -> SeaElevation
323 getSeaElevation phylo = seaElevation (getConfig phylo)
326 getConfig :: Phylo -> Config
327 getConfig phylo = (phylo ^. phylo_param) ^. phyloParam_config
330 getRoots :: Phylo -> Vector Ngrams
331 getRoots phylo = (phylo ^. phylo_foundations) ^. foundations_roots
333 phyloToLastBranches :: Phylo -> [[PhyloGroup]]
334 phyloToLastBranches phylo = elems
336 $ map (\g -> (g ^. phylo_groupBranchId, [g]))
337 $ getGroupsFromLevel (last' "byBranches" $ getLevels phylo) phylo
339 getGroupsFromLevel :: Level -> Phylo -> [PhyloGroup]
340 getGroupsFromLevel lvl phylo =
341 elems $ view ( phylo_periods
345 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
346 . phylo_levelGroups ) phylo
349 getGroupsFromLevelPeriods :: Level -> [PhyloPeriodId] -> Phylo -> [PhyloGroup]
350 getGroupsFromLevelPeriods lvl periods phylo =
351 elems $ view ( phylo_periods
353 . filtered (\phyloPrd -> elem (phyloPrd ^. phylo_periodPeriod) periods)
356 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
357 . phylo_levelGroups ) phylo
360 getGroupsFromPeriods :: Level -> Map PhyloPeriodId PhyloPeriod -> [PhyloGroup]
361 getGroupsFromPeriods lvl periods =
362 elems $ view ( traverse
365 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
366 . phylo_levelGroups ) periods
369 updatePhyloGroups :: Level -> Map PhyloGroupId PhyloGroup -> Phylo -> Phylo
370 updatePhyloGroups lvl m phylo =
375 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
379 let id = getGroupId group
386 traceToPhylo :: Level -> Phylo -> Phylo
387 traceToPhylo lvl phylo =
388 trace ("\n" <> "-- | End of phylo making at level " <> show (lvl) <> " with "
389 <> show (length $ getGroupsFromLevel lvl phylo) <> " groups and "
390 <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel lvl phylo) <> " branches" <> "\n") phylo
396 relatedComponents :: Ord a => [[a]] -> [[a]]
397 relatedComponents graph = foldl' (\acc groups ->
401 let acc' = partition (\groups' -> disjoint (Set.fromList groups') (Set.fromList groups)) acc
402 in (fst acc') ++ [nub $ concat $ (snd acc') ++ [groups]]) [] graph
404 toRelatedComponents :: [PhyloGroup] -> [((PhyloGroup,PhyloGroup),Double)] -> [[PhyloGroup]]
405 toRelatedComponents nodes edges =
406 let ref = fromList $ map (\g -> (getGroupId g, g)) nodes
407 clusters = relatedComponents $ ((map (\((g,g'),_) -> [getGroupId g, getGroupId g']) edges) ++ (map (\g -> [getGroupId g]) nodes))
408 in map (\cluster -> map (\gId -> ref ! gId) cluster) clusters
411 traceSynchronyEnd :: Phylo -> Phylo
412 traceSynchronyEnd phylo =
413 trace ( "\n" <> "-- | End synchronic clustering at level " <> show (getLastLevel phylo)
414 <> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
415 <> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
418 traceSynchronyStart :: Phylo -> Phylo
419 traceSynchronyStart phylo =
420 trace ( "\n" <> "-- | Start synchronic clustering at level " <> show (getLastLevel phylo)
421 <> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
422 <> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
430 getSensibility :: Proximity -> Double
431 getSensibility proxi = case proxi of
432 WeightedLogJaccard s -> s
439 intersectInit :: Eq a => [a] -> [a] -> [a] -> [a]
440 intersectInit acc lst lst' =
441 if (null lst) || (null lst')
443 else if (head' "intersectInit" lst) == (head' "intersectInit" lst')
444 then intersectInit (acc ++ [head' "intersectInit" lst]) (tail lst) (tail lst')
447 branchIdsToProximity :: PhyloBranchId -> PhyloBranchId -> Double -> Double -> Double
448 branchIdsToProximity id id' thrInit thrStep = thrInit + thrStep * (fromIntegral $ length $ intersectInit [] (snd id) (snd id'))
450 ngramsInBranches :: [[PhyloGroup]] -> [Int]
451 ngramsInBranches branches = nub $ foldl (\acc g -> acc ++ (g ^. phylo_groupNgrams)) [] $ concat branches
454 traceMatchSuccess :: Double -> Double -> Double -> [[[PhyloGroup]]] -> [[[PhyloGroup]]]
455 traceMatchSuccess thr qua qua' nextBranches =
456 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . init . snd)
457 $ (head' "trace" $ head' "trace" $ head' "trace" nextBranches) ^. phylo_groupBranchId))
458 <> ",(1.." <> show (length nextBranches) <> ")]"
459 <> " | " <> show ((length . concat . concat) nextBranches) <> " groups" <> "\n"
460 <> " - splited with success in " <> show (map length nextBranches) <> " sub-branches" <> "\n"
461 <> " - for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " < " <> show(qua') <> ")\n" ) nextBranches
464 traceMatchFailure :: Double -> Double -> Double -> [[PhyloGroup]] -> [[PhyloGroup]]
465 traceMatchFailure thr qua qua' branches =
466 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
467 <> ",(1.." <> show (length branches) <> ")]"
468 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
469 <> " - split with failure for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " > " <> show(qua') <> ")\n"
473 traceMatchNoSplit :: [[PhyloGroup]] -> [[PhyloGroup]]
474 traceMatchNoSplit branches =
475 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
476 <> ",(1.." <> show (length branches) <> ")]"
477 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
478 <> " - unable to split in smaller branches" <> "\n"
482 traceMatchLimit :: [[PhyloGroup]] -> [[PhyloGroup]]
483 traceMatchLimit branches =
484 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
485 <> ",(1.." <> show (length branches) <> ")]"
486 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
487 <> " - unable to increase the threshold above 1" <> "\n"
491 traceMatchEnd :: [PhyloGroup] -> [PhyloGroup]
492 traceMatchEnd groups =
493 trace ("\n" <> "-- | End temporal matching with " <> show (length $ nub $ map (\g -> g ^. phylo_groupBranchId) groups)
494 <> " branches and " <> show (length groups) <> " groups" <> "\n") groups
497 traceTemporalMatching :: [PhyloGroup] -> [PhyloGroup]
498 traceTemporalMatching groups =
499 trace ( "\n" <> "-- | Start temporal matching for " <> show(length groups) <> " groups" <> "\n") groups
502 traceGroupsProxi :: Map (PhyloGroupId,PhyloGroupId) Double -> Map (PhyloGroupId,PhyloGroupId) Double
504 trace ( "\n" <> "-- | " <> show(Map.size m) <> " computed pairs of groups proximity" <> "\n") m