2 Module : Gargantext.Viz.Phylo.PhyloTools
3 Description : Module dedicated to all the tools needed for making a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 {-# LANGUAGE FlexibleContexts #-}
12 {-# LANGUAGE NoImplicitPrelude #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE RankNTypes #-}
15 {-# LANGUAGE ViewPatterns #-}
17 module Gargantext.Viz.Phylo.PhyloTools where
19 import Data.Vector (Vector, elemIndex)
20 import Data.List (sort, concat, null, union, (++), tails, sortOn, nub, init, tail, partition, tails, nubBy)
21 import Data.Set (Set, size, disjoint)
22 import Data.Map (Map, elems, fromList, unionWith, keys, member, (!), filterWithKey, fromListWith, empty)
23 import Data.String (String)
24 import Data.Text (Text, unwords)
26 import Gargantext.Prelude
27 import Gargantext.Viz.AdaptativePhylo
31 import Debug.Trace (trace)
32 import Control.Lens hiding (Level)
34 import qualified Data.Vector as Vector
35 import qualified Data.List as List
36 import qualified Data.Set as Set
37 import qualified Data.Map as Map
43 -- | To print an important message as an IO()
44 printIOMsg :: String -> IO ()
49 <> "-- | " <> msg <> "\n" )
52 -- | To print a comment as an IO()
53 printIOComment :: String -> IO ()
55 putStrLn ( "\n" <> cmt <> "\n" )
63 roundToStr :: (PrintfArg a, Floating a) => Int -> a -> String
64 roundToStr = printf "%0.*f"
67 countSup :: Double -> [Double] -> Int
68 countSup s l = length $ filter (>s) l
71 dropByIdx :: Int -> [a] -> [a]
72 dropByIdx k l = take k l ++ drop (k+1) l
75 elemIndex' :: Eq a => a -> [a] -> Int
76 elemIndex' e l = case (List.elemIndex e l) of
77 Nothing -> panic ("[ERR][Viz.Phylo.PhyloTools] element not in list")
81 commonPrefix :: Eq a => [a] -> [a] -> [a] -> [a]
82 commonPrefix lst lst' acc =
83 if (null lst || null lst')
85 else if (head' "commonPrefix" lst == head' "commonPrefix" lst')
86 then commonPrefix (tail lst) (tail lst') (acc ++ [head' "commonPrefix" lst])
95 -- | Is this Ngrams a Foundations Root ?
96 isRoots :: Ngrams -> Vector Ngrams -> Bool
97 isRoots n ns = Vector.elem n ns
99 -- | To transform a list of nrams into a list of foundation's index
100 ngramsToIdx :: [Ngrams] -> Vector Ngrams -> [Int]
101 ngramsToIdx ns fdt = map (\n -> fromJust $ elemIndex n fdt) ns
103 -- | To transform a list of Ngrams Indexes into a Label
104 ngramsToLabel :: Vector Ngrams -> [Int] -> Text
105 ngramsToLabel ngrams l = unwords $ tail' "ngramsToLabel" $ concat $ map (\n -> ["|",n]) $ ngramsToText ngrams l
108 -- | To transform a list of Ngrams Indexes into a list of Text
109 ngramsToText :: Vector Ngrams -> [Int] -> [Text]
110 ngramsToText ngrams l = map (\idx -> ngrams Vector.! idx) l
117 -- | To transform a list of periods into a set of Dates
118 periodsToYears :: [(Date,Date)] -> Set Date
119 periodsToYears periods = (Set.fromList . sort . concat)
120 $ map (\(d,d') -> [d..d']) periods
123 findBounds :: [Date] -> (Date,Date)
125 let dates' = sort dates
126 in (head' "findBounds" dates', last' "findBounds" dates')
129 toPeriods :: [Date] -> Int -> Int -> [(Date,Date)]
130 toPeriods dates p s =
131 let (start,end) = findBounds dates
132 in map (\dates' -> (head' "toPeriods" dates', last' "toPeriods" dates'))
133 $ chunkAlong p s [start .. end]
136 -- | Get a regular & ascendante timeScale from a given list of dates
137 toTimeScale :: [Date] -> Int -> [Date]
138 toTimeScale dates step =
139 let (start,end) = findBounds dates
140 in [start, (start + step) .. end]
143 getTimeStep :: TimeUnit -> Int
144 getTimeStep time = case time of
147 getTimePeriod :: TimeUnit -> Int
148 getTimePeriod time = case time of
151 getTimeFrame :: TimeUnit -> Int
152 getTimeFrame time = case time of
160 -- | To find if l' is nested in l
161 isNested :: Eq a => [a] -> [a] -> Bool
164 | length l' > length l = False
165 | (union l l') == l = True
169 -- | To filter Fis with small Support but by keeping non empty Periods
170 keepFilled :: (Int -> [a] -> [a]) -> Int -> [a] -> [a]
171 keepFilled f thr l = if (null $ f thr l) && (not $ null l)
172 then keepFilled f (thr - 1) l
176 traceClique :: Map (Date, Date) [PhyloClique] -> String
177 traceClique mFis = foldl (\msg cpt -> msg <> show (countSup cpt cliques) <> " (>" <> show (cpt) <> ") " ) "" [1..6]
179 --------------------------------------
181 cliques = sort $ map (fromIntegral . size . _phyloClique_nodes) $ concat $ elems mFis
182 --------------------------------------
185 traceSupport :: Map (Date, Date) [PhyloClique] -> String
186 traceSupport mFis = foldl (\msg cpt -> msg <> show (countSup cpt supports) <> " (>" <> show (cpt) <> ") " ) "" [1..6]
188 --------------------------------------
190 supports = sort $ map (fromIntegral . _phyloClique_support) $ concat $ elems mFis
191 --------------------------------------
194 traceFis :: [Char] -> Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
195 traceFis msg mFis = trace ( "\n" <> "-- | " <> msg <> " : " <> show (sum $ map length $ elems mFis) <> "\n"
196 <> "Support : " <> (traceSupport mFis) <> "\n"
197 <> "Nb Ngrams : " <> (traceClique mFis) <> "\n" ) mFis
205 getCliqueSupport :: Clique -> Int
206 getCliqueSupport unit = case unit of
210 getCliqueSize :: Clique -> Int
211 getCliqueSize unit = case unit of
220 listToCombi' :: [a] -> [(a,a)]
221 listToCombi' l = [(x,y) | (x:rest) <- tails l, y <- rest]
223 listToEqual' :: Eq a => [a] -> [(a,a)]
224 listToEqual' l = [(x,y) | x <- l, y <- l, x == y]
226 listToKeys :: Eq a => [a] -> [(a,a)]
227 listToKeys lst = (listToCombi' lst) ++ (listToEqual' lst)
229 listToMatrix :: [Int] -> Map (Int,Int) Double
230 listToMatrix lst = fromList $ map (\k -> (k,1)) $ listToKeys $ sort lst
232 listToSeq :: Eq a => [a] -> [(a,a)]
233 listToSeq l = nubBy (\x y -> fst x == fst y) $ [ (x,y) | (x:rest) <- tails l, y <- rest ]
235 sumCooc :: Cooc -> Cooc -> Cooc
236 sumCooc cooc cooc' = unionWith (+) cooc cooc'
238 getTrace :: Cooc -> Double
239 getTrace cooc = sum $ elems $ filterWithKey (\(k,k') _ -> k == k') cooc
241 coocToDiago :: Cooc -> Cooc
242 coocToDiago cooc = filterWithKey (\(k,k') _ -> k == k') cooc
244 -- | To build the local cooc matrix of each phylogroup
245 ngramsToCooc :: [Int] -> [Cooc] -> Cooc
246 ngramsToCooc ngrams coocs =
247 let cooc = foldl (\acc cooc' -> sumCooc acc cooc') empty coocs
248 pairs = listToKeys ngrams
249 in filterWithKey (\k _ -> elem k pairs) cooc
256 getGroupId :: PhyloGroup -> PhyloGroupId
257 getGroupId group = ((group ^. phylo_groupPeriod, group ^. phylo_groupLevel), group ^. phylo_groupIndex)
259 idToPrd :: PhyloGroupId -> PhyloPeriodId
260 idToPrd id = (fst . fst) id
262 getGroupThr :: PhyloGroup -> Double
263 getGroupThr group = last' "getGroupThr" ((group ^. phylo_groupMeta) ! "breaks")
265 groupByField :: Ord a => (PhyloGroup -> a) -> [PhyloGroup] -> Map a [PhyloGroup]
266 groupByField toField groups = fromListWith (++) $ map (\g -> (toField g, [g])) groups
268 getPeriodPointers :: Filiation -> PhyloGroup -> [Pointer]
269 getPeriodPointers fil group =
271 ToChilds -> group ^. phylo_groupPeriodChilds
272 ToParents -> group ^. phylo_groupPeriodParents
274 filterProximity :: Proximity -> Double -> Double -> Bool
275 filterProximity proximity thr local =
277 WeightedLogJaccard _ -> local >= thr
280 getProximityName :: Proximity -> String
281 getProximityName proximity =
283 WeightedLogJaccard _ -> "WLJaccard"
290 addPointers :: Filiation -> PointerType -> [Pointer] -> PhyloGroup -> PhyloGroup
291 addPointers fil pty pointers group =
293 TemporalPointer -> case fil of
294 ToChilds -> group & phylo_groupPeriodChilds .~ pointers
295 ToParents -> group & phylo_groupPeriodParents .~ pointers
296 LevelPointer -> case fil of
297 ToChilds -> group & phylo_groupLevelChilds .~ pointers
298 ToParents -> group & phylo_groupLevelParents .~ pointers
301 getPeriodIds :: Phylo -> [(Date,Date)]
302 getPeriodIds phylo = sortOn fst
304 $ phylo ^. phylo_periods
306 getLevelParentId :: PhyloGroup -> PhyloGroupId
307 getLevelParentId g = fst $ head' "getLevelParentId" $ g ^. phylo_groupLevelParents
309 getLastLevel :: Phylo -> Level
310 getLastLevel phylo = last' "lastLevel" $ getLevels phylo
312 getLevels :: Phylo -> [Level]
313 getLevels phylo = nub
315 $ keys $ view ( phylo_periods
317 . phylo_periodLevels ) phylo
319 getSeaElevation :: Phylo -> SeaElevation
320 getSeaElevation phylo = seaElevation (getConfig phylo)
323 getConfig :: Phylo -> Config
324 getConfig phylo = (phylo ^. phylo_param) ^. phyloParam_config
327 getRoots :: Phylo -> Vector Ngrams
328 getRoots phylo = (phylo ^. phylo_foundations) ^. foundations_roots
330 phyloToLastBranches :: Phylo -> [[PhyloGroup]]
331 phyloToLastBranches phylo = elems
333 $ map (\g -> (g ^. phylo_groupBranchId, [g]))
334 $ getGroupsFromLevel (last' "byBranches" $ getLevels phylo) phylo
336 getGroupsFromLevel :: Level -> Phylo -> [PhyloGroup]
337 getGroupsFromLevel lvl phylo =
338 elems $ view ( phylo_periods
342 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
343 . phylo_levelGroups ) phylo
346 getGroupsFromLevelPeriods :: Level -> [PhyloPeriodId] -> Phylo -> [PhyloGroup]
347 getGroupsFromLevelPeriods lvl periods phylo =
348 elems $ view ( phylo_periods
350 . filtered (\phyloPrd -> elem (phyloPrd ^. phylo_periodPeriod) periods)
353 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
354 . phylo_levelGroups ) phylo
357 getGroupsFromPeriods :: Level -> Map PhyloPeriodId PhyloPeriod -> [PhyloGroup]
358 getGroupsFromPeriods lvl periods =
359 elems $ view ( traverse
362 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
363 . phylo_levelGroups ) periods
366 updatePhyloGroups :: Level -> Map PhyloGroupId PhyloGroup -> Phylo -> Phylo
367 updatePhyloGroups lvl m phylo =
372 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
376 let id = getGroupId group
383 traceToPhylo :: Level -> Phylo -> Phylo
384 traceToPhylo lvl phylo =
385 trace ("\n" <> "-- | End of phylo making at level " <> show (lvl) <> " with "
386 <> show (length $ getGroupsFromLevel lvl phylo) <> " groups and "
387 <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel lvl phylo) <> " branches" <> "\n") phylo
393 relatedComponents :: Ord a => [[a]] -> [[a]]
394 relatedComponents graph = foldl' (\acc groups ->
398 let acc' = partition (\groups' -> disjoint (Set.fromList groups') (Set.fromList groups)) acc
399 in (fst acc') ++ [nub $ concat $ (snd acc') ++ [groups]]) [] graph
402 traceSynchronyEnd :: Phylo -> Phylo
403 traceSynchronyEnd phylo =
404 trace ( "\n" <> "-- | End synchronic clustering at level " <> show (getLastLevel phylo)
405 <> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
406 <> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
409 traceSynchronyStart :: Phylo -> Phylo
410 traceSynchronyStart phylo =
411 trace ( "\n" <> "-- | Start synchronic clustering at level " <> show (getLastLevel phylo)
412 <> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
413 <> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
421 getSensibility :: Proximity -> Double
422 getSensibility proxi = case proxi of
423 WeightedLogJaccard s -> s
430 intersectInit :: Eq a => [a] -> [a] -> [a] -> [a]
431 intersectInit acc lst lst' =
432 if (null lst) || (null lst')
434 else if (head' "intersectInit" lst) == (head' "intersectInit" lst')
435 then intersectInit (acc ++ [head' "intersectInit" lst]) (tail lst) (tail lst')
438 branchIdsToProximity :: PhyloBranchId -> PhyloBranchId -> Double -> Double -> Double
439 branchIdsToProximity id id' thrInit thrStep = thrInit + thrStep * (fromIntegral $ length $ intersectInit [] (snd id) (snd id'))
441 ngramsInBranches :: [[PhyloGroup]] -> [Int]
442 ngramsInBranches branches = nub $ foldl (\acc g -> acc ++ (g ^. phylo_groupNgrams)) [] $ concat branches
445 traceMatchSuccess :: Double -> Double -> Double -> [[[PhyloGroup]]] -> [[[PhyloGroup]]]
446 traceMatchSuccess thr qua qua' nextBranches =
447 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . init . snd)
448 $ (head' "trace" $ head' "trace" $ head' "trace" nextBranches) ^. phylo_groupBranchId))
449 <> ",(1.." <> show (length nextBranches) <> ")]"
450 <> " | " <> show ((length . concat . concat) nextBranches) <> " groups" <> "\n"
451 <> " - splited with success in " <> show (map length nextBranches) <> " sub-branches" <> "\n"
452 <> " - for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " < " <> show(qua') <> ")\n" ) nextBranches
455 traceMatchFailure :: Double -> Double -> Double -> [[PhyloGroup]] -> [[PhyloGroup]]
456 traceMatchFailure thr qua qua' branches =
457 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
458 <> ",(1.." <> show (length branches) <> ")]"
459 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
460 <> " - split with failure for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " > " <> show(qua') <> ")\n"
464 traceMatchNoSplit :: [[PhyloGroup]] -> [[PhyloGroup]]
465 traceMatchNoSplit branches =
466 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
467 <> ",(1.." <> show (length branches) <> ")]"
468 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
469 <> " - unable to split in smaller branches" <> "\n"
473 traceMatchLimit :: [[PhyloGroup]] -> [[PhyloGroup]]
474 traceMatchLimit branches =
475 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
476 <> ",(1.." <> show (length branches) <> ")]"
477 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
478 <> " - unable to increase the threshold above 1" <> "\n"
482 traceMatchEnd :: [PhyloGroup] -> [PhyloGroup]
483 traceMatchEnd groups =
484 trace ("\n" <> "-- | End temporal matching with " <> show (length $ nub $ map (\g -> g ^. phylo_groupBranchId) groups)
485 <> " branches and " <> show (length groups) <> " groups" <> "\n") groups
488 traceTemporalMatching :: [PhyloGroup] -> [PhyloGroup]
489 traceTemporalMatching groups =
490 trace ( "\n" <> "-- | Start temporal matching for " <> show(length groups) <> " groups" <> "\n") groups
493 traceGroupsProxi :: Map (PhyloGroupId,PhyloGroupId) Double -> Map (PhyloGroupId,PhyloGroupId) Double
495 trace ( "\n" <> "-- | " <> show(Map.size m) <> " computed pairs of groups proximity" <> "\n") m