2 Module : Gargantext.Viz.Phylo.PhyloTools
3 Description : Module dedicated to all the tools needed for making a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 {-# LANGUAGE ViewPatterns #-}
13 module Gargantext.Viz.Phylo.PhyloTools where
15 import Data.Vector (Vector, elemIndex)
16 import Data.List (sort, concat, null, union, (++), tails, sortOn, nub, init, tail, partition, tails, nubBy, maximum, group)
17 import Data.Set (Set, disjoint)
18 import Data.Map (Map, elems, fromList, unionWith, keys, member, (!), filterWithKey, fromListWith, empty, restrictKeys)
19 import Data.String (String)
20 import Data.Text (Text)
22 import Gargantext.Prelude
23 import Gargantext.Viz.AdaptativePhylo
27 import Debug.Trace (trace)
28 import Control.Lens hiding (Level)
30 import qualified Data.Vector as Vector
31 import qualified Data.List as List
32 import qualified Data.Set as Set
33 import qualified Data.Map as Map
34 import qualified Data.Text as Text
40 -- | To print an important message as an IO()
41 printIOMsg :: String -> IO ()
46 <> "-- | " <> msg <> "\n" )
49 -- | To print a comment as an IO()
50 printIOComment :: String -> IO ()
52 putStrLn ( "\n" <> cmt <> "\n" )
60 roundToStr :: (PrintfArg a, Floating a) => Int -> a -> String
61 roundToStr = printf "%0.*f"
64 countSup :: Double -> [Double] -> Int
65 countSup s l = length $ filter (>s) l
68 dropByIdx :: Int -> [a] -> [a]
69 dropByIdx k l = take k l ++ drop (k+1) l
72 elemIndex' :: Eq a => a -> [a] -> Int
73 elemIndex' e l = case (List.elemIndex e l) of
74 Nothing -> panic ("[ERR][Viz.Phylo.PhyloTools] element not in list")
78 commonPrefix :: Eq a => [a] -> [a] -> [a] -> [a]
79 commonPrefix lst lst' acc =
80 if (null lst || null lst')
82 else if (head' "commonPrefix" lst == head' "commonPrefix" lst')
83 then commonPrefix (tail lst) (tail lst') (acc ++ [head' "commonPrefix" lst])
92 -- | Is this Ngrams a Foundations Root ?
93 isRoots :: Ngrams -> Vector Ngrams -> Bool
94 isRoots n ns = Vector.elem n ns
96 -- | To transform a list of nrams into a list of foundation's index
97 ngramsToIdx :: [Ngrams] -> Vector Ngrams -> [Int]
98 ngramsToIdx ns fdt = map (\n -> fromJust $ elemIndex n fdt) ns
100 -- | To transform a list of Ngrams Indexes into a Label
101 ngramsToLabel :: Vector Ngrams -> [Int] -> Text
102 ngramsToLabel ngrams l = Text.unwords $ tail' "ngramsToLabel" $ concat $ map (\n -> ["|",n]) $ ngramsToText ngrams l
104 idxToLabel :: [Int] -> String
105 idxToLabel l = List.unwords $ tail' "idxToLabel" $ concat $ map (\n -> ["|",show n]) l
107 idxToLabel' :: [Double] -> String
108 idxToLabel' l = List.unwords $ tail' "idxToLabel" $ concat $ map (\n -> ["|",show n]) l
110 -- | To transform a list of Ngrams Indexes into a list of Text
111 ngramsToText :: Vector Ngrams -> [Int] -> [Text]
112 ngramsToText ngrams l = map (\idx -> ngrams Vector.! idx) l
119 -- | To transform a list of periods into a set of Dates
120 periodsToYears :: [(Date,Date)] -> Set Date
121 periodsToYears periods = (Set.fromList . sort . concat)
122 $ map (\(d,d') -> [d..d']) periods
125 findBounds :: [Date] -> (Date,Date)
127 let dates' = sort dates
128 in (head' "findBounds" dates', last' "findBounds" dates')
131 toPeriods :: [Date] -> Int -> Int -> [(Date,Date)]
132 toPeriods dates p s =
133 let (start,end) = findBounds dates
134 in map (\dates' -> (head' "toPeriods" dates', last' "toPeriods" dates'))
135 $ chunkAlong p s [start .. end]
138 -- | Get a regular & ascendante timeScale from a given list of dates
139 toTimeScale :: [Date] -> Int -> [Date]
140 toTimeScale dates step =
141 let (start,end) = findBounds dates
142 in [start, (start + step) .. end]
145 getTimeStep :: TimeUnit -> Int
146 getTimeStep time = case time of
149 getTimePeriod :: TimeUnit -> Int
150 getTimePeriod time = case time of
153 getTimeFrame :: TimeUnit -> Int
154 getTimeFrame time = case time of
162 -- | To find if l' is nested in l
163 isNested :: Eq a => [a] -> [a] -> Bool
166 | length l' > length l = False
167 | (union l l') == l = True
171 -- | To filter Fis with small Support but by keeping non empty Periods
172 keepFilled :: (Int -> [a] -> [a]) -> Int -> [a] -> [a]
173 keepFilled f thr l = if (null $ f thr l) && (not $ null l)
174 then keepFilled f (thr - 1) l
178 traceClique :: Map (Date, Date) [PhyloClique] -> String
179 traceClique mFis = foldl (\msg cpt -> msg <> show (countSup cpt cliques) <> " (>" <> show (cpt) <> ") " ) "" [1..6]
181 --------------------------------------
183 cliques = sort $ map (fromIntegral . length . _phyloClique_nodes) $ concat $ elems mFis
184 --------------------------------------
187 traceSupport :: Map (Date, Date) [PhyloClique] -> String
188 traceSupport mFis = foldl (\msg cpt -> msg <> show (countSup cpt supports) <> " (>" <> show (cpt) <> ") " ) "" [1..6]
190 --------------------------------------
192 supports = sort $ map (fromIntegral . _phyloClique_support) $ concat $ elems mFis
193 --------------------------------------
196 traceFis :: [Char] -> Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
197 traceFis msg mFis = trace ( "\n" <> "-- | " <> msg <> " : " <> show (sum $ map length $ elems mFis) <> "\n"
198 <> "Support : " <> (traceSupport mFis) <> "\n"
199 <> "Nb Ngrams : " <> (traceClique mFis) <> "\n" ) mFis
207 getCliqueSupport :: Clique -> Int
208 getCliqueSupport unit = case unit of
212 getCliqueSize :: Clique -> Int
213 getCliqueSize unit = case unit of
222 listToCombi' :: [a] -> [(a,a)]
223 listToCombi' l = [(x,y) | (x:rest) <- tails l, y <- rest]
225 listToEqual' :: Eq a => [a] -> [(a,a)]
226 listToEqual' l = [(x,y) | x <- l, y <- l, x == y]
228 listToKeys :: Eq a => [a] -> [(a,a)]
229 listToKeys lst = (listToCombi' lst) ++ (listToEqual' lst)
231 listToMatrix :: [Int] -> Map (Int,Int) Double
232 listToMatrix lst = fromList $ map (\k -> (k,1)) $ listToKeys $ sort lst
234 listToMatrix' :: [Ngrams] -> Map (Ngrams,Ngrams) Int
235 listToMatrix' lst = fromList $ map (\k -> (k,1)) $ listToKeys $ sort lst
237 listToSeq :: Eq a => [a] -> [(a,a)]
238 listToSeq l = nubBy (\x y -> fst x == fst y) $ [ (x,y) | (x:rest) <- tails l, y <- rest ]
240 sumCooc :: Cooc -> Cooc -> Cooc
241 sumCooc cooc cooc' = unionWith (+) cooc cooc'
243 getTrace :: Cooc -> Double
244 getTrace cooc = sum $ elems $ filterWithKey (\(k,k') _ -> k == k') cooc
246 coocToDiago :: Cooc -> Cooc
247 coocToDiago cooc = filterWithKey (\(k,k') _ -> k == k') cooc
249 -- | To build the local cooc matrix of each phylogroup
250 ngramsToCooc :: [Int] -> [Cooc] -> Cooc
251 ngramsToCooc ngrams coocs =
252 let cooc = foldl (\acc cooc' -> sumCooc acc cooc') empty coocs
253 pairs = listToKeys ngrams
254 in filterWithKey (\k _ -> elem k pairs) cooc
261 getGroupId :: PhyloGroup -> PhyloGroupId
262 getGroupId g = ((g ^. phylo_groupPeriod, g ^. phylo_groupLevel), g ^. phylo_groupIndex)
264 idToPrd :: PhyloGroupId -> PhyloPeriodId
265 idToPrd id = (fst . fst) id
267 groupByField :: Ord a => (PhyloGroup -> a) -> [PhyloGroup] -> Map a [PhyloGroup]
268 groupByField toField groups = fromListWith (++) $ map (\g -> (toField g, [g])) groups
270 getPeriodPointers :: Filiation -> PhyloGroup -> [Pointer]
271 getPeriodPointers fil g =
273 ToChilds -> g ^. phylo_groupPeriodChilds
274 ToParents -> g ^. phylo_groupPeriodParents
276 filterProximity :: Proximity -> Double -> Double -> Bool
277 filterProximity proximity thr local =
279 WeightedLogJaccard _ -> local >= thr
282 getProximityName :: Proximity -> String
283 getProximityName proximity =
285 WeightedLogJaccard _ -> "WLJaccard"
292 addPointers :: Filiation -> PointerType -> [Pointer] -> PhyloGroup -> PhyloGroup
293 addPointers fil pty pointers g =
295 TemporalPointer -> case fil of
296 ToChilds -> g & phylo_groupPeriodChilds .~ pointers
297 ToParents -> g & phylo_groupPeriodParents .~ pointers
298 LevelPointer -> case fil of
299 ToChilds -> g & phylo_groupLevelChilds .~ pointers
300 ToParents -> g & phylo_groupLevelParents .~ pointers
303 getPeriodIds :: Phylo -> [(Date,Date)]
304 getPeriodIds phylo = sortOn fst
306 $ phylo ^. phylo_periods
308 getLevelParentId :: PhyloGroup -> PhyloGroupId
309 getLevelParentId g = fst $ head' "getLevelParentId" $ g ^. phylo_groupLevelParents
311 getLastLevel :: Phylo -> Level
312 getLastLevel phylo = last' "lastLevel" $ getLevels phylo
314 getLevels :: Phylo -> [Level]
315 getLevels phylo = nub
317 $ keys $ view ( phylo_periods
319 . phylo_periodLevels ) phylo
321 getSeaElevation :: Phylo -> SeaElevation
322 getSeaElevation phylo = seaElevation (getConfig phylo)
325 getConfig :: Phylo -> Config
326 getConfig phylo = (phylo ^. phylo_param) ^. phyloParam_config
329 getRoots :: Phylo -> Vector Ngrams
330 getRoots phylo = (phylo ^. phylo_foundations) ^. foundations_roots
332 phyloToLastBranches :: Phylo -> [[PhyloGroup]]
333 phyloToLastBranches phylo = elems
335 $ map (\g -> (g ^. phylo_groupBranchId, [g]))
336 $ getGroupsFromLevel (last' "byBranches" $ getLevels phylo) phylo
338 getGroupsFromLevel :: Level -> Phylo -> [PhyloGroup]
339 getGroupsFromLevel lvl phylo =
340 elems $ view ( phylo_periods
344 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
345 . phylo_levelGroups ) phylo
348 getGroupsFromLevelPeriods :: Level -> [PhyloPeriodId] -> Phylo -> [PhyloGroup]
349 getGroupsFromLevelPeriods lvl periods phylo =
350 elems $ view ( phylo_periods
352 . filtered (\phyloPrd -> elem (phyloPrd ^. phylo_periodPeriod) periods)
355 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
356 . phylo_levelGroups ) phylo
359 getGroupsFromPeriods :: Level -> Map PhyloPeriodId PhyloPeriod -> [PhyloGroup]
360 getGroupsFromPeriods lvl periods =
361 elems $ view ( traverse
364 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
365 . phylo_levelGroups ) periods
368 updatePhyloGroups :: Level -> Map PhyloGroupId PhyloGroup -> Phylo -> Phylo
369 updatePhyloGroups lvl m phylo =
374 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
378 let id = getGroupId g
385 traceToPhylo :: Level -> Phylo -> Phylo
386 traceToPhylo lvl phylo =
387 trace ("\n" <> "-- | End of phylo making at level " <> show (lvl) <> " with "
388 <> show (length $ getGroupsFromLevel lvl phylo) <> " groups and "
389 <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel lvl phylo) <> " branches" <> "\n") phylo
395 mergeBranchIds :: [[Int]] -> [Int]
396 mergeBranchIds ids = (head' "mergeBranchIds" . sort . mostFreq') ids
398 -- | 2) find the most Up Left ids in the hierarchy of similarity
399 -- mostUpLeft :: [[Int]] -> [[Int]]
401 -- let groupIds = (map (\gIds -> (length $ head' "gIds" gIds, head' "gIds" gIds)) . groupBy (\id id' -> length id == length id') . sortOn length) ids'
402 -- inf = (fst . minimum) groupIds
403 -- in map snd $ filter (\gIds -> fst gIds == inf) groupIds
404 -- | 1) find the most frequent ids
405 mostFreq' :: [[Int]] -> [[Int]]
407 let groupIds = (map (\gIds -> (length gIds, head' "gIds" gIds)) . group . sort) ids'
408 sup = (fst . maximum) groupIds
409 in map snd $ filter (\gIds -> fst gIds == sup) groupIds
412 mergeMeta :: [Int] -> [PhyloGroup] -> Map Text [Double]
413 mergeMeta bId groups =
414 let ego = head' "mergeMeta" $ filter (\g -> (snd (g ^. phylo_groupBranchId)) == bId) groups
415 in fromList [("breaks",(ego ^. phylo_groupMeta) ! "breaks"),("seaLevels",(ego ^. phylo_groupMeta) ! "seaLevels")]
418 groupsToBranches :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
419 groupsToBranches groups =
420 -- | run the related component algorithm
421 let egos = map (\g -> [getGroupId g]
422 ++ (map fst $ g ^. phylo_groupPeriodParents)
423 ++ (map fst $ g ^. phylo_groupPeriodChilds)
424 ++ (map fst $ g ^. phylo_groupAncestors)) $ elems groups
425 graph = relatedComponents egos
426 -- | update each group's branch id
428 let groups' = elems $ restrictKeys groups (Set.fromList ids)
429 bId = mergeBranchIds $ map (\g -> snd $ g ^. phylo_groupBranchId) groups'
430 in map (\g -> g & phylo_groupBranchId %~ (\(lvl,_) -> (lvl,bId))) groups') graph
432 relatedComponents :: Ord a => [[a]] -> [[a]]
433 relatedComponents graph = foldl' (\acc groups ->
437 let acc' = partition (\groups' -> disjoint (Set.fromList groups') (Set.fromList groups)) acc
438 in (fst acc') ++ [nub $ concat $ (snd acc') ++ [groups]]) [] graph
440 toRelatedComponents :: [PhyloGroup] -> [((PhyloGroup,PhyloGroup),Double)] -> [[PhyloGroup]]
441 toRelatedComponents nodes edges =
442 let ref = fromList $ map (\g -> (getGroupId g, g)) nodes
443 clusters = relatedComponents $ ((map (\((g,g'),_) -> [getGroupId g, getGroupId g']) edges) ++ (map (\g -> [getGroupId g]) nodes))
444 in map (\cluster -> map (\gId -> ref ! gId) cluster) clusters
447 traceSynchronyEnd :: Phylo -> Phylo
448 traceSynchronyEnd phylo =
449 trace ( "\n" <> "-- | End synchronic clustering at level " <> show (getLastLevel phylo)
450 <> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
451 <> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
454 traceSynchronyStart :: Phylo -> Phylo
455 traceSynchronyStart phylo =
456 trace ( "\n" <> "-- | Start synchronic clustering at level " <> show (getLastLevel phylo)
457 <> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
458 <> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
466 getSensibility :: Proximity -> Double
467 getSensibility proxi = case proxi of
468 WeightedLogJaccard s -> s
475 intersectInit :: Eq a => [a] -> [a] -> [a] -> [a]
476 intersectInit acc lst lst' =
477 if (null lst) || (null lst')
479 else if (head' "intersectInit" lst) == (head' "intersectInit" lst')
480 then intersectInit (acc ++ [head' "intersectInit" lst]) (tail lst) (tail lst')
483 branchIdsToProximity :: PhyloBranchId -> PhyloBranchId -> Double -> Double -> Double
484 branchIdsToProximity id id' thrInit thrStep = thrInit + thrStep * (fromIntegral $ length $ intersectInit [] (snd id) (snd id'))
486 ngramsInBranches :: [[PhyloGroup]] -> [Int]
487 ngramsInBranches branches = nub $ foldl (\acc g -> acc ++ (g ^. phylo_groupNgrams)) [] $ concat branches
490 traceMatchSuccess :: Double -> Double -> Double -> [[[PhyloGroup]]] -> [[[PhyloGroup]]]
491 traceMatchSuccess thr qua qua' nextBranches =
492 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . init . snd)
493 $ (head' "trace" $ head' "trace" $ head' "trace" nextBranches) ^. phylo_groupBranchId))
494 <> ",(1.." <> show (length nextBranches) <> ")]"
495 <> " | " <> show ((length . concat . concat) nextBranches) <> " groups" <> "\n"
496 <> " - splited with success in " <> show (map length nextBranches) <> " sub-branches" <> "\n"
497 <> " - for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " < " <> show(qua') <> ")\n" ) nextBranches
500 traceMatchFailure :: Double -> Double -> Double -> [[PhyloGroup]] -> [[PhyloGroup]]
501 traceMatchFailure thr qua qua' branches =
502 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
503 <> ",(1.." <> show (length branches) <> ")]"
504 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
505 <> " - split with failure for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " > " <> show(qua') <> ")\n"
509 traceMatchNoSplit :: [[PhyloGroup]] -> [[PhyloGroup]]
510 traceMatchNoSplit branches =
511 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
512 <> ",(1.." <> show (length branches) <> ")]"
513 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
514 <> " - unable to split in smaller branches" <> "\n"
518 traceMatchLimit :: [[PhyloGroup]] -> [[PhyloGroup]]
519 traceMatchLimit branches =
520 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
521 <> ",(1.." <> show (length branches) <> ")]"
522 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
523 <> " - unable to increase the threshold above 1" <> "\n"
527 traceMatchEnd :: [PhyloGroup] -> [PhyloGroup]
528 traceMatchEnd groups =
529 trace ("\n" <> "-- | End temporal matching with " <> show (length $ nub $ map (\g -> g ^. phylo_groupBranchId) groups)
530 <> " branches and " <> show (length groups) <> " groups" <> "\n") groups
533 traceTemporalMatching :: [PhyloGroup] -> [PhyloGroup]
534 traceTemporalMatching groups =
535 trace ( "\n" <> "-- | Start temporal matching for " <> show(length groups) <> " groups" <> "\n") groups
538 traceGroupsProxi :: Map (PhyloGroupId,PhyloGroupId) Double -> Map (PhyloGroupId,PhyloGroupId) Double
540 trace ( "\n" <> "-- | " <> show(Map.size m) <> " computed pairs of groups proximity" <> "\n") m