2 Module : Gargantext.Core.Viz.Phylo.PhyloTools
3 Description : Module dedicated to all the tools needed for making a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 {-# LANGUAGE ViewPatterns #-}
13 module Gargantext.Core.Viz.Phylo.PhyloTools where
15 import Control.Lens hiding (Level)
16 import Data.List (sort, concat, null, union, (++), tails, sortOn, nub, init, tail, partition, tails, nubBy, group)
17 import Data.Map (Map, elems, fromList, unionWith, keys, member, (!), filterWithKey, fromListWith, empty, restrictKeys)
18 import Data.Set (Set, disjoint)
19 import Data.String (String)
20 import Data.Text (Text,unpack)
21 import Data.Vector (Vector, elemIndex)
22 import Debug.Trace (trace)
23 import Gargantext.Core.Viz.Phylo
24 import Gargantext.Prelude
25 import Prelude (floor,read)
27 import qualified Data.List as List
28 import qualified Data.Map as Map
29 import qualified Data.Set as Set
30 import qualified Data.Text as Text
31 import qualified Data.Vector as Vector
37 -- | To print an important message as an IO()
38 printIOMsg :: String -> IO ()
43 <> "-- | " <> msg <> "\n" )
46 -- | To print a comment as an IO()
47 printIOComment :: String -> IO ()
49 putStrLn ( "\n" <> cmt <> "\n" )
56 -- truncate' :: Double -> Int -> Double
57 -- truncate' x n = (fromIntegral (floor (x * t))) / t
60 truncate' :: Double -> Int -> Double
61 truncate' x n = (fromIntegral $ (floor (x * t) :: Int)) / t
67 getInMap :: Int -> Map Int Double -> Double
73 roundToStr :: (PrintfArg a, Floating a) => Int -> a -> String
74 roundToStr = printf "%0.*f"
77 countSup :: Double -> [Double] -> Int
78 countSup s l = length $ filter (>s) l
81 dropByIdx :: Int -> [a] -> [a]
82 dropByIdx k l = take k l ++ drop (k+1) l
85 elemIndex' :: Eq a => a -> [a] -> Int
86 elemIndex' e l = case (List.elemIndex e l) of
87 Nothing -> panic ("[ERR][Viz.Phylo.PhyloTools] element not in list")
91 commonPrefix :: Eq a => [a] -> [a] -> [a] -> [a]
92 commonPrefix lst lst' acc =
93 if (null lst || null lst')
95 else if (head' "commonPrefix" lst == head' "commonPrefix" lst')
96 then commonPrefix (tail lst) (tail lst') (acc ++ [head' "commonPrefix" lst])
100 ---------------------
101 -- | Foundations | --
102 ---------------------
105 -- | Is this Ngrams a Foundations Root ?
106 isRoots :: Ngrams -> Vector Ngrams -> Bool
107 isRoots n ns = Vector.elem n ns
109 -- | To transform a list of nrams into a list of foundation's index
110 ngramsToIdx :: [Ngrams] -> Vector Ngrams -> [Int]
111 ngramsToIdx ns fdt = map (\n -> fromJust $ elemIndex n fdt) ns
113 -- | To transform a list of sources into a list of sources' index
114 sourcesToIdx :: [Text] -> Vector Text -> [Int]
115 sourcesToIdx ss ps = nub $ map (\s -> fromJust $ elemIndex s ps) ss
117 -- | To transform a list of Ngrams Indexes into a Label
118 ngramsToLabel :: Vector Ngrams -> [Int] -> Text
119 ngramsToLabel ngrams l = Text.unwords $ tail' "ngramsToLabel" $ concat $ map (\n -> ["|",n]) $ ngramsToText ngrams l
121 idxToLabel :: [Int] -> String
122 idxToLabel l = List.unwords $ tail' "idxToLabel" $ concat $ map (\n -> ["|",show n]) l
124 idxToLabel' :: [Double] -> String
125 idxToLabel' l = List.unwords $ tail' "idxToLabel" $ concat $ map (\n -> ["|",show n]) l
127 -- | To transform a list of Ngrams Indexes into a list of Text
128 ngramsToText :: Vector Ngrams -> [Int] -> [Text]
129 ngramsToText ngrams l = map (\idx -> ngrams Vector.! idx) l
136 -- | To transform a list of periods into a set of Dates
137 periodsToYears :: [(Date,Date)] -> Set Date
138 periodsToYears periods = (Set.fromList . sort . concat)
139 $ map (\(d,d') -> [d..d']) periods
142 findBounds :: [Date] -> (Date,Date)
144 let dates' = sort dates
145 in (head' "findBounds" dates', last' "findBounds" dates')
148 toPeriods :: [Date] -> Int -> Int -> [(Date,Date)]
149 toPeriods dates p s =
150 let (start,end) = findBounds dates
151 in map (\dates' -> (head' "toPeriods" dates', last' "toPeriods" dates'))
152 $ chunkAlong p s [start .. end]
155 toFstDate :: [Text] -> Text
160 let d' = read (filter (\c -> c /= '-') $ unpack d)::Int
163 toLstDate :: [Text] -> Text
169 let d' = read (filter (\c -> c /= '-') $ unpack d)::Int
173 getTimeScale :: Phylo -> [Char]
174 getTimeScale p = case (timeUnit $ getConfig p) of
176 Month _ _ _ -> "month"
181 -- | Get a regular & ascendante timeScale from a given list of dates
182 toTimeScale :: [Date] -> Int -> [Date]
183 toTimeScale dates step =
184 let (start,end) = findBounds dates
185 in [start, (start + step) .. end]
188 getTimeStep :: TimeUnit -> Int
189 getTimeStep time = case time of
195 getTimePeriod :: TimeUnit -> Int
196 getTimePeriod time = case time of
202 getTimeFrame :: TimeUnit -> Int
203 getTimeFrame time = case time of
214 -- | To find if l' is nested in l
215 isNested :: Eq a => [a] -> [a] -> Bool
218 | length l' > length l = False
219 | (union l l') == l = True
223 -- | To filter Fis with small Support but by keeping non empty Periods
224 keepFilled :: (Int -> [a] -> [a]) -> Int -> [a] -> [a]
225 keepFilled f thr l = if (null $ f thr l) && (not $ null l)
226 then keepFilled f (thr - 1) l
230 traceClique :: Map (Date, Date) [PhyloClique] -> String
231 traceClique mFis = foldl (\msg cpt -> msg <> show (countSup cpt cliques) <> " (>" <> show (cpt) <> ") " ) "" [1..6]
233 --------------------------------------
235 cliques = sort $ map (fromIntegral . length . _phyloClique_nodes) $ concat $ elems mFis
236 --------------------------------------
239 traceSupport :: Map (Date, Date) [PhyloClique] -> String
240 traceSupport mFis = foldl (\msg cpt -> msg <> show (countSup cpt supports) <> " (>" <> show (cpt) <> ") " ) "" [1..6]
242 --------------------------------------
244 supports = sort $ map (fromIntegral . _phyloClique_support) $ concat $ elems mFis
245 --------------------------------------
248 traceFis :: [Char] -> Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
249 traceFis msg mFis = trace ( "\n" <> "-- | " <> msg <> " : " <> show (sum $ map length $ elems mFis) <> "\n"
250 <> "Support : " <> (traceSupport mFis) <> "\n"
251 <> "Nb Ngrams : " <> (traceClique mFis) <> "\n" ) mFis
259 getCliqueSupport :: Clique -> Int
260 getCliqueSupport unit = case unit of
264 getCliqueSize :: Clique -> Int
265 getCliqueSize unit = case unit of
274 listToCombi' :: [a] -> [(a,a)]
275 listToCombi' l = [(x,y) | (x:rest) <- tails l, y <- rest]
277 listToEqual' :: Eq a => [a] -> [(a,a)]
278 listToEqual' l = [(x,y) | x <- l, y <- l, x == y]
280 listToKeys :: Eq a => [a] -> [(a,a)]
281 listToKeys lst = (listToCombi' lst) ++ (listToEqual' lst)
283 listToMatrix :: [Int] -> Map (Int,Int) Double
284 listToMatrix lst = fromList $ map (\k -> (k,1)) $ listToKeys $ sort lst
286 listToMatrix' :: [Ngrams] -> Map (Ngrams,Ngrams) Int
287 listToMatrix' lst = fromList $ map (\k -> (k,1)) $ listToKeys $ sort lst
289 listToSeq :: Eq a => [a] -> [(a,a)]
290 listToSeq l = nubBy (\x y -> fst x == fst y) $ [ (x,y) | (x:rest) <- tails l, y <- rest ]
292 sumCooc :: Cooc -> Cooc -> Cooc
293 sumCooc cooc cooc' = unionWith (+) cooc cooc'
295 getTrace :: Cooc -> Double
296 getTrace cooc = sum $ elems $ filterWithKey (\(k,k') _ -> k == k') cooc
298 coocToDiago :: Cooc -> Cooc
299 coocToDiago cooc = filterWithKey (\(k,k') _ -> k == k') cooc
301 -- | To build the local cooc matrix of each phylogroup
302 ngramsToCooc :: [Int] -> [Cooc] -> Cooc
303 ngramsToCooc ngrams coocs =
304 let cooc = foldl (\acc cooc' -> sumCooc acc cooc') empty coocs
305 pairs = listToKeys ngrams
306 in filterWithKey (\k _ -> elem k pairs) cooc
313 getGroupId :: PhyloGroup -> PhyloGroupId
314 getGroupId g = ((g ^. phylo_groupPeriod, g ^. phylo_groupLevel), g ^. phylo_groupIndex)
316 idToPrd :: PhyloGroupId -> PhyloPeriodId
317 idToPrd id = (fst . fst) id
319 groupByField :: Ord a => (PhyloGroup -> a) -> [PhyloGroup] -> Map a [PhyloGroup]
320 groupByField toField groups = fromListWith (++) $ map (\g -> (toField g, [g])) groups
322 getPeriodPointers :: Filiation -> PhyloGroup -> [Pointer]
323 getPeriodPointers fil g =
325 ToChilds -> g ^. phylo_groupPeriodChilds
326 ToParents -> g ^. phylo_groupPeriodParents
328 filterProximity :: Proximity -> Double -> Double -> Bool
329 filterProximity proximity thr local =
331 WeightedLogJaccard _ -> local >= thr
332 WeightedLogSim _ -> local >= thr
335 getProximityName :: Proximity -> String
336 getProximityName proximity =
338 WeightedLogJaccard _ -> "WLJaccard"
339 WeightedLogSim _ -> "WeightedLogSim"
346 addPointers :: Filiation -> PointerType -> [Pointer] -> PhyloGroup -> PhyloGroup
347 addPointers fil pty pointers g =
349 TemporalPointer -> case fil of
350 ToChilds -> g & phylo_groupPeriodChilds .~ pointers
351 ToParents -> g & phylo_groupPeriodParents .~ pointers
352 LevelPointer -> case fil of
353 ToChilds -> g & phylo_groupLevelChilds .~ pointers
354 ToParents -> g & phylo_groupLevelParents .~ pointers
357 getPeriodIds :: Phylo -> [(Date,Date)]
358 getPeriodIds phylo = sortOn fst
360 $ phylo ^. phylo_periods
362 getLevelParentId :: PhyloGroup -> PhyloGroupId
363 getLevelParentId g = fst $ head' "getLevelParentId" $ g ^. phylo_groupLevelParents
365 getLastLevel :: Phylo -> Level
366 getLastLevel phylo = last' "lastLevel" $ getLevels phylo
368 getLevels :: Phylo -> [Level]
369 getLevels phylo = nub
371 $ keys $ view ( phylo_periods
373 . phylo_periodLevels ) phylo
375 getSeaElevation :: Phylo -> SeaElevation
376 getSeaElevation phylo = seaElevation (getConfig phylo)
379 getConfig :: Phylo -> Config
380 getConfig phylo = (phylo ^. phylo_param) ^. phyloParam_config
383 setConfig :: Config -> Phylo -> Phylo
384 setConfig config phylo = phylo
385 & phylo_param .~ (PhyloParam
386 ((phylo ^. phylo_param) ^. phyloParam_version)
387 ((phylo ^. phylo_param) ^. phyloParam_software)
390 -- & phylo_param & phyloParam_config & phyloParam_config .~ config
393 getRoots :: Phylo -> Vector Ngrams
394 getRoots phylo = (phylo ^. phylo_foundations) ^. foundations_roots
396 getSources :: Phylo -> Vector Text
397 getSources phylo = _sources (phylo ^. phylo_sources)
399 phyloToLastBranches :: Phylo -> [[PhyloGroup]]
400 phyloToLastBranches phylo = elems
402 $ map (\g -> (g ^. phylo_groupBranchId, [g]))
403 $ getGroupsFromLevel (last' "byBranches" $ getLevels phylo) phylo
405 getGroupsFromLevel :: Level -> Phylo -> [PhyloGroup]
406 getGroupsFromLevel lvl phylo =
407 elems $ view ( phylo_periods
411 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
412 . phylo_levelGroups ) phylo
415 getGroupsFromLevelPeriods :: Level -> [PhyloPeriodId] -> Phylo -> [PhyloGroup]
416 getGroupsFromLevelPeriods lvl periods phylo =
417 elems $ view ( phylo_periods
419 . filtered (\phyloPrd -> elem (phyloPrd ^. phylo_periodPeriod) periods)
422 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
423 . phylo_levelGroups ) phylo
426 getGroupsFromPeriods :: Level -> Map PhyloPeriodId PhyloPeriod -> [PhyloGroup]
427 getGroupsFromPeriods lvl periods =
428 elems $ view ( traverse
431 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
432 . phylo_levelGroups ) periods
435 updatePhyloGroups :: Level -> Map PhyloGroupId PhyloGroup -> Phylo -> Phylo
436 updatePhyloGroups lvl m phylo =
441 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
445 let id = getGroupId g
451 updatePeriods :: Map (Date,Date) (Text,Text) -> Phylo -> Phylo
452 updatePeriods periods' phylo =
453 over (phylo_periods . traverse)
455 let prd' = periods' ! (prd ^. phylo_periodPeriod)
456 lvls = map (\lvl -> lvl & phylo_levelPeriod' .~ prd') $ prd ^. phylo_periodLevels
457 in prd & phylo_periodPeriod' .~ prd'
458 & phylo_periodLevels .~ lvls
462 traceToPhylo :: Level -> Phylo -> Phylo
463 traceToPhylo lvl phylo =
464 trace ("\n" <> "-- | End of phylo making at level " <> show (lvl) <> " with "
465 <> show (length $ getGroupsFromLevel lvl phylo) <> " groups and "
466 <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel lvl phylo) <> " branches" <> "\n") phylo
472 mergeBranchIds :: [[Int]] -> [Int]
473 mergeBranchIds ids = (head' "mergeBranchIds" . sort . mostFreq') ids
475 -- | 2) find the most Up Left ids in the hierarchy of similarity
476 -- mostUpLeft :: [[Int]] -> [[Int]]
478 -- let groupIds = (map (\gIds -> (length $ head' "gIds" gIds, head' "gIds" gIds)) . groupBy (\id id' -> length id == length id') . sortOn length) ids'
479 -- inf = (fst . minimum) groupIds
480 -- in map snd $ filter (\gIds -> fst gIds == inf) groupIds
481 -- | 1) find the most frequent ids
482 mostFreq' :: [[Int]] -> [[Int]]
484 let groupIds = (map (\gIds -> (length gIds, head' "gIds" gIds)) . group . sort) ids'
485 sup = (fst . maximum) groupIds
486 in map snd $ filter (\gIds -> fst gIds == sup) groupIds
489 mergeMeta :: [Int] -> [PhyloGroup] -> Map Text [Double]
490 mergeMeta bId groups =
491 let ego = head' "mergeMeta" $ filter (\g -> (snd (g ^. phylo_groupBranchId)) == bId) groups
492 in fromList [("breaks",(ego ^. phylo_groupMeta) ! "breaks"),("seaLevels",(ego ^. phylo_groupMeta) ! "seaLevels")]
495 groupsToBranches :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
496 groupsToBranches groups =
497 {- run the related component algorithm -}
498 let egos = map (\g -> [getGroupId g]
499 ++ (map fst $ g ^. phylo_groupPeriodParents)
500 ++ (map fst $ g ^. phylo_groupPeriodChilds)
501 ++ (map fst $ g ^. phylo_groupAncestors)) $ elems groups
502 graph = relatedComponents egos
503 {- update each group's branch id -}
505 let groups' = elems $ restrictKeys groups (Set.fromList ids)
506 bId = mergeBranchIds $ map (\g -> snd $ g ^. phylo_groupBranchId) groups'
507 in map (\g -> g & phylo_groupBranchId %~ (\(lvl,_) -> (lvl,bId))) groups') graph
509 relatedComponents :: Ord a => [[a]] -> [[a]]
510 relatedComponents graph = foldl' (\acc groups ->
514 let acc' = partition (\groups' -> disjoint (Set.fromList groups') (Set.fromList groups)) acc
515 in (fst acc') ++ [nub $ concat $ (snd acc') ++ [groups]]) [] graph
517 toRelatedComponents :: [PhyloGroup] -> [((PhyloGroup,PhyloGroup),Double)] -> [[PhyloGroup]]
518 toRelatedComponents nodes edges =
519 let ref = fromList $ map (\g -> (getGroupId g, g)) nodes
520 clusters = relatedComponents $ ((map (\((g,g'),_) -> [getGroupId g, getGroupId g']) edges) ++ (map (\g -> [getGroupId g]) nodes))
521 in map (\cluster -> map (\gId -> ref ! gId) cluster) clusters
524 traceSynchronyEnd :: Phylo -> Phylo
525 traceSynchronyEnd phylo =
526 trace ( "-- | End synchronic clustering at level " <> show (getLastLevel phylo)
527 <> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
528 <> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
531 traceSynchronyStart :: Phylo -> Phylo
532 traceSynchronyStart phylo =
533 trace ( "\n" <> "-- | Start synchronic clustering at level " <> show (getLastLevel phylo)
534 <> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
535 <> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
543 getSensibility :: Proximity -> Double
544 getSensibility proxi = case proxi of
545 WeightedLogJaccard s -> s
546 WeightedLogSim s -> s
553 intersectInit :: Eq a => [a] -> [a] -> [a] -> [a]
554 intersectInit acc lst lst' =
555 if (null lst) || (null lst')
557 else if (head' "intersectInit" lst) == (head' "intersectInit" lst')
558 then intersectInit (acc ++ [head' "intersectInit" lst]) (tail lst) (tail lst')
561 branchIdsToProximity :: PhyloBranchId -> PhyloBranchId -> Double -> Double -> Double
562 branchIdsToProximity id id' thrInit thrStep = thrInit + thrStep * (fromIntegral $ length $ intersectInit [] (snd id) (snd id'))
564 ngramsInBranches :: [[PhyloGroup]] -> [Int]
565 ngramsInBranches branches = nub $ foldl (\acc g -> acc ++ (g ^. phylo_groupNgrams)) [] $ concat branches
568 traceMatchSuccess :: Double -> Double -> Double -> [[[PhyloGroup]]] -> [[[PhyloGroup]]]
569 traceMatchSuccess thr qua qua' nextBranches =
570 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . init . snd)
571 $ (head' "trace" $ head' "trace" $ head' "trace" nextBranches) ^. phylo_groupBranchId))
572 <> ",(1.." <> show (length nextBranches) <> ")]"
573 <> " | " <> show ((length . concat . concat) nextBranches) <> " groups" <> "\n"
574 <> " - splited with success in " <> show (map length nextBranches) <> " sub-branches" <> "\n"
575 <> " - for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " < " <> show(qua') <> ")\n" ) nextBranches
578 traceMatchFailure :: Double -> Double -> Double -> [[PhyloGroup]] -> [[PhyloGroup]]
579 traceMatchFailure thr qua qua' branches =
580 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
581 <> ",(1.." <> show (length branches) <> ")]"
582 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
583 <> " - split with failure for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " > " <> show(qua') <> ")\n"
587 traceMatchNoSplit :: [[PhyloGroup]] -> [[PhyloGroup]]
588 traceMatchNoSplit branches =
589 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
590 <> ",(1.." <> show (length branches) <> ")]"
591 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
592 <> " - unable to split in smaller branches" <> "\n"
596 traceMatchLimit :: [[PhyloGroup]] -> [[PhyloGroup]]
597 traceMatchLimit branches =
598 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
599 <> ",(1.." <> show (length branches) <> ")]"
600 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
601 <> " - unable to increase the threshold above 1" <> "\n"
605 traceMatchEnd :: [PhyloGroup] -> [PhyloGroup]
606 traceMatchEnd groups =
607 trace ("\n" <> "-- | End temporal matching with " <> show (length $ nub $ map (\g -> g ^. phylo_groupBranchId) groups)
608 <> " branches and " <> show (length groups) <> " groups" <> "\n") groups
611 traceTemporalMatching :: [PhyloGroup] -> [PhyloGroup]
612 traceTemporalMatching groups =
613 trace ( "\n" <> "-- | Start temporal matching for " <> show(length groups) <> " groups" <> "\n") groups
616 traceGroupsProxi :: Map (PhyloGroupId,PhyloGroupId) Double -> Map (PhyloGroupId,PhyloGroupId) Double
618 trace ( "\n" <> "-- | " <> show(Map.size m) <> " computed pairs of groups proximity" <> "\n") m