2 Module : Gargantext.Core.Viz.Phylo.PhyloTools
3 Description : Module dedicated to all the tools needed for making a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 {-# LANGUAGE ViewPatterns #-}
13 module Gargantext.Core.Viz.Phylo.PhyloTools where
15 import Data.Vector (Vector, elemIndex)
16 import Data.List (sort, concat, null, union, (++), tails, sortOn, nub, init, tail, partition, tails, nubBy, group)
17 import Data.Set (Set, disjoint)
18 import Data.Map (Map, elems, fromList, unionWith, keys, member, (!), filterWithKey, fromListWith, empty, restrictKeys)
19 import Data.String (String)
20 import Data.Text (Text,unpack)
22 import Prelude (floor,read)
24 import Gargantext.Prelude
25 import Gargantext.Core.Viz.AdaptativePhylo
29 import Debug.Trace (trace)
30 import Control.Lens hiding (Level)
32 import qualified Data.Vector as Vector
33 import qualified Data.List as List
34 import qualified Data.Set as Set
35 import qualified Data.Map as Map
36 import qualified Data.Text as Text
42 -- | To print an important message as an IO()
43 printIOMsg :: String -> IO ()
48 <> "-- | " <> msg <> "\n" )
51 -- | To print a comment as an IO()
52 printIOComment :: String -> IO ()
54 putStrLn ( "\n" <> cmt <> "\n" )
61 -- truncate' :: Double -> Int -> Double
62 -- truncate' x n = (fromIntegral (floor (x * t))) / t
65 truncate' :: Double -> Int -> Double
66 truncate' x n = (fromIntegral $ (floor (x * t) :: Int)) / t
72 getInMap :: Int -> Map Int Double -> Double
78 roundToStr :: (PrintfArg a, Floating a) => Int -> a -> String
79 roundToStr = printf "%0.*f"
82 countSup :: Double -> [Double] -> Int
83 countSup s l = length $ filter (>s) l
86 dropByIdx :: Int -> [a] -> [a]
87 dropByIdx k l = take k l ++ drop (k+1) l
90 elemIndex' :: Eq a => a -> [a] -> Int
91 elemIndex' e l = case (List.elemIndex e l) of
92 Nothing -> panic ("[ERR][Viz.Phylo.PhyloTools] element not in list")
96 commonPrefix :: Eq a => [a] -> [a] -> [a] -> [a]
97 commonPrefix lst lst' acc =
98 if (null lst || null lst')
100 else if (head' "commonPrefix" lst == head' "commonPrefix" lst')
101 then commonPrefix (tail lst) (tail lst') (acc ++ [head' "commonPrefix" lst])
105 ---------------------
106 -- | Foundations | --
107 ---------------------
110 -- | Is this Ngrams a Foundations Root ?
111 isRoots :: Ngrams -> Vector Ngrams -> Bool
112 isRoots n ns = Vector.elem n ns
114 -- | To transform a list of nrams into a list of foundation's index
115 ngramsToIdx :: [Ngrams] -> Vector Ngrams -> [Int]
116 ngramsToIdx ns fdt = map (\n -> fromJust $ elemIndex n fdt) ns
118 -- | To transform a list of sources into a list of sources' index
119 sourcesToIdx :: [Text] -> Vector Text -> [Int]
120 sourcesToIdx ss ps = nub $ map (\s -> fromJust $ elemIndex s ps) ss
122 -- | To transform a list of Ngrams Indexes into a Label
123 ngramsToLabel :: Vector Ngrams -> [Int] -> Text
124 ngramsToLabel ngrams l = Text.unwords $ tail' "ngramsToLabel" $ concat $ map (\n -> ["|",n]) $ ngramsToText ngrams l
126 idxToLabel :: [Int] -> String
127 idxToLabel l = List.unwords $ tail' "idxToLabel" $ concat $ map (\n -> ["|",show n]) l
129 idxToLabel' :: [Double] -> String
130 idxToLabel' l = List.unwords $ tail' "idxToLabel" $ concat $ map (\n -> ["|",show n]) l
132 -- | To transform a list of Ngrams Indexes into a list of Text
133 ngramsToText :: Vector Ngrams -> [Int] -> [Text]
134 ngramsToText ngrams l = map (\idx -> ngrams Vector.! idx) l
141 -- | To transform a list of periods into a set of Dates
142 periodsToYears :: [(Date,Date)] -> Set Date
143 periodsToYears periods = (Set.fromList . sort . concat)
144 $ map (\(d,d') -> [d..d']) periods
147 findBounds :: [Date] -> (Date,Date)
149 let dates' = sort dates
150 in (head' "findBounds" dates', last' "findBounds" dates')
153 toPeriods :: [Date] -> Int -> Int -> [(Date,Date)]
154 toPeriods dates p s =
155 let (start,end) = findBounds dates
156 in map (\dates' -> (head' "toPeriods" dates', last' "toPeriods" dates'))
157 $ chunkAlong p s [start .. end]
160 toFstDate :: [Text] -> Text
165 let d' = read (filter (\c -> c /= '-') $ unpack d)::Int
168 toLstDate :: [Text] -> Text
174 let d' = read (filter (\c -> c /= '-') $ unpack d)::Int
178 getTimeScale :: Phylo -> [Char]
179 getTimeScale p = case (timeUnit $ getConfig p) of
181 Month _ _ _ -> "month"
186 -- | Get a regular & ascendante timeScale from a given list of dates
187 toTimeScale :: [Date] -> Int -> [Date]
188 toTimeScale dates step =
189 let (start,end) = findBounds dates
190 in [start, (start + step) .. end]
193 getTimeStep :: TimeUnit -> Int
194 getTimeStep time = case time of
200 getTimePeriod :: TimeUnit -> Int
201 getTimePeriod time = case time of
207 getTimeFrame :: TimeUnit -> Int
208 getTimeFrame time = case time of
219 -- | To find if l' is nested in l
220 isNested :: Eq a => [a] -> [a] -> Bool
223 | length l' > length l = False
224 | (union l l') == l = True
228 -- | To filter Fis with small Support but by keeping non empty Periods
229 keepFilled :: (Int -> [a] -> [a]) -> Int -> [a] -> [a]
230 keepFilled f thr l = if (null $ f thr l) && (not $ null l)
231 then keepFilled f (thr - 1) l
235 traceClique :: Map (Date, Date) [PhyloClique] -> String
236 traceClique mFis = foldl (\msg cpt -> msg <> show (countSup cpt cliques) <> " (>" <> show (cpt) <> ") " ) "" [1..6]
238 --------------------------------------
240 cliques = sort $ map (fromIntegral . length . _phyloClique_nodes) $ concat $ elems mFis
241 --------------------------------------
244 traceSupport :: Map (Date, Date) [PhyloClique] -> String
245 traceSupport mFis = foldl (\msg cpt -> msg <> show (countSup cpt supports) <> " (>" <> show (cpt) <> ") " ) "" [1..6]
247 --------------------------------------
249 supports = sort $ map (fromIntegral . _phyloClique_support) $ concat $ elems mFis
250 --------------------------------------
253 traceFis :: [Char] -> Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
254 traceFis msg mFis = trace ( "\n" <> "-- | " <> msg <> " : " <> show (sum $ map length $ elems mFis) <> "\n"
255 <> "Support : " <> (traceSupport mFis) <> "\n"
256 <> "Nb Ngrams : " <> (traceClique mFis) <> "\n" ) mFis
264 getCliqueSupport :: Clique -> Int
265 getCliqueSupport unit = case unit of
269 getCliqueSize :: Clique -> Int
270 getCliqueSize unit = case unit of
279 listToCombi' :: [a] -> [(a,a)]
280 listToCombi' l = [(x,y) | (x:rest) <- tails l, y <- rest]
282 listToEqual' :: Eq a => [a] -> [(a,a)]
283 listToEqual' l = [(x,y) | x <- l, y <- l, x == y]
285 listToKeys :: Eq a => [a] -> [(a,a)]
286 listToKeys lst = (listToCombi' lst) ++ (listToEqual' lst)
288 listToMatrix :: [Int] -> Map (Int,Int) Double
289 listToMatrix lst = fromList $ map (\k -> (k,1)) $ listToKeys $ sort lst
291 listToMatrix' :: [Ngrams] -> Map (Ngrams,Ngrams) Int
292 listToMatrix' lst = fromList $ map (\k -> (k,1)) $ listToKeys $ sort lst
294 listToSeq :: Eq a => [a] -> [(a,a)]
295 listToSeq l = nubBy (\x y -> fst x == fst y) $ [ (x,y) | (x:rest) <- tails l, y <- rest ]
297 sumCooc :: Cooc -> Cooc -> Cooc
298 sumCooc cooc cooc' = unionWith (+) cooc cooc'
300 getTrace :: Cooc -> Double
301 getTrace cooc = sum $ elems $ filterWithKey (\(k,k') _ -> k == k') cooc
303 coocToDiago :: Cooc -> Cooc
304 coocToDiago cooc = filterWithKey (\(k,k') _ -> k == k') cooc
306 -- | To build the local cooc matrix of each phylogroup
307 ngramsToCooc :: [Int] -> [Cooc] -> Cooc
308 ngramsToCooc ngrams coocs =
309 let cooc = foldl (\acc cooc' -> sumCooc acc cooc') empty coocs
310 pairs = listToKeys ngrams
311 in filterWithKey (\k _ -> elem k pairs) cooc
318 getGroupId :: PhyloGroup -> PhyloGroupId
319 getGroupId g = ((g ^. phylo_groupPeriod, g ^. phylo_groupLevel), g ^. phylo_groupIndex)
321 idToPrd :: PhyloGroupId -> PhyloPeriodId
322 idToPrd id = (fst . fst) id
324 groupByField :: Ord a => (PhyloGroup -> a) -> [PhyloGroup] -> Map a [PhyloGroup]
325 groupByField toField groups = fromListWith (++) $ map (\g -> (toField g, [g])) groups
327 getPeriodPointers :: Filiation -> PhyloGroup -> [Pointer]
328 getPeriodPointers fil g =
330 ToChilds -> g ^. phylo_groupPeriodChilds
331 ToParents -> g ^. phylo_groupPeriodParents
333 filterProximity :: Proximity -> Double -> Double -> Bool
334 filterProximity proximity thr local =
336 WeightedLogJaccard _ -> local >= thr
337 WeightedLogSim _ -> local >= thr
340 getProximityName :: Proximity -> String
341 getProximityName proximity =
343 WeightedLogJaccard _ -> "WLJaccard"
344 WeightedLogSim _ -> "WeightedLogSim"
351 addPointers :: Filiation -> PointerType -> [Pointer] -> PhyloGroup -> PhyloGroup
352 addPointers fil pty pointers g =
354 TemporalPointer -> case fil of
355 ToChilds -> g & phylo_groupPeriodChilds .~ pointers
356 ToParents -> g & phylo_groupPeriodParents .~ pointers
357 LevelPointer -> case fil of
358 ToChilds -> g & phylo_groupLevelChilds .~ pointers
359 ToParents -> g & phylo_groupLevelParents .~ pointers
362 getPeriodIds :: Phylo -> [(Date,Date)]
363 getPeriodIds phylo = sortOn fst
365 $ phylo ^. phylo_periods
367 getLevelParentId :: PhyloGroup -> PhyloGroupId
368 getLevelParentId g = fst $ head' "getLevelParentId" $ g ^. phylo_groupLevelParents
370 getLastLevel :: Phylo -> Level
371 getLastLevel phylo = last' "lastLevel" $ getLevels phylo
373 getLevels :: Phylo -> [Level]
374 getLevels phylo = nub
376 $ keys $ view ( phylo_periods
378 . phylo_periodLevels ) phylo
380 getSeaElevation :: Phylo -> SeaElevation
381 getSeaElevation phylo = seaElevation (getConfig phylo)
384 getConfig :: Phylo -> Config
385 getConfig phylo = (phylo ^. phylo_param) ^. phyloParam_config
388 setConfig :: Config -> Phylo -> Phylo
389 setConfig config phylo = phylo
390 & phylo_param .~ (PhyloParam
391 ((phylo ^. phylo_param) ^. phyloParam_version)
392 ((phylo ^. phylo_param) ^. phyloParam_software)
395 -- & phylo_param & phyloParam_config & phyloParam_config .~ config
398 getRoots :: Phylo -> Vector Ngrams
399 getRoots phylo = (phylo ^. phylo_foundations) ^. foundations_roots
401 getSources :: Phylo -> Vector Text
402 getSources phylo = _sources (phylo ^. phylo_sources)
404 phyloToLastBranches :: Phylo -> [[PhyloGroup]]
405 phyloToLastBranches phylo = elems
407 $ map (\g -> (g ^. phylo_groupBranchId, [g]))
408 $ getGroupsFromLevel (last' "byBranches" $ getLevels phylo) phylo
410 getGroupsFromLevel :: Level -> Phylo -> [PhyloGroup]
411 getGroupsFromLevel lvl phylo =
412 elems $ view ( phylo_periods
416 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
417 . phylo_levelGroups ) phylo
420 getGroupsFromLevelPeriods :: Level -> [PhyloPeriodId] -> Phylo -> [PhyloGroup]
421 getGroupsFromLevelPeriods lvl periods phylo =
422 elems $ view ( phylo_periods
424 . filtered (\phyloPrd -> elem (phyloPrd ^. phylo_periodPeriod) periods)
427 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
428 . phylo_levelGroups ) phylo
431 getGroupsFromPeriods :: Level -> Map PhyloPeriodId PhyloPeriod -> [PhyloGroup]
432 getGroupsFromPeriods lvl periods =
433 elems $ view ( traverse
436 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
437 . phylo_levelGroups ) periods
440 updatePhyloGroups :: Level -> Map PhyloGroupId PhyloGroup -> Phylo -> Phylo
441 updatePhyloGroups lvl m phylo =
446 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
450 let id = getGroupId g
456 updatePeriods :: Map (Date,Date) (Text,Text) -> Phylo -> Phylo
457 updatePeriods periods' phylo =
458 over (phylo_periods . traverse)
460 let prd' = periods' ! (prd ^. phylo_periodPeriod)
461 lvls = map (\lvl -> lvl & phylo_levelPeriod' .~ prd') $ prd ^. phylo_periodLevels
462 in prd & phylo_periodPeriod' .~ prd'
463 & phylo_periodLevels .~ lvls
467 traceToPhylo :: Level -> Phylo -> Phylo
468 traceToPhylo lvl phylo =
469 trace ("\n" <> "-- | End of phylo making at level " <> show (lvl) <> " with "
470 <> show (length $ getGroupsFromLevel lvl phylo) <> " groups and "
471 <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel lvl phylo) <> " branches" <> "\n") phylo
477 mergeBranchIds :: [[Int]] -> [Int]
478 mergeBranchIds ids = (head' "mergeBranchIds" . sort . mostFreq') ids
480 -- | 2) find the most Up Left ids in the hierarchy of similarity
481 -- mostUpLeft :: [[Int]] -> [[Int]]
483 -- let groupIds = (map (\gIds -> (length $ head' "gIds" gIds, head' "gIds" gIds)) . groupBy (\id id' -> length id == length id') . sortOn length) ids'
484 -- inf = (fst . minimum) groupIds
485 -- in map snd $ filter (\gIds -> fst gIds == inf) groupIds
486 -- | 1) find the most frequent ids
487 mostFreq' :: [[Int]] -> [[Int]]
489 let groupIds = (map (\gIds -> (length gIds, head' "gIds" gIds)) . group . sort) ids'
490 sup = (fst . maximum) groupIds
491 in map snd $ filter (\gIds -> fst gIds == sup) groupIds
494 mergeMeta :: [Int] -> [PhyloGroup] -> Map Text [Double]
495 mergeMeta bId groups =
496 let ego = head' "mergeMeta" $ filter (\g -> (snd (g ^. phylo_groupBranchId)) == bId) groups
497 in fromList [("breaks",(ego ^. phylo_groupMeta) ! "breaks"),("seaLevels",(ego ^. phylo_groupMeta) ! "seaLevels")]
500 groupsToBranches :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
501 groupsToBranches groups =
502 {- run the related component algorithm -}
503 let egos = map (\g -> [getGroupId g]
504 ++ (map fst $ g ^. phylo_groupPeriodParents)
505 ++ (map fst $ g ^. phylo_groupPeriodChilds)
506 ++ (map fst $ g ^. phylo_groupAncestors)) $ elems groups
507 graph = relatedComponents egos
508 {- update each group's branch id -}
510 let groups' = elems $ restrictKeys groups (Set.fromList ids)
511 bId = mergeBranchIds $ map (\g -> snd $ g ^. phylo_groupBranchId) groups'
512 in map (\g -> g & phylo_groupBranchId %~ (\(lvl,_) -> (lvl,bId))) groups') graph
514 relatedComponents :: Ord a => [[a]] -> [[a]]
515 relatedComponents graph = foldl' (\acc groups ->
519 let acc' = partition (\groups' -> disjoint (Set.fromList groups') (Set.fromList groups)) acc
520 in (fst acc') ++ [nub $ concat $ (snd acc') ++ [groups]]) [] graph
522 toRelatedComponents :: [PhyloGroup] -> [((PhyloGroup,PhyloGroup),Double)] -> [[PhyloGroup]]
523 toRelatedComponents nodes edges =
524 let ref = fromList $ map (\g -> (getGroupId g, g)) nodes
525 clusters = relatedComponents $ ((map (\((g,g'),_) -> [getGroupId g, getGroupId g']) edges) ++ (map (\g -> [getGroupId g]) nodes))
526 in map (\cluster -> map (\gId -> ref ! gId) cluster) clusters
529 traceSynchronyEnd :: Phylo -> Phylo
530 traceSynchronyEnd phylo =
531 trace ( "-- | End synchronic clustering at level " <> show (getLastLevel phylo)
532 <> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
533 <> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
536 traceSynchronyStart :: Phylo -> Phylo
537 traceSynchronyStart phylo =
538 trace ( "\n" <> "-- | Start synchronic clustering at level " <> show (getLastLevel phylo)
539 <> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
540 <> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
548 getSensibility :: Proximity -> Double
549 getSensibility proxi = case proxi of
550 WeightedLogJaccard s -> s
551 WeightedLogSim s -> s
558 intersectInit :: Eq a => [a] -> [a] -> [a] -> [a]
559 intersectInit acc lst lst' =
560 if (null lst) || (null lst')
562 else if (head' "intersectInit" lst) == (head' "intersectInit" lst')
563 then intersectInit (acc ++ [head' "intersectInit" lst]) (tail lst) (tail lst')
566 branchIdsToProximity :: PhyloBranchId -> PhyloBranchId -> Double -> Double -> Double
567 branchIdsToProximity id id' thrInit thrStep = thrInit + thrStep * (fromIntegral $ length $ intersectInit [] (snd id) (snd id'))
569 ngramsInBranches :: [[PhyloGroup]] -> [Int]
570 ngramsInBranches branches = nub $ foldl (\acc g -> acc ++ (g ^. phylo_groupNgrams)) [] $ concat branches
573 traceMatchSuccess :: Double -> Double -> Double -> [[[PhyloGroup]]] -> [[[PhyloGroup]]]
574 traceMatchSuccess thr qua qua' nextBranches =
575 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . init . snd)
576 $ (head' "trace" $ head' "trace" $ head' "trace" nextBranches) ^. phylo_groupBranchId))
577 <> ",(1.." <> show (length nextBranches) <> ")]"
578 <> " | " <> show ((length . concat . concat) nextBranches) <> " groups" <> "\n"
579 <> " - splited with success in " <> show (map length nextBranches) <> " sub-branches" <> "\n"
580 <> " - for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " < " <> show(qua') <> ")\n" ) nextBranches
583 traceMatchFailure :: Double -> Double -> Double -> [[PhyloGroup]] -> [[PhyloGroup]]
584 traceMatchFailure thr qua qua' branches =
585 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
586 <> ",(1.." <> show (length branches) <> ")]"
587 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
588 <> " - split with failure for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " > " <> show(qua') <> ")\n"
592 traceMatchNoSplit :: [[PhyloGroup]] -> [[PhyloGroup]]
593 traceMatchNoSplit branches =
594 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
595 <> ",(1.." <> show (length branches) <> ")]"
596 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
597 <> " - unable to split in smaller branches" <> "\n"
601 traceMatchLimit :: [[PhyloGroup]] -> [[PhyloGroup]]
602 traceMatchLimit branches =
603 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
604 <> ",(1.." <> show (length branches) <> ")]"
605 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
606 <> " - unable to increase the threshold above 1" <> "\n"
610 traceMatchEnd :: [PhyloGroup] -> [PhyloGroup]
611 traceMatchEnd groups =
612 trace ("\n" <> "-- | End temporal matching with " <> show (length $ nub $ map (\g -> g ^. phylo_groupBranchId) groups)
613 <> " branches and " <> show (length groups) <> " groups" <> "\n") groups
616 traceTemporalMatching :: [PhyloGroup] -> [PhyloGroup]
617 traceTemporalMatching groups =
618 trace ( "\n" <> "-- | Start temporal matching for " <> show(length groups) <> " groups" <> "\n") groups
621 traceGroupsProxi :: Map (PhyloGroupId,PhyloGroupId) Double -> Map (PhyloGroupId,PhyloGroupId) Double
623 trace ( "\n" <> "-- | " <> show(Map.size m) <> " computed pairs of groups proximity" <> "\n") m