2 Module : Gargantext.Core.Viz.Phylo.PhyloTools
3 Description : Module dedicated to all the tools needed for making a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 {-# LANGUAGE ViewPatterns #-}
13 module Gargantext.Core.Viz.Phylo.PhyloTools where
15 import Data.Vector (Vector, elemIndex)
16 import Data.List (sort, concat, null, union, (++), tails, sortOn, nub, init, tail, partition, tails, nubBy, group)
17 import Data.Set (Set, disjoint)
18 import Data.Map (Map, elems, fromList, unionWith, keys, member, (!), filterWithKey, fromListWith, empty, restrictKeys)
19 import Data.String (String)
20 import Data.Text (Text)
22 import Prelude (floor)
24 import Gargantext.Prelude
25 import Gargantext.Core.Viz.AdaptativePhylo
29 import Debug.Trace (trace)
30 import Control.Lens hiding (Level)
32 import qualified Data.Vector as Vector
33 import qualified Data.List as List
34 import qualified Data.Set as Set
35 import qualified Data.Map as Map
36 import qualified Data.Text as Text
42 -- | To print an important message as an IO()
43 printIOMsg :: String -> IO ()
48 <> "-- | " <> msg <> "\n" )
51 -- | To print a comment as an IO()
52 printIOComment :: String -> IO ()
54 putStrLn ( "\n" <> cmt <> "\n" )
61 -- truncate' :: Double -> Int -> Double
62 -- truncate' x n = (fromIntegral (floor (x * t))) / t
65 truncate' :: Double -> Int -> Double
66 truncate' x n = (fromIntegral $ (floor (x * t) :: Int)) / t
72 getInMap :: Int -> Map Int Double -> Double
78 roundToStr :: (PrintfArg a, Floating a) => Int -> a -> String
79 roundToStr = printf "%0.*f"
82 countSup :: Double -> [Double] -> Int
83 countSup s l = length $ filter (>s) l
86 dropByIdx :: Int -> [a] -> [a]
87 dropByIdx k l = take k l ++ drop (k+1) l
90 elemIndex' :: Eq a => a -> [a] -> Int
91 elemIndex' e l = case (List.elemIndex e l) of
92 Nothing -> panic ("[ERR][Viz.Phylo.PhyloTools] element not in list")
96 commonPrefix :: Eq a => [a] -> [a] -> [a] -> [a]
97 commonPrefix lst lst' acc =
98 if (null lst || null lst')
100 else if (head' "commonPrefix" lst == head' "commonPrefix" lst')
101 then commonPrefix (tail lst) (tail lst') (acc ++ [head' "commonPrefix" lst])
105 ---------------------
106 -- | Foundations | --
107 ---------------------
110 -- | Is this Ngrams a Foundations Root ?
111 isRoots :: Ngrams -> Vector Ngrams -> Bool
112 isRoots n ns = Vector.elem n ns
114 -- | To transform a list of nrams into a list of foundation's index
115 ngramsToIdx :: [Ngrams] -> Vector Ngrams -> [Int]
116 ngramsToIdx ns fdt = map (\n -> fromJust $ elemIndex n fdt) ns
118 -- | To transform a list of Ngrams Indexes into a Label
119 ngramsToLabel :: Vector Ngrams -> [Int] -> Text
120 ngramsToLabel ngrams l = Text.unwords $ tail' "ngramsToLabel" $ concat $ map (\n -> ["|",n]) $ ngramsToText ngrams l
122 idxToLabel :: [Int] -> String
123 idxToLabel l = List.unwords $ tail' "idxToLabel" $ concat $ map (\n -> ["|",show n]) l
125 idxToLabel' :: [Double] -> String
126 idxToLabel' l = List.unwords $ tail' "idxToLabel" $ concat $ map (\n -> ["|",show n]) l
128 -- | To transform a list of Ngrams Indexes into a list of Text
129 ngramsToText :: Vector Ngrams -> [Int] -> [Text]
130 ngramsToText ngrams l = map (\idx -> ngrams Vector.! idx) l
137 -- | To transform a list of periods into a set of Dates
138 periodsToYears :: [(Date,Date)] -> Set Date
139 periodsToYears periods = (Set.fromList . sort . concat)
140 $ map (\(d,d') -> [d..d']) periods
143 findBounds :: [Date] -> (Date,Date)
145 let dates' = sort dates
146 in (head' "findBounds" dates', last' "findBounds" dates')
149 toPeriods :: [Date] -> Int -> Int -> [(Date,Date)]
150 toPeriods dates p s =
151 let (start,end) = findBounds dates
152 in map (\dates' -> (head' "toPeriods" dates', last' "toPeriods" dates'))
153 $ chunkAlong p s [start .. end]
156 -- | Get a regular & ascendante timeScale from a given list of dates
157 toTimeScale :: [Date] -> Int -> [Date]
158 toTimeScale dates step =
159 let (start,end) = findBounds dates
160 in [start, (start + step) .. end]
163 getTimeStep :: TimeUnit -> Int
164 getTimeStep time = case time of
167 getTimePeriod :: TimeUnit -> Int
168 getTimePeriod time = case time of
171 getTimeFrame :: TimeUnit -> Int
172 getTimeFrame time = case time of
180 -- | To find if l' is nested in l
181 isNested :: Eq a => [a] -> [a] -> Bool
184 | length l' > length l = False
185 | (union l l') == l = True
189 -- | To filter Fis with small Support but by keeping non empty Periods
190 keepFilled :: (Int -> [a] -> [a]) -> Int -> [a] -> [a]
191 keepFilled f thr l = if (null $ f thr l) && (not $ null l)
192 then keepFilled f (thr - 1) l
196 traceClique :: Map (Date, Date) [PhyloClique] -> String
197 traceClique mFis = foldl (\msg cpt -> msg <> show (countSup cpt cliques) <> " (>" <> show (cpt) <> ") " ) "" [1..6]
199 --------------------------------------
201 cliques = sort $ map (fromIntegral . length . _phyloClique_nodes) $ concat $ elems mFis
202 --------------------------------------
205 traceSupport :: Map (Date, Date) [PhyloClique] -> String
206 traceSupport mFis = foldl (\msg cpt -> msg <> show (countSup cpt supports) <> " (>" <> show (cpt) <> ") " ) "" [1..6]
208 --------------------------------------
210 supports = sort $ map (fromIntegral . _phyloClique_support) $ concat $ elems mFis
211 --------------------------------------
214 traceFis :: [Char] -> Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
215 traceFis msg mFis = trace ( "\n" <> "-- | " <> msg <> " : " <> show (sum $ map length $ elems mFis) <> "\n"
216 <> "Support : " <> (traceSupport mFis) <> "\n"
217 <> "Nb Ngrams : " <> (traceClique mFis) <> "\n" ) mFis
225 getCliqueSupport :: Clique -> Int
226 getCliqueSupport unit = case unit of
230 getCliqueSize :: Clique -> Int
231 getCliqueSize unit = case unit of
240 listToCombi' :: [a] -> [(a,a)]
241 listToCombi' l = [(x,y) | (x:rest) <- tails l, y <- rest]
243 listToEqual' :: Eq a => [a] -> [(a,a)]
244 listToEqual' l = [(x,y) | x <- l, y <- l, x == y]
246 listToKeys :: Eq a => [a] -> [(a,a)]
247 listToKeys lst = (listToCombi' lst) ++ (listToEqual' lst)
249 listToMatrix :: [Int] -> Map (Int,Int) Double
250 listToMatrix lst = fromList $ map (\k -> (k,1)) $ listToKeys $ sort lst
252 listToMatrix' :: [Ngrams] -> Map (Ngrams,Ngrams) Int
253 listToMatrix' lst = fromList $ map (\k -> (k,1)) $ listToKeys $ sort lst
255 listToSeq :: Eq a => [a] -> [(a,a)]
256 listToSeq l = nubBy (\x y -> fst x == fst y) $ [ (x,y) | (x:rest) <- tails l, y <- rest ]
258 sumCooc :: Cooc -> Cooc -> Cooc
259 sumCooc cooc cooc' = unionWith (+) cooc cooc'
261 getTrace :: Cooc -> Double
262 getTrace cooc = sum $ elems $ filterWithKey (\(k,k') _ -> k == k') cooc
264 coocToDiago :: Cooc -> Cooc
265 coocToDiago cooc = filterWithKey (\(k,k') _ -> k == k') cooc
267 -- | To build the local cooc matrix of each phylogroup
268 ngramsToCooc :: [Int] -> [Cooc] -> Cooc
269 ngramsToCooc ngrams coocs =
270 let cooc = foldl (\acc cooc' -> sumCooc acc cooc') empty coocs
271 pairs = listToKeys ngrams
272 in filterWithKey (\k _ -> elem k pairs) cooc
279 getGroupId :: PhyloGroup -> PhyloGroupId
280 getGroupId g = ((g ^. phylo_groupPeriod, g ^. phylo_groupLevel), g ^. phylo_groupIndex)
282 idToPrd :: PhyloGroupId -> PhyloPeriodId
283 idToPrd id = (fst . fst) id
285 groupByField :: Ord a => (PhyloGroup -> a) -> [PhyloGroup] -> Map a [PhyloGroup]
286 groupByField toField groups = fromListWith (++) $ map (\g -> (toField g, [g])) groups
288 getPeriodPointers :: Filiation -> PhyloGroup -> [Pointer]
289 getPeriodPointers fil g =
291 ToChilds -> g ^. phylo_groupPeriodChilds
292 ToParents -> g ^. phylo_groupPeriodParents
294 filterProximity :: Proximity -> Double -> Double -> Bool
295 filterProximity proximity thr local =
297 WeightedLogJaccard _ -> local >= thr
298 WeightedLogSim _ -> local >= thr
301 getProximityName :: Proximity -> String
302 getProximityName proximity =
304 WeightedLogJaccard _ -> "WLJaccard"
305 WeightedLogSim _ -> "WeightedLogSim"
312 addPointers :: Filiation -> PointerType -> [Pointer] -> PhyloGroup -> PhyloGroup
313 addPointers fil pty pointers g =
315 TemporalPointer -> case fil of
316 ToChilds -> g & phylo_groupPeriodChilds .~ pointers
317 ToParents -> g & phylo_groupPeriodParents .~ pointers
318 LevelPointer -> case fil of
319 ToChilds -> g & phylo_groupLevelChilds .~ pointers
320 ToParents -> g & phylo_groupLevelParents .~ pointers
323 getPeriodIds :: Phylo -> [(Date,Date)]
324 getPeriodIds phylo = sortOn fst
326 $ phylo ^. phylo_periods
328 getLevelParentId :: PhyloGroup -> PhyloGroupId
329 getLevelParentId g = fst $ head' "getLevelParentId" $ g ^. phylo_groupLevelParents
331 getLastLevel :: Phylo -> Level
332 getLastLevel phylo = last' "lastLevel" $ getLevels phylo
334 getLevels :: Phylo -> [Level]
335 getLevels phylo = nub
337 $ keys $ view ( phylo_periods
339 . phylo_periodLevels ) phylo
341 getSeaElevation :: Phylo -> SeaElevation
342 getSeaElevation phylo = seaElevation (getConfig phylo)
345 getConfig :: Phylo -> Config
346 getConfig phylo = (phylo ^. phylo_param) ^. phyloParam_config
349 getRoots :: Phylo -> Vector Ngrams
350 getRoots phylo = (phylo ^. phylo_foundations) ^. foundations_roots
352 phyloToLastBranches :: Phylo -> [[PhyloGroup]]
353 phyloToLastBranches phylo = elems
355 $ map (\g -> (g ^. phylo_groupBranchId, [g]))
356 $ getGroupsFromLevel (last' "byBranches" $ getLevels phylo) phylo
358 getGroupsFromLevel :: Level -> Phylo -> [PhyloGroup]
359 getGroupsFromLevel lvl phylo =
360 elems $ view ( phylo_periods
364 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
365 . phylo_levelGroups ) phylo
368 getGroupsFromLevelPeriods :: Level -> [PhyloPeriodId] -> Phylo -> [PhyloGroup]
369 getGroupsFromLevelPeriods lvl periods phylo =
370 elems $ view ( phylo_periods
372 . filtered (\phyloPrd -> elem (phyloPrd ^. phylo_periodPeriod) periods)
375 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
376 . phylo_levelGroups ) phylo
379 getGroupsFromPeriods :: Level -> Map PhyloPeriodId PhyloPeriod -> [PhyloGroup]
380 getGroupsFromPeriods lvl periods =
381 elems $ view ( traverse
384 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
385 . phylo_levelGroups ) periods
388 updatePhyloGroups :: Level -> Map PhyloGroupId PhyloGroup -> Phylo -> Phylo
389 updatePhyloGroups lvl m phylo =
394 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
398 let id = getGroupId g
405 traceToPhylo :: Level -> Phylo -> Phylo
406 traceToPhylo lvl phylo =
407 trace ("\n" <> "-- | End of phylo making at level " <> show (lvl) <> " with "
408 <> show (length $ getGroupsFromLevel lvl phylo) <> " groups and "
409 <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel lvl phylo) <> " branches" <> "\n") phylo
415 mergeBranchIds :: [[Int]] -> [Int]
416 mergeBranchIds ids = (head' "mergeBranchIds" . sort . mostFreq') ids
418 -- | 2) find the most Up Left ids in the hierarchy of similarity
419 -- mostUpLeft :: [[Int]] -> [[Int]]
421 -- let groupIds = (map (\gIds -> (length $ head' "gIds" gIds, head' "gIds" gIds)) . groupBy (\id id' -> length id == length id') . sortOn length) ids'
422 -- inf = (fst . minimum) groupIds
423 -- in map snd $ filter (\gIds -> fst gIds == inf) groupIds
424 -- | 1) find the most frequent ids
425 mostFreq' :: [[Int]] -> [[Int]]
427 let groupIds = (map (\gIds -> (length gIds, head' "gIds" gIds)) . group . sort) ids'
428 sup = (fst . maximum) groupIds
429 in map snd $ filter (\gIds -> fst gIds == sup) groupIds
432 mergeMeta :: [Int] -> [PhyloGroup] -> Map Text [Double]
433 mergeMeta bId groups =
434 let ego = head' "mergeMeta" $ filter (\g -> (snd (g ^. phylo_groupBranchId)) == bId) groups
435 in fromList [("breaks",(ego ^. phylo_groupMeta) ! "breaks"),("seaLevels",(ego ^. phylo_groupMeta) ! "seaLevels")]
438 groupsToBranches :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
439 groupsToBranches groups =
440 {- run the related component algorithm -}
441 let egos = map (\g -> [getGroupId g]
442 ++ (map fst $ g ^. phylo_groupPeriodParents)
443 ++ (map fst $ g ^. phylo_groupPeriodChilds)
444 ++ (map fst $ g ^. phylo_groupAncestors)) $ elems groups
445 graph = relatedComponents egos
446 {- update each group's branch id -}
448 let groups' = elems $ restrictKeys groups (Set.fromList ids)
449 bId = mergeBranchIds $ map (\g -> snd $ g ^. phylo_groupBranchId) groups'
450 in map (\g -> g & phylo_groupBranchId %~ (\(lvl,_) -> (lvl,bId))) groups') graph
452 relatedComponents :: Ord a => [[a]] -> [[a]]
453 relatedComponents graph = foldl' (\acc groups ->
457 let acc' = partition (\groups' -> disjoint (Set.fromList groups') (Set.fromList groups)) acc
458 in (fst acc') ++ [nub $ concat $ (snd acc') ++ [groups]]) [] graph
460 toRelatedComponents :: [PhyloGroup] -> [((PhyloGroup,PhyloGroup),Double)] -> [[PhyloGroup]]
461 toRelatedComponents nodes edges =
462 let ref = fromList $ map (\g -> (getGroupId g, g)) nodes
463 clusters = relatedComponents $ ((map (\((g,g'),_) -> [getGroupId g, getGroupId g']) edges) ++ (map (\g -> [getGroupId g]) nodes))
464 in map (\cluster -> map (\gId -> ref ! gId) cluster) clusters
467 traceSynchronyEnd :: Phylo -> Phylo
468 traceSynchronyEnd phylo =
469 trace ( "-- | End synchronic clustering at level " <> show (getLastLevel phylo)
470 <> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
471 <> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
474 traceSynchronyStart :: Phylo -> Phylo
475 traceSynchronyStart phylo =
476 trace ( "\n" <> "-- | Start synchronic clustering at level " <> show (getLastLevel phylo)
477 <> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
478 <> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
486 getSensibility :: Proximity -> Double
487 getSensibility proxi = case proxi of
488 WeightedLogJaccard s -> s
489 WeightedLogSim s -> s
496 intersectInit :: Eq a => [a] -> [a] -> [a] -> [a]
497 intersectInit acc lst lst' =
498 if (null lst) || (null lst')
500 else if (head' "intersectInit" lst) == (head' "intersectInit" lst')
501 then intersectInit (acc ++ [head' "intersectInit" lst]) (tail lst) (tail lst')
504 branchIdsToProximity :: PhyloBranchId -> PhyloBranchId -> Double -> Double -> Double
505 branchIdsToProximity id id' thrInit thrStep = thrInit + thrStep * (fromIntegral $ length $ intersectInit [] (snd id) (snd id'))
507 ngramsInBranches :: [[PhyloGroup]] -> [Int]
508 ngramsInBranches branches = nub $ foldl (\acc g -> acc ++ (g ^. phylo_groupNgrams)) [] $ concat branches
511 traceMatchSuccess :: Double -> Double -> Double -> [[[PhyloGroup]]] -> [[[PhyloGroup]]]
512 traceMatchSuccess thr qua qua' nextBranches =
513 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . init . snd)
514 $ (head' "trace" $ head' "trace" $ head' "trace" nextBranches) ^. phylo_groupBranchId))
515 <> ",(1.." <> show (length nextBranches) <> ")]"
516 <> " | " <> show ((length . concat . concat) nextBranches) <> " groups" <> "\n"
517 <> " - splited with success in " <> show (map length nextBranches) <> " sub-branches" <> "\n"
518 <> " - for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " < " <> show(qua') <> ")\n" ) nextBranches
521 traceMatchFailure :: Double -> Double -> Double -> [[PhyloGroup]] -> [[PhyloGroup]]
522 traceMatchFailure thr qua qua' branches =
523 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
524 <> ",(1.." <> show (length branches) <> ")]"
525 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
526 <> " - split with failure for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " > " <> show(qua') <> ")\n"
530 traceMatchNoSplit :: [[PhyloGroup]] -> [[PhyloGroup]]
531 traceMatchNoSplit branches =
532 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
533 <> ",(1.." <> show (length branches) <> ")]"
534 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
535 <> " - unable to split in smaller branches" <> "\n"
539 traceMatchLimit :: [[PhyloGroup]] -> [[PhyloGroup]]
540 traceMatchLimit branches =
541 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
542 <> ",(1.." <> show (length branches) <> ")]"
543 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
544 <> " - unable to increase the threshold above 1" <> "\n"
548 traceMatchEnd :: [PhyloGroup] -> [PhyloGroup]
549 traceMatchEnd groups =
550 trace ("\n" <> "-- | End temporal matching with " <> show (length $ nub $ map (\g -> g ^. phylo_groupBranchId) groups)
551 <> " branches and " <> show (length groups) <> " groups" <> "\n") groups
554 traceTemporalMatching :: [PhyloGroup] -> [PhyloGroup]
555 traceTemporalMatching groups =
556 trace ( "\n" <> "-- | Start temporal matching for " <> show(length groups) <> " groups" <> "\n") groups
559 traceGroupsProxi :: Map (PhyloGroupId,PhyloGroupId) Double -> Map (PhyloGroupId,PhyloGroupId) Double
561 trace ( "\n" <> "-- | " <> show(Map.size m) <> " computed pairs of groups proximity" <> "\n") m