2 Module : Gargantext.Core.Viz.Phylo.PhyloTools
3 Description : Module dedicated to all the tools needed for making a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 {-# LANGUAGE ViewPatterns #-}
13 module Gargantext.Core.Viz.Phylo.PhyloTools where
15 import Control.Lens hiding (Level)
16 import Data.List (sort, concat, null, union, (++), tails, sortOn, nub, init, tail, partition, tails, nubBy, group, notElem)
17 import Data.Map (Map, elems, fromList, unionWith, keys, member, (!), filterWithKey, fromListWith, empty, restrictKeys)
18 import Data.Set (Set, disjoint)
19 import Data.String (String)
20 import Data.Text (Text,unpack)
21 import Data.Vector (Vector, elemIndex)
22 import Debug.Trace (trace)
23 import Gargantext.Core.Viz.Phylo
24 import Gargantext.Prelude
25 import Prelude (floor,read)
27 import qualified Data.List as List
28 import qualified Data.Map as Map
29 import qualified Data.Set as Set
30 import qualified Data.Text as Text
31 import qualified Data.Vector as Vector
37 -- | To print an important message as an IO()
38 printIOMsg :: String -> IO ()
43 <> "-- | " <> msg <> "\n" )
46 -- | To print a comment as an IO()
47 printIOComment :: String -> IO ()
49 putStrLn ( "\n" <> cmt <> "\n" )
56 -- truncate' :: Double -> Int -> Double
57 -- truncate' x n = (fromIntegral (floor (x * t))) / t
60 truncate' :: Double -> Int -> Double
61 truncate' x n = (fromIntegral $ (floor (x * t) :: Int)) / t
67 getInMap :: Int -> Map Int Double -> Double
73 roundToStr :: (PrintfArg a, Floating a) => Int -> a -> String
74 roundToStr = printf "%0.*f"
77 countSup :: Double -> [Double] -> Int
78 countSup s l = length $ filter (>s) l
81 dropByIdx :: Int -> [a] -> [a]
82 dropByIdx k l = take k l ++ drop (k+1) l
85 elemIndex' :: Eq a => a -> [a] -> Int
86 elemIndex' e l = case (List.elemIndex e l) of
87 Nothing -> panic ("[ERR][Viz.Phylo.PhyloTools] element not in list")
91 commonPrefix :: Eq a => [a] -> [a] -> [a] -> [a]
92 commonPrefix lst lst' acc =
93 if (null lst || null lst')
95 else if (head' "commonPrefix" lst == head' "commonPrefix" lst')
96 then commonPrefix (tail lst) (tail lst') (acc ++ [head' "commonPrefix" lst])
100 ---------------------
101 -- | Foundations | --
102 ---------------------
105 -- | Is this Ngrams a Foundations Root ?
106 isRoots :: Ngrams -> Vector Ngrams -> Bool
107 isRoots n ns = Vector.elem n ns
109 -- | To transform a list of nrams into a list of foundation's index
110 ngramsToIdx :: [Ngrams] -> Vector Ngrams -> [Int]
111 ngramsToIdx ns fdt = map (\n -> fromJust $ elemIndex n fdt) ns
113 -- | To transform a list of sources into a list of sources' index
114 sourcesToIdx :: [Text] -> Vector Text -> [Int]
115 sourcesToIdx ss ps = nub $ map (\s -> fromJust $ elemIndex s ps) ss
117 -- | To transform a list of Ngrams Indexes into a Label
118 ngramsToLabel :: Vector Ngrams -> [Int] -> Text
119 ngramsToLabel ngrams l = Text.unwords $ tail' "ngramsToLabel" $ concat $ map (\n -> ["|",n]) $ ngramsToText ngrams l
121 idxToLabel :: [Int] -> String
122 idxToLabel l = List.unwords $ tail' "idxToLabel" $ concat $ map (\n -> ["|",show n]) l
124 idxToLabel' :: [Double] -> String
125 idxToLabel' l = List.unwords $ tail' "idxToLabel" $ concat $ map (\n -> ["|",show n]) l
127 -- | To transform a list of Ngrams Indexes into a list of Text
128 ngramsToText :: Vector Ngrams -> [Int] -> [Text]
129 ngramsToText ngrams l = map (\idx -> ngrams Vector.! idx) l
136 -- | To transform a list of periods into a set of Dates
137 periodsToYears :: [(Date,Date)] -> Set Date
138 periodsToYears periods = (Set.fromList . sort . concat)
139 $ map (\(d,d') -> [d..d']) periods
142 findBounds :: [Date] -> (Date,Date)
143 findBounds [] = panic "[G.C.V.P.PhyloTools] empty dates for find bounds"
145 let dates' = sort dates
146 in (head' "findBounds" dates', last' "findBounds" dates')
149 toPeriods :: [Date] -> Int -> Int -> [(Date,Date)]
150 toPeriods dates p s =
151 let (start,end) = findBounds dates
152 in map (\dates' -> (head' "toPeriods" dates', last' "toPeriods" dates'))
153 $ chunkAlong p s [start .. end]
156 toFstDate :: [Text] -> Text
161 let d' = read (filter (\c -> notElem c ['U','T','C',' ',':','-']) $ unpack d)::Int
164 toLstDate :: [Text] -> Text
170 let d' = read (filter (\c -> notElem c ['U','T','C',' ',':','-']) $ unpack d)::Int
174 getTimeScale :: Phylo -> [Char]
175 getTimeScale p = case (timeUnit $ getConfig p) of
176 Epoch _ _ _ -> "epoch"
178 Month _ _ _ -> "month"
183 -- | Get a regular & ascendante timeScale from a given list of dates
184 toTimeScale :: [Date] -> Int -> [Date]
185 toTimeScale dates step =
186 let (start,end) = findBounds dates
187 in [start, (start + step) .. end]
190 getTimeStep :: TimeUnit -> Int
191 getTimeStep time = case time of
198 getTimePeriod :: TimeUnit -> Int
199 getTimePeriod time = case time of
206 getTimeFrame :: TimeUnit -> Int
207 getTimeFrame time = case time of
219 -- | To find if l' is nested in l
220 isNested :: Eq a => [a] -> [a] -> Bool
223 | length l' > length l = False
224 | (union l l') == l = True
228 -- | To filter Fis with small Support but by keeping non empty Periods
229 keepFilled :: (Int -> [a] -> [a]) -> Int -> [a] -> [a]
230 keepFilled f thr l = if (null $ f thr l) && (not $ null l)
231 then keepFilled f (thr - 1) l
235 traceClique :: Map (Date, Date) [PhyloClique] -> String
236 traceClique mFis = foldl (\msg cpt -> msg <> show (countSup cpt cliques) <> " (>" <> show (cpt) <> ") " ) "" [1..6]
238 --------------------------------------
240 cliques = sort $ map (fromIntegral . length . _phyloClique_nodes) $ concat $ elems mFis
241 --------------------------------------
244 traceSupport :: Map (Date, Date) [PhyloClique] -> String
245 traceSupport mFis = foldl (\msg cpt -> msg <> show (countSup cpt supports) <> " (>" <> show (cpt) <> ") " ) "" [1..6]
247 --------------------------------------
249 supports = sort $ map (fromIntegral . _phyloClique_support) $ concat $ elems mFis
250 --------------------------------------
253 traceFis :: [Char] -> Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
254 traceFis msg mFis = trace ( "\n" <> "-- | " <> msg <> " : " <> show (sum $ map length $ elems mFis) <> "\n"
255 <> "Support : " <> (traceSupport mFis) <> "\n"
256 <> "Nb Ngrams : " <> (traceClique mFis) <> "\n" ) mFis
264 getCliqueSupport :: Clique -> Int
265 getCliqueSupport unit = case unit of
269 getCliqueSize :: Clique -> Int
270 getCliqueSize unit = case unit of
279 listToCombi' :: [a] -> [(a,a)]
280 listToCombi' l = [(x,y) | (x:rest) <- tails l, y <- rest]
282 listToEqual' :: Eq a => [a] -> [(a,a)]
283 listToEqual' l = [(x,y) | x <- l, y <- l, x == y]
285 listToKeys :: Eq a => [a] -> [(a,a)]
286 listToKeys lst = (listToCombi' lst) ++ (listToEqual' lst)
288 listToMatrix :: [Int] -> Map (Int,Int) Double
289 listToMatrix lst = fromList $ map (\k -> (k,1)) $ listToKeys $ sort lst
291 listToMatrix' :: [Ngrams] -> Map (Ngrams,Ngrams) Int
292 listToMatrix' lst = fromList $ map (\k -> (k,1)) $ listToKeys $ sort lst
294 listToSeq :: Eq a => [a] -> [(a,a)]
295 listToSeq l = nubBy (\x y -> fst x == fst y) $ [ (x,y) | (x:rest) <- tails l, y <- rest ]
297 sumCooc :: Cooc -> Cooc -> Cooc
298 sumCooc cooc cooc' = unionWith (+) cooc cooc'
300 getTrace :: Cooc -> Double
301 getTrace cooc = sum $ elems $ filterWithKey (\(k,k') _ -> k == k') cooc
303 coocToDiago :: Cooc -> Cooc
304 coocToDiago cooc = filterWithKey (\(k,k') _ -> k == k') cooc
306 -- | To build the local cooc matrix of each phylogroup
307 ngramsToCooc :: [Int] -> [Cooc] -> Cooc
308 ngramsToCooc ngrams coocs =
309 let cooc = foldl (\acc cooc' -> sumCooc acc cooc') empty coocs
310 pairs = listToKeys ngrams
311 in filterWithKey (\k _ -> elem k pairs) cooc
318 getGroupId :: PhyloGroup -> PhyloGroupId
319 getGroupId g = ((g ^. phylo_groupPeriod, g ^. phylo_groupLevel), g ^. phylo_groupIndex)
321 idToPrd :: PhyloGroupId -> PhyloPeriodId
322 idToPrd id = (fst . fst) id
324 groupByField :: Ord a => (PhyloGroup -> a) -> [PhyloGroup] -> Map a [PhyloGroup]
325 groupByField toField groups = fromListWith (++) $ map (\g -> (toField g, [g])) groups
327 getPeriodPointers :: Filiation -> PhyloGroup -> [Pointer]
328 getPeriodPointers fil g =
330 ToChilds -> g ^. phylo_groupPeriodChilds
331 ToParents -> g ^. phylo_groupPeriodParents
332 ToChildsMemory -> undefined
333 ToParentsMemory -> undefined
335 filterProximity :: Proximity -> Double -> Double -> Bool
336 filterProximity proximity thr local =
338 WeightedLogJaccard _ -> local >= thr
339 WeightedLogSim _ -> local >= thr
340 Hamming _ -> undefined
342 getProximityName :: Proximity -> String
343 getProximityName proximity =
345 WeightedLogJaccard _ -> "WLJaccard"
346 WeightedLogSim _ -> "WeightedLogSim"
347 Hamming _ -> "Hamming"
353 addPointers :: Filiation -> PointerType -> [Pointer] -> PhyloGroup -> PhyloGroup
354 addPointers fil pty pointers g =
356 TemporalPointer -> case fil of
357 ToChilds -> g & phylo_groupPeriodChilds .~ pointers
358 ToParents -> g & phylo_groupPeriodParents .~ pointers
359 ToChildsMemory -> undefined
360 ToParentsMemory -> undefined
361 LevelPointer -> case fil of
362 ToChilds -> g & phylo_groupLevelChilds .~ pointers
363 ToParents -> g & phylo_groupLevelParents .~ pointers
364 ToChildsMemory -> undefined
365 ToParentsMemory -> undefined
367 toPointer' :: Double -> Pointer -> Pointer'
368 toPointer' thr pt = (fst pt,(thr,snd pt))
371 addMemoryPointers :: Filiation -> PointerType -> Double -> [Pointer] -> PhyloGroup -> PhyloGroup
372 addMemoryPointers fil pty thr pointers g =
374 TemporalPointer -> case fil of
375 ToChilds -> undefined
376 ToParents -> undefined
377 ToChildsMemory -> g & phylo_groupPeriodMemoryChilds .~ (concat [(g ^. phylo_groupPeriodMemoryChilds),(map (\pt -> toPointer' thr pt) pointers)])
378 ToParentsMemory -> g & phylo_groupPeriodMemoryParents .~ (concat [(g ^. phylo_groupPeriodMemoryParents),(map (\pt -> toPointer' thr pt) pointers)])
379 LevelPointer -> undefined
382 getPeriodIds :: Phylo -> [(Date,Date)]
383 getPeriodIds phylo = sortOn fst
385 $ phylo ^. phylo_periods
387 getLevelParentId :: PhyloGroup -> PhyloGroupId
388 getLevelParentId g = fst $ head' "getLevelParentId" $ g ^. phylo_groupLevelParents
390 getLastLevel :: Phylo -> Level
391 getLastLevel phylo = last' "lastLevel" $ getLevels phylo
393 getLevels :: Phylo -> [Level]
394 getLevels phylo = nub
396 $ keys $ view ( phylo_periods
398 . phylo_periodLevels ) phylo
400 getSeaElevation :: Phylo -> SeaElevation
401 getSeaElevation phylo = seaElevation (getConfig phylo)
404 getConfig :: Phylo -> PhyloConfig
405 getConfig phylo = (phylo ^. phylo_param) ^. phyloParam_config
408 setConfig :: PhyloConfig -> Phylo -> Phylo
409 setConfig config phylo = phylo
410 & phylo_param .~ (PhyloParam
411 ((phylo ^. phylo_param) ^. phyloParam_version)
412 ((phylo ^. phylo_param) ^. phyloParam_software)
415 -- & phylo_param & phyloParam_config & phyloParam_config .~ config
418 getRoots :: Phylo -> Vector Ngrams
419 getRoots phylo = (phylo ^. phylo_foundations) ^. foundations_roots
421 getSources :: Phylo -> Vector Text
422 getSources phylo = _sources (phylo ^. phylo_sources)
424 phyloToLastBranches :: Phylo -> [[PhyloGroup]]
425 phyloToLastBranches phylo = elems
427 $ map (\g -> (g ^. phylo_groupBranchId, [g]))
428 $ getGroupsFromLevel (last' "byBranches" $ getLevels phylo) phylo
430 getGroupsFromLevel :: Level -> Phylo -> [PhyloGroup]
431 getGroupsFromLevel lvl phylo =
432 elems $ view ( phylo_periods
436 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
437 . phylo_levelGroups ) phylo
440 getGroupsFromLevelPeriods :: Level -> [PhyloPeriodId] -> Phylo -> [PhyloGroup]
441 getGroupsFromLevelPeriods lvl periods phylo =
442 elems $ view ( phylo_periods
444 . filtered (\phyloPrd -> elem (phyloPrd ^. phylo_periodPeriod) periods)
447 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
448 . phylo_levelGroups ) phylo
451 getGroupsFromPeriods :: Level -> Map PhyloPeriodId PhyloPeriod -> [PhyloGroup]
452 getGroupsFromPeriods lvl periods =
453 elems $ view ( traverse
456 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
457 . phylo_levelGroups ) periods
460 updatePhyloGroups :: Level -> Map PhyloGroupId PhyloGroup -> Phylo -> Phylo
461 updatePhyloGroups lvl m phylo =
466 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
470 let id = getGroupId g
476 updatePeriods :: Map (Date,Date) (Text,Text) -> Phylo -> Phylo
477 updatePeriods periods' phylo =
478 over (phylo_periods . traverse)
480 let prd' = periods' ! (prd ^. phylo_periodPeriod)
481 lvls = map (\lvl -> lvl & phylo_levelPeriod' .~ prd') $ prd ^. phylo_periodLevels
482 in prd & phylo_periodPeriod' .~ prd'
483 & phylo_periodLevels .~ lvls
487 traceToPhylo :: Level -> Phylo -> Phylo
488 traceToPhylo lvl phylo =
489 trace ("\n" <> "-- | End of phylo making at level " <> show (lvl) <> " with "
490 <> show (length $ getGroupsFromLevel lvl phylo) <> " groups and "
491 <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel lvl phylo) <> " branches" <> "\n") phylo
497 mergeBranchIds :: [[Int]] -> [Int]
498 mergeBranchIds ids = (head' "mergeBranchIds" . sort . mostFreq') ids
500 -- | 2) find the most Up Left ids in the hierarchy of similarity
501 -- mostUpLeft :: [[Int]] -> [[Int]]
503 -- let groupIds = (map (\gIds -> (length $ head' "gIds" gIds, head' "gIds" gIds)) . groupBy (\id id' -> length id == length id') . sortOn length) ids'
504 -- inf = (fst . minimum) groupIds
505 -- in map snd $ filter (\gIds -> fst gIds == inf) groupIds
506 -- | 1) find the most frequent ids
507 mostFreq' :: [[Int]] -> [[Int]]
509 let groupIds = (map (\gIds -> (length gIds, head' "gIds" gIds)) . group . sort) ids'
510 sup = (fst . maximum) groupIds
511 in map snd $ filter (\gIds -> fst gIds == sup) groupIds
514 mergeMeta :: [Int] -> [PhyloGroup] -> Map Text [Double]
515 mergeMeta bId groups =
516 let ego = head' "mergeMeta" $ filter (\g -> (snd (g ^. phylo_groupBranchId)) == bId) groups
517 in fromList [("breaks",(ego ^. phylo_groupMeta) ! "breaks"),("seaLevels",(ego ^. phylo_groupMeta) ! "seaLevels")]
520 groupsToBranches :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
521 groupsToBranches groups =
522 {- run the related component algorithm -}
523 let egos = map (\g -> [getGroupId g]
524 ++ (map fst $ g ^. phylo_groupPeriodParents)
525 ++ (map fst $ g ^. phylo_groupPeriodChilds)
526 ++ (map fst $ g ^. phylo_groupAncestors)) $ elems groups
527 graph = relatedComponents egos
528 {- update each group's branch id -}
530 let groups' = elems $ restrictKeys groups (Set.fromList ids)
531 bId = mergeBranchIds $ map (\g -> snd $ g ^. phylo_groupBranchId) groups'
532 in map (\g -> g & phylo_groupBranchId %~ (\(lvl,_) -> (lvl,bId))) groups') graph
534 relatedComponents :: Ord a => [[a]] -> [[a]]
535 relatedComponents graph = foldl' (\acc groups ->
539 let acc' = partition (\groups' -> disjoint (Set.fromList groups') (Set.fromList groups)) acc
540 in (fst acc') ++ [nub $ concat $ (snd acc') ++ [groups]]) [] graph
542 toRelatedComponents :: [PhyloGroup] -> [((PhyloGroup,PhyloGroup),Double)] -> [[PhyloGroup]]
543 toRelatedComponents nodes edges =
544 let ref = fromList $ map (\g -> (getGroupId g, g)) nodes
545 clusters = relatedComponents $ ((map (\((g,g'),_) -> [getGroupId g, getGroupId g']) edges) ++ (map (\g -> [getGroupId g]) nodes))
546 in map (\cluster -> map (\gId -> ref ! gId) cluster) clusters
549 traceSynchronyEnd :: Phylo -> Phylo
550 traceSynchronyEnd phylo =
551 trace ( "-- | End synchronic clustering at level " <> show (getLastLevel phylo)
552 <> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
553 <> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
556 traceSynchronyStart :: Phylo -> Phylo
557 traceSynchronyStart phylo =
558 trace ( "\n" <> "-- | Start synchronic clustering at level " <> show (getLastLevel phylo)
559 <> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
560 <> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
568 getSensibility :: Proximity -> Double
569 getSensibility proxi = case proxi of
570 WeightedLogJaccard s -> s
571 WeightedLogSim s -> s
572 Hamming _ -> undefined
578 intersectInit :: Eq a => [a] -> [a] -> [a] -> [a]
579 intersectInit acc lst lst' =
580 if (null lst) || (null lst')
582 else if (head' "intersectInit" lst) == (head' "intersectInit" lst')
583 then intersectInit (acc ++ [head' "intersectInit" lst]) (tail lst) (tail lst')
586 branchIdsToProximity :: PhyloBranchId -> PhyloBranchId -> Double -> Double -> Double
587 branchIdsToProximity id id' thrInit thrStep = thrInit + thrStep * (fromIntegral $ length $ intersectInit [] (snd id) (snd id'))
589 ngramsInBranches :: [[PhyloGroup]] -> [Int]
590 ngramsInBranches branches = nub $ foldl (\acc g -> acc ++ (g ^. phylo_groupNgrams)) [] $ concat branches
593 traceMatchSuccess :: Double -> Double -> Double -> [[[PhyloGroup]]] -> [[[PhyloGroup]]]
594 traceMatchSuccess thr qua qua' nextBranches =
595 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . init . snd)
596 $ (head' "trace" $ head' "trace" $ head' "trace" nextBranches) ^. phylo_groupBranchId))
597 <> ",(1.." <> show (length nextBranches) <> ")]"
598 <> " | " <> show ((length . concat . concat) nextBranches) <> " groups" <> "\n"
599 <> " - splited with success in " <> show (map length nextBranches) <> " sub-branches" <> "\n"
600 <> " - for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " < " <> show(qua') <> ")\n" ) nextBranches
603 traceMatchFailure :: Double -> Double -> Double -> [[PhyloGroup]] -> [[PhyloGroup]]
604 traceMatchFailure thr qua qua' branches =
605 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
606 <> ",(1.." <> show (length branches) <> ")]"
607 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
608 <> " - split with failure for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " > " <> show(qua') <> ")\n"
612 traceMatchNoSplit :: [[PhyloGroup]] -> [[PhyloGroup]]
613 traceMatchNoSplit branches =
614 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
615 <> ",(1.." <> show (length branches) <> ")]"
616 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
617 <> " - unable to split in smaller branches" <> "\n"
621 traceMatchLimit :: [[PhyloGroup]] -> [[PhyloGroup]]
622 traceMatchLimit branches =
623 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
624 <> ",(1.." <> show (length branches) <> ")]"
625 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
626 <> " - unable to increase the threshold above 1" <> "\n"
630 traceMatchEnd :: [PhyloGroup] -> [PhyloGroup]
631 traceMatchEnd groups =
632 trace ("\n" <> "-- | End temporal matching with " <> show (length $ nub $ map (\g -> g ^. phylo_groupBranchId) groups)
633 <> " branches and " <> show (length groups) <> " groups" <> "\n") groups
636 traceTemporalMatching :: [PhyloGroup] -> [PhyloGroup]
637 traceTemporalMatching groups =
638 trace ( "\n" <> "-- | Start temporal matching for " <> show(length groups) <> " groups" <> "\n") groups
641 traceGroupsProxi :: Map (PhyloGroupId,PhyloGroupId) Double -> Map (PhyloGroupId,PhyloGroupId) Double
643 trace ( "\n" <> "-- | " <> show(Map.size m) <> " computed pairs of groups proximity" <> "\n") m