2 Module : Gargantext.Core.Viz.Phylo.PhyloTools
3 Description : Module dedicated to all the tools needed for making a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 {-# LANGUAGE ViewPatterns #-}
13 module Gargantext.Core.Viz.Phylo.PhyloTools where
15 import Data.Vector (Vector, elemIndex)
16 import Data.List (sort, concat, null, union, (++), tails, sortOn, nub, init, tail, partition, tails, nubBy, group)
17 import Data.Set (Set, disjoint)
18 import Data.Map (Map, elems, fromList, unionWith, keys, member, (!), filterWithKey, fromListWith, empty, restrictKeys)
19 import Data.String (String)
20 import Data.Text (Text)
22 import Prelude (floor)
24 import Gargantext.Prelude
25 import Gargantext.Core.Viz.AdaptativePhylo
29 import Debug.Trace (trace)
30 import Control.Lens hiding (Level)
32 import qualified Data.Vector as Vector
33 import qualified Data.List as List
34 import qualified Data.Set as Set
35 import qualified Data.Map as Map
36 import qualified Data.Text as Text
42 -- | To print an important message as an IO()
43 printIOMsg :: String -> IO ()
48 <> "-- | " <> msg <> "\n" )
51 -- | To print a comment as an IO()
52 printIOComment :: String -> IO ()
54 putStrLn ( "\n" <> cmt <> "\n" )
61 -- truncate' :: Double -> Int -> Double
62 -- truncate' x n = (fromIntegral (floor (x * t))) / t
65 truncate' :: Double -> Int -> Double
66 truncate' x n = (fromIntegral $ (floor (x * t) :: Int)) / t
72 getInMap :: Int -> Map Int Double -> Double
78 roundToStr :: (PrintfArg a, Floating a) => Int -> a -> String
79 roundToStr = printf "%0.*f"
82 countSup :: Double -> [Double] -> Int
83 countSup s l = length $ filter (>s) l
86 dropByIdx :: Int -> [a] -> [a]
87 dropByIdx k l = take k l ++ drop (k+1) l
90 elemIndex' :: Eq a => a -> [a] -> Int
91 elemIndex' e l = case (List.elemIndex e l) of
92 Nothing -> panic ("[ERR][Viz.Phylo.PhyloTools] element not in list")
96 commonPrefix :: Eq a => [a] -> [a] -> [a] -> [a]
97 commonPrefix lst lst' acc =
98 if (null lst || null lst')
100 else if (head' "commonPrefix" lst == head' "commonPrefix" lst')
101 then commonPrefix (tail lst) (tail lst') (acc ++ [head' "commonPrefix" lst])
105 ---------------------
106 -- | Foundations | --
107 ---------------------
110 -- | Is this Ngrams a Foundations Root ?
111 isRoots :: Ngrams -> Vector Ngrams -> Bool
112 isRoots n ns = Vector.elem n ns
114 -- | To transform a list of nrams into a list of foundation's index
115 ngramsToIdx :: [Ngrams] -> Vector Ngrams -> [Int]
116 ngramsToIdx ns fdt = map (\n -> fromJust $ elemIndex n fdt) ns
118 -- | To transform a list of Ngrams Indexes into a Label
119 ngramsToLabel :: Vector Ngrams -> [Int] -> Text
120 ngramsToLabel ngrams l = Text.unwords $ tail' "ngramsToLabel" $ concat $ map (\n -> ["|",n]) $ ngramsToText ngrams l
122 idxToLabel :: [Int] -> String
123 idxToLabel l = List.unwords $ tail' "idxToLabel" $ concat $ map (\n -> ["|",show n]) l
125 idxToLabel' :: [Double] -> String
126 idxToLabel' l = List.unwords $ tail' "idxToLabel" $ concat $ map (\n -> ["|",show n]) l
128 -- | To transform a list of Ngrams Indexes into a list of Text
129 ngramsToText :: Vector Ngrams -> [Int] -> [Text]
130 ngramsToText ngrams l = map (\idx -> ngrams Vector.! idx) l
137 -- | To transform a list of periods into a set of Dates
138 periodsToYears :: [(Date,Date)] -> Set Date
139 periodsToYears periods = (Set.fromList . sort . concat)
140 $ map (\(d,d') -> [d..d']) periods
143 findBounds :: [Date] -> (Date,Date)
145 let dates' = sort dates
146 in (head' "findBounds" dates', last' "findBounds" dates')
149 toPeriods :: [Date] -> Int -> Int -> [(Date,Date)]
150 toPeriods dates p s =
151 let (start,end) = findBounds dates
152 in map (\dates' -> (head' "toPeriods" dates', last' "toPeriods" dates'))
153 $ chunkAlong p s [start .. end]
156 -- | Get a regular & ascendante timeScale from a given list of dates
157 toTimeScale :: [Date] -> Int -> [Date]
158 toTimeScale dates step =
159 let (start,end) = findBounds dates
160 in [start, (start + step) .. end]
163 getTimeStep :: TimeUnit -> Int
164 getTimeStep time = case time of
167 getTimePeriod :: TimeUnit -> Int
168 getTimePeriod time = case time of
171 getTimeFrame :: TimeUnit -> Int
172 getTimeFrame time = case time of
180 -- | To find if l' is nested in l
181 isNested :: Eq a => [a] -> [a] -> Bool
184 | length l' > length l = False
185 | (union l l') == l = True
189 -- | To filter Fis with small Support but by keeping non empty Periods
190 keepFilled :: (Int -> [a] -> [a]) -> Int -> [a] -> [a]
191 keepFilled f thr l = if (null $ f thr l) && (not $ null l)
192 then keepFilled f (thr - 1) l
196 traceClique :: Map (Date, Date) [PhyloClique] -> String
197 traceClique mFis = foldl (\msg cpt -> msg <> show (countSup cpt cliques) <> " (>" <> show (cpt) <> ") " ) "" [1..6]
199 --------------------------------------
201 cliques = sort $ map (fromIntegral . length . _phyloClique_nodes) $ concat $ elems mFis
202 --------------------------------------
205 traceSupport :: Map (Date, Date) [PhyloClique] -> String
206 traceSupport mFis = foldl (\msg cpt -> msg <> show (countSup cpt supports) <> " (>" <> show (cpt) <> ") " ) "" [1..6]
208 --------------------------------------
210 supports = sort $ map (fromIntegral . _phyloClique_support) $ concat $ elems mFis
211 --------------------------------------
214 traceFis :: [Char] -> Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
215 traceFis msg mFis = trace ( "\n" <> "-- | " <> msg <> " : " <> show (sum $ map length $ elems mFis) <> "\n"
216 <> "Support : " <> (traceSupport mFis) <> "\n"
217 <> "Nb Ngrams : " <> (traceClique mFis) <> "\n" ) mFis
225 getCliqueSupport :: Clique -> Int
226 getCliqueSupport unit = case unit of
230 getCliqueSize :: Clique -> Int
231 getCliqueSize unit = case unit of
240 listToCombi' :: [a] -> [(a,a)]
241 listToCombi' l = [(x,y) | (x:rest) <- tails l, y <- rest]
243 listToEqual' :: Eq a => [a] -> [(a,a)]
244 listToEqual' l = [(x,y) | x <- l, y <- l, x == y]
246 listToKeys :: Eq a => [a] -> [(a,a)]
247 listToKeys lst = (listToCombi' lst) ++ (listToEqual' lst)
249 listToMatrix :: [Int] -> Map (Int,Int) Double
250 listToMatrix lst = fromList $ map (\k -> (k,1)) $ listToKeys $ sort lst
252 listToMatrix' :: [Ngrams] -> Map (Ngrams,Ngrams) Int
253 listToMatrix' lst = fromList $ map (\k -> (k,1)) $ listToKeys $ sort lst
255 listToSeq :: Eq a => [a] -> [(a,a)]
256 listToSeq l = nubBy (\x y -> fst x == fst y) $ [ (x,y) | (x:rest) <- tails l, y <- rest ]
258 sumCooc :: Cooc -> Cooc -> Cooc
259 sumCooc cooc cooc' = unionWith (+) cooc cooc'
261 getTrace :: Cooc -> Double
262 getTrace cooc = sum $ elems $ filterWithKey (\(k,k') _ -> k == k') cooc
264 coocToDiago :: Cooc -> Cooc
265 coocToDiago cooc = filterWithKey (\(k,k') _ -> k == k') cooc
267 -- | To build the local cooc matrix of each phylogroup
268 ngramsToCooc :: [Int] -> [Cooc] -> Cooc
269 ngramsToCooc ngrams coocs =
270 let cooc = foldl (\acc cooc' -> sumCooc acc cooc') empty coocs
271 pairs = listToKeys ngrams
272 in filterWithKey (\k _ -> elem k pairs) cooc
279 getGroupId :: PhyloGroup -> PhyloGroupId
280 getGroupId g = ((g ^. phylo_groupPeriod, g ^. phylo_groupLevel), g ^. phylo_groupIndex)
282 idToPrd :: PhyloGroupId -> PhyloPeriodId
283 idToPrd id = (fst . fst) id
285 groupByField :: Ord a => (PhyloGroup -> a) -> [PhyloGroup] -> Map a [PhyloGroup]
286 groupByField toField groups = fromListWith (++) $ map (\g -> (toField g, [g])) groups
288 getPeriodPointers :: Filiation -> PhyloGroup -> [Pointer]
289 getPeriodPointers fil g =
291 ToChilds -> g ^. phylo_groupPeriodChilds
292 ToParents -> g ^. phylo_groupPeriodParents
294 filterProximity :: Proximity -> Double -> Double -> Bool
295 filterProximity proximity thr local =
297 WeightedLogJaccard _ -> local >= thr
300 getProximityName :: Proximity -> String
301 getProximityName proximity =
303 WeightedLogJaccard _ -> "WLJaccard"
310 addPointers :: Filiation -> PointerType -> [Pointer] -> PhyloGroup -> PhyloGroup
311 addPointers fil pty pointers g =
313 TemporalPointer -> case fil of
314 ToChilds -> g & phylo_groupPeriodChilds .~ pointers
315 ToParents -> g & phylo_groupPeriodParents .~ pointers
316 LevelPointer -> case fil of
317 ToChilds -> g & phylo_groupLevelChilds .~ pointers
318 ToParents -> g & phylo_groupLevelParents .~ pointers
321 getPeriodIds :: Phylo -> [(Date,Date)]
322 getPeriodIds phylo = sortOn fst
324 $ phylo ^. phylo_periods
326 getLevelParentId :: PhyloGroup -> PhyloGroupId
327 getLevelParentId g = fst $ head' "getLevelParentId" $ g ^. phylo_groupLevelParents
329 getLastLevel :: Phylo -> Level
330 getLastLevel phylo = last' "lastLevel" $ getLevels phylo
332 getLevels :: Phylo -> [Level]
333 getLevels phylo = nub
335 $ keys $ view ( phylo_periods
337 . phylo_periodLevels ) phylo
339 getSeaElevation :: Phylo -> SeaElevation
340 getSeaElevation phylo = seaElevation (getConfig phylo)
343 getConfig :: Phylo -> Config
344 getConfig phylo = (phylo ^. phylo_param) ^. phyloParam_config
347 getRoots :: Phylo -> Vector Ngrams
348 getRoots phylo = (phylo ^. phylo_foundations) ^. foundations_roots
350 phyloToLastBranches :: Phylo -> [[PhyloGroup]]
351 phyloToLastBranches phylo = elems
353 $ map (\g -> (g ^. phylo_groupBranchId, [g]))
354 $ getGroupsFromLevel (last' "byBranches" $ getLevels phylo) phylo
356 getGroupsFromLevel :: Level -> Phylo -> [PhyloGroup]
357 getGroupsFromLevel lvl phylo =
358 elems $ view ( phylo_periods
362 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
363 . phylo_levelGroups ) phylo
366 getGroupsFromLevelPeriods :: Level -> [PhyloPeriodId] -> Phylo -> [PhyloGroup]
367 getGroupsFromLevelPeriods lvl periods phylo =
368 elems $ view ( phylo_periods
370 . filtered (\phyloPrd -> elem (phyloPrd ^. phylo_periodPeriod) periods)
373 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
374 . phylo_levelGroups ) phylo
377 getGroupsFromPeriods :: Level -> Map PhyloPeriodId PhyloPeriod -> [PhyloGroup]
378 getGroupsFromPeriods lvl periods =
379 elems $ view ( traverse
382 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
383 . phylo_levelGroups ) periods
386 updatePhyloGroups :: Level -> Map PhyloGroupId PhyloGroup -> Phylo -> Phylo
387 updatePhyloGroups lvl m phylo =
392 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
396 let id = getGroupId g
403 traceToPhylo :: Level -> Phylo -> Phylo
404 traceToPhylo lvl phylo =
405 trace ("\n" <> "-- | End of phylo making at level " <> show (lvl) <> " with "
406 <> show (length $ getGroupsFromLevel lvl phylo) <> " groups and "
407 <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel lvl phylo) <> " branches" <> "\n") phylo
413 mergeBranchIds :: [[Int]] -> [Int]
414 mergeBranchIds ids = (head' "mergeBranchIds" . sort . mostFreq') ids
416 -- | 2) find the most Up Left ids in the hierarchy of similarity
417 -- mostUpLeft :: [[Int]] -> [[Int]]
419 -- let groupIds = (map (\gIds -> (length $ head' "gIds" gIds, head' "gIds" gIds)) . groupBy (\id id' -> length id == length id') . sortOn length) ids'
420 -- inf = (fst . minimum) groupIds
421 -- in map snd $ filter (\gIds -> fst gIds == inf) groupIds
422 -- | 1) find the most frequent ids
423 mostFreq' :: [[Int]] -> [[Int]]
425 let groupIds = (map (\gIds -> (length gIds, head' "gIds" gIds)) . group . sort) ids'
426 sup = (fst . maximum) groupIds
427 in map snd $ filter (\gIds -> fst gIds == sup) groupIds
430 mergeMeta :: [Int] -> [PhyloGroup] -> Map Text [Double]
431 mergeMeta bId groups =
432 let ego = head' "mergeMeta" $ filter (\g -> (snd (g ^. phylo_groupBranchId)) == bId) groups
433 in fromList [("breaks",(ego ^. phylo_groupMeta) ! "breaks"),("seaLevels",(ego ^. phylo_groupMeta) ! "seaLevels")]
436 groupsToBranches :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
437 groupsToBranches groups =
438 {- run the related component algorithm -}
439 let egos = map (\g -> [getGroupId g]
440 ++ (map fst $ g ^. phylo_groupPeriodParents)
441 ++ (map fst $ g ^. phylo_groupPeriodChilds)
442 ++ (map fst $ g ^. phylo_groupAncestors)) $ elems groups
443 graph = relatedComponents egos
444 {- update each group's branch id -}
446 let groups' = elems $ restrictKeys groups (Set.fromList ids)
447 bId = mergeBranchIds $ map (\g -> snd $ g ^. phylo_groupBranchId) groups'
448 in map (\g -> g & phylo_groupBranchId %~ (\(lvl,_) -> (lvl,bId))) groups') graph
450 relatedComponents :: Ord a => [[a]] -> [[a]]
451 relatedComponents graph = foldl' (\acc groups ->
455 let acc' = partition (\groups' -> disjoint (Set.fromList groups') (Set.fromList groups)) acc
456 in (fst acc') ++ [nub $ concat $ (snd acc') ++ [groups]]) [] graph
458 toRelatedComponents :: [PhyloGroup] -> [((PhyloGroup,PhyloGroup),Double)] -> [[PhyloGroup]]
459 toRelatedComponents nodes edges =
460 let ref = fromList $ map (\g -> (getGroupId g, g)) nodes
461 clusters = relatedComponents $ ((map (\((g,g'),_) -> [getGroupId g, getGroupId g']) edges) ++ (map (\g -> [getGroupId g]) nodes))
462 in map (\cluster -> map (\gId -> ref ! gId) cluster) clusters
465 traceSynchronyEnd :: Phylo -> Phylo
466 traceSynchronyEnd phylo =
467 trace ( "\n" <> "-- | End synchronic clustering at level " <> show (getLastLevel phylo)
468 <> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
469 <> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
472 traceSynchronyStart :: Phylo -> Phylo
473 traceSynchronyStart phylo =
474 trace ( "\n" <> "-- | Start synchronic clustering at level " <> show (getLastLevel phylo)
475 <> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
476 <> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
484 getSensibility :: Proximity -> Double
485 getSensibility proxi = case proxi of
486 WeightedLogJaccard s -> s
493 intersectInit :: Eq a => [a] -> [a] -> [a] -> [a]
494 intersectInit acc lst lst' =
495 if (null lst) || (null lst')
497 else if (head' "intersectInit" lst) == (head' "intersectInit" lst')
498 then intersectInit (acc ++ [head' "intersectInit" lst]) (tail lst) (tail lst')
501 branchIdsToProximity :: PhyloBranchId -> PhyloBranchId -> Double -> Double -> Double
502 branchIdsToProximity id id' thrInit thrStep = thrInit + thrStep * (fromIntegral $ length $ intersectInit [] (snd id) (snd id'))
504 ngramsInBranches :: [[PhyloGroup]] -> [Int]
505 ngramsInBranches branches = nub $ foldl (\acc g -> acc ++ (g ^. phylo_groupNgrams)) [] $ concat branches
508 traceMatchSuccess :: Double -> Double -> Double -> [[[PhyloGroup]]] -> [[[PhyloGroup]]]
509 traceMatchSuccess thr qua qua' nextBranches =
510 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . init . snd)
511 $ (head' "trace" $ head' "trace" $ head' "trace" nextBranches) ^. phylo_groupBranchId))
512 <> ",(1.." <> show (length nextBranches) <> ")]"
513 <> " | " <> show ((length . concat . concat) nextBranches) <> " groups" <> "\n"
514 <> " - splited with success in " <> show (map length nextBranches) <> " sub-branches" <> "\n"
515 <> " - for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " < " <> show(qua') <> ")\n" ) nextBranches
518 traceMatchFailure :: Double -> Double -> Double -> [[PhyloGroup]] -> [[PhyloGroup]]
519 traceMatchFailure thr qua qua' branches =
520 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
521 <> ",(1.." <> show (length branches) <> ")]"
522 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
523 <> " - split with failure for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " > " <> show(qua') <> ")\n"
527 traceMatchNoSplit :: [[PhyloGroup]] -> [[PhyloGroup]]
528 traceMatchNoSplit branches =
529 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
530 <> ",(1.." <> show (length branches) <> ")]"
531 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
532 <> " - unable to split in smaller branches" <> "\n"
536 traceMatchLimit :: [[PhyloGroup]] -> [[PhyloGroup]]
537 traceMatchLimit branches =
538 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
539 <> ",(1.." <> show (length branches) <> ")]"
540 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
541 <> " - unable to increase the threshold above 1" <> "\n"
545 traceMatchEnd :: [PhyloGroup] -> [PhyloGroup]
546 traceMatchEnd groups =
547 trace ("\n" <> "-- | End temporal matching with " <> show (length $ nub $ map (\g -> g ^. phylo_groupBranchId) groups)
548 <> " branches and " <> show (length groups) <> " groups" <> "\n") groups
551 traceTemporalMatching :: [PhyloGroup] -> [PhyloGroup]
552 traceTemporalMatching groups =
553 trace ( "\n" <> "-- | Start temporal matching for " <> show(length groups) <> " groups" <> "\n") groups
556 traceGroupsProxi :: Map (PhyloGroupId,PhyloGroupId) Double -> Map (PhyloGroupId,PhyloGroupId) Double
558 trace ( "\n" <> "-- | " <> show(Map.size m) <> " computed pairs of groups proximity" <> "\n") m