2 Module : Gargantext.Core.Viz.Phylo.PhyloTools
3 Description : Module dedicated to all the tools needed for making a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 {-# LANGUAGE ViewPatterns #-}
13 module Gargantext.Core.Viz.Phylo.PhyloTools where
15 import Data.Vector (Vector, elemIndex)
16 import Data.List (sort, concat, null, union, (++), tails, sortOn, nub, init, tail, partition, tails, nubBy, group)
17 import Data.Set (Set, disjoint)
18 import Data.Map (Map, elems, fromList, unionWith, keys, member, (!), filterWithKey, fromListWith, empty, restrictKeys)
19 import Data.String (String)
20 import Data.Text (Text)
22 import Prelude (floor)
24 import Gargantext.Prelude
25 import Gargantext.Core.Viz.AdaptativePhylo
29 import Debug.Trace (trace)
30 import Control.Lens hiding (Level)
32 import qualified Data.Vector as Vector
33 import qualified Data.List as List
34 import qualified Data.Set as Set
35 import qualified Data.Map as Map
36 import qualified Data.Text as Text
42 -- | To print an important message as an IO()
43 printIOMsg :: String -> IO ()
48 <> "-- | " <> msg <> "\n" )
51 -- | To print a comment as an IO()
52 printIOComment :: String -> IO ()
54 putStrLn ( "\n" <> cmt <> "\n" )
61 -- truncate' :: Double -> Int -> Double
62 -- truncate' x n = (fromIntegral (floor (x * t))) / t
65 truncate' :: Double -> Int -> Double
66 truncate' x n = (fromIntegral $ (floor (x * t) :: Int)) / t
72 getInMap :: Int -> Map Int Double -> Double
78 roundToStr :: (PrintfArg a, Floating a) => Int -> a -> String
79 roundToStr = printf "%0.*f"
82 countSup :: Double -> [Double] -> Int
83 countSup s l = length $ filter (>s) l
86 dropByIdx :: Int -> [a] -> [a]
87 dropByIdx k l = take k l ++ drop (k+1) l
90 elemIndex' :: Eq a => a -> [a] -> Int
91 elemIndex' e l = case (List.elemIndex e l) of
92 Nothing -> panic ("[ERR][Viz.Phylo.PhyloTools] element not in list")
96 commonPrefix :: Eq a => [a] -> [a] -> [a] -> [a]
97 commonPrefix lst lst' acc =
98 if (null lst || null lst')
100 else if (head' "commonPrefix" lst == head' "commonPrefix" lst')
101 then commonPrefix (tail lst) (tail lst') (acc ++ [head' "commonPrefix" lst])
105 ---------------------
106 -- | Foundations | --
107 ---------------------
110 -- | Is this Ngrams a Foundations Root ?
111 isRoots :: Ngrams -> Vector Ngrams -> Bool
112 isRoots n ns = Vector.elem n ns
114 -- | To transform a list of nrams into a list of foundation's index
115 ngramsToIdx :: [Ngrams] -> Vector Ngrams -> [Int]
116 ngramsToIdx ns fdt = map (\n -> fromJust $ elemIndex n fdt) ns
118 -- | To transform a list of Ngrams Indexes into a Label
119 ngramsToLabel :: Vector Ngrams -> [Int] -> Text
120 ngramsToLabel ngrams l = Text.unwords $ tail' "ngramsToLabel" $ concat $ map (\n -> ["|",n]) $ ngramsToText ngrams l
122 idxToLabel :: [Int] -> String
123 idxToLabel l = List.unwords $ tail' "idxToLabel" $ concat $ map (\n -> ["|",show n]) l
125 idxToLabel' :: [Double] -> String
126 idxToLabel' l = List.unwords $ tail' "idxToLabel" $ concat $ map (\n -> ["|",show n]) l
128 -- | To transform a list of Ngrams Indexes into a list of Text
129 ngramsToText :: Vector Ngrams -> [Int] -> [Text]
130 ngramsToText ngrams l = map (\idx -> ngrams Vector.! idx) l
137 -- | To transform a list of periods into a set of Dates
138 periodsToYears :: [(Date,Date)] -> Set Date
139 periodsToYears periods = (Set.fromList . sort . concat)
140 $ map (\(d,d') -> [d..d']) periods
143 findBounds :: [Date] -> (Date,Date)
145 let dates' = sort dates
146 in (head' "findBounds" dates', last' "findBounds" dates')
149 toPeriods :: [Date] -> Int -> Int -> [(Date,Date)]
150 toPeriods dates p s =
151 let (start,end) = findBounds dates
152 in map (\dates' -> (head' "toPeriods" dates', last' "toPeriods" dates'))
153 $ chunkAlong p s [start .. end]
156 -- | Get a regular & ascendante timeScale from a given list of dates
157 toTimeScale :: [Date] -> Int -> [Date]
158 toTimeScale dates step =
159 let (start,end) = findBounds dates
160 in [start, (start + step) .. end]
163 getTimeStep :: TimeUnit -> Int
164 getTimeStep time = case time of
167 getTimePeriod :: TimeUnit -> Int
168 getTimePeriod time = case time of
171 getTimeFrame :: TimeUnit -> Int
172 getTimeFrame time = case time of
180 -- | To find if l' is nested in l
181 isNested :: Eq a => [a] -> [a] -> Bool
184 | length l' > length l = False
185 | (union l l') == l = True
189 -- | To filter Fis with small Support but by keeping non empty Periods
190 keepFilled :: (Int -> [a] -> [a]) -> Int -> [a] -> [a]
191 keepFilled f thr l = if (null $ f thr l) && (not $ null l)
192 then keepFilled f (thr - 1) l
196 traceClique :: Map (Date, Date) [PhyloClique] -> String
197 traceClique mFis = foldl (\msg cpt -> msg <> show (countSup cpt cliques) <> " (>" <> show (cpt) <> ") " ) "" [1..6]
199 --------------------------------------
201 cliques = sort $ map (fromIntegral . length . _phyloClique_nodes) $ concat $ elems mFis
202 --------------------------------------
205 traceSupport :: Map (Date, Date) [PhyloClique] -> String
206 traceSupport mFis = foldl (\msg cpt -> msg <> show (countSup cpt supports) <> " (>" <> show (cpt) <> ") " ) "" [1..6]
208 --------------------------------------
210 supports = sort $ map (fromIntegral . _phyloClique_support) $ concat $ elems mFis
211 --------------------------------------
214 traceFis :: [Char] -> Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
215 traceFis msg mFis = trace ( "\n" <> "-- | " <> msg <> " : " <> show (sum $ map length $ elems mFis) <> "\n"
216 <> "Support : " <> (traceSupport mFis) <> "\n"
217 <> "Nb Ngrams : " <> (traceClique mFis) <> "\n" ) mFis
225 getCliqueSupport :: Clique -> Int
226 getCliqueSupport unit = case unit of
230 getCliqueSize :: Clique -> Int
231 getCliqueSize unit = case unit of
240 listToCombi' :: [a] -> [(a,a)]
241 listToCombi' l = [(x,y) | (x:rest) <- tails l, y <- rest]
243 listToEqual' :: Eq a => [a] -> [(a,a)]
244 listToEqual' l = [(x,y) | x <- l, y <- l, x == y]
246 listToKeys :: Eq a => [a] -> [(a,a)]
247 listToKeys lst = (listToCombi' lst) ++ (listToEqual' lst)
249 listToMatrix :: [Int] -> Map (Int,Int) Double
250 listToMatrix lst = fromList $ map (\k -> (k,1)) $ listToKeys $ sort lst
252 listToMatrix' :: [Ngrams] -> Map (Ngrams,Ngrams) Int
253 listToMatrix' lst = fromList $ map (\k -> (k,1)) $ listToKeys $ sort lst
255 listToSeq :: Eq a => [a] -> [(a,a)]
256 listToSeq l = nubBy (\x y -> fst x == fst y) $ [ (x,y) | (x:rest) <- tails l, y <- rest ]
258 sumCooc :: Cooc -> Cooc -> Cooc
259 sumCooc cooc cooc' = unionWith (+) cooc cooc'
261 getTrace :: Cooc -> Double
262 getTrace cooc = sum $ elems $ filterWithKey (\(k,k') _ -> k == k') cooc
264 coocToDiago :: Cooc -> Cooc
265 coocToDiago cooc = filterWithKey (\(k,k') _ -> k == k') cooc
267 -- | To build the local cooc matrix of each phylogroup
268 ngramsToCooc :: [Int] -> [Cooc] -> Cooc
269 ngramsToCooc ngrams coocs =
270 let cooc = foldl (\acc cooc' -> sumCooc acc cooc') empty coocs
271 pairs = listToKeys ngrams
272 in filterWithKey (\k _ -> elem k pairs) cooc
279 getGroupId :: PhyloGroup -> PhyloGroupId
280 getGroupId g = ((g ^. phylo_groupPeriod, g ^. phylo_groupLevel), g ^. phylo_groupIndex)
282 idToPrd :: PhyloGroupId -> PhyloPeriodId
283 idToPrd id = (fst . fst) id
285 groupByField :: Ord a => (PhyloGroup -> a) -> [PhyloGroup] -> Map a [PhyloGroup]
286 groupByField toField groups = fromListWith (++) $ map (\g -> (toField g, [g])) groups
288 getPeriodPointers :: Filiation -> PhyloGroup -> [Pointer]
289 getPeriodPointers fil g =
291 ToChilds -> g ^. phylo_groupPeriodChilds
292 ToParents -> g ^. phylo_groupPeriodParents
294 filterProximity :: Proximity -> Double -> Double -> Bool
295 filterProximity proximity thr local =
297 WeightedLogJaccard _ -> local >= thr
298 WeightedLogSim _ -> local >= thr
301 getProximityName :: Proximity -> String
302 getProximityName proximity =
304 WeightedLogJaccard _ -> "WLJaccard"
305 WeightedLogSim _ -> "WeightedLogSim"
312 addPointers :: Filiation -> PointerType -> [Pointer] -> PhyloGroup -> PhyloGroup
313 addPointers fil pty pointers g =
315 TemporalPointer -> case fil of
316 ToChilds -> g & phylo_groupPeriodChilds .~ pointers
317 ToParents -> g & phylo_groupPeriodParents .~ pointers
318 LevelPointer -> case fil of
319 ToChilds -> g & phylo_groupLevelChilds .~ pointers
320 ToParents -> g & phylo_groupLevelParents .~ pointers
323 getPeriodIds :: Phylo -> [(Date,Date)]
324 getPeriodIds phylo = sortOn fst
326 $ phylo ^. phylo_periods
328 getLevelParentId :: PhyloGroup -> PhyloGroupId
329 getLevelParentId g = fst $ head' "getLevelParentId" $ g ^. phylo_groupLevelParents
331 getLastLevel :: Phylo -> Level
332 getLastLevel phylo = last' "lastLevel" $ getLevels phylo
334 getLevels :: Phylo -> [Level]
335 getLevels phylo = nub
337 $ keys $ view ( phylo_periods
339 . phylo_periodLevels ) phylo
341 getSeaElevation :: Phylo -> SeaElevation
342 getSeaElevation phylo = seaElevation (getConfig phylo)
345 getConfig :: Phylo -> Config
346 getConfig phylo = (phylo ^. phylo_param) ^. phyloParam_config
349 setConfig :: Config -> Phylo -> Phylo
350 setConfig config phylo = phylo
351 & phylo_param .~ (PhyloParam
352 ((phylo ^. phylo_param) ^. phyloParam_version)
353 ((phylo ^. phylo_param) ^. phyloParam_software)
356 -- & phylo_param & phyloParam_config & phyloParam_config .~ config
359 getRoots :: Phylo -> Vector Ngrams
360 getRoots phylo = (phylo ^. phylo_foundations) ^. foundations_roots
362 phyloToLastBranches :: Phylo -> [[PhyloGroup]]
363 phyloToLastBranches phylo = elems
365 $ map (\g -> (g ^. phylo_groupBranchId, [g]))
366 $ getGroupsFromLevel (last' "byBranches" $ getLevels phylo) phylo
368 getGroupsFromLevel :: Level -> Phylo -> [PhyloGroup]
369 getGroupsFromLevel lvl phylo =
370 elems $ view ( phylo_periods
374 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
375 . phylo_levelGroups ) phylo
378 getGroupsFromLevelPeriods :: Level -> [PhyloPeriodId] -> Phylo -> [PhyloGroup]
379 getGroupsFromLevelPeriods lvl periods phylo =
380 elems $ view ( phylo_periods
382 . filtered (\phyloPrd -> elem (phyloPrd ^. phylo_periodPeriod) periods)
385 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
386 . phylo_levelGroups ) phylo
389 getGroupsFromPeriods :: Level -> Map PhyloPeriodId PhyloPeriod -> [PhyloGroup]
390 getGroupsFromPeriods lvl periods =
391 elems $ view ( traverse
394 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
395 . phylo_levelGroups ) periods
398 updatePhyloGroups :: Level -> Map PhyloGroupId PhyloGroup -> Phylo -> Phylo
399 updatePhyloGroups lvl m phylo =
404 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
408 let id = getGroupId g
415 traceToPhylo :: Level -> Phylo -> Phylo
416 traceToPhylo lvl phylo =
417 trace ("\n" <> "-- | End of phylo making at level " <> show (lvl) <> " with "
418 <> show (length $ getGroupsFromLevel lvl phylo) <> " groups and "
419 <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel lvl phylo) <> " branches" <> "\n") phylo
425 mergeBranchIds :: [[Int]] -> [Int]
426 mergeBranchIds ids = (head' "mergeBranchIds" . sort . mostFreq') ids
428 -- | 2) find the most Up Left ids in the hierarchy of similarity
429 -- mostUpLeft :: [[Int]] -> [[Int]]
431 -- let groupIds = (map (\gIds -> (length $ head' "gIds" gIds, head' "gIds" gIds)) . groupBy (\id id' -> length id == length id') . sortOn length) ids'
432 -- inf = (fst . minimum) groupIds
433 -- in map snd $ filter (\gIds -> fst gIds == inf) groupIds
434 -- | 1) find the most frequent ids
435 mostFreq' :: [[Int]] -> [[Int]]
437 let groupIds = (map (\gIds -> (length gIds, head' "gIds" gIds)) . group . sort) ids'
438 sup = (fst . maximum) groupIds
439 in map snd $ filter (\gIds -> fst gIds == sup) groupIds
442 mergeMeta :: [Int] -> [PhyloGroup] -> Map Text [Double]
443 mergeMeta bId groups =
444 let ego = head' "mergeMeta" $ filter (\g -> (snd (g ^. phylo_groupBranchId)) == bId) groups
445 in fromList [("breaks",(ego ^. phylo_groupMeta) ! "breaks"),("seaLevels",(ego ^. phylo_groupMeta) ! "seaLevels")]
448 groupsToBranches :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
449 groupsToBranches groups =
450 {- run the related component algorithm -}
451 let egos = map (\g -> [getGroupId g]
452 ++ (map fst $ g ^. phylo_groupPeriodParents)
453 ++ (map fst $ g ^. phylo_groupPeriodChilds)
454 ++ (map fst $ g ^. phylo_groupAncestors)) $ elems groups
455 graph = relatedComponents egos
456 {- update each group's branch id -}
458 let groups' = elems $ restrictKeys groups (Set.fromList ids)
459 bId = mergeBranchIds $ map (\g -> snd $ g ^. phylo_groupBranchId) groups'
460 in map (\g -> g & phylo_groupBranchId %~ (\(lvl,_) -> (lvl,bId))) groups') graph
462 relatedComponents :: Ord a => [[a]] -> [[a]]
463 relatedComponents graph = foldl' (\acc groups ->
467 let acc' = partition (\groups' -> disjoint (Set.fromList groups') (Set.fromList groups)) acc
468 in (fst acc') ++ [nub $ concat $ (snd acc') ++ [groups]]) [] graph
470 toRelatedComponents :: [PhyloGroup] -> [((PhyloGroup,PhyloGroup),Double)] -> [[PhyloGroup]]
471 toRelatedComponents nodes edges =
472 let ref = fromList $ map (\g -> (getGroupId g, g)) nodes
473 clusters = relatedComponents $ ((map (\((g,g'),_) -> [getGroupId g, getGroupId g']) edges) ++ (map (\g -> [getGroupId g]) nodes))
474 in map (\cluster -> map (\gId -> ref ! gId) cluster) clusters
477 traceSynchronyEnd :: Phylo -> Phylo
478 traceSynchronyEnd phylo =
479 trace ( "-- | End synchronic clustering at level " <> show (getLastLevel phylo)
480 <> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
481 <> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
484 traceSynchronyStart :: Phylo -> Phylo
485 traceSynchronyStart phylo =
486 trace ( "\n" <> "-- | Start synchronic clustering at level " <> show (getLastLevel phylo)
487 <> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
488 <> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
496 getSensibility :: Proximity -> Double
497 getSensibility proxi = case proxi of
498 WeightedLogJaccard s -> s
499 WeightedLogSim s -> s
506 intersectInit :: Eq a => [a] -> [a] -> [a] -> [a]
507 intersectInit acc lst lst' =
508 if (null lst) || (null lst')
510 else if (head' "intersectInit" lst) == (head' "intersectInit" lst')
511 then intersectInit (acc ++ [head' "intersectInit" lst]) (tail lst) (tail lst')
514 branchIdsToProximity :: PhyloBranchId -> PhyloBranchId -> Double -> Double -> Double
515 branchIdsToProximity id id' thrInit thrStep = thrInit + thrStep * (fromIntegral $ length $ intersectInit [] (snd id) (snd id'))
517 ngramsInBranches :: [[PhyloGroup]] -> [Int]
518 ngramsInBranches branches = nub $ foldl (\acc g -> acc ++ (g ^. phylo_groupNgrams)) [] $ concat branches
521 traceMatchSuccess :: Double -> Double -> Double -> [[[PhyloGroup]]] -> [[[PhyloGroup]]]
522 traceMatchSuccess thr qua qua' nextBranches =
523 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . init . snd)
524 $ (head' "trace" $ head' "trace" $ head' "trace" nextBranches) ^. phylo_groupBranchId))
525 <> ",(1.." <> show (length nextBranches) <> ")]"
526 <> " | " <> show ((length . concat . concat) nextBranches) <> " groups" <> "\n"
527 <> " - splited with success in " <> show (map length nextBranches) <> " sub-branches" <> "\n"
528 <> " - for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " < " <> show(qua') <> ")\n" ) nextBranches
531 traceMatchFailure :: Double -> Double -> Double -> [[PhyloGroup]] -> [[PhyloGroup]]
532 traceMatchFailure thr qua qua' branches =
533 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
534 <> ",(1.." <> show (length branches) <> ")]"
535 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
536 <> " - split with failure for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " > " <> show(qua') <> ")\n"
540 traceMatchNoSplit :: [[PhyloGroup]] -> [[PhyloGroup]]
541 traceMatchNoSplit branches =
542 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
543 <> ",(1.." <> show (length branches) <> ")]"
544 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
545 <> " - unable to split in smaller branches" <> "\n"
549 traceMatchLimit :: [[PhyloGroup]] -> [[PhyloGroup]]
550 traceMatchLimit branches =
551 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
552 <> ",(1.." <> show (length branches) <> ")]"
553 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
554 <> " - unable to increase the threshold above 1" <> "\n"
558 traceMatchEnd :: [PhyloGroup] -> [PhyloGroup]
559 traceMatchEnd groups =
560 trace ("\n" <> "-- | End temporal matching with " <> show (length $ nub $ map (\g -> g ^. phylo_groupBranchId) groups)
561 <> " branches and " <> show (length groups) <> " groups" <> "\n") groups
564 traceTemporalMatching :: [PhyloGroup] -> [PhyloGroup]
565 traceTemporalMatching groups =
566 trace ( "\n" <> "-- | Start temporal matching for " <> show(length groups) <> " groups" <> "\n") groups
569 traceGroupsProxi :: Map (PhyloGroupId,PhyloGroupId) Double -> Map (PhyloGroupId,PhyloGroupId) Double
571 trace ( "\n" <> "-- | " <> show(Map.size m) <> " computed pairs of groups proximity" <> "\n") m