2 Module : Gargantext.Core.Viz.Phylo.PhyloTools
3 Description : Module dedicated to all the tools needed for making a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 {-# LANGUAGE ViewPatterns #-}
13 module Gargantext.Core.Viz.Phylo.PhyloTools where
15 import Control.Lens hiding (Level)
16 import Data.List (sort, concat, null, union, (++), tails, sortOn, nub, init, tail, partition, tails, nubBy, group, notElem)
17 import Data.Map (Map, elems, fromList, unionWith, keys, member, (!), filterWithKey, fromListWith, empty, restrictKeys)
18 import Data.Set (Set, disjoint)
19 import Data.String (String)
20 import Data.Text (Text,unpack)
21 import Data.Vector (Vector, elemIndex)
22 import Debug.Trace (trace)
23 import Gargantext.Core.Viz.Phylo
24 import Gargantext.Prelude
25 import Prelude (floor,read)
27 import qualified Data.List as List
28 import qualified Data.Map as Map
29 import qualified Data.Set as Set
30 import qualified Data.Text as Text
31 import qualified Data.Vector as Vector
37 -- | To print an important message as an IO()
38 printIOMsg :: String -> IO ()
43 <> "-- | " <> msg <> "\n" )
46 -- | To print a comment as an IO()
47 printIOComment :: String -> IO ()
49 putStrLn ( "\n" <> cmt <> "\n" )
56 -- truncate' :: Double -> Int -> Double
57 -- truncate' x n = (fromIntegral (floor (x * t))) / t
60 truncate' :: Double -> Int -> Double
61 truncate' x n = (fromIntegral $ (floor (x * t) :: Int)) / t
67 getInMap :: Int -> Map Int Double -> Double
73 roundToStr :: (PrintfArg a, Floating a) => Int -> a -> String
74 roundToStr = printf "%0.*f"
77 countSup :: Double -> [Double] -> Int
78 countSup s l = length $ filter (>s) l
81 dropByIdx :: Int -> [a] -> [a]
82 dropByIdx k l = take k l ++ drop (k+1) l
85 elemIndex' :: Eq a => a -> [a] -> Int
86 elemIndex' e l = case (List.elemIndex e l) of
87 Nothing -> panic ("[ERR][Viz.Phylo.PhyloTools] element not in list")
91 commonPrefix :: Eq a => [a] -> [a] -> [a] -> [a]
92 commonPrefix lst lst' acc =
93 if (null lst || null lst')
95 else if (head' "commonPrefix" lst == head' "commonPrefix" lst')
96 then commonPrefix (tail lst) (tail lst') (acc ++ [head' "commonPrefix" lst])
100 ---------------------
101 -- | Foundations | --
102 ---------------------
105 -- | Is this Ngrams a Foundations Root ?
106 isRoots :: Ngrams -> Vector Ngrams -> Bool
107 isRoots n ns = Vector.elem n ns
109 -- | To transform a list of nrams into a list of foundation's index
110 ngramsToIdx :: [Ngrams] -> Vector Ngrams -> [Int]
111 ngramsToIdx ns fdt = map (\n -> fromJust $ elemIndex n fdt) ns
113 -- | To transform a list of sources into a list of sources' index
114 sourcesToIdx :: [Text] -> Vector Text -> [Int]
115 sourcesToIdx ss ps = nub $ map (\s -> fromJust $ elemIndex s ps) ss
117 -- | To transform a list of Ngrams Indexes into a Label
118 ngramsToLabel :: Vector Ngrams -> [Int] -> Text
119 ngramsToLabel ngrams l = Text.unwords $ tail' "ngramsToLabel" $ concat $ map (\n -> ["|",n]) $ ngramsToText ngrams l
121 idxToLabel :: [Int] -> String
122 idxToLabel l = List.unwords $ tail' "idxToLabel" $ concat $ map (\n -> ["|",show n]) l
124 idxToLabel' :: [Double] -> String
125 idxToLabel' l = List.unwords $ tail' "idxToLabel" $ concat $ map (\n -> ["|",show n]) l
127 -- | To transform a list of Ngrams Indexes into a list of Text
128 ngramsToText :: Vector Ngrams -> [Int] -> [Text]
129 ngramsToText ngrams l = map (\idx -> ngrams Vector.! idx) l
136 -- | To transform a list of periods into a set of Dates
137 periodsToYears :: [(Date,Date)] -> Set Date
138 periodsToYears periods = (Set.fromList . sort . concat)
139 $ map (\(d,d') -> [d..d']) periods
142 findBounds :: [Date] -> (Date,Date)
144 let dates' = sort dates
145 in (head' "findBounds" dates', last' "findBounds" dates')
148 toPeriods :: [Date] -> Int -> Int -> [(Date,Date)]
149 toPeriods dates p s =
150 let (start,end) = findBounds dates
151 in map (\dates' -> (head' "toPeriods" dates', last' "toPeriods" dates'))
152 $ chunkAlong p s [start .. end]
155 toFstDate :: [Text] -> Text
160 let d' = read (filter (\c -> notElem c ['U','T','C',' ',':','-']) $ unpack d)::Int
163 toLstDate :: [Text] -> Text
169 let d' = read (filter (\c -> notElem c ['U','T','C',' ',':','-']) $ unpack d)::Int
173 getTimeScale :: Phylo -> [Char]
174 getTimeScale p = case (timeUnit $ getConfig p) of
175 Epoch _ _ _ -> "epoch"
177 Month _ _ _ -> "month"
182 -- | Get a regular & ascendante timeScale from a given list of dates
183 toTimeScale :: [Date] -> Int -> [Date]
184 toTimeScale dates step =
185 let (start,end) = findBounds dates
186 in [start, (start + step) .. end]
189 getTimeStep :: TimeUnit -> Int
190 getTimeStep time = case time of
197 getTimePeriod :: TimeUnit -> Int
198 getTimePeriod time = case time of
205 getTimeFrame :: TimeUnit -> Int
206 getTimeFrame time = case time of
218 -- | To find if l' is nested in l
219 isNested :: Eq a => [a] -> [a] -> Bool
222 | length l' > length l = False
223 | (union l l') == l = True
227 -- | To filter Fis with small Support but by keeping non empty Periods
228 keepFilled :: (Int -> [a] -> [a]) -> Int -> [a] -> [a]
229 keepFilled f thr l = if (null $ f thr l) && (not $ null l)
230 then keepFilled f (thr - 1) l
234 traceClique :: Map (Date, Date) [PhyloClique] -> String
235 traceClique mFis = foldl (\msg cpt -> msg <> show (countSup cpt cliques) <> " (>" <> show (cpt) <> ") " ) "" [1..6]
237 --------------------------------------
239 cliques = sort $ map (fromIntegral . length . _phyloClique_nodes) $ concat $ elems mFis
240 --------------------------------------
243 traceSupport :: Map (Date, Date) [PhyloClique] -> String
244 traceSupport mFis = foldl (\msg cpt -> msg <> show (countSup cpt supports) <> " (>" <> show (cpt) <> ") " ) "" [1..6]
246 --------------------------------------
248 supports = sort $ map (fromIntegral . _phyloClique_support) $ concat $ elems mFis
249 --------------------------------------
252 traceFis :: [Char] -> Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
253 traceFis msg mFis = trace ( "\n" <> "-- | " <> msg <> " : " <> show (sum $ map length $ elems mFis) <> "\n"
254 <> "Support : " <> (traceSupport mFis) <> "\n"
255 <> "Nb Ngrams : " <> (traceClique mFis) <> "\n" ) mFis
263 getCliqueSupport :: Clique -> Int
264 getCliqueSupport unit = case unit of
268 getCliqueSize :: Clique -> Int
269 getCliqueSize unit = case unit of
278 listToCombi' :: [a] -> [(a,a)]
279 listToCombi' l = [(x,y) | (x:rest) <- tails l, y <- rest]
281 listToEqual' :: Eq a => [a] -> [(a,a)]
282 listToEqual' l = [(x,y) | x <- l, y <- l, x == y]
284 listToKeys :: Eq a => [a] -> [(a,a)]
285 listToKeys lst = (listToCombi' lst) ++ (listToEqual' lst)
287 listToMatrix :: [Int] -> Map (Int,Int) Double
288 listToMatrix lst = fromList $ map (\k -> (k,1)) $ listToKeys $ sort lst
290 listToMatrix' :: [Ngrams] -> Map (Ngrams,Ngrams) Int
291 listToMatrix' lst = fromList $ map (\k -> (k,1)) $ listToKeys $ sort lst
293 listToSeq :: Eq a => [a] -> [(a,a)]
294 listToSeq l = nubBy (\x y -> fst x == fst y) $ [ (x,y) | (x:rest) <- tails l, y <- rest ]
296 sumCooc :: Cooc -> Cooc -> Cooc
297 sumCooc cooc cooc' = unionWith (+) cooc cooc'
299 getTrace :: Cooc -> Double
300 getTrace cooc = sum $ elems $ filterWithKey (\(k,k') _ -> k == k') cooc
302 coocToDiago :: Cooc -> Cooc
303 coocToDiago cooc = filterWithKey (\(k,k') _ -> k == k') cooc
305 -- | To build the local cooc matrix of each phylogroup
306 ngramsToCooc :: [Int] -> [Cooc] -> Cooc
307 ngramsToCooc ngrams coocs =
308 let cooc = foldl (\acc cooc' -> sumCooc acc cooc') empty coocs
309 pairs = listToKeys ngrams
310 in filterWithKey (\k _ -> elem k pairs) cooc
317 getGroupId :: PhyloGroup -> PhyloGroupId
318 getGroupId g = ((g ^. phylo_groupPeriod, g ^. phylo_groupLevel), g ^. phylo_groupIndex)
320 idToPrd :: PhyloGroupId -> PhyloPeriodId
321 idToPrd id = (fst . fst) id
323 groupByField :: Ord a => (PhyloGroup -> a) -> [PhyloGroup] -> Map a [PhyloGroup]
324 groupByField toField groups = fromListWith (++) $ map (\g -> (toField g, [g])) groups
326 getPeriodPointers :: Filiation -> PhyloGroup -> [Pointer]
327 getPeriodPointers fil g =
329 ToChilds -> g ^. phylo_groupPeriodChilds
330 ToParents -> g ^. phylo_groupPeriodParents
331 ToChildsMemory -> undefined
332 ToParentsMemory -> undefined
334 filterProximity :: Proximity -> Double -> Double -> Bool
335 filterProximity proximity thr local =
337 WeightedLogJaccard _ -> local >= thr
338 WeightedLogSim _ -> local >= thr
339 Hamming _ -> undefined
341 getProximityName :: Proximity -> String
342 getProximityName proximity =
344 WeightedLogJaccard _ -> "WLJaccard"
345 WeightedLogSim _ -> "WeightedLogSim"
346 Hamming _ -> "Hamming"
352 addPointers :: Filiation -> PointerType -> [Pointer] -> PhyloGroup -> PhyloGroup
353 addPointers fil pty pointers g =
355 TemporalPointer -> case fil of
356 ToChilds -> g & phylo_groupPeriodChilds .~ pointers
357 ToParents -> g & phylo_groupPeriodParents .~ pointers
358 ToChildsMemory -> undefined
359 ToParentsMemory -> undefined
360 LevelPointer -> case fil of
361 ToChilds -> g & phylo_groupLevelChilds .~ pointers
362 ToParents -> g & phylo_groupLevelParents .~ pointers
363 ToChildsMemory -> undefined
364 ToParentsMemory -> undefined
366 toPointer' :: Double -> Pointer -> Pointer'
367 toPointer' thr pt = (fst pt,(thr,snd pt))
370 addMemoryPointers :: Filiation -> PointerType -> Double -> [Pointer] -> PhyloGroup -> PhyloGroup
371 addMemoryPointers fil pty thr pointers g =
373 TemporalPointer -> case fil of
374 ToChilds -> undefined
375 ToParents -> undefined
376 ToChildsMemory -> g & phylo_groupPeriodMemoryChilds .~ (concat [(g ^. phylo_groupPeriodMemoryChilds),(map (\pt -> toPointer' thr pt) pointers)])
377 ToParentsMemory -> g & phylo_groupPeriodMemoryParents .~ (concat [(g ^. phylo_groupPeriodMemoryParents),(map (\pt -> toPointer' thr pt) pointers)])
378 LevelPointer -> undefined
381 getPeriodIds :: Phylo -> [(Date,Date)]
382 getPeriodIds phylo = sortOn fst
384 $ phylo ^. phylo_periods
386 getLevelParentId :: PhyloGroup -> PhyloGroupId
387 getLevelParentId g = fst $ head' "getLevelParentId" $ g ^. phylo_groupLevelParents
389 getLastLevel :: Phylo -> Level
390 getLastLevel phylo = last' "lastLevel" $ getLevels phylo
392 getLevels :: Phylo -> [Level]
393 getLevels phylo = nub
395 $ keys $ view ( phylo_periods
397 . phylo_periodLevels ) phylo
399 getSeaElevation :: Phylo -> SeaElevation
400 getSeaElevation phylo = seaElevation (getConfig phylo)
403 getConfig :: Phylo -> Config
404 getConfig phylo = (phylo ^. phylo_param) ^. phyloParam_config
407 setConfig :: Config -> Phylo -> Phylo
408 setConfig config phylo = phylo
409 & phylo_param .~ (PhyloParam
410 ((phylo ^. phylo_param) ^. phyloParam_version)
411 ((phylo ^. phylo_param) ^. phyloParam_software)
414 -- & phylo_param & phyloParam_config & phyloParam_config .~ config
417 getRoots :: Phylo -> Vector Ngrams
418 getRoots phylo = (phylo ^. phylo_foundations) ^. foundations_roots
420 getSources :: Phylo -> Vector Text
421 getSources phylo = _sources (phylo ^. phylo_sources)
423 phyloToLastBranches :: Phylo -> [[PhyloGroup]]
424 phyloToLastBranches phylo = elems
426 $ map (\g -> (g ^. phylo_groupBranchId, [g]))
427 $ getGroupsFromLevel (last' "byBranches" $ getLevels phylo) phylo
429 getGroupsFromLevel :: Level -> Phylo -> [PhyloGroup]
430 getGroupsFromLevel lvl phylo =
431 elems $ view ( phylo_periods
435 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
436 . phylo_levelGroups ) phylo
439 getGroupsFromLevelPeriods :: Level -> [PhyloPeriodId] -> Phylo -> [PhyloGroup]
440 getGroupsFromLevelPeriods lvl periods phylo =
441 elems $ view ( phylo_periods
443 . filtered (\phyloPrd -> elem (phyloPrd ^. phylo_periodPeriod) periods)
446 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
447 . phylo_levelGroups ) phylo
450 getGroupsFromPeriods :: Level -> Map PhyloPeriodId PhyloPeriod -> [PhyloGroup]
451 getGroupsFromPeriods lvl periods =
452 elems $ view ( traverse
455 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
456 . phylo_levelGroups ) periods
459 updatePhyloGroups :: Level -> Map PhyloGroupId PhyloGroup -> Phylo -> Phylo
460 updatePhyloGroups lvl m phylo =
465 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
469 let id = getGroupId g
475 updatePeriods :: Map (Date,Date) (Text,Text) -> Phylo -> Phylo
476 updatePeriods periods' phylo =
477 over (phylo_periods . traverse)
479 let prd' = periods' ! (prd ^. phylo_periodPeriod)
480 lvls = map (\lvl -> lvl & phylo_levelPeriod' .~ prd') $ prd ^. phylo_periodLevels
481 in prd & phylo_periodPeriod' .~ prd'
482 & phylo_periodLevels .~ lvls
486 traceToPhylo :: Level -> Phylo -> Phylo
487 traceToPhylo lvl phylo =
488 trace ("\n" <> "-- | End of phylo making at level " <> show (lvl) <> " with "
489 <> show (length $ getGroupsFromLevel lvl phylo) <> " groups and "
490 <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel lvl phylo) <> " branches" <> "\n") phylo
496 mergeBranchIds :: [[Int]] -> [Int]
497 mergeBranchIds ids = (head' "mergeBranchIds" . sort . mostFreq') ids
499 -- | 2) find the most Up Left ids in the hierarchy of similarity
500 -- mostUpLeft :: [[Int]] -> [[Int]]
502 -- let groupIds = (map (\gIds -> (length $ head' "gIds" gIds, head' "gIds" gIds)) . groupBy (\id id' -> length id == length id') . sortOn length) ids'
503 -- inf = (fst . minimum) groupIds
504 -- in map snd $ filter (\gIds -> fst gIds == inf) groupIds
505 -- | 1) find the most frequent ids
506 mostFreq' :: [[Int]] -> [[Int]]
508 let groupIds = (map (\gIds -> (length gIds, head' "gIds" gIds)) . group . sort) ids'
509 sup = (fst . maximum) groupIds
510 in map snd $ filter (\gIds -> fst gIds == sup) groupIds
513 mergeMeta :: [Int] -> [PhyloGroup] -> Map Text [Double]
514 mergeMeta bId groups =
515 let ego = head' "mergeMeta" $ filter (\g -> (snd (g ^. phylo_groupBranchId)) == bId) groups
516 in fromList [("breaks",(ego ^. phylo_groupMeta) ! "breaks"),("seaLevels",(ego ^. phylo_groupMeta) ! "seaLevels")]
519 groupsToBranches :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
520 groupsToBranches groups =
521 {- run the related component algorithm -}
522 let egos = map (\g -> [getGroupId g]
523 ++ (map fst $ g ^. phylo_groupPeriodParents)
524 ++ (map fst $ g ^. phylo_groupPeriodChilds)
525 ++ (map fst $ g ^. phylo_groupAncestors)) $ elems groups
526 graph = relatedComponents egos
527 {- update each group's branch id -}
529 let groups' = elems $ restrictKeys groups (Set.fromList ids)
530 bId = mergeBranchIds $ map (\g -> snd $ g ^. phylo_groupBranchId) groups'
531 in map (\g -> g & phylo_groupBranchId %~ (\(lvl,_) -> (lvl,bId))) groups') graph
533 relatedComponents :: Ord a => [[a]] -> [[a]]
534 relatedComponents graph = foldl' (\acc groups ->
538 let acc' = partition (\groups' -> disjoint (Set.fromList groups') (Set.fromList groups)) acc
539 in (fst acc') ++ [nub $ concat $ (snd acc') ++ [groups]]) [] graph
541 toRelatedComponents :: [PhyloGroup] -> [((PhyloGroup,PhyloGroup),Double)] -> [[PhyloGroup]]
542 toRelatedComponents nodes edges =
543 let ref = fromList $ map (\g -> (getGroupId g, g)) nodes
544 clusters = relatedComponents $ ((map (\((g,g'),_) -> [getGroupId g, getGroupId g']) edges) ++ (map (\g -> [getGroupId g]) nodes))
545 in map (\cluster -> map (\gId -> ref ! gId) cluster) clusters
548 traceSynchronyEnd :: Phylo -> Phylo
549 traceSynchronyEnd phylo =
550 trace ( "-- | End synchronic clustering at level " <> show (getLastLevel phylo)
551 <> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
552 <> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
555 traceSynchronyStart :: Phylo -> Phylo
556 traceSynchronyStart phylo =
557 trace ( "\n" <> "-- | Start synchronic clustering at level " <> show (getLastLevel phylo)
558 <> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
559 <> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
567 getSensibility :: Proximity -> Double
568 getSensibility proxi = case proxi of
569 WeightedLogJaccard s -> s
570 WeightedLogSim s -> s
571 Hamming _ -> undefined
577 intersectInit :: Eq a => [a] -> [a] -> [a] -> [a]
578 intersectInit acc lst lst' =
579 if (null lst) || (null lst')
581 else if (head' "intersectInit" lst) == (head' "intersectInit" lst')
582 then intersectInit (acc ++ [head' "intersectInit" lst]) (tail lst) (tail lst')
585 branchIdsToProximity :: PhyloBranchId -> PhyloBranchId -> Double -> Double -> Double
586 branchIdsToProximity id id' thrInit thrStep = thrInit + thrStep * (fromIntegral $ length $ intersectInit [] (snd id) (snd id'))
588 ngramsInBranches :: [[PhyloGroup]] -> [Int]
589 ngramsInBranches branches = nub $ foldl (\acc g -> acc ++ (g ^. phylo_groupNgrams)) [] $ concat branches
592 traceMatchSuccess :: Double -> Double -> Double -> [[[PhyloGroup]]] -> [[[PhyloGroup]]]
593 traceMatchSuccess thr qua qua' nextBranches =
594 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . init . snd)
595 $ (head' "trace" $ head' "trace" $ head' "trace" nextBranches) ^. phylo_groupBranchId))
596 <> ",(1.." <> show (length nextBranches) <> ")]"
597 <> " | " <> show ((length . concat . concat) nextBranches) <> " groups" <> "\n"
598 <> " - splited with success in " <> show (map length nextBranches) <> " sub-branches" <> "\n"
599 <> " - for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " < " <> show(qua') <> ")\n" ) nextBranches
602 traceMatchFailure :: Double -> Double -> Double -> [[PhyloGroup]] -> [[PhyloGroup]]
603 traceMatchFailure thr qua qua' branches =
604 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
605 <> ",(1.." <> show (length branches) <> ")]"
606 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
607 <> " - split with failure for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " > " <> show(qua') <> ")\n"
611 traceMatchNoSplit :: [[PhyloGroup]] -> [[PhyloGroup]]
612 traceMatchNoSplit branches =
613 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
614 <> ",(1.." <> show (length branches) <> ")]"
615 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
616 <> " - unable to split in smaller branches" <> "\n"
620 traceMatchLimit :: [[PhyloGroup]] -> [[PhyloGroup]]
621 traceMatchLimit branches =
622 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
623 <> ",(1.." <> show (length branches) <> ")]"
624 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
625 <> " - unable to increase the threshold above 1" <> "\n"
629 traceMatchEnd :: [PhyloGroup] -> [PhyloGroup]
630 traceMatchEnd groups =
631 trace ("\n" <> "-- | End temporal matching with " <> show (length $ nub $ map (\g -> g ^. phylo_groupBranchId) groups)
632 <> " branches and " <> show (length groups) <> " groups" <> "\n") groups
635 traceTemporalMatching :: [PhyloGroup] -> [PhyloGroup]
636 traceTemporalMatching groups =
637 trace ( "\n" <> "-- | Start temporal matching for " <> show(length groups) <> " groups" <> "\n") groups
640 traceGroupsProxi :: Map (PhyloGroupId,PhyloGroupId) Double -> Map (PhyloGroupId,PhyloGroupId) Double
642 trace ( "\n" <> "-- | " <> show(Map.size m) <> " computed pairs of groups proximity" <> "\n") m