2 Module : Gargantext.Viz.Phylo.PhyloTools
3 Description : Module dedicated to all the tools needed for making a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 {-# LANGUAGE FlexibleContexts #-}
12 {-# LANGUAGE NoImplicitPrelude #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE RankNTypes #-}
15 {-# LANGUAGE ViewPatterns #-}
17 module Gargantext.Viz.Phylo.PhyloTools where
19 import Data.Vector (Vector, elemIndex)
20 import Data.List (sort, concat, null, union, (++), tails, sortOn, nub, init, tail, partition, tails, nubBy)
21 import Data.Set (Set, size, disjoint)
22 import Data.Map (Map, elems, fromList, unionWith, keys, member, (!), filterWithKey, fromListWith, empty)
23 import Data.String (String)
24 import Data.Text (Text, unwords)
26 import Gargantext.Prelude
27 import Gargantext.Viz.AdaptativePhylo
31 import Debug.Trace (trace)
32 import Control.Lens hiding (Level)
34 import qualified Data.Vector as Vector
35 import qualified Data.List as List
36 import qualified Data.Set as Set
42 -- | To print an important message as an IO()
43 printIOMsg :: String -> IO ()
48 <> "-- | " <> msg <> "\n" )
51 -- | To print a comment as an IO()
52 printIOComment :: String -> IO ()
54 putStrLn ( "\n" <> cmt <> "\n" )
62 roundToStr :: (PrintfArg a, Floating a) => Int -> a -> String
63 roundToStr = printf "%0.*f"
66 countSup :: Double -> [Double] -> Int
67 countSup s l = length $ filter (>s) l
69 dropByIdx :: Int -> [a] -> [a]
70 dropByIdx k l = take k l ++ drop (k+1) l
73 elemIndex' :: Eq a => a -> [a] -> Int
74 elemIndex' e l = case (List.elemIndex e l) of
75 Nothing -> panic ("[ERR][Viz.Phylo.PhyloTools] element not in list")
84 -- | Is this Ngrams a Foundations Root ?
85 isRoots :: Ngrams -> Vector Ngrams -> Bool
86 isRoots n ns = Vector.elem n ns
88 -- | To transform a list of nrams into a list of foundation's index
89 ngramsToIdx :: [Ngrams] -> Vector Ngrams -> [Int]
90 ngramsToIdx ns fdt = map (\n -> fromJust $ elemIndex n fdt) ns
92 -- | To transform a list of Ngrams Indexes into a Label
93 ngramsToLabel :: Vector Ngrams -> [Int] -> Text
94 ngramsToLabel ngrams l = unwords $ tail' "ngramsToLabel" $ concat $ map (\n -> ["|",n]) $ ngramsToText ngrams l
97 -- | To transform a list of Ngrams Indexes into a list of Text
98 ngramsToText :: Vector Ngrams -> [Int] -> [Text]
99 ngramsToText ngrams l = map (\idx -> ngrams Vector.! idx) l
106 -- | To transform a list of periods into a set of Dates
107 periodsToYears :: [(Date,Date)] -> Set Date
108 periodsToYears periods = (Set.fromList . sort . concat)
109 $ map (\(d,d') -> [d..d']) periods
112 findBounds :: [Date] -> (Date,Date)
114 let dates' = sort dates
115 in (head' "findBounds" dates', last' "findBounds" dates')
118 toPeriods :: [Date] -> Int -> Int -> [(Date,Date)]
119 toPeriods dates p s =
120 let (start,end) = findBounds dates
121 in map (\dates' -> (head' "toPeriods" dates', last' "toPeriods" dates'))
122 $ chunkAlong p s [start .. end]
125 -- | Get a regular & ascendante timeScale from a given list of dates
126 toTimeScale :: [Date] -> Int -> [Date]
127 toTimeScale dates step =
128 let (start,end) = findBounds dates
129 in [start, (start + step) .. end]
132 getTimeStep :: TimeUnit -> Int
133 getTimeStep time = case time of
136 getTimePeriod :: TimeUnit -> Int
137 getTimePeriod time = case time of
140 getTimeFrame :: TimeUnit -> Int
141 getTimeFrame time = case time of
149 -- | To find if l' is nested in l
150 isNested :: Eq a => [a] -> [a] -> Bool
153 | length l' > length l = False
154 | (union l l') == l = True
158 -- | To filter Fis with small Support but by keeping non empty Periods
159 keepFilled :: (Int -> [a] -> [a]) -> Int -> [a] -> [a]
160 keepFilled f thr l = if (null $ f thr l) && (not $ null l)
161 then keepFilled f (thr - 1) l
165 traceClique :: Map (Date, Date) [PhyloClique] -> String
166 traceClique mFis = foldl (\msg cpt -> msg <> show (countSup cpt cliques) <> " (>" <> show (cpt) <> ") " ) "" [1..6]
168 --------------------------------------
170 cliques = sort $ map (fromIntegral . size . _phyloClique_nodes) $ concat $ elems mFis
171 --------------------------------------
174 traceSupport :: Map (Date, Date) [PhyloClique] -> String
175 traceSupport mFis = foldl (\msg cpt -> msg <> show (countSup cpt supports) <> " (>" <> show (cpt) <> ") " ) "" [1..6]
177 --------------------------------------
179 supports = sort $ map (fromIntegral . _phyloClique_support) $ concat $ elems mFis
180 --------------------------------------
183 traceFis :: [Char] -> Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
184 traceFis msg mFis = trace ( "\n" <> "-- | " <> msg <> " : " <> show (sum $ map length $ elems mFis) <> "\n"
185 <> "Support : " <> (traceSupport mFis) <> "\n"
186 <> "Nb Ngrams : " <> (traceClique mFis) <> "\n" ) mFis
194 getCliqueSupport :: Clique -> Int
195 getCliqueSupport unit = case unit of
199 getCliqueSize :: Clique -> Int
200 getCliqueSize unit = case unit of
209 listToCombi' :: [a] -> [(a,a)]
210 listToCombi' l = [(x,y) | (x:rest) <- tails l, y <- rest]
212 listToEqual' :: Eq a => [a] -> [(a,a)]
213 listToEqual' l = [(x,y) | x <- l, y <- l, x == y]
215 listToKeys :: Eq a => [a] -> [(a,a)]
216 listToKeys lst = (listToCombi' lst) ++ (listToEqual' lst)
218 listToMatrix :: [Int] -> Map (Int,Int) Double
219 listToMatrix lst = fromList $ map (\k -> (k,1)) $ listToKeys $ sort lst
221 listToSeq :: Eq a => [a] -> [(a,a)]
222 listToSeq l = nubBy (\x y -> fst x == fst y) $ [ (x,y) | (x:rest) <- tails l, y <- rest ]
224 sumCooc :: Cooc -> Cooc -> Cooc
225 sumCooc cooc cooc' = unionWith (+) cooc cooc'
227 getTrace :: Cooc -> Double
228 getTrace cooc = sum $ elems $ filterWithKey (\(k,k') _ -> k == k') cooc
231 -- | To build the local cooc matrix of each phylogroup
232 ngramsToCooc :: [Int] -> [Cooc] -> Cooc
233 ngramsToCooc ngrams coocs =
234 let cooc = foldl (\acc cooc' -> sumCooc acc cooc') empty coocs
235 pairs = listToKeys ngrams
236 in filterWithKey (\k _ -> elem k pairs) cooc
243 getGroupId :: PhyloGroup -> PhyloGroupId
244 getGroupId group = ((group ^. phylo_groupPeriod, group ^. phylo_groupLevel), group ^. phylo_groupIndex)
246 getGroupThr :: PhyloGroup -> Double
247 getGroupThr group = head' "getGroupThr" ((group ^. phylo_groupMeta) ! "thr")
249 groupByField :: Ord a => (PhyloGroup -> a) -> [PhyloGroup] -> Map a [PhyloGroup]
250 groupByField toField groups = fromListWith (++) $ map (\g -> (toField g, [g])) groups
252 getPeriodPointers :: Filiation -> PhyloGroup -> [Pointer]
253 getPeriodPointers fil group =
255 ToChilds -> group ^. phylo_groupPeriodChilds
256 ToParents -> group ^. phylo_groupPeriodParents
258 filterProximity :: Proximity -> Double -> Double -> Bool
259 filterProximity proximity thr local =
261 WeightedLogJaccard _ _ _ -> local >= thr
264 getProximityName :: Proximity -> String
265 getProximityName proximity =
267 WeightedLogJaccard _ _ _ -> "WLJaccard"
270 getProximityInit :: Proximity -> Double
271 getProximityInit proximity =
273 WeightedLogJaccard _ i _ -> i
277 getProximityStep :: Proximity -> Double
278 getProximityStep proximity =
280 WeightedLogJaccard _ _ s -> s
287 addPointers :: PhyloGroup -> Filiation -> PointerType -> [Pointer] -> PhyloGroup
288 addPointers group fil pty pointers =
290 TemporalPointer -> case fil of
291 ToChilds -> group & phylo_groupPeriodChilds .~ pointers
292 ToParents -> group & phylo_groupPeriodParents .~ pointers
293 LevelPointer -> case fil of
294 ToChilds -> group & phylo_groupLevelChilds .~ pointers
295 ToParents -> group & phylo_groupLevelParents .~ pointers
298 getPeriodIds :: Phylo -> [(Date,Date)]
299 getPeriodIds phylo = sortOn fst
301 $ phylo ^. phylo_periods
303 getLevelParentId :: PhyloGroup -> PhyloGroupId
304 getLevelParentId g = fst $ head' "getLevelParentId" $ g ^. phylo_groupLevelParents
306 getLastLevel :: Phylo -> Level
307 getLastLevel phylo = last' "lastLevel" $ getLevels phylo
309 getLevels :: Phylo -> [Level]
310 getLevels phylo = nub
312 $ keys $ view ( phylo_periods
314 . phylo_periodLevels ) phylo
317 getPhyloThresholdInit :: Phylo -> Double
318 getPhyloThresholdInit phylo = getThresholdInit (phyloProximity (getConfig phylo))
321 getPhyloThresholdStep :: Phylo -> Double
322 getPhyloThresholdStep phylo = getThresholdStep (phyloProximity (getConfig phylo))
325 getConfig :: Phylo -> Config
326 getConfig phylo = (phylo ^. phylo_param) ^. phyloParam_config
329 getRoots :: Phylo -> Vector Ngrams
330 getRoots phylo = (phylo ^. phylo_foundations) ^. foundations_roots
332 phyloToLastBranches :: Phylo -> [[PhyloGroup]]
333 phyloToLastBranches phylo = elems
335 $ map (\g -> (g ^. phylo_groupBranchId, [g]))
336 $ getGroupsFromLevel (last' "byBranches" $ getLevels phylo) phylo
338 getGroupsFromLevel :: Level -> Phylo -> [PhyloGroup]
339 getGroupsFromLevel lvl phylo =
340 elems $ view ( phylo_periods
344 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
345 . phylo_levelGroups ) phylo
348 updatePhyloGroups :: Level -> Map PhyloGroupId PhyloGroup -> Phylo -> Phylo
349 updatePhyloGroups lvl m phylo =
354 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
358 let id = getGroupId group
365 traceToPhylo :: Level -> Phylo -> Phylo
366 traceToPhylo lvl phylo =
367 trace ("\n" <> "-- | End of phylo making at level " <> show (lvl) <> " with "
368 <> show (length $ getGroupsFromLevel lvl phylo) <> " groups and "
369 <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel lvl phylo) <> " branches" <> "\n") phylo
375 relatedComponents :: Ord a => [[a]] -> [[a]]
376 relatedComponents graph = foldl' (\acc groups ->
380 let acc' = partition (\groups' -> disjoint (Set.fromList groups') (Set.fromList groups)) acc
381 in (fst acc') ++ [nub $ concat $ (snd acc') ++ [groups]]) [] graph
384 traceSynchronyEnd :: Phylo -> Phylo
385 traceSynchronyEnd phylo =
386 trace ( "\n" <> "-- | End synchronic clustering at level " <> show (getLastLevel phylo)
387 <> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
388 <> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
391 traceSynchronyStart :: Phylo -> Phylo
392 traceSynchronyStart phylo =
393 trace ( "\n" <> "-- | Start synchronic clustering at level " <> show (getLastLevel phylo)
394 <> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
395 <> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
403 getSensibility :: Proximity -> Double
404 getSensibility proxi = case proxi of
405 WeightedLogJaccard s _ _ -> s
408 getThresholdInit :: Proximity -> Double
409 getThresholdInit proxi = case proxi of
410 WeightedLogJaccard _ t _ -> t
413 getThresholdStep :: Proximity -> Double
414 getThresholdStep proxi = case proxi of
415 WeightedLogJaccard _ _ s -> s
419 traceBranchMatching :: Proximity -> Double -> [PhyloGroup] -> [PhyloGroup]
420 traceBranchMatching proxi thr groups = case proxi of
421 WeightedLogJaccard _ i s -> trace (
422 roundToStr 2 thr <> " "
423 <> foldl (\acc _ -> acc <> ".") "." [(10*i),(10*i + 10*s)..(10*thr)]
424 <> " " <> show(length groups) <> " groups"
432 intersectInit :: Eq a => [a] -> [a] -> [a] -> [a]
433 intersectInit acc lst lst' =
434 if (null lst) || (null lst')
436 else if (head' "intersectInit" lst) == (head' "intersectInit" lst')
437 then intersectInit (acc ++ [head' "intersectInit" lst]) (tail lst) (tail lst')
440 branchIdsToProximity :: PhyloBranchId -> PhyloBranchId -> Double -> Double -> Double
441 branchIdsToProximity id id' thrInit thrStep = thrInit + thrStep * (fromIntegral $ length $ intersectInit [] (snd id) (snd id'))
443 ngramsInBranches :: [[PhyloGroup]] -> [Int]
444 ngramsInBranches branches = nub $ foldl (\acc g -> acc ++ (g ^. phylo_groupNgrams)) [] $ concat branches
447 traceMatchSuccess :: Double -> Double -> Double -> [[[PhyloGroup]]] -> [[[PhyloGroup]]]
448 traceMatchSuccess thr qua qua' nextBranches =
449 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . init . snd)
450 $ (head' "trace" $ head' "trace" $ head' "trace" nextBranches) ^. phylo_groupBranchId))
451 <> ",(1.." <> show (length nextBranches) <> ")]"
452 <> " | " <> show ((length . concat . concat) nextBranches) <> " groups" <> "\n"
453 <> " - splited with success in " <> show (map length nextBranches) <> " sub-branches" <> "\n"
454 <> " - for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " < " <> show(qua') <> ")\n" ) nextBranches
457 traceMatchFailure :: Double -> Double -> Double -> [[PhyloGroup]] -> [[PhyloGroup]]
458 traceMatchFailure thr qua qua' branches =
459 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
460 <> ",(1.." <> show (length branches) <> ")]"
461 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
462 <> " - split with failure for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " > " <> show(qua') <> ")\n"
466 traceMatchNoSplit :: [[PhyloGroup]] -> [[PhyloGroup]]
467 traceMatchNoSplit branches =
468 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
469 <> ",(1.." <> show (length branches) <> ")]"
470 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
471 <> " - unable to split in smaller branches" <> "\n"
475 traceMatchLimit :: [[PhyloGroup]] -> [[PhyloGroup]]
476 traceMatchLimit branches =
477 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
478 <> ",(1.." <> show (length branches) <> ")]"
479 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
480 <> " - unable to increase the threshold above 1" <> "\n"
484 traceMatchEnd :: [PhyloGroup] -> [PhyloGroup]
485 traceMatchEnd groups =
486 trace ("\n" <> "-- | End temporal matching with " <> show (length $ nub $ map (\g -> g ^. phylo_groupBranchId) groups)
487 <> " branches and " <> show (length groups) <> " groups" <> "\n") groups
490 traceTemporalMatching :: [PhyloGroup] -> [PhyloGroup]
491 traceTemporalMatching groups =
492 trace ( "\n" <> "-- | Start temporal matching for " <> show(length groups) <> " groups" <> "\n") groups