2 Module : Gargantext.Viz.Phylo.PhyloTools
3 Description : Module dedicated to all the tools needed for making a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 {-# LANGUAGE ViewPatterns #-}
13 module Gargantext.Viz.Phylo.PhyloTools where
15 import Data.Vector (Vector, elemIndex)
16 import Data.List (sort, concat, null, union, (++), tails, sortOn, nub, init, tail, partition, tails, nubBy)
17 import Data.Set (Set, size, disjoint)
18 import Data.Map (Map, elems, fromList, unionWith, keys, member, (!), filterWithKey, fromListWith, empty)
19 import Data.String (String)
20 import Data.Text (Text, unwords)
22 import Gargantext.Prelude
23 import Gargantext.Viz.AdaptativePhylo
27 import Debug.Trace (trace)
28 import Control.Lens hiding (Level)
30 import qualified Data.Vector as Vector
31 import qualified Data.List as List
32 import qualified Data.Set as Set
33 import qualified Data.Map as Map
39 -- | To print an important message as an IO()
40 printIOMsg :: String -> IO ()
45 <> "-- | " <> msg <> "\n" )
48 -- | To print a comment as an IO()
49 printIOComment :: String -> IO ()
51 putStrLn ( "\n" <> cmt <> "\n" )
59 roundToStr :: (PrintfArg a, Floating a) => Int -> a -> String
60 roundToStr = printf "%0.*f"
63 countSup :: Double -> [Double] -> Int
64 countSup s l = length $ filter (>s) l
67 dropByIdx :: Int -> [a] -> [a]
68 dropByIdx k l = take k l ++ drop (k+1) l
71 elemIndex' :: Eq a => a -> [a] -> Int
72 elemIndex' e l = case (List.elemIndex e l) of
73 Nothing -> panic ("[ERR][Viz.Phylo.PhyloTools] element not in list")
77 commonPrefix :: Eq a => [a] -> [a] -> [a] -> [a]
78 commonPrefix lst lst' acc =
79 if (null lst || null lst')
81 else if (head' "commonPrefix" lst == head' "commonPrefix" lst')
82 then commonPrefix (tail lst) (tail lst') (acc ++ [head' "commonPrefix" lst])
91 -- | Is this Ngrams a Foundations Root ?
92 isRoots :: Ngrams -> Vector Ngrams -> Bool
93 isRoots n ns = Vector.elem n ns
95 -- | To transform a list of nrams into a list of foundation's index
96 ngramsToIdx :: [Ngrams] -> Vector Ngrams -> [Int]
97 ngramsToIdx ns fdt = map (\n -> fromJust $ elemIndex n fdt) ns
99 -- | To transform a list of Ngrams Indexes into a Label
100 ngramsToLabel :: Vector Ngrams -> [Int] -> Text
101 ngramsToLabel ngrams l = unwords $ tail' "ngramsToLabel" $ concat $ map (\n -> ["|",n]) $ ngramsToText ngrams l
104 -- | To transform a list of Ngrams Indexes into a list of Text
105 ngramsToText :: Vector Ngrams -> [Int] -> [Text]
106 ngramsToText ngrams l = map (\idx -> ngrams Vector.! idx) l
113 -- | To transform a list of periods into a set of Dates
114 periodsToYears :: [(Date,Date)] -> Set Date
115 periodsToYears periods = (Set.fromList . sort . concat)
116 $ map (\(d,d') -> [d..d']) periods
119 findBounds :: [Date] -> (Date,Date)
121 let dates' = sort dates
122 in (head' "findBounds" dates', last' "findBounds" dates')
125 toPeriods :: [Date] -> Int -> Int -> [(Date,Date)]
126 toPeriods dates p s =
127 let (start,end) = findBounds dates
128 in map (\dates' -> (head' "toPeriods" dates', last' "toPeriods" dates'))
129 $ chunkAlong p s [start .. end]
132 -- | Get a regular & ascendante timeScale from a given list of dates
133 toTimeScale :: [Date] -> Int -> [Date]
134 toTimeScale dates step =
135 let (start,end) = findBounds dates
136 in [start, (start + step) .. end]
139 getTimeStep :: TimeUnit -> Int
140 getTimeStep time = case time of
143 getTimePeriod :: TimeUnit -> Int
144 getTimePeriod time = case time of
147 getTimeFrame :: TimeUnit -> Int
148 getTimeFrame time = case time of
156 -- | To find if l' is nested in l
157 isNested :: Eq a => [a] -> [a] -> Bool
160 | length l' > length l = False
161 | (union l l') == l = True
165 -- | To filter Fis with small Support but by keeping non empty Periods
166 keepFilled :: (Int -> [a] -> [a]) -> Int -> [a] -> [a]
167 keepFilled f thr l = if (null $ f thr l) && (not $ null l)
168 then keepFilled f (thr - 1) l
172 traceClique :: Map (Date, Date) [PhyloClique] -> String
173 traceClique mFis = foldl (\msg cpt -> msg <> show (countSup cpt cliques) <> " (>" <> show (cpt) <> ") " ) "" [1..6]
175 --------------------------------------
177 cliques = sort $ map (fromIntegral . size . _phyloClique_nodes) $ concat $ elems mFis
178 --------------------------------------
181 traceSupport :: Map (Date, Date) [PhyloClique] -> String
182 traceSupport mFis = foldl (\msg cpt -> msg <> show (countSup cpt supports) <> " (>" <> show (cpt) <> ") " ) "" [1..6]
184 --------------------------------------
186 supports = sort $ map (fromIntegral . _phyloClique_support) $ concat $ elems mFis
187 --------------------------------------
190 traceFis :: [Char] -> Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
191 traceFis msg mFis = trace ( "\n" <> "-- | " <> msg <> " : " <> show (sum $ map length $ elems mFis) <> "\n"
192 <> "Support : " <> (traceSupport mFis) <> "\n"
193 <> "Nb Ngrams : " <> (traceClique mFis) <> "\n" ) mFis
201 getCliqueSupport :: Clique -> Int
202 getCliqueSupport unit = case unit of
206 getCliqueSize :: Clique -> Int
207 getCliqueSize unit = case unit of
216 listToCombi' :: [a] -> [(a,a)]
217 listToCombi' l = [(x,y) | (x:rest) <- tails l, y <- rest]
219 listToEqual' :: Eq a => [a] -> [(a,a)]
220 listToEqual' l = [(x,y) | x <- l, y <- l, x == y]
222 listToKeys :: Eq a => [a] -> [(a,a)]
223 listToKeys lst = (listToCombi' lst) ++ (listToEqual' lst)
225 listToMatrix :: [Int] -> Map (Int,Int) Double
226 listToMatrix lst = fromList $ map (\k -> (k,1)) $ listToKeys $ sort lst
228 listToSeq :: Eq a => [a] -> [(a,a)]
229 listToSeq l = nubBy (\x y -> fst x == fst y) $ [ (x,y) | (x:rest) <- tails l, y <- rest ]
231 sumCooc :: Cooc -> Cooc -> Cooc
232 sumCooc cooc cooc' = unionWith (+) cooc cooc'
234 getTrace :: Cooc -> Double
235 getTrace cooc = sum $ elems $ filterWithKey (\(k,k') _ -> k == k') cooc
237 coocToDiago :: Cooc -> Cooc
238 coocToDiago cooc = filterWithKey (\(k,k') _ -> k == k') cooc
240 -- | To build the local cooc matrix of each phylogroup
241 ngramsToCooc :: [Int] -> [Cooc] -> Cooc
242 ngramsToCooc ngrams coocs =
243 let cooc = foldl (\acc cooc' -> sumCooc acc cooc') empty coocs
244 pairs = listToKeys ngrams
245 in filterWithKey (\k _ -> elem k pairs) cooc
252 getGroupId :: PhyloGroup -> PhyloGroupId
253 getGroupId group = ((group ^. phylo_groupPeriod, group ^. phylo_groupLevel), group ^. phylo_groupIndex)
255 idToPrd :: PhyloGroupId -> PhyloPeriodId
256 idToPrd id = (fst . fst) id
258 getGroupThr :: PhyloGroup -> Double
259 getGroupThr group = last' "getGroupThr" ((group ^. phylo_groupMeta) ! "breaks")
261 groupByField :: Ord a => (PhyloGroup -> a) -> [PhyloGroup] -> Map a [PhyloGroup]
262 groupByField toField groups = fromListWith (++) $ map (\g -> (toField g, [g])) groups
264 getPeriodPointers :: Filiation -> PhyloGroup -> [Pointer]
265 getPeriodPointers fil group =
267 ToChilds -> group ^. phylo_groupPeriodChilds
268 ToParents -> group ^. phylo_groupPeriodParents
270 filterProximity :: Proximity -> Double -> Double -> Bool
271 filterProximity proximity thr local =
273 WeightedLogJaccard _ -> local >= thr
276 getProximityName :: Proximity -> String
277 getProximityName proximity =
279 WeightedLogJaccard _ -> "WLJaccard"
286 addPointers :: Filiation -> PointerType -> [Pointer] -> PhyloGroup -> PhyloGroup
287 addPointers fil pty pointers group =
289 TemporalPointer -> case fil of
290 ToChilds -> group & phylo_groupPeriodChilds .~ pointers
291 ToParents -> group & phylo_groupPeriodParents .~ pointers
292 LevelPointer -> case fil of
293 ToChilds -> group & phylo_groupLevelChilds .~ pointers
294 ToParents -> group & phylo_groupLevelParents .~ pointers
297 getPeriodIds :: Phylo -> [(Date,Date)]
298 getPeriodIds phylo = sortOn fst
300 $ phylo ^. phylo_periods
302 getLevelParentId :: PhyloGroup -> PhyloGroupId
303 getLevelParentId g = fst $ head' "getLevelParentId" $ g ^. phylo_groupLevelParents
305 getLastLevel :: Phylo -> Level
306 getLastLevel phylo = last' "lastLevel" $ getLevels phylo
308 getLevels :: Phylo -> [Level]
309 getLevels phylo = nub
311 $ keys $ view ( phylo_periods
313 . phylo_periodLevels ) phylo
315 getSeaElevation :: Phylo -> SeaElevation
316 getSeaElevation phylo = seaElevation (getConfig phylo)
319 getConfig :: Phylo -> Config
320 getConfig phylo = (phylo ^. phylo_param) ^. phyloParam_config
323 getRoots :: Phylo -> Vector Ngrams
324 getRoots phylo = (phylo ^. phylo_foundations) ^. foundations_roots
326 phyloToLastBranches :: Phylo -> [[PhyloGroup]]
327 phyloToLastBranches phylo = elems
329 $ map (\g -> (g ^. phylo_groupBranchId, [g]))
330 $ getGroupsFromLevel (last' "byBranches" $ getLevels phylo) phylo
332 getGroupsFromLevel :: Level -> Phylo -> [PhyloGroup]
333 getGroupsFromLevel lvl phylo =
334 elems $ view ( phylo_periods
338 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
339 . phylo_levelGroups ) phylo
342 getGroupsFromLevelPeriods :: Level -> [PhyloPeriodId] -> Phylo -> [PhyloGroup]
343 getGroupsFromLevelPeriods lvl periods phylo =
344 elems $ view ( phylo_periods
346 . filtered (\phyloPrd -> elem (phyloPrd ^. phylo_periodPeriod) periods)
349 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
350 . phylo_levelGroups ) phylo
353 getGroupsFromPeriods :: Level -> Map PhyloPeriodId PhyloPeriod -> [PhyloGroup]
354 getGroupsFromPeriods lvl periods =
355 elems $ view ( traverse
358 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
359 . phylo_levelGroups ) periods
362 updatePhyloGroups :: Level -> Map PhyloGroupId PhyloGroup -> Phylo -> Phylo
363 updatePhyloGroups lvl m phylo =
368 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
372 let id = getGroupId group
379 traceToPhylo :: Level -> Phylo -> Phylo
380 traceToPhylo lvl phylo =
381 trace ("\n" <> "-- | End of phylo making at level " <> show (lvl) <> " with "
382 <> show (length $ getGroupsFromLevel lvl phylo) <> " groups and "
383 <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel lvl phylo) <> " branches" <> "\n") phylo
389 relatedComponents :: Ord a => [[a]] -> [[a]]
390 relatedComponents graph = foldl' (\acc groups ->
394 let acc' = partition (\groups' -> disjoint (Set.fromList groups') (Set.fromList groups)) acc
395 in (fst acc') ++ [nub $ concat $ (snd acc') ++ [groups]]) [] graph
398 traceSynchronyEnd :: Phylo -> Phylo
399 traceSynchronyEnd phylo =
400 trace ( "\n" <> "-- | End synchronic clustering at level " <> show (getLastLevel phylo)
401 <> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
402 <> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
405 traceSynchronyStart :: Phylo -> Phylo
406 traceSynchronyStart phylo =
407 trace ( "\n" <> "-- | Start synchronic clustering at level " <> show (getLastLevel phylo)
408 <> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
409 <> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
417 getSensibility :: Proximity -> Double
418 getSensibility proxi = case proxi of
419 WeightedLogJaccard s -> s
426 intersectInit :: Eq a => [a] -> [a] -> [a] -> [a]
427 intersectInit acc lst lst' =
428 if (null lst) || (null lst')
430 else if (head' "intersectInit" lst) == (head' "intersectInit" lst')
431 then intersectInit (acc ++ [head' "intersectInit" lst]) (tail lst) (tail lst')
434 branchIdsToProximity :: PhyloBranchId -> PhyloBranchId -> Double -> Double -> Double
435 branchIdsToProximity id id' thrInit thrStep = thrInit + thrStep * (fromIntegral $ length $ intersectInit [] (snd id) (snd id'))
437 ngramsInBranches :: [[PhyloGroup]] -> [Int]
438 ngramsInBranches branches = nub $ foldl (\acc g -> acc ++ (g ^. phylo_groupNgrams)) [] $ concat branches
441 traceMatchSuccess :: Double -> Double -> Double -> [[[PhyloGroup]]] -> [[[PhyloGroup]]]
442 traceMatchSuccess thr qua qua' nextBranches =
443 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . init . snd)
444 $ (head' "trace" $ head' "trace" $ head' "trace" nextBranches) ^. phylo_groupBranchId))
445 <> ",(1.." <> show (length nextBranches) <> ")]"
446 <> " | " <> show ((length . concat . concat) nextBranches) <> " groups" <> "\n"
447 <> " - splited with success in " <> show (map length nextBranches) <> " sub-branches" <> "\n"
448 <> " - for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " < " <> show(qua') <> ")\n" ) nextBranches
451 traceMatchFailure :: Double -> Double -> Double -> [[PhyloGroup]] -> [[PhyloGroup]]
452 traceMatchFailure thr qua qua' branches =
453 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
454 <> ",(1.." <> show (length branches) <> ")]"
455 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
456 <> " - split with failure for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " > " <> show(qua') <> ")\n"
460 traceMatchNoSplit :: [[PhyloGroup]] -> [[PhyloGroup]]
461 traceMatchNoSplit branches =
462 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
463 <> ",(1.." <> show (length branches) <> ")]"
464 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
465 <> " - unable to split in smaller branches" <> "\n"
469 traceMatchLimit :: [[PhyloGroup]] -> [[PhyloGroup]]
470 traceMatchLimit branches =
471 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
472 <> ",(1.." <> show (length branches) <> ")]"
473 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
474 <> " - unable to increase the threshold above 1" <> "\n"
478 traceMatchEnd :: [PhyloGroup] -> [PhyloGroup]
479 traceMatchEnd groups =
480 trace ("\n" <> "-- | End temporal matching with " <> show (length $ nub $ map (\g -> g ^. phylo_groupBranchId) groups)
481 <> " branches and " <> show (length groups) <> " groups" <> "\n") groups
484 traceTemporalMatching :: [PhyloGroup] -> [PhyloGroup]
485 traceTemporalMatching groups =
486 trace ( "\n" <> "-- | Start temporal matching for " <> show(length groups) <> " groups" <> "\n") groups
489 traceGroupsProxi :: Map (PhyloGroupId,PhyloGroupId) Double -> Map (PhyloGroupId,PhyloGroupId) Double
491 trace ( "\n" <> "-- | " <> show(Map.size m) <> " computed pairs of groups proximity" <> "\n") m