2 Module : Gargantext.Viz.Phylo.PhyloTools
3 Description : Module dedicated to all the tools needed for making a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 {-# LANGUAGE FlexibleContexts #-}
12 {-# LANGUAGE NoImplicitPrelude #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE RankNTypes #-}
15 {-# LANGUAGE ViewPatterns #-}
17 module Gargantext.Viz.Phylo.PhyloTools where
19 import Data.Vector (Vector, elemIndex)
20 import Data.List (sort, concat, null, union, (++), tails, sortOn, nub, init, tail, partition, tails, nubBy)
21 import Data.Set (Set, size, disjoint)
22 import Data.Map (Map, elems, fromList, unionWith, keys, member, (!), filterWithKey, fromListWith, empty)
23 import Data.String (String)
24 import Data.Text (Text, unwords)
26 import Gargantext.Prelude
27 import Gargantext.Viz.AdaptativePhylo
31 import Debug.Trace (trace)
32 import Control.Lens hiding (Level)
34 import qualified Data.Vector as Vector
35 import qualified Data.List as List
36 import qualified Data.Set as Set
42 -- | To print an important message as an IO()
43 printIOMsg :: String -> IO ()
48 <> "-- | " <> msg <> "\n" )
51 -- | To print a comment as an IO()
52 printIOComment :: String -> IO ()
54 putStrLn ( "\n" <> cmt <> "\n" )
62 roundToStr :: (PrintfArg a, Floating a) => Int -> a -> String
63 roundToStr = printf "%0.*f"
66 countSup :: Double -> [Double] -> Int
67 countSup s l = length $ filter (>s) l
69 dropByIdx :: Int -> [a] -> [a]
70 dropByIdx k l = take k l ++ drop (k+1) l
73 elemIndex' :: Eq a => a -> [a] -> Int
74 elemIndex' e l = case (List.elemIndex e l) of
75 Nothing -> panic ("[ERR][Viz.Phylo.PhyloTools] element not in list")
84 -- | Is this Ngrams a Foundations Root ?
85 isRoots :: Ngrams -> Vector Ngrams -> Bool
86 isRoots n ns = Vector.elem n ns
88 -- | To transform a list of nrams into a list of foundation's index
89 ngramsToIdx :: [Ngrams] -> Vector Ngrams -> [Int]
90 ngramsToIdx ns fdt = map (\n -> fromJust $ elemIndex n fdt) ns
92 -- | To transform a list of Ngrams Indexes into a Label
93 ngramsToLabel :: Vector Ngrams -> [Int] -> Text
94 ngramsToLabel ngrams l = unwords $ tail' "ngramsToLabel" $ concat $ map (\n -> ["|",n]) $ ngramsToText ngrams l
97 -- | To transform a list of Ngrams Indexes into a list of Text
98 ngramsToText :: Vector Ngrams -> [Int] -> [Text]
99 ngramsToText ngrams l = map (\idx -> ngrams Vector.! idx) l
106 -- | To transform a list of periods into a set of Dates
107 periodsToYears :: [(Date,Date)] -> Set Date
108 periodsToYears periods = (Set.fromList . sort . concat)
109 $ map (\(d,d') -> [d..d']) periods
112 findBounds :: [Date] -> (Date,Date)
114 let dates' = sort dates
115 in (head' "findBounds" dates', last' "findBounds" dates')
118 toPeriods :: [Date] -> Int -> Int -> [(Date,Date)]
119 toPeriods dates p s =
120 let (start,end) = findBounds dates
121 in map (\dates' -> (head' "toPeriods" dates', last' "toPeriods" dates'))
122 $ chunkAlong p s [start .. end]
125 -- | Get a regular & ascendante timeScale from a given list of dates
126 toTimeScale :: [Date] -> Int -> [Date]
127 toTimeScale dates step =
128 let (start,end) = findBounds dates
129 in [start, (start + step) .. end]
132 getTimeStep :: TimeUnit -> Int
133 getTimeStep time = case time of
136 getTimePeriod :: TimeUnit -> Int
137 getTimePeriod time = case time of
140 getTimeFrame :: TimeUnit -> Int
141 getTimeFrame time = case time of
149 -- | To find if l' is nested in l
150 isNested :: Eq a => [a] -> [a] -> Bool
153 | length l' > length l = False
154 | (union l l') == l = True
158 -- | To filter Fis with small Support but by keeping non empty Periods
159 keepFilled :: (Int -> [a] -> [a]) -> Int -> [a] -> [a]
160 keepFilled f thr l = if (null $ f thr l) && (not $ null l)
161 then keepFilled f (thr - 1) l
165 traceClique :: Map (Date, Date) [PhyloFis] -> String
166 traceClique mFis = foldl (\msg cpt -> msg <> show (countSup cpt cliques) <> " (>" <> show (cpt) <> ") " ) "" [1..6]
168 --------------------------------------
170 cliques = sort $ map (fromIntegral . size . _phyloFis_clique) $ concat $ elems mFis
171 --------------------------------------
174 traceSupport :: Map (Date, Date) [PhyloFis] -> String
175 traceSupport mFis = foldl (\msg cpt -> msg <> show (countSup cpt supports) <> " (>" <> show (cpt) <> ") " ) "" [1..6]
177 --------------------------------------
179 supports = sort $ map (fromIntegral . _phyloFis_support) $ concat $ elems mFis
180 --------------------------------------
183 traceFis :: [Char] -> Map (Date, Date) [PhyloFis] -> Map (Date, Date) [PhyloFis]
184 traceFis msg mFis = trace ( "\n" <> "-- | " <> msg <> " : " <> show (sum $ map length $ elems mFis) <> "\n"
185 <> "Support : " <> (traceSupport mFis) <> "\n"
186 <> "Clique : " <> (traceClique mFis) <> "\n" ) mFis
189 -------------------------
190 -- | Contextual unit | --
191 -------------------------
194 getFisSupport :: ContextualUnit -> Int
195 getFisSupport unit = case unit of
197 -- _ -> panic ("[ERR][Viz.Phylo.PhyloTools.getFisSupport] Only Fis has a support")
199 getFisSize :: ContextualUnit -> Int
200 getFisSize unit = case unit of
202 -- _ -> panic ("[ERR][Viz.Phylo.PhyloTools.getFisSupport] Only Fis has a clique size")
209 listToCombi' :: [a] -> [(a,a)]
210 listToCombi' l = [(x,y) | (x:rest) <- tails l, y <- rest]
212 listToEqual' :: Eq a => [a] -> [(a,a)]
213 listToEqual' l = [(x,y) | x <- l, y <- l, x == y]
215 listToKeys :: Eq a => [a] -> [(a,a)]
216 listToKeys lst = (listToCombi' lst) ++ (listToEqual' lst)
218 listToMatrix :: [Int] -> Map (Int,Int) Double
219 listToMatrix lst = fromList $ map (\k -> (k,1)) $ listToKeys $ sort lst
221 listToSeq :: Eq a => [a] -> [(a,a)]
222 listToSeq l = nubBy (\x y -> fst x == fst y) $ [ (x,y) | (x:rest) <- tails l, y <- rest ]
224 sumCooc :: Cooc -> Cooc -> Cooc
225 sumCooc cooc cooc' = unionWith (+) cooc cooc'
227 getTrace :: Cooc -> Double
228 getTrace cooc = sum $ elems $ filterWithKey (\(k,k') _ -> k == k') cooc
231 -- | To build the local cooc matrix of each phylogroup
232 ngramsToCooc :: [Int] -> [Cooc] -> Cooc
233 ngramsToCooc ngrams coocs =
234 let cooc = foldl (\acc cooc' -> sumCooc acc cooc') empty coocs
235 pairs = listToKeys ngrams
236 in filterWithKey (\k _ -> elem k pairs) cooc
243 getGroupId :: PhyloGroup -> PhyloGroupId
244 getGroupId group = ((group ^. phylo_groupPeriod, group ^. phylo_groupLevel), group ^. phylo_groupIndex)
246 groupByField :: Ord a => (PhyloGroup -> a) -> [PhyloGroup] -> Map a [PhyloGroup]
247 groupByField toField groups = fromListWith (++) $ map (\g -> (toField g, [g])) groups
249 getPeriodPointers :: Filiation -> PhyloGroup -> [Pointer]
250 getPeriodPointers fil group =
252 ToChilds -> group ^. phylo_groupPeriodChilds
253 ToParents -> group ^. phylo_groupPeriodParents
255 filterProximity :: Proximity -> Double -> Double -> Bool
256 filterProximity proximity thr local =
258 WeightedLogJaccard _ _ _ -> local >= thr
261 getProximityName :: Proximity -> String
262 getProximityName proximity =
264 WeightedLogJaccard _ _ _ -> "WLJaccard"
267 getProximityInit :: Proximity -> Double
268 getProximityInit proximity =
270 WeightedLogJaccard _ i _ -> i
274 getProximityStep :: Proximity -> Double
275 getProximityStep proximity =
277 WeightedLogJaccard _ _ s -> s
284 addPointers :: PhyloGroup -> Filiation -> PointerType -> [Pointer] -> PhyloGroup
285 addPointers group fil pty pointers =
287 TemporalPointer -> case fil of
288 ToChilds -> group & phylo_groupPeriodChilds .~ pointers
289 ToParents -> group & phylo_groupPeriodParents .~ pointers
290 LevelPointer -> case fil of
291 ToChilds -> group & phylo_groupLevelChilds .~ pointers
292 ToParents -> group & phylo_groupLevelParents .~ pointers
295 getPeriodIds :: Phylo -> [(Date,Date)]
296 getPeriodIds phylo = sortOn fst
298 $ phylo ^. phylo_periods
300 getLevelParentId :: PhyloGroup -> PhyloGroupId
301 getLevelParentId g = fst $ head' "getLevelParentId" $ g ^. phylo_groupLevelParents
303 getLastLevel :: Phylo -> Level
304 getLastLevel phylo = last' "lastLevel" $ getLevels phylo
306 getLevels :: Phylo -> [Level]
307 getLevels phylo = nub
309 $ keys $ view ( phylo_periods
311 . phylo_periodLevels ) phylo
314 getConfig :: Phylo -> Config
315 getConfig phylo = (phylo ^. phylo_param) ^. phyloParam_config
318 getRoots :: Phylo -> Vector Ngrams
319 getRoots phylo = (phylo ^. phylo_foundations) ^. foundations_roots
321 phyloToLastBranches :: Phylo -> [[PhyloGroup]]
322 phyloToLastBranches phylo = elems
324 $ map (\g -> (g ^. phylo_groupBranchId, [g]))
325 $ getGroupsFromLevel (last' "byBranches" $ getLevels phylo) phylo
327 getGroupsFromLevel :: Level -> Phylo -> [PhyloGroup]
328 getGroupsFromLevel lvl phylo =
329 elems $ view ( phylo_periods
333 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
334 . phylo_levelGroups ) phylo
337 updatePhyloGroups :: Level -> Map PhyloGroupId PhyloGroup -> Phylo -> Phylo
338 updatePhyloGroups lvl m phylo =
343 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
347 let id = getGroupId group
354 traceToPhylo :: Level -> Phylo -> Phylo
355 traceToPhylo lvl phylo =
356 trace ("\n" <> "-- | End of phylo making at level " <> show (lvl) <> " with "
357 <> show (length $ getGroupsFromLevel lvl phylo) <> " groups and "
358 <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel lvl phylo) <> " branches" <> "\n") phylo
364 relatedComponents :: Ord a => [[a]] -> [[a]]
365 relatedComponents graph = foldl' (\acc groups ->
369 let acc' = partition (\groups' -> disjoint (Set.fromList groups') (Set.fromList groups)) acc
370 in (fst acc') ++ [nub $ concat $ (snd acc') ++ [groups]]) [] graph
373 traceSynchronyEnd :: Phylo -> Phylo
374 traceSynchronyEnd phylo =
375 trace ( "\n" <> "-- | End synchronic clustering at level " <> show (getLastLevel phylo)
376 <> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
377 <> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
380 traceSynchronyStart :: Phylo -> Phylo
381 traceSynchronyStart phylo =
382 trace ( "\n" <> "-- | Start synchronic clustering at level " <> show (getLastLevel phylo)
383 <> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
384 <> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
392 getSensibility :: Proximity -> Double
393 getSensibility proxi = case proxi of
394 WeightedLogJaccard s _ _ -> s
397 getThresholdInit :: Proximity -> Double
398 getThresholdInit proxi = case proxi of
399 WeightedLogJaccard _ t _ -> t
402 getThresholdStep :: Proximity -> Double
403 getThresholdStep proxi = case proxi of
404 WeightedLogJaccard _ _ s -> s
408 traceBranchMatching :: Proximity -> Double -> [PhyloGroup] -> [PhyloGroup]
409 traceBranchMatching proxi thr groups = case proxi of
410 WeightedLogJaccard _ i s -> trace (
411 roundToStr 2 thr <> " "
412 <> foldl (\acc _ -> acc <> ".") "." [(10*i),(10*i + 10*s)..(10*thr)]
413 <> " " <> show(length groups) <> " groups"
421 intersectInit :: Eq a => [a] -> [a] -> [a] -> [a]
422 intersectInit acc lst lst' =
423 if (null lst) || (null lst')
425 else if (head' "intersectInit" lst) == (head' "intersectInit" lst')
426 then intersectInit (acc ++ [head' "intersectInit" lst]) (tail lst) (tail lst')
429 branchIdsToProximity :: PhyloBranchId -> PhyloBranchId -> Double -> Double -> Double
430 branchIdsToProximity id id' thrInit thrStep = thrInit + thrStep * (fromIntegral $ length $ intersectInit [] (snd id) (snd id'))
432 ngramsInBranches :: [[PhyloGroup]] -> [Int]
433 ngramsInBranches branches = nub $ foldl (\acc g -> acc ++ (g ^. phylo_groupNgrams)) [] $ concat branches
436 traceMatchSuccess :: Double -> Double -> Double -> [[[PhyloGroup]]] -> [[[PhyloGroup]]]
437 traceMatchSuccess thr qua qua' nextBranches =
438 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . init . snd)
439 $ (head' "trace" $ head' "trace" $ head' "trace" nextBranches) ^. phylo_groupBranchId))
440 <> ",(1.." <> show (length nextBranches) <> ")]"
441 <> " | " <> show ((length . concat . concat) nextBranches) <> " groups" <> "\n"
442 <> " - splited with success in " <> show (map length nextBranches) <> " sub-branches" <> "\n"
443 <> " - for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " < " <> show(qua') <> ")\n" ) nextBranches
446 traceMatchFailure :: Double -> Double -> Double -> [[PhyloGroup]] -> [[PhyloGroup]]
447 traceMatchFailure thr qua qua' branches =
448 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
449 <> ",(1.." <> show (length branches) <> ")]"
450 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
451 <> " - split with failure for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " > " <> show(qua') <> ")\n"
455 traceMatchNoSplit :: [[PhyloGroup]] -> [[PhyloGroup]]
456 traceMatchNoSplit branches =
457 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
458 <> ",(1.." <> show (length branches) <> ")]"
459 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
460 <> " - unable to split in smaller branches" <> "\n"
464 traceMatchLimit :: [[PhyloGroup]] -> [[PhyloGroup]]
465 traceMatchLimit branches =
466 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
467 <> ",(1.." <> show (length branches) <> ")]"
468 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
469 <> " - unable to increase the threshold above 1" <> "\n"
473 traceMatchEnd :: [PhyloGroup] -> [PhyloGroup]
474 traceMatchEnd groups =
475 trace ("\n" <> "-- | End temporal matching with " <> show (length $ nub $ map (\g -> g ^. phylo_groupBranchId) groups)
476 <> " branches and " <> show (length groups) <> " groups" <> "\n") groups
479 traceTemporalMatching :: [PhyloGroup] -> [PhyloGroup]
480 traceTemporalMatching groups =
481 trace ( "\n" <> "-- | Start temporal matching for " <> show(length groups) <> " groups" <> "\n") groups