2 Module : Gargantext.Viz.Phylo.PhyloTools
3 Description : Module dedicated to all the tools needed for making a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
11 {-# LANGUAGE FlexibleContexts #-}
12 {-# LANGUAGE NoImplicitPrelude #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE RankNTypes #-}
15 {-# LANGUAGE ViewPatterns #-}
17 module Gargantext.Viz.Phylo.PhyloTools where
19 import Data.Vector (Vector, elemIndex)
20 import Data.List (sort, concat, null, union, (++), tails, sortOn, nub, init, tail, partition, tails, nubBy)
21 import Data.Set (Set, size, disjoint)
22 import Data.Map (Map, elems, fromList, unionWith, keys, member, (!), filterWithKey, fromListWith, empty)
23 import Data.String (String)
24 import Data.Text (Text, unwords)
26 import Gargantext.Prelude
27 import Gargantext.Viz.AdaptativePhylo
31 import Debug.Trace (trace)
32 import Control.Lens hiding (Level)
34 import qualified Data.Vector as Vector
35 import qualified Data.List as List
36 import qualified Data.Set as Set
42 -- | To print an important message as an IO()
43 printIOMsg :: String -> IO ()
48 <> "-- | " <> msg <> "\n" )
51 -- | To print a comment as an IO()
52 printIOComment :: String -> IO ()
54 putStrLn ( "\n" <> cmt <> "\n" )
62 roundToStr :: (PrintfArg a, Floating a) => Int -> a -> String
63 roundToStr = printf "%0.*f"
66 countSup :: Double -> [Double] -> Int
67 countSup s l = length $ filter (>s) l
70 elemIndex' :: Eq a => a -> [a] -> Int
71 elemIndex' e l = case (List.elemIndex e l) of
72 Nothing -> panic ("[ERR][Viz.Phylo.PhyloTools] element not in list")
81 -- | Is this Ngrams a Foundations Root ?
82 isRoots :: Ngrams -> Vector Ngrams -> Bool
83 isRoots n ns = Vector.elem n ns
85 -- | To transform a list of nrams into a list of foundation's index
86 ngramsToIdx :: [Ngrams] -> Vector Ngrams -> [Int]
87 ngramsToIdx ns fdt = map (\n -> fromJust $ elemIndex n fdt) ns
89 -- | To transform a list of Ngrams Indexes into a Label
90 ngramsToLabel :: Vector Ngrams -> [Int] -> Text
91 ngramsToLabel ngrams l = unwords $ tail' "ngramsToLabel" $ concat $ map (\n -> ["|",n]) $ ngramsToText ngrams l
94 -- | To transform a list of Ngrams Indexes into a list of Text
95 ngramsToText :: Vector Ngrams -> [Int] -> [Text]
96 ngramsToText ngrams l = map (\idx -> ngrams Vector.! idx) l
103 -- | To transform a list of periods into a set of Dates
104 periodsToYears :: [(Date,Date)] -> Set Date
105 periodsToYears periods = (Set.fromList . sort . concat)
106 $ map (\(d,d') -> [d..d']) periods
109 findBounds :: [Date] -> (Date,Date)
111 let dates' = sort dates
112 in (head' "findBounds" dates', last' "findBounds" dates')
115 toPeriods :: [Date] -> Int -> Int -> [(Date,Date)]
116 toPeriods dates p s =
117 let (start,end) = findBounds dates
118 in map (\dates' -> (head' "toPeriods" dates', last' "toPeriods" dates'))
119 $ chunkAlong p s [start .. end]
122 -- | Get a regular & ascendante timeScale from a given list of dates
123 toTimeScale :: [Date] -> Int -> [Date]
124 toTimeScale dates step =
125 let (start,end) = findBounds dates
126 in [start, (start + step) .. end]
129 getTimeStep :: TimeUnit -> Int
130 getTimeStep time = case time of
133 getTimePeriod :: TimeUnit -> Int
134 getTimePeriod time = case time of
137 getTimeFrame :: TimeUnit -> Int
138 getTimeFrame time = case time of
146 -- | To find if l' is nested in l
147 isNested :: Eq a => [a] -> [a] -> Bool
150 | length l' > length l = False
151 | (union l l') == l = True
155 -- | To filter Fis with small Support but by keeping non empty Periods
156 keepFilled :: (Int -> [a] -> [a]) -> Int -> [a] -> [a]
157 keepFilled f thr l = if (null $ f thr l) && (not $ null l)
158 then keepFilled f (thr - 1) l
162 traceClique :: Map (Date, Date) [PhyloFis] -> String
163 traceClique mFis = foldl (\msg cpt -> msg <> show (countSup cpt cliques) <> " (>" <> show (cpt) <> ") " ) "" [1..6]
165 --------------------------------------
167 cliques = sort $ map (fromIntegral . size . _phyloFis_clique) $ concat $ elems mFis
168 --------------------------------------
171 traceSupport :: Map (Date, Date) [PhyloFis] -> String
172 traceSupport mFis = foldl (\msg cpt -> msg <> show (countSup cpt supports) <> " (>" <> show (cpt) <> ") " ) "" [1..6]
174 --------------------------------------
176 supports = sort $ map (fromIntegral . _phyloFis_support) $ concat $ elems mFis
177 --------------------------------------
180 traceFis :: [Char] -> Map (Date, Date) [PhyloFis] -> Map (Date, Date) [PhyloFis]
181 traceFis msg mFis = trace ( "\n" <> "-- | " <> msg <> " : " <> show (sum $ map length $ elems mFis) <> "\n"
182 <> "Support : " <> (traceSupport mFis) <> "\n"
183 <> "Clique : " <> (traceClique mFis) <> "\n" ) mFis
186 -------------------------
187 -- | Contextual unit | --
188 -------------------------
191 getFisSupport :: ContextualUnit -> Int
192 getFisSupport unit = case unit of
194 -- _ -> panic ("[ERR][Viz.Phylo.PhyloTools.getFisSupport] Only Fis has a support")
196 getFisSize :: ContextualUnit -> Int
197 getFisSize unit = case unit of
199 -- _ -> panic ("[ERR][Viz.Phylo.PhyloTools.getFisSupport] Only Fis has a clique size")
206 listToCombi' :: [a] -> [(a,a)]
207 listToCombi' l = [(x,y) | (x:rest) <- tails l, y <- rest]
209 listToEqual' :: Eq a => [a] -> [(a,a)]
210 listToEqual' l = [(x,y) | x <- l, y <- l, x == y]
212 listToKeys :: Eq a => [a] -> [(a,a)]
213 listToKeys lst = (listToCombi' lst) ++ (listToEqual' lst)
215 listToMatrix :: [Int] -> Map (Int,Int) Double
216 listToMatrix lst = fromList $ map (\k -> (k,1)) $ listToKeys $ sort lst
218 listToSeq :: Eq a => [a] -> [(a,a)]
219 listToSeq l = nubBy (\x y -> fst x == fst y) $ [ (x,y) | (x:rest) <- tails l, y <- rest ]
221 sumCooc :: Cooc -> Cooc -> Cooc
222 sumCooc cooc cooc' = unionWith (+) cooc cooc'
224 getTrace :: Cooc -> Double
225 getTrace cooc = sum $ elems $ filterWithKey (\(k,k') _ -> k == k') cooc
228 -- | To build the local cooc matrix of each phylogroup
229 ngramsToCooc :: [Int] -> [Cooc] -> Cooc
230 ngramsToCooc ngrams coocs =
231 let cooc = foldl (\acc cooc' -> sumCooc acc cooc') empty coocs
232 pairs = listToKeys ngrams
233 in filterWithKey (\k _ -> elem k pairs) cooc
240 getGroupId :: PhyloGroup -> PhyloGroupId
241 getGroupId group = ((group ^. phylo_groupPeriod, group ^. phylo_groupLevel), group ^. phylo_groupIndex)
243 groupByField :: Ord a => (PhyloGroup -> a) -> [PhyloGroup] -> Map a [PhyloGroup]
244 groupByField toField groups = fromListWith (++) $ map (\g -> (toField g, [g])) groups
246 getPeriodPointers :: Filiation -> PhyloGroup -> [Pointer]
247 getPeriodPointers fil group =
249 ToChilds -> group ^. phylo_groupPeriodChilds
250 ToParents -> group ^. phylo_groupPeriodParents
252 filterProximity :: Proximity -> Double -> Double -> Bool
253 filterProximity proximity thr local =
255 WeightedLogJaccard _ _ _ -> local >= thr
263 addPointers :: PhyloGroup -> Filiation -> PointerType -> [Pointer] -> PhyloGroup
264 addPointers group fil pty pointers =
266 TemporalPointer -> case fil of
267 ToChilds -> group & phylo_groupPeriodChilds .~ pointers
268 ToParents -> group & phylo_groupPeriodParents .~ pointers
269 LevelPointer -> case fil of
270 ToChilds -> group & phylo_groupLevelChilds .~ pointers
271 ToParents -> group & phylo_groupLevelParents .~ pointers
274 getPeriodIds :: Phylo -> [(Date,Date)]
275 getPeriodIds phylo = sortOn fst
277 $ phylo ^. phylo_periods
279 getLevelParentId :: PhyloGroup -> PhyloGroupId
280 getLevelParentId g = fst $ head' "getLevelParentId" $ g ^. phylo_groupLevelParents
282 getLastLevel :: Phylo -> Level
283 getLastLevel phylo = last' "lastLevel" $ getLevels phylo
285 getLevels :: Phylo -> [Level]
286 getLevels phylo = nub
288 $ keys $ view ( phylo_periods
290 . phylo_periodLevels ) phylo
293 getConfig :: Phylo -> Config
294 getConfig phylo = (phylo ^. phylo_param) ^. phyloParam_config
297 getRoots :: Phylo -> Vector Ngrams
298 getRoots phylo = (phylo ^. phylo_foundations) ^. foundations_roots
300 phyloToLastBranches :: Phylo -> [[PhyloGroup]]
301 phyloToLastBranches phylo = elems
303 $ map (\g -> (g ^. phylo_groupBranchId, [g]))
304 $ getGroupsFromLevel (last' "byBranches" $ getLevels phylo) phylo
306 getGroupsFromLevel :: Level -> Phylo -> [PhyloGroup]
307 getGroupsFromLevel lvl phylo =
308 elems $ view ( phylo_periods
312 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
313 . phylo_levelGroups ) phylo
316 updatePhyloGroups :: Level -> Map PhyloGroupId PhyloGroup -> Phylo -> Phylo
317 updatePhyloGroups lvl m phylo =
322 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
326 let id = getGroupId group
333 traceToPhylo :: Level -> Phylo -> Phylo
334 traceToPhylo lvl phylo =
335 trace ("\n" <> "-- | End of phylo making at level " <> show (lvl) <> " with "
336 <> show (length $ getGroupsFromLevel lvl phylo) <> " groups and "
337 <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel lvl phylo) <> " branches" <> "\n") phylo
343 relatedComponents :: Ord a => [[a]] -> [[a]]
344 relatedComponents graph = foldl' (\acc groups ->
348 let acc' = partition (\groups' -> disjoint (Set.fromList groups') (Set.fromList groups)) acc
349 in (fst acc') ++ [nub $ concat $ (snd acc') ++ [groups]]) [] graph
352 traceSynchronyEnd :: Phylo -> Phylo
353 traceSynchronyEnd phylo =
354 trace ( "\n" <> "-- | End synchronic clustering at level " <> show (getLastLevel phylo)
355 <> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
356 <> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
359 traceSynchronyStart :: Phylo -> Phylo
360 traceSynchronyStart phylo =
361 trace ( "\n" <> "-- | Start synchronic clustering at level " <> show (getLastLevel phylo)
362 <> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
363 <> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
371 getSensibility :: Proximity -> Double
372 getSensibility proxi = case proxi of
373 WeightedLogJaccard s _ _ -> s
376 getThresholdInit :: Proximity -> Double
377 getThresholdInit proxi = case proxi of
378 WeightedLogJaccard _ t _ -> t
381 getThresholdStep :: Proximity -> Double
382 getThresholdStep proxi = case proxi of
383 WeightedLogJaccard _ _ s -> s
387 traceBranchMatching :: Proximity -> Double -> [PhyloGroup] -> [PhyloGroup]
388 traceBranchMatching proxi thr groups = case proxi of
389 WeightedLogJaccard _ i s -> trace (
390 roundToStr 2 thr <> " "
391 <> foldl (\acc _ -> acc <> ".") "." [(10*i),(10*i + 10*s)..(10*thr)]
392 <> " " <> show(length groups) <> " groups"
400 intersectInit :: Eq a => [a] -> [a] -> [a] -> [a]
401 intersectInit acc lst lst' =
402 if (null lst) || (null lst')
404 else if (head' "intersectInit" lst) == (head' "intersectInit" lst')
405 then intersectInit (acc ++ [head' "intersectInit" lst]) (tail lst) (tail lst')
408 branchIdsToProximity :: PhyloBranchId -> PhyloBranchId -> Double -> Double -> Double
409 branchIdsToProximity id id' thrInit thrStep = thrInit + thrStep * (fromIntegral $ length $ intersectInit [] (snd id) (snd id'))
411 ngramsInBranches :: [[PhyloGroup]] -> [Int]
412 ngramsInBranches branches = nub $ foldl (\acc g -> acc ++ (g ^. phylo_groupNgrams)) [] $ concat branches
415 traceMatchSuccess :: Double -> Double -> Double -> [[[PhyloGroup]]] -> [[[PhyloGroup]]]
416 traceMatchSuccess thr qua qua' nextBranches =
417 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . init . snd)
418 $ (head' "trace" $ head' "trace" $ head' "trace" nextBranches) ^. phylo_groupBranchId))
419 <> ",(1.." <> show (length nextBranches) <> ")]"
420 <> " | " <> show ((length . concat . concat) nextBranches) <> " groups" <> "\n"
421 <> " - splited with success in " <> show (map length nextBranches) <> " sub-branches" <> "\n"
422 <> " - for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " < " <> show(qua') <> ")\n" ) nextBranches
425 traceMatchFailure :: Double -> Double -> Double -> [[PhyloGroup]] -> [[PhyloGroup]]
426 traceMatchFailure thr qua qua' branches =
427 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
428 <> ",(1.." <> show (length branches) <> ")]"
429 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
430 <> " - split with failure for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " > " <> show(qua') <> ")\n"
434 traceMatchNoSplit :: [[PhyloGroup]] -> [[PhyloGroup]]
435 traceMatchNoSplit branches =
436 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
437 <> ",(1.." <> show (length branches) <> ")]"
438 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
439 <> " - unable to split in smaller branches" <> "\n"
443 traceMatchLimit :: [[PhyloGroup]] -> [[PhyloGroup]]
444 traceMatchLimit branches =
445 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
446 <> ",(1.." <> show (length branches) <> ")]"
447 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
448 <> " - unable to increase the threshold above 1" <> "\n"
452 traceMatchEnd :: [PhyloGroup] -> [PhyloGroup]
453 traceMatchEnd groups =
454 trace ("\n" <> "-- | End temporal matching with " <> show (length $ nub $ map (\g -> g ^. phylo_groupBranchId) groups)
455 <> " branches and " <> show (length groups) <> " groups" <> "\n") groups
458 traceTemporalMatching :: [PhyloGroup] -> [PhyloGroup]
459 traceTemporalMatching groups =
460 trace ( "\n" <> "-- | Start temporal matching for " <> show(length groups) <> " groups" <> "\n") groups