]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/Phylo/PhyloTools.hs
bug fix
[gargantext.git] / src / Gargantext / Viz / Phylo / PhyloTools.hs
1 {-|
2 Module : Gargantext.Viz.Phylo.PhyloTools
3 Description : Module dedicated to all the tools needed for making a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 {-# LANGUAGE FlexibleContexts #-}
12 {-# LANGUAGE NoImplicitPrelude #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE RankNTypes #-}
15 {-# LANGUAGE ViewPatterns #-}
16
17 module Gargantext.Viz.Phylo.PhyloTools where
18
19 import Data.Vector (Vector, elemIndex)
20 import Data.List (sort, concat, null, union, (++), tails, sortOn, nub, init, tail, partition, tails, nubBy)
21 import Data.Set (Set, disjoint)
22 import Data.Map (Map, elems, fromList, unionWith, keys, member, (!), filterWithKey, fromListWith, empty)
23 import Data.String (String)
24 import Data.Text (Text, unwords)
25
26 import Gargantext.Prelude
27 import Gargantext.Viz.AdaptativePhylo
28 import Text.Printf
29
30
31 import Debug.Trace (trace)
32 import Control.Lens hiding (Level)
33
34 import qualified Data.Vector as Vector
35 import qualified Data.List as List
36 import qualified Data.Set as Set
37 import qualified Data.Map as Map
38
39 ------------
40 -- | Io | --
41 ------------
42
43 -- | To print an important message as an IO()
44 printIOMsg :: String -> IO ()
45 printIOMsg msg =
46 putStrLn ( "\n"
47 <> "------------"
48 <> "\n"
49 <> "-- | " <> msg <> "\n" )
50
51
52 -- | To print a comment as an IO()
53 printIOComment :: String -> IO ()
54 printIOComment cmt =
55 putStrLn ( "\n" <> cmt <> "\n" )
56
57
58 --------------
59 -- | Misc | --
60 --------------
61
62
63 roundToStr :: (PrintfArg a, Floating a) => Int -> a -> String
64 roundToStr = printf "%0.*f"
65
66
67 countSup :: Double -> [Double] -> Int
68 countSup s l = length $ filter (>s) l
69
70
71 dropByIdx :: Int -> [a] -> [a]
72 dropByIdx k l = take k l ++ drop (k+1) l
73
74
75 elemIndex' :: Eq a => a -> [a] -> Int
76 elemIndex' e l = case (List.elemIndex e l) of
77 Nothing -> panic ("[ERR][Viz.Phylo.PhyloTools] element not in list")
78 Just i -> i
79
80
81 commonPrefix :: Eq a => [a] -> [a] -> [a] -> [a]
82 commonPrefix lst lst' acc =
83 if (null lst || null lst')
84 then acc
85 else if (head' "commonPrefix" lst == head' "commonPrefix" lst')
86 then commonPrefix (tail lst) (tail lst') (acc ++ [head' "commonPrefix" lst])
87 else acc
88
89
90 ---------------------
91 -- | Foundations | --
92 ---------------------
93
94
95 -- | Is this Ngrams a Foundations Root ?
96 isRoots :: Ngrams -> Vector Ngrams -> Bool
97 isRoots n ns = Vector.elem n ns
98
99 -- | To transform a list of nrams into a list of foundation's index
100 ngramsToIdx :: [Ngrams] -> Vector Ngrams -> [Int]
101 ngramsToIdx ns fdt = map (\n -> fromJust $ elemIndex n fdt) ns
102
103 -- | To transform a list of Ngrams Indexes into a Label
104 ngramsToLabel :: Vector Ngrams -> [Int] -> Text
105 ngramsToLabel ngrams l = unwords $ tail' "ngramsToLabel" $ concat $ map (\n -> ["|",n]) $ ngramsToText ngrams l
106
107
108 -- | To transform a list of Ngrams Indexes into a list of Text
109 ngramsToText :: Vector Ngrams -> [Int] -> [Text]
110 ngramsToText ngrams l = map (\idx -> ngrams Vector.! idx) l
111
112
113 --------------
114 -- | Time | --
115 --------------
116
117 -- | To transform a list of periods into a set of Dates
118 periodsToYears :: [(Date,Date)] -> Set Date
119 periodsToYears periods = (Set.fromList . sort . concat)
120 $ map (\(d,d') -> [d..d']) periods
121
122
123 findBounds :: [Date] -> (Date,Date)
124 findBounds dates =
125 let dates' = sort dates
126 in (head' "findBounds" dates', last' "findBounds" dates')
127
128
129 toPeriods :: [Date] -> Int -> Int -> [(Date,Date)]
130 toPeriods dates p s =
131 let (start,end) = findBounds dates
132 in map (\dates' -> (head' "toPeriods" dates', last' "toPeriods" dates'))
133 $ chunkAlong p s [start .. end]
134
135
136 -- | Get a regular & ascendante timeScale from a given list of dates
137 toTimeScale :: [Date] -> Int -> [Date]
138 toTimeScale dates step =
139 let (start,end) = findBounds dates
140 in [start, (start + step) .. end]
141
142
143 getTimeStep :: TimeUnit -> Int
144 getTimeStep time = case time of
145 Year _ s _ -> s
146
147 getTimePeriod :: TimeUnit -> Int
148 getTimePeriod time = case time of
149 Year p _ _ -> p
150
151 getTimeFrame :: TimeUnit -> Int
152 getTimeFrame time = case time of
153 Year _ _ f -> f
154
155 -------------
156 -- | Fis | --
157 -------------
158
159
160 -- | To find if l' is nested in l
161 isNested :: Eq a => [a] -> [a] -> Bool
162 isNested l l'
163 | null l' = True
164 | length l' > length l = False
165 | (union l l') == l = True
166 | otherwise = False
167
168
169 -- | To filter Fis with small Support but by keeping non empty Periods
170 keepFilled :: (Int -> [a] -> [a]) -> Int -> [a] -> [a]
171 keepFilled f thr l = if (null $ f thr l) && (not $ null l)
172 then keepFilled f (thr - 1) l
173 else f thr l
174
175
176 traceClique :: Map (Date, Date) [PhyloClique] -> String
177 traceClique mFis = foldl (\msg cpt -> msg <> show (countSup cpt cliques) <> " (>" <> show (cpt) <> ") " ) "" [1..6]
178 where
179 --------------------------------------
180 cliques :: [Double]
181 cliques = sort $ map (fromIntegral . length . _phyloClique_nodes) $ concat $ elems mFis
182 --------------------------------------
183
184
185 traceSupport :: Map (Date, Date) [PhyloClique] -> String
186 traceSupport mFis = foldl (\msg cpt -> msg <> show (countSup cpt supports) <> " (>" <> show (cpt) <> ") " ) "" [1..6]
187 where
188 --------------------------------------
189 supports :: [Double]
190 supports = sort $ map (fromIntegral . _phyloClique_support) $ concat $ elems mFis
191 --------------------------------------
192
193
194 traceFis :: [Char] -> Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
195 traceFis msg mFis = trace ( "\n" <> "-- | " <> msg <> " : " <> show (sum $ map length $ elems mFis) <> "\n"
196 <> "Support : " <> (traceSupport mFis) <> "\n"
197 <> "Nb Ngrams : " <> (traceClique mFis) <> "\n" ) mFis
198
199
200 ---------------
201 -- | Clique| --
202 ---------------
203
204
205 getCliqueSupport :: Clique -> Int
206 getCliqueSupport unit = case unit of
207 Fis s _ -> s
208 MaxClique _ -> 0
209
210 getCliqueSize :: Clique -> Int
211 getCliqueSize unit = case unit of
212 Fis _ s -> s
213 MaxClique s -> s
214
215
216 --------------
217 -- | Cooc | --
218 --------------
219
220 listToCombi' :: [a] -> [(a,a)]
221 listToCombi' l = [(x,y) | (x:rest) <- tails l, y <- rest]
222
223 listToEqual' :: Eq a => [a] -> [(a,a)]
224 listToEqual' l = [(x,y) | x <- l, y <- l, x == y]
225
226 listToKeys :: Eq a => [a] -> [(a,a)]
227 listToKeys lst = (listToCombi' lst) ++ (listToEqual' lst)
228
229 listToMatrix :: [Int] -> Map (Int,Int) Double
230 listToMatrix lst = fromList $ map (\k -> (k,1)) $ listToKeys $ sort lst
231
232 listToMatrix' :: [Ngrams] -> Map (Ngrams,Ngrams) Int
233 listToMatrix' lst = fromList $ map (\k -> (k,1)) $ listToKeys $ sort lst
234
235 listToSeq :: Eq a => [a] -> [(a,a)]
236 listToSeq l = nubBy (\x y -> fst x == fst y) $ [ (x,y) | (x:rest) <- tails l, y <- rest ]
237
238 sumCooc :: Cooc -> Cooc -> Cooc
239 sumCooc cooc cooc' = unionWith (+) cooc cooc'
240
241 getTrace :: Cooc -> Double
242 getTrace cooc = sum $ elems $ filterWithKey (\(k,k') _ -> k == k') cooc
243
244 coocToDiago :: Cooc -> Cooc
245 coocToDiago cooc = filterWithKey (\(k,k') _ -> k == k') cooc
246
247 -- | To build the local cooc matrix of each phylogroup
248 ngramsToCooc :: [Int] -> [Cooc] -> Cooc
249 ngramsToCooc ngrams coocs =
250 let cooc = foldl (\acc cooc' -> sumCooc acc cooc') empty coocs
251 pairs = listToKeys ngrams
252 in filterWithKey (\k _ -> elem k pairs) cooc
253
254
255 --------------------
256 -- | PhyloGroup | --
257 --------------------
258
259 getGroupId :: PhyloGroup -> PhyloGroupId
260 getGroupId group = ((group ^. phylo_groupPeriod, group ^. phylo_groupLevel), group ^. phylo_groupIndex)
261
262 idToPrd :: PhyloGroupId -> PhyloPeriodId
263 idToPrd id = (fst . fst) id
264
265 getGroupThr :: PhyloGroup -> Double
266 getGroupThr group = last' "getGroupThr" ((group ^. phylo_groupMeta) ! "breaks")
267
268 groupByField :: Ord a => (PhyloGroup -> a) -> [PhyloGroup] -> Map a [PhyloGroup]
269 groupByField toField groups = fromListWith (++) $ map (\g -> (toField g, [g])) groups
270
271 getPeriodPointers :: Filiation -> PhyloGroup -> [Pointer]
272 getPeriodPointers fil group =
273 case fil of
274 ToChilds -> group ^. phylo_groupPeriodChilds
275 ToParents -> group ^. phylo_groupPeriodParents
276
277 filterProximity :: Proximity -> Double -> Double -> Bool
278 filterProximity proximity thr local =
279 case proximity of
280 WeightedLogJaccard _ -> local >= thr
281 Hamming -> undefined
282
283 getProximityName :: Proximity -> String
284 getProximityName proximity =
285 case proximity of
286 WeightedLogJaccard _ -> "WLJaccard"
287 Hamming -> "Hamming"
288
289 ---------------
290 -- | Phylo | --
291 ---------------
292
293 addPointers :: Filiation -> PointerType -> [Pointer] -> PhyloGroup -> PhyloGroup
294 addPointers fil pty pointers group =
295 case pty of
296 TemporalPointer -> case fil of
297 ToChilds -> group & phylo_groupPeriodChilds .~ pointers
298 ToParents -> group & phylo_groupPeriodParents .~ pointers
299 LevelPointer -> case fil of
300 ToChilds -> group & phylo_groupLevelChilds .~ pointers
301 ToParents -> group & phylo_groupLevelParents .~ pointers
302
303
304 getPeriodIds :: Phylo -> [(Date,Date)]
305 getPeriodIds phylo = sortOn fst
306 $ keys
307 $ phylo ^. phylo_periods
308
309 getLevelParentId :: PhyloGroup -> PhyloGroupId
310 getLevelParentId g = fst $ head' "getLevelParentId" $ g ^. phylo_groupLevelParents
311
312 getLastLevel :: Phylo -> Level
313 getLastLevel phylo = last' "lastLevel" $ getLevels phylo
314
315 getLevels :: Phylo -> [Level]
316 getLevels phylo = nub
317 $ map snd
318 $ keys $ view ( phylo_periods
319 . traverse
320 . phylo_periodLevels ) phylo
321
322 getSeaElevation :: Phylo -> SeaElevation
323 getSeaElevation phylo = seaElevation (getConfig phylo)
324
325
326 getConfig :: Phylo -> Config
327 getConfig phylo = (phylo ^. phylo_param) ^. phyloParam_config
328
329
330 getRoots :: Phylo -> Vector Ngrams
331 getRoots phylo = (phylo ^. phylo_foundations) ^. foundations_roots
332
333 phyloToLastBranches :: Phylo -> [[PhyloGroup]]
334 phyloToLastBranches phylo = elems
335 $ fromListWith (++)
336 $ map (\g -> (g ^. phylo_groupBranchId, [g]))
337 $ getGroupsFromLevel (last' "byBranches" $ getLevels phylo) phylo
338
339 getGroupsFromLevel :: Level -> Phylo -> [PhyloGroup]
340 getGroupsFromLevel lvl phylo =
341 elems $ view ( phylo_periods
342 . traverse
343 . phylo_periodLevels
344 . traverse
345 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
346 . phylo_levelGroups ) phylo
347
348
349 getGroupsFromLevelPeriods :: Level -> [PhyloPeriodId] -> Phylo -> [PhyloGroup]
350 getGroupsFromLevelPeriods lvl periods phylo =
351 elems $ view ( phylo_periods
352 . traverse
353 . filtered (\phyloPrd -> elem (phyloPrd ^. phylo_periodPeriod) periods)
354 . phylo_periodLevels
355 . traverse
356 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
357 . phylo_levelGroups ) phylo
358
359
360 getGroupsFromPeriods :: Level -> Map PhyloPeriodId PhyloPeriod -> [PhyloGroup]
361 getGroupsFromPeriods lvl periods =
362 elems $ view ( traverse
363 . phylo_periodLevels
364 . traverse
365 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
366 . phylo_levelGroups ) periods
367
368
369 updatePhyloGroups :: Level -> Map PhyloGroupId PhyloGroup -> Phylo -> Phylo
370 updatePhyloGroups lvl m phylo =
371 over ( phylo_periods
372 . traverse
373 . phylo_periodLevels
374 . traverse
375 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
376 . phylo_levelGroups
377 . traverse
378 ) (\group ->
379 let id = getGroupId group
380 in
381 if member id m
382 then m ! id
383 else group ) phylo
384
385
386 traceToPhylo :: Level -> Phylo -> Phylo
387 traceToPhylo lvl phylo =
388 trace ("\n" <> "-- | End of phylo making at level " <> show (lvl) <> " with "
389 <> show (length $ getGroupsFromLevel lvl phylo) <> " groups and "
390 <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel lvl phylo) <> " branches" <> "\n") phylo
391
392 --------------------
393 -- | Clustering | --
394 --------------------
395
396 relatedComponents :: Ord a => [[a]] -> [[a]]
397 relatedComponents graph = foldl' (\acc groups ->
398 if (null acc)
399 then acc ++ [groups]
400 else
401 let acc' = partition (\groups' -> disjoint (Set.fromList groups') (Set.fromList groups)) acc
402 in (fst acc') ++ [nub $ concat $ (snd acc') ++ [groups]]) [] graph
403
404 toRelatedComponents :: [PhyloGroup] -> [((PhyloGroup,PhyloGroup),Double)] -> [[PhyloGroup]]
405 toRelatedComponents nodes edges =
406 let ref = fromList $ map (\g -> (getGroupId g, g)) nodes
407 clusters = relatedComponents $ ((map (\((g,g'),_) -> [getGroupId g, getGroupId g']) edges) ++ (map (\g -> [getGroupId g]) nodes))
408 in map (\cluster -> map (\gId -> ref ! gId) cluster) clusters
409
410
411 traceSynchronyEnd :: Phylo -> Phylo
412 traceSynchronyEnd phylo =
413 trace ( "\n" <> "-- | End synchronic clustering at level " <> show (getLastLevel phylo)
414 <> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
415 <> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
416 <> "\n" ) phylo
417
418 traceSynchronyStart :: Phylo -> Phylo
419 traceSynchronyStart phylo =
420 trace ( "\n" <> "-- | Start synchronic clustering at level " <> show (getLastLevel phylo)
421 <> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
422 <> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
423 <> "\n" ) phylo
424
425
426 -------------------
427 -- | Proximity | --
428 -------------------
429
430 getSensibility :: Proximity -> Double
431 getSensibility proxi = case proxi of
432 WeightedLogJaccard s -> s
433 Hamming -> undefined
434
435 ----------------
436 -- | Branch | --
437 ----------------
438
439 intersectInit :: Eq a => [a] -> [a] -> [a] -> [a]
440 intersectInit acc lst lst' =
441 if (null lst) || (null lst')
442 then acc
443 else if (head' "intersectInit" lst) == (head' "intersectInit" lst')
444 then intersectInit (acc ++ [head' "intersectInit" lst]) (tail lst) (tail lst')
445 else acc
446
447 branchIdsToProximity :: PhyloBranchId -> PhyloBranchId -> Double -> Double -> Double
448 branchIdsToProximity id id' thrInit thrStep = thrInit + thrStep * (fromIntegral $ length $ intersectInit [] (snd id) (snd id'))
449
450 ngramsInBranches :: [[PhyloGroup]] -> [Int]
451 ngramsInBranches branches = nub $ foldl (\acc g -> acc ++ (g ^. phylo_groupNgrams)) [] $ concat branches
452
453
454 traceMatchSuccess :: Double -> Double -> Double -> [[[PhyloGroup]]] -> [[[PhyloGroup]]]
455 traceMatchSuccess thr qua qua' nextBranches =
456 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . init . snd)
457 $ (head' "trace" $ head' "trace" $ head' "trace" nextBranches) ^. phylo_groupBranchId))
458 <> ",(1.." <> show (length nextBranches) <> ")]"
459 <> " | " <> show ((length . concat . concat) nextBranches) <> " groups" <> "\n"
460 <> " - splited with success in " <> show (map length nextBranches) <> " sub-branches" <> "\n"
461 <> " - for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " < " <> show(qua') <> ")\n" ) nextBranches
462
463
464 traceMatchFailure :: Double -> Double -> Double -> [[PhyloGroup]] -> [[PhyloGroup]]
465 traceMatchFailure thr qua qua' branches =
466 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
467 <> ",(1.." <> show (length branches) <> ")]"
468 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
469 <> " - split with failure for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " > " <> show(qua') <> ")\n"
470 ) branches
471
472
473 traceMatchNoSplit :: [[PhyloGroup]] -> [[PhyloGroup]]
474 traceMatchNoSplit branches =
475 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
476 <> ",(1.." <> show (length branches) <> ")]"
477 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
478 <> " - unable to split in smaller branches" <> "\n"
479 ) branches
480
481
482 traceMatchLimit :: [[PhyloGroup]] -> [[PhyloGroup]]
483 traceMatchLimit branches =
484 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
485 <> ",(1.." <> show (length branches) <> ")]"
486 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
487 <> " - unable to increase the threshold above 1" <> "\n"
488 ) branches
489
490
491 traceMatchEnd :: [PhyloGroup] -> [PhyloGroup]
492 traceMatchEnd groups =
493 trace ("\n" <> "-- | End temporal matching with " <> show (length $ nub $ map (\g -> g ^. phylo_groupBranchId) groups)
494 <> " branches and " <> show (length groups) <> " groups" <> "\n") groups
495
496
497 traceTemporalMatching :: [PhyloGroup] -> [PhyloGroup]
498 traceTemporalMatching groups =
499 trace ( "\n" <> "-- | Start temporal matching for " <> show(length groups) <> " groups" <> "\n") groups
500
501
502 traceGroupsProxi :: Map (PhyloGroupId,PhyloGroupId) Double -> Map (PhyloGroupId,PhyloGroupId) Double
503 traceGroupsProxi m =
504 trace ( "\n" <> "-- | " <> show(Map.size m) <> " computed pairs of groups proximity" <> "\n") m