]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Viz/Phylo/PhyloTools.hs
[Phylo] Clean
[gargantext.git] / src / Gargantext / Core / Viz / Phylo / PhyloTools.hs
1 {-|
2 Module : Gargantext.Core.Viz.Phylo.PhyloTools
3 Description : Module dedicated to all the tools needed for making a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 {-# LANGUAGE ViewPatterns #-}
12
13 module Gargantext.Core.Viz.Phylo.PhyloTools where
14
15 import Data.Vector (Vector, elemIndex)
16 import Data.List (sort, concat, null, union, (++), tails, sortOn, nub, init, tail, partition, tails, nubBy, group)
17 import Data.Set (Set, disjoint)
18 import Data.Map (Map, elems, fromList, unionWith, keys, member, (!), filterWithKey, fromListWith, empty, restrictKeys)
19 import Data.String (String)
20 import Data.Text (Text)
21
22 import Gargantext.Prelude
23 import Gargantext.Core.Viz.AdaptativePhylo
24 import Text.Printf
25
26
27 import Debug.Trace (trace)
28 import Control.Lens hiding (Level)
29
30 import qualified Data.Vector as Vector
31 import qualified Data.List as List
32 import qualified Data.Set as Set
33 import qualified Data.Map as Map
34 import qualified Data.Text as Text
35
36 ------------
37 -- | Io | --
38 ------------
39
40 -- | To print an important message as an IO()
41 printIOMsg :: String -> IO ()
42 printIOMsg msg =
43 putStrLn ( "\n"
44 <> "------------"
45 <> "\n"
46 <> "-- | " <> msg <> "\n" )
47
48
49 -- | To print a comment as an IO()
50 printIOComment :: String -> IO ()
51 printIOComment cmt =
52 putStrLn ( "\n" <> cmt <> "\n" )
53
54
55 --------------
56 -- | Misc | --
57 --------------
58
59
60 roundToStr :: (PrintfArg a, Floating a) => Int -> a -> String
61 roundToStr = printf "%0.*f"
62
63
64 countSup :: Double -> [Double] -> Int
65 countSup s l = length $ filter (>s) l
66
67
68 dropByIdx :: Int -> [a] -> [a]
69 dropByIdx k l = take k l ++ drop (k+1) l
70
71
72 elemIndex' :: Eq a => a -> [a] -> Int
73 elemIndex' e l = case (List.elemIndex e l) of
74 Nothing -> panic ("[ERR][Viz.Phylo.PhyloTools] element not in list")
75 Just i -> i
76
77
78 commonPrefix :: Eq a => [a] -> [a] -> [a] -> [a]
79 commonPrefix lst lst' acc =
80 if (null lst || null lst')
81 then acc
82 else if (head' "commonPrefix" lst == head' "commonPrefix" lst')
83 then commonPrefix (tail lst) (tail lst') (acc ++ [head' "commonPrefix" lst])
84 else acc
85
86
87 ---------------------
88 -- | Foundations | --
89 ---------------------
90
91
92 -- | Is this Ngrams a Foundations Root ?
93 isRoots :: Ngrams -> Vector Ngrams -> Bool
94 isRoots n ns = Vector.elem n ns
95
96 -- | To transform a list of nrams into a list of foundation's index
97 ngramsToIdx :: [Ngrams] -> Vector Ngrams -> [Int]
98 ngramsToIdx ns fdt = map (\n -> fromJust $ elemIndex n fdt) ns
99
100 -- | To transform a list of Ngrams Indexes into a Label
101 ngramsToLabel :: Vector Ngrams -> [Int] -> Text
102 ngramsToLabel ngrams l = Text.unwords $ tail' "ngramsToLabel" $ concat $ map (\n -> ["|",n]) $ ngramsToText ngrams l
103
104 idxToLabel :: [Int] -> String
105 idxToLabel l = List.unwords $ tail' "idxToLabel" $ concat $ map (\n -> ["|",show n]) l
106
107 idxToLabel' :: [Double] -> String
108 idxToLabel' l = List.unwords $ tail' "idxToLabel" $ concat $ map (\n -> ["|",show n]) l
109
110 -- | To transform a list of Ngrams Indexes into a list of Text
111 ngramsToText :: Vector Ngrams -> [Int] -> [Text]
112 ngramsToText ngrams l = map (\idx -> ngrams Vector.! idx) l
113
114
115 --------------
116 -- | Time | --
117 --------------
118
119 -- | To transform a list of periods into a set of Dates
120 periodsToYears :: [(Date,Date)] -> Set Date
121 periodsToYears periods = (Set.fromList . sort . concat)
122 $ map (\(d,d') -> [d..d']) periods
123
124
125 findBounds :: [Date] -> (Date,Date)
126 findBounds dates =
127 let dates' = sort dates
128 in (head' "findBounds" dates', last' "findBounds" dates')
129
130
131 toPeriods :: [Date] -> Int -> Int -> [(Date,Date)]
132 toPeriods dates p s =
133 let (start,end) = findBounds dates
134 in map (\dates' -> (head' "toPeriods" dates', last' "toPeriods" dates'))
135 $ chunkAlong p s [start .. end]
136
137
138 -- | Get a regular & ascendante timeScale from a given list of dates
139 toTimeScale :: [Date] -> Int -> [Date]
140 toTimeScale dates step =
141 let (start,end) = findBounds dates
142 in [start, (start + step) .. end]
143
144
145 getTimeStep :: TimeUnit -> Int
146 getTimeStep time = case time of
147 Year _ s _ -> s
148
149 getTimePeriod :: TimeUnit -> Int
150 getTimePeriod time = case time of
151 Year p _ _ -> p
152
153 getTimeFrame :: TimeUnit -> Int
154 getTimeFrame time = case time of
155 Year _ _ f -> f
156
157 -------------
158 -- | Fis | --
159 -------------
160
161
162 -- | To find if l' is nested in l
163 isNested :: Eq a => [a] -> [a] -> Bool
164 isNested l l'
165 | null l' = True
166 | length l' > length l = False
167 | (union l l') == l = True
168 | otherwise = False
169
170
171 -- | To filter Fis with small Support but by keeping non empty Periods
172 keepFilled :: (Int -> [a] -> [a]) -> Int -> [a] -> [a]
173 keepFilled f thr l = if (null $ f thr l) && (not $ null l)
174 then keepFilled f (thr - 1) l
175 else f thr l
176
177
178 traceClique :: Map (Date, Date) [PhyloClique] -> String
179 traceClique mFis = foldl (\msg cpt -> msg <> show (countSup cpt cliques) <> " (>" <> show (cpt) <> ") " ) "" [1..6]
180 where
181 --------------------------------------
182 cliques :: [Double]
183 cliques = sort $ map (fromIntegral . length . _phyloClique_nodes) $ concat $ elems mFis
184 --------------------------------------
185
186
187 traceSupport :: Map (Date, Date) [PhyloClique] -> String
188 traceSupport mFis = foldl (\msg cpt -> msg <> show (countSup cpt supports) <> " (>" <> show (cpt) <> ") " ) "" [1..6]
189 where
190 --------------------------------------
191 supports :: [Double]
192 supports = sort $ map (fromIntegral . _phyloClique_support) $ concat $ elems mFis
193 --------------------------------------
194
195
196 traceFis :: [Char] -> Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
197 traceFis msg mFis = trace ( "\n" <> "-- | " <> msg <> " : " <> show (sum $ map length $ elems mFis) <> "\n"
198 <> "Support : " <> (traceSupport mFis) <> "\n"
199 <> "Nb Ngrams : " <> (traceClique mFis) <> "\n" ) mFis
200
201
202 ---------------
203 -- | Clique| --
204 ---------------
205
206
207 getCliqueSupport :: Clique -> Int
208 getCliqueSupport unit = case unit of
209 Fis s _ -> s
210 MaxClique _ -> 0
211
212 getCliqueSize :: Clique -> Int
213 getCliqueSize unit = case unit of
214 Fis _ s -> s
215 MaxClique s -> s
216
217
218 --------------
219 -- | Cooc | --
220 --------------
221
222 listToCombi' :: [a] -> [(a,a)]
223 listToCombi' l = [(x,y) | (x:rest) <- tails l, y <- rest]
224
225 listToEqual' :: Eq a => [a] -> [(a,a)]
226 listToEqual' l = [(x,y) | x <- l, y <- l, x == y]
227
228 listToKeys :: Eq a => [a] -> [(a,a)]
229 listToKeys lst = (listToCombi' lst) ++ (listToEqual' lst)
230
231 listToMatrix :: [Int] -> Map (Int,Int) Double
232 listToMatrix lst = fromList $ map (\k -> (k,1)) $ listToKeys $ sort lst
233
234 listToMatrix' :: [Ngrams] -> Map (Ngrams,Ngrams) Int
235 listToMatrix' lst = fromList $ map (\k -> (k,1)) $ listToKeys $ sort lst
236
237 listToSeq :: Eq a => [a] -> [(a,a)]
238 listToSeq l = nubBy (\x y -> fst x == fst y) $ [ (x,y) | (x:rest) <- tails l, y <- rest ]
239
240 sumCooc :: Cooc -> Cooc -> Cooc
241 sumCooc cooc cooc' = unionWith (+) cooc cooc'
242
243 getTrace :: Cooc -> Double
244 getTrace cooc = sum $ elems $ filterWithKey (\(k,k') _ -> k == k') cooc
245
246 coocToDiago :: Cooc -> Cooc
247 coocToDiago cooc = filterWithKey (\(k,k') _ -> k == k') cooc
248
249 -- | To build the local cooc matrix of each phylogroup
250 ngramsToCooc :: [Int] -> [Cooc] -> Cooc
251 ngramsToCooc ngrams coocs =
252 let cooc = foldl (\acc cooc' -> sumCooc acc cooc') empty coocs
253 pairs = listToKeys ngrams
254 in filterWithKey (\k _ -> elem k pairs) cooc
255
256
257 --------------------
258 -- | PhyloGroup | --
259 --------------------
260
261 getGroupId :: PhyloGroup -> PhyloGroupId
262 getGroupId g = ((g ^. phylo_groupPeriod, g ^. phylo_groupLevel), g ^. phylo_groupIndex)
263
264 idToPrd :: PhyloGroupId -> PhyloPeriodId
265 idToPrd id = (fst . fst) id
266
267 groupByField :: Ord a => (PhyloGroup -> a) -> [PhyloGroup] -> Map a [PhyloGroup]
268 groupByField toField groups = fromListWith (++) $ map (\g -> (toField g, [g])) groups
269
270 getPeriodPointers :: Filiation -> PhyloGroup -> [Pointer]
271 getPeriodPointers fil g =
272 case fil of
273 ToChilds -> g ^. phylo_groupPeriodChilds
274 ToParents -> g ^. phylo_groupPeriodParents
275
276 filterProximity :: Proximity -> Double -> Double -> Bool
277 filterProximity proximity thr local =
278 case proximity of
279 WeightedLogJaccard _ -> local >= thr
280 Hamming -> undefined
281
282 getProximityName :: Proximity -> String
283 getProximityName proximity =
284 case proximity of
285 WeightedLogJaccard _ -> "WLJaccard"
286 Hamming -> "Hamming"
287
288 ---------------
289 -- | Phylo | --
290 ---------------
291
292 addPointers :: Filiation -> PointerType -> [Pointer] -> PhyloGroup -> PhyloGroup
293 addPointers fil pty pointers g =
294 case pty of
295 TemporalPointer -> case fil of
296 ToChilds -> g & phylo_groupPeriodChilds .~ pointers
297 ToParents -> g & phylo_groupPeriodParents .~ pointers
298 LevelPointer -> case fil of
299 ToChilds -> g & phylo_groupLevelChilds .~ pointers
300 ToParents -> g & phylo_groupLevelParents .~ pointers
301
302
303 getPeriodIds :: Phylo -> [(Date,Date)]
304 getPeriodIds phylo = sortOn fst
305 $ keys
306 $ phylo ^. phylo_periods
307
308 getLevelParentId :: PhyloGroup -> PhyloGroupId
309 getLevelParentId g = fst $ head' "getLevelParentId" $ g ^. phylo_groupLevelParents
310
311 getLastLevel :: Phylo -> Level
312 getLastLevel phylo = last' "lastLevel" $ getLevels phylo
313
314 getLevels :: Phylo -> [Level]
315 getLevels phylo = nub
316 $ map snd
317 $ keys $ view ( phylo_periods
318 . traverse
319 . phylo_periodLevels ) phylo
320
321 getSeaElevation :: Phylo -> SeaElevation
322 getSeaElevation phylo = seaElevation (getConfig phylo)
323
324
325 getConfig :: Phylo -> Config
326 getConfig phylo = (phylo ^. phylo_param) ^. phyloParam_config
327
328
329 getRoots :: Phylo -> Vector Ngrams
330 getRoots phylo = (phylo ^. phylo_foundations) ^. foundations_roots
331
332 phyloToLastBranches :: Phylo -> [[PhyloGroup]]
333 phyloToLastBranches phylo = elems
334 $ fromListWith (++)
335 $ map (\g -> (g ^. phylo_groupBranchId, [g]))
336 $ getGroupsFromLevel (last' "byBranches" $ getLevels phylo) phylo
337
338 getGroupsFromLevel :: Level -> Phylo -> [PhyloGroup]
339 getGroupsFromLevel lvl phylo =
340 elems $ view ( phylo_periods
341 . traverse
342 . phylo_periodLevels
343 . traverse
344 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
345 . phylo_levelGroups ) phylo
346
347
348 getGroupsFromLevelPeriods :: Level -> [PhyloPeriodId] -> Phylo -> [PhyloGroup]
349 getGroupsFromLevelPeriods lvl periods phylo =
350 elems $ view ( phylo_periods
351 . traverse
352 . filtered (\phyloPrd -> elem (phyloPrd ^. phylo_periodPeriod) periods)
353 . phylo_periodLevels
354 . traverse
355 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
356 . phylo_levelGroups ) phylo
357
358
359 getGroupsFromPeriods :: Level -> Map PhyloPeriodId PhyloPeriod -> [PhyloGroup]
360 getGroupsFromPeriods lvl periods =
361 elems $ view ( traverse
362 . phylo_periodLevels
363 . traverse
364 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
365 . phylo_levelGroups ) periods
366
367
368 updatePhyloGroups :: Level -> Map PhyloGroupId PhyloGroup -> Phylo -> Phylo
369 updatePhyloGroups lvl m phylo =
370 over ( phylo_periods
371 . traverse
372 . phylo_periodLevels
373 . traverse
374 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
375 . phylo_levelGroups
376 . traverse
377 ) (\g ->
378 let id = getGroupId g
379 in
380 if member id m
381 then m ! id
382 else g ) phylo
383
384
385 traceToPhylo :: Level -> Phylo -> Phylo
386 traceToPhylo lvl phylo =
387 trace ("\n" <> "-- | End of phylo making at level " <> show (lvl) <> " with "
388 <> show (length $ getGroupsFromLevel lvl phylo) <> " groups and "
389 <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel lvl phylo) <> " branches" <> "\n") phylo
390
391 --------------------
392 -- | Clustering | --
393 --------------------
394
395 mergeBranchIds :: [[Int]] -> [Int]
396 mergeBranchIds ids = (head' "mergeBranchIds" . sort . mostFreq') ids
397 where
398 -- | 2) find the most Up Left ids in the hierarchy of similarity
399 -- mostUpLeft :: [[Int]] -> [[Int]]
400 -- mostUpLeft ids' =
401 -- let groupIds = (map (\gIds -> (length $ head' "gIds" gIds, head' "gIds" gIds)) . groupBy (\id id' -> length id == length id') . sortOn length) ids'
402 -- inf = (fst . minimum) groupIds
403 -- in map snd $ filter (\gIds -> fst gIds == inf) groupIds
404 -- | 1) find the most frequent ids
405 mostFreq' :: [[Int]] -> [[Int]]
406 mostFreq' ids' =
407 let groupIds = (map (\gIds -> (length gIds, head' "gIds" gIds)) . group . sort) ids'
408 sup = (fst . maximum) groupIds
409 in map snd $ filter (\gIds -> fst gIds == sup) groupIds
410
411
412 mergeMeta :: [Int] -> [PhyloGroup] -> Map Text [Double]
413 mergeMeta bId groups =
414 let ego = head' "mergeMeta" $ filter (\g -> (snd (g ^. phylo_groupBranchId)) == bId) groups
415 in fromList [("breaks",(ego ^. phylo_groupMeta) ! "breaks"),("seaLevels",(ego ^. phylo_groupMeta) ! "seaLevels")]
416
417
418 groupsToBranches :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
419 groupsToBranches groups =
420 {- run the related component algorithm -}
421 let egos = map (\g -> [getGroupId g]
422 ++ (map fst $ g ^. phylo_groupPeriodParents)
423 ++ (map fst $ g ^. phylo_groupPeriodChilds)
424 ++ (map fst $ g ^. phylo_groupAncestors)) $ elems groups
425 graph = relatedComponents egos
426 {- update each group's branch id -}
427 in map (\ids ->
428 let groups' = elems $ restrictKeys groups (Set.fromList ids)
429 bId = mergeBranchIds $ map (\g -> snd $ g ^. phylo_groupBranchId) groups'
430 in map (\g -> g & phylo_groupBranchId %~ (\(lvl,_) -> (lvl,bId))) groups') graph
431
432 relatedComponents :: Ord a => [[a]] -> [[a]]
433 relatedComponents graph = foldl' (\acc groups ->
434 if (null acc)
435 then acc ++ [groups]
436 else
437 let acc' = partition (\groups' -> disjoint (Set.fromList groups') (Set.fromList groups)) acc
438 in (fst acc') ++ [nub $ concat $ (snd acc') ++ [groups]]) [] graph
439
440 toRelatedComponents :: [PhyloGroup] -> [((PhyloGroup,PhyloGroup),Double)] -> [[PhyloGroup]]
441 toRelatedComponents nodes edges =
442 let ref = fromList $ map (\g -> (getGroupId g, g)) nodes
443 clusters = relatedComponents $ ((map (\((g,g'),_) -> [getGroupId g, getGroupId g']) edges) ++ (map (\g -> [getGroupId g]) nodes))
444 in map (\cluster -> map (\gId -> ref ! gId) cluster) clusters
445
446
447 traceSynchronyEnd :: Phylo -> Phylo
448 traceSynchronyEnd phylo =
449 trace ( "\n" <> "-- | End synchronic clustering at level " <> show (getLastLevel phylo)
450 <> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
451 <> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
452 <> "\n" ) phylo
453
454 traceSynchronyStart :: Phylo -> Phylo
455 traceSynchronyStart phylo =
456 trace ( "\n" <> "-- | Start synchronic clustering at level " <> show (getLastLevel phylo)
457 <> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
458 <> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
459 <> "\n" ) phylo
460
461
462 -------------------
463 -- | Proximity | --
464 -------------------
465
466 getSensibility :: Proximity -> Double
467 getSensibility proxi = case proxi of
468 WeightedLogJaccard s -> s
469 Hamming -> undefined
470
471 ----------------
472 -- | Branch | --
473 ----------------
474
475 intersectInit :: Eq a => [a] -> [a] -> [a] -> [a]
476 intersectInit acc lst lst' =
477 if (null lst) || (null lst')
478 then acc
479 else if (head' "intersectInit" lst) == (head' "intersectInit" lst')
480 then intersectInit (acc ++ [head' "intersectInit" lst]) (tail lst) (tail lst')
481 else acc
482
483 branchIdsToProximity :: PhyloBranchId -> PhyloBranchId -> Double -> Double -> Double
484 branchIdsToProximity id id' thrInit thrStep = thrInit + thrStep * (fromIntegral $ length $ intersectInit [] (snd id) (snd id'))
485
486 ngramsInBranches :: [[PhyloGroup]] -> [Int]
487 ngramsInBranches branches = nub $ foldl (\acc g -> acc ++ (g ^. phylo_groupNgrams)) [] $ concat branches
488
489
490 traceMatchSuccess :: Double -> Double -> Double -> [[[PhyloGroup]]] -> [[[PhyloGroup]]]
491 traceMatchSuccess thr qua qua' nextBranches =
492 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . init . snd)
493 $ (head' "trace" $ head' "trace" $ head' "trace" nextBranches) ^. phylo_groupBranchId))
494 <> ",(1.." <> show (length nextBranches) <> ")]"
495 <> " | " <> show ((length . concat . concat) nextBranches) <> " groups" <> "\n"
496 <> " - splited with success in " <> show (map length nextBranches) <> " sub-branches" <> "\n"
497 <> " - for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " < " <> show(qua') <> ")\n" ) nextBranches
498
499
500 traceMatchFailure :: Double -> Double -> Double -> [[PhyloGroup]] -> [[PhyloGroup]]
501 traceMatchFailure thr qua qua' branches =
502 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
503 <> ",(1.." <> show (length branches) <> ")]"
504 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
505 <> " - split with failure for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " > " <> show(qua') <> ")\n"
506 ) branches
507
508
509 traceMatchNoSplit :: [[PhyloGroup]] -> [[PhyloGroup]]
510 traceMatchNoSplit branches =
511 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
512 <> ",(1.." <> show (length branches) <> ")]"
513 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
514 <> " - unable to split in smaller branches" <> "\n"
515 ) branches
516
517
518 traceMatchLimit :: [[PhyloGroup]] -> [[PhyloGroup]]
519 traceMatchLimit branches =
520 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
521 <> ",(1.." <> show (length branches) <> ")]"
522 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
523 <> " - unable to increase the threshold above 1" <> "\n"
524 ) branches
525
526
527 traceMatchEnd :: [PhyloGroup] -> [PhyloGroup]
528 traceMatchEnd groups =
529 trace ("\n" <> "-- | End temporal matching with " <> show (length $ nub $ map (\g -> g ^. phylo_groupBranchId) groups)
530 <> " branches and " <> show (length groups) <> " groups" <> "\n") groups
531
532
533 traceTemporalMatching :: [PhyloGroup] -> [PhyloGroup]
534 traceTemporalMatching groups =
535 trace ( "\n" <> "-- | Start temporal matching for " <> show(length groups) <> " groups" <> "\n") groups
536
537
538 traceGroupsProxi :: Map (PhyloGroupId,PhyloGroupId) Double -> Map (PhyloGroupId,PhyloGroupId) Double
539 traceGroupsProxi m =
540 trace ( "\n" <> "-- | " <> show(Map.size m) <> " computed pairs of groups proximity" <> "\n") m