]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Viz/Phylo/PhyloTools.hs
Merge branch 'dev-textflow' into dev
[gargantext.git] / src / Gargantext / Core / Viz / Phylo / PhyloTools.hs
1 {-|
2 Module : Gargantext.Core.Viz.Phylo.PhyloTools
3 Description : Module dedicated to all the tools needed for making a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 {-# LANGUAGE ViewPatterns #-}
12
13 module Gargantext.Core.Viz.Phylo.PhyloTools where
14
15 import Data.Vector (Vector, elemIndex)
16 import Data.List (sort, concat, null, union, (++), tails, sortOn, nub, init, tail, partition, tails, nubBy, maximum, group)
17 import Data.Set (Set, disjoint)
18 import Data.Map (Map, elems, fromList, unionWith, keys, member, (!), filterWithKey, fromListWith, empty, restrictKeys)
19 import Data.String (String)
20 import Data.Text (Text, unwords)
21
22 import Gargantext.Prelude
23 import Gargantext.Core.Viz.AdaptativePhylo
24 import Text.Printf
25
26
27 import Debug.Trace (trace)
28 import Control.Lens hiding (Level)
29
30 import qualified Data.Vector as Vector
31 import qualified Data.List as List
32 import qualified Data.Set as Set
33 import qualified Data.Map as Map
34
35 ------------
36 -- | Io | --
37 ------------
38
39 -- | To print an important message as an IO()
40 printIOMsg :: String -> IO ()
41 printIOMsg msg =
42 putStrLn ( "\n"
43 <> "------------"
44 <> "\n"
45 <> "-- | " <> msg <> "\n" )
46
47
48 -- | To print a comment as an IO()
49 printIOComment :: String -> IO ()
50 printIOComment cmt =
51 putStrLn ( "\n" <> cmt <> "\n" )
52
53
54 --------------
55 -- | Misc | --
56 --------------
57
58
59 roundToStr :: (PrintfArg a, Floating a) => Int -> a -> String
60 roundToStr = printf "%0.*f"
61
62
63 countSup :: Double -> [Double] -> Int
64 countSup s l = length $ filter (>s) l
65
66
67 dropByIdx :: Int -> [a] -> [a]
68 dropByIdx k l = take k l ++ drop (k+1) l
69
70
71 elemIndex' :: Eq a => a -> [a] -> Int
72 elemIndex' e l = case (List.elemIndex e l) of
73 Nothing -> panic ("[ERR][Viz.Phylo.PhyloTools] element not in list")
74 Just i -> i
75
76
77 commonPrefix :: Eq a => [a] -> [a] -> [a] -> [a]
78 commonPrefix lst lst' acc =
79 if (null lst || null lst')
80 then acc
81 else if (head' "commonPrefix" lst == head' "commonPrefix" lst')
82 then commonPrefix (tail lst) (tail lst') (acc ++ [head' "commonPrefix" lst])
83 else acc
84
85
86 ---------------------
87 -- | Foundations | --
88 ---------------------
89
90
91 -- | Is this Ngrams a Foundations Root ?
92 isRoots :: Ngrams -> Vector Ngrams -> Bool
93 isRoots n ns = Vector.elem n ns
94
95 -- | To transform a list of nrams into a list of foundation's index
96 ngramsToIdx :: [Ngrams] -> Vector Ngrams -> [Int]
97 ngramsToIdx ns fdt = map (\n -> fromJust $ elemIndex n fdt) ns
98
99 -- | To transform a list of Ngrams Indexes into a Label
100 ngramsToLabel :: Vector Ngrams -> [Int] -> Text
101 ngramsToLabel ngrams l = unwords $ tail' "ngramsToLabel" $ concat $ map (\n -> ["|",n]) $ ngramsToText ngrams l
102
103
104 -- | To transform a list of Ngrams Indexes into a list of Text
105 ngramsToText :: Vector Ngrams -> [Int] -> [Text]
106 ngramsToText ngrams l = map (\idx -> ngrams Vector.! idx) l
107
108
109 --------------
110 -- | Time | --
111 --------------
112
113 -- | To transform a list of periods into a set of Dates
114 periodsToYears :: [(Date,Date)] -> Set Date
115 periodsToYears periods = (Set.fromList . sort . concat)
116 $ map (\(d,d') -> [d..d']) periods
117
118
119 findBounds :: [Date] -> (Date,Date)
120 findBounds dates =
121 let dates' = sort dates
122 in (head' "findBounds" dates', last' "findBounds" dates')
123
124
125 toPeriods :: [Date] -> Int -> Int -> [(Date,Date)]
126 toPeriods dates p s =
127 let (start,end) = findBounds dates
128 in map (\dates' -> (head' "toPeriods" dates', last' "toPeriods" dates'))
129 $ chunkAlong p s [start .. end]
130
131
132 -- | Get a regular & ascendante timeScale from a given list of dates
133 toTimeScale :: [Date] -> Int -> [Date]
134 toTimeScale dates step =
135 let (start,end) = findBounds dates
136 in [start, (start + step) .. end]
137
138
139 getTimeStep :: TimeUnit -> Int
140 getTimeStep time = case time of
141 Year _ s _ -> s
142
143 getTimePeriod :: TimeUnit -> Int
144 getTimePeriod time = case time of
145 Year p _ _ -> p
146
147 getTimeFrame :: TimeUnit -> Int
148 getTimeFrame time = case time of
149 Year _ _ f -> f
150
151 -------------
152 -- | Fis | --
153 -------------
154
155
156 -- | To find if l' is nested in l
157 isNested :: Eq a => [a] -> [a] -> Bool
158 isNested l l'
159 | null l' = True
160 | length l' > length l = False
161 | (union l l') == l = True
162 | otherwise = False
163
164
165 -- | To filter Fis with small Support but by keeping non empty Periods
166 keepFilled :: (Int -> [a] -> [a]) -> Int -> [a] -> [a]
167 keepFilled f thr l = if (null $ f thr l) && (not $ null l)
168 then keepFilled f (thr - 1) l
169 else f thr l
170
171
172 traceClique :: Map (Date, Date) [PhyloClique] -> String
173 traceClique mFis = foldl (\msg cpt -> msg <> show (countSup cpt cliques) <> " (>" <> show (cpt) <> ") " ) "" [1..6]
174 where
175 --------------------------------------
176 cliques :: [Double]
177 cliques = sort $ map (fromIntegral . length . _phyloClique_nodes) $ concat $ elems mFis
178 --------------------------------------
179
180
181 traceSupport :: Map (Date, Date) [PhyloClique] -> String
182 traceSupport mFis = foldl (\msg cpt -> msg <> show (countSup cpt supports) <> " (>" <> show (cpt) <> ") " ) "" [1..6]
183 where
184 --------------------------------------
185 supports :: [Double]
186 supports = sort $ map (fromIntegral . _phyloClique_support) $ concat $ elems mFis
187 --------------------------------------
188
189
190 traceFis :: [Char] -> Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
191 traceFis msg mFis = trace ( "\n" <> "-- | " <> msg <> " : " <> show (sum $ map length $ elems mFis) <> "\n"
192 <> "Support : " <> (traceSupport mFis) <> "\n"
193 <> "Nb Ngrams : " <> (traceClique mFis) <> "\n" ) mFis
194
195
196 ---------------
197 -- | Clique| --
198 ---------------
199
200
201 getCliqueSupport :: Clique -> Int
202 getCliqueSupport unit = case unit of
203 Fis s _ -> s
204 MaxClique _ -> 0
205
206 getCliqueSize :: Clique -> Int
207 getCliqueSize unit = case unit of
208 Fis _ s -> s
209 MaxClique s -> s
210
211
212 --------------
213 -- | Cooc | --
214 --------------
215
216 listToCombi' :: [a] -> [(a,a)]
217 listToCombi' l = [(x,y) | (x:rest) <- tails l, y <- rest]
218
219 listToEqual' :: Eq a => [a] -> [(a,a)]
220 listToEqual' l = [(x,y) | x <- l, y <- l, x == y]
221
222 listToKeys :: Eq a => [a] -> [(a,a)]
223 listToKeys lst = (listToCombi' lst) ++ (listToEqual' lst)
224
225 listToMatrix :: [Int] -> Map (Int,Int) Double
226 listToMatrix lst = fromList $ map (\k -> (k,1)) $ listToKeys $ sort lst
227
228 listToMatrix' :: [Ngrams] -> Map (Ngrams,Ngrams) Int
229 listToMatrix' lst = fromList $ map (\k -> (k,1)) $ listToKeys $ sort lst
230
231 listToSeq :: Eq a => [a] -> [(a,a)]
232 listToSeq l = nubBy (\x y -> fst x == fst y) $ [ (x,y) | (x:rest) <- tails l, y <- rest ]
233
234 sumCooc :: Cooc -> Cooc -> Cooc
235 sumCooc cooc cooc' = unionWith (+) cooc cooc'
236
237 getTrace :: Cooc -> Double
238 getTrace cooc = sum $ elems $ filterWithKey (\(k,k') _ -> k == k') cooc
239
240 coocToDiago :: Cooc -> Cooc
241 coocToDiago cooc = filterWithKey (\(k,k') _ -> k == k') cooc
242
243 -- | To build the local cooc matrix of each phylogroup
244 ngramsToCooc :: [Int] -> [Cooc] -> Cooc
245 ngramsToCooc ngrams coocs =
246 let cooc = foldl (\acc cooc' -> sumCooc acc cooc') empty coocs
247 pairs = listToKeys ngrams
248 in filterWithKey (\k _ -> elem k pairs) cooc
249
250
251 --------------------
252 -- | PhyloGroup | --
253 --------------------
254
255 getGroupId :: PhyloGroup -> PhyloGroupId
256 getGroupId g = ((g ^. phylo_groupPeriod, g ^. phylo_groupLevel), g ^. phylo_groupIndex)
257
258 idToPrd :: PhyloGroupId -> PhyloPeriodId
259 idToPrd id = (fst . fst) id
260
261 groupByField :: Ord a => (PhyloGroup -> a) -> [PhyloGroup] -> Map a [PhyloGroup]
262 groupByField toField groups = fromListWith (++) $ map (\g -> (toField g, [g])) groups
263
264 getPeriodPointers :: Filiation -> PhyloGroup -> [Pointer]
265 getPeriodPointers fil g =
266 case fil of
267 ToChilds -> g ^. phylo_groupPeriodChilds
268 ToParents -> g ^. phylo_groupPeriodParents
269
270 filterProximity :: Proximity -> Double -> Double -> Bool
271 filterProximity proximity thr local =
272 case proximity of
273 WeightedLogJaccard _ -> local >= thr
274 Hamming -> undefined
275
276 getProximityName :: Proximity -> String
277 getProximityName proximity =
278 case proximity of
279 WeightedLogJaccard _ -> "WLJaccard"
280 Hamming -> "Hamming"
281
282 ---------------
283 -- | Phylo | --
284 ---------------
285
286 addPointers :: Filiation -> PointerType -> [Pointer] -> PhyloGroup -> PhyloGroup
287 addPointers fil pty pointers g =
288 case pty of
289 TemporalPointer -> case fil of
290 ToChilds -> g & phylo_groupPeriodChilds .~ pointers
291 ToParents -> g & phylo_groupPeriodParents .~ pointers
292 LevelPointer -> case fil of
293 ToChilds -> g & phylo_groupLevelChilds .~ pointers
294 ToParents -> g & phylo_groupLevelParents .~ pointers
295
296
297 getPeriodIds :: Phylo -> [(Date,Date)]
298 getPeriodIds phylo = sortOn fst
299 $ keys
300 $ phylo ^. phylo_periods
301
302 getLevelParentId :: PhyloGroup -> PhyloGroupId
303 getLevelParentId g = fst $ head' "getLevelParentId" $ g ^. phylo_groupLevelParents
304
305 getLastLevel :: Phylo -> Level
306 getLastLevel phylo = last' "lastLevel" $ getLevels phylo
307
308 getLevels :: Phylo -> [Level]
309 getLevels phylo = nub
310 $ map snd
311 $ keys $ view ( phylo_periods
312 . traverse
313 . phylo_periodLevels ) phylo
314
315 getSeaElevation :: Phylo -> SeaElevation
316 getSeaElevation phylo = seaElevation (getConfig phylo)
317
318
319 getConfig :: Phylo -> Config
320 getConfig phylo = (phylo ^. phylo_param) ^. phyloParam_config
321
322
323 getRoots :: Phylo -> Vector Ngrams
324 getRoots phylo = (phylo ^. phylo_foundations) ^. foundations_roots
325
326 phyloToLastBranches :: Phylo -> [[PhyloGroup]]
327 phyloToLastBranches phylo = elems
328 $ fromListWith (++)
329 $ map (\g -> (g ^. phylo_groupBranchId, [g]))
330 $ getGroupsFromLevel (last' "byBranches" $ getLevels phylo) phylo
331
332 getGroupsFromLevel :: Level -> Phylo -> [PhyloGroup]
333 getGroupsFromLevel lvl phylo =
334 elems $ view ( phylo_periods
335 . traverse
336 . phylo_periodLevels
337 . traverse
338 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
339 . phylo_levelGroups ) phylo
340
341
342 getGroupsFromLevelPeriods :: Level -> [PhyloPeriodId] -> Phylo -> [PhyloGroup]
343 getGroupsFromLevelPeriods lvl periods phylo =
344 elems $ view ( phylo_periods
345 . traverse
346 . filtered (\phyloPrd -> elem (phyloPrd ^. phylo_periodPeriod) periods)
347 . phylo_periodLevels
348 . traverse
349 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
350 . phylo_levelGroups ) phylo
351
352
353 getGroupsFromPeriods :: Level -> Map PhyloPeriodId PhyloPeriod -> [PhyloGroup]
354 getGroupsFromPeriods lvl periods =
355 elems $ view ( traverse
356 . phylo_periodLevels
357 . traverse
358 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
359 . phylo_levelGroups ) periods
360
361
362 updatePhyloGroups :: Level -> Map PhyloGroupId PhyloGroup -> Phylo -> Phylo
363 updatePhyloGroups lvl m phylo =
364 over ( phylo_periods
365 . traverse
366 . phylo_periodLevels
367 . traverse
368 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
369 . phylo_levelGroups
370 . traverse
371 ) (\g ->
372 let id = getGroupId g
373 in
374 if member id m
375 then m ! id
376 else g ) phylo
377
378
379 traceToPhylo :: Level -> Phylo -> Phylo
380 traceToPhylo lvl phylo =
381 trace ("\n" <> "-- | End of phylo making at level " <> show (lvl) <> " with "
382 <> show (length $ getGroupsFromLevel lvl phylo) <> " groups and "
383 <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel lvl phylo) <> " branches" <> "\n") phylo
384
385 --------------------
386 -- | Clustering | --
387 --------------------
388
389 mergeBranchIds :: [[Int]] -> [Int]
390 mergeBranchIds ids = (head' "mergeBranchIds" . sort . mostFreq') ids
391 where
392 -- | 2) find the most Up Left ids in the hierarchy of similarity
393 -- mostUpLeft :: [[Int]] -> [[Int]]
394 -- mostUpLeft ids' =
395 -- let groupIds = (map (\gIds -> (length $ head' "gIds" gIds, head' "gIds" gIds)) . groupBy (\id id' -> length id == length id') . sortOn length) ids'
396 -- inf = (fst . minimum) groupIds
397 -- in map snd $ filter (\gIds -> fst gIds == inf) groupIds
398 -- | 1) find the most frequent ids
399 mostFreq' :: [[Int]] -> [[Int]]
400 mostFreq' ids' =
401 let groupIds = (map (\gIds -> (length gIds, head' "gIds" gIds)) . group . sort) ids'
402 sup = (fst . maximum) groupIds
403 in map snd $ filter (\gIds -> fst gIds == sup) groupIds
404
405
406 mergeMeta :: [Int] -> [PhyloGroup] -> Map Text [Double]
407 mergeMeta bId groups =
408 let ego = head' "mergeMeta" $ filter (\g -> (snd (g ^. phylo_groupBranchId)) == bId) groups
409 in fromList [("breaks",(ego ^. phylo_groupMeta) ! "breaks"),("seaLevels",(ego ^. phylo_groupMeta) ! "seaLevels")]
410
411
412 groupsToBranches :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
413 groupsToBranches groups =
414 -- | run the related component algorithm
415 let egos = map (\g -> [getGroupId g]
416 ++ (map fst $ g ^. phylo_groupPeriodParents)
417 ++ (map fst $ g ^. phylo_groupPeriodChilds)
418 ++ (map fst $ g ^. phylo_groupAncestors)) $ elems groups
419 graph = relatedComponents egos
420 -- | update each group's branch id
421 in map (\ids ->
422 let groups' = elems $ restrictKeys groups (Set.fromList ids)
423 bId = mergeBranchIds $ map (\g -> snd $ g ^. phylo_groupBranchId) groups'
424 in map (\g -> g & phylo_groupBranchId %~ (\(lvl,_) -> (lvl,bId))) groups') graph
425
426 relatedComponents :: Ord a => [[a]] -> [[a]]
427 relatedComponents graph = foldl' (\acc groups ->
428 if (null acc)
429 then acc ++ [groups]
430 else
431 let acc' = partition (\groups' -> disjoint (Set.fromList groups') (Set.fromList groups)) acc
432 in (fst acc') ++ [nub $ concat $ (snd acc') ++ [groups]]) [] graph
433
434 toRelatedComponents :: [PhyloGroup] -> [((PhyloGroup,PhyloGroup),Double)] -> [[PhyloGroup]]
435 toRelatedComponents nodes edges =
436 let ref = fromList $ map (\g -> (getGroupId g, g)) nodes
437 clusters = relatedComponents $ ((map (\((g,g'),_) -> [getGroupId g, getGroupId g']) edges) ++ (map (\g -> [getGroupId g]) nodes))
438 in map (\cluster -> map (\gId -> ref ! gId) cluster) clusters
439
440
441 traceSynchronyEnd :: Phylo -> Phylo
442 traceSynchronyEnd phylo =
443 trace ( "\n" <> "-- | End synchronic clustering at level " <> show (getLastLevel phylo)
444 <> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
445 <> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
446 <> "\n" ) phylo
447
448 traceSynchronyStart :: Phylo -> Phylo
449 traceSynchronyStart phylo =
450 trace ( "\n" <> "-- | Start synchronic clustering at level " <> show (getLastLevel phylo)
451 <> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
452 <> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
453 <> "\n" ) phylo
454
455
456 -------------------
457 -- | Proximity | --
458 -------------------
459
460 getSensibility :: Proximity -> Double
461 getSensibility proxi = case proxi of
462 WeightedLogJaccard s -> s
463 Hamming -> undefined
464
465 ----------------
466 -- | Branch | --
467 ----------------
468
469 intersectInit :: Eq a => [a] -> [a] -> [a] -> [a]
470 intersectInit acc lst lst' =
471 if (null lst) || (null lst')
472 then acc
473 else if (head' "intersectInit" lst) == (head' "intersectInit" lst')
474 then intersectInit (acc ++ [head' "intersectInit" lst]) (tail lst) (tail lst')
475 else acc
476
477 branchIdsToProximity :: PhyloBranchId -> PhyloBranchId -> Double -> Double -> Double
478 branchIdsToProximity id id' thrInit thrStep = thrInit + thrStep * (fromIntegral $ length $ intersectInit [] (snd id) (snd id'))
479
480 ngramsInBranches :: [[PhyloGroup]] -> [Int]
481 ngramsInBranches branches = nub $ foldl (\acc g -> acc ++ (g ^. phylo_groupNgrams)) [] $ concat branches
482
483
484 traceMatchSuccess :: Double -> Double -> Double -> [[[PhyloGroup]]] -> [[[PhyloGroup]]]
485 traceMatchSuccess thr qua qua' nextBranches =
486 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . init . snd)
487 $ (head' "trace" $ head' "trace" $ head' "trace" nextBranches) ^. phylo_groupBranchId))
488 <> ",(1.." <> show (length nextBranches) <> ")]"
489 <> " | " <> show ((length . concat . concat) nextBranches) <> " groups" <> "\n"
490 <> " - splited with success in " <> show (map length nextBranches) <> " sub-branches" <> "\n"
491 <> " - for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " < " <> show(qua') <> ")\n" ) nextBranches
492
493
494 traceMatchFailure :: Double -> Double -> Double -> [[PhyloGroup]] -> [[PhyloGroup]]
495 traceMatchFailure thr qua qua' branches =
496 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
497 <> ",(1.." <> show (length branches) <> ")]"
498 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
499 <> " - split with failure for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " > " <> show(qua') <> ")\n"
500 ) branches
501
502
503 traceMatchNoSplit :: [[PhyloGroup]] -> [[PhyloGroup]]
504 traceMatchNoSplit branches =
505 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
506 <> ",(1.." <> show (length branches) <> ")]"
507 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
508 <> " - unable to split in smaller branches" <> "\n"
509 ) branches
510
511
512 traceMatchLimit :: [[PhyloGroup]] -> [[PhyloGroup]]
513 traceMatchLimit branches =
514 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
515 <> ",(1.." <> show (length branches) <> ")]"
516 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
517 <> " - unable to increase the threshold above 1" <> "\n"
518 ) branches
519
520
521 traceMatchEnd :: [PhyloGroup] -> [PhyloGroup]
522 traceMatchEnd groups =
523 trace ("\n" <> "-- | End temporal matching with " <> show (length $ nub $ map (\g -> g ^. phylo_groupBranchId) groups)
524 <> " branches and " <> show (length groups) <> " groups" <> "\n") groups
525
526
527 traceTemporalMatching :: [PhyloGroup] -> [PhyloGroup]
528 traceTemporalMatching groups =
529 trace ( "\n" <> "-- | Start temporal matching for " <> show(length groups) <> " groups" <> "\n") groups
530
531
532 traceGroupsProxi :: Map (PhyloGroupId,PhyloGroupId) Double -> Map (PhyloGroupId,PhyloGroupId) Double
533 traceGroupsProxi m =
534 trace ( "\n" <> "-- | " <> show(Map.size m) <> " computed pairs of groups proximity" <> "\n") m