]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/Phylo/PhyloTools.hs
Merge branch 'dev-phylo' of ssh://gitlab.iscpif.fr:20022/gargantext/haskell-gargantex...
[gargantext.git] / src / Gargantext / Viz / Phylo / PhyloTools.hs
1 {-|
2 Module : Gargantext.Viz.Phylo.PhyloTools
3 Description : Module dedicated to all the tools needed for making a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 {-# LANGUAGE FlexibleContexts #-}
12 {-# LANGUAGE NoImplicitPrelude #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE RankNTypes #-}
15 {-# LANGUAGE ViewPatterns #-}
16
17 module Gargantext.Viz.Phylo.PhyloTools where
18
19 import Data.Vector (Vector, elemIndex)
20 import Data.List (sort, concat, null, union, (++), tails, sortOn, nub, init, tail, partition, tails, nubBy)
21 import Data.Set (Set, size, disjoint)
22 import Data.Map (Map, elems, fromList, unionWith, keys, member, (!), filterWithKey, fromListWith, empty)
23 import Data.String (String)
24 import Data.Text (Text, unwords)
25
26 import Gargantext.Prelude
27 import Gargantext.Viz.AdaptativePhylo
28 import Text.Printf
29
30
31 import Debug.Trace (trace)
32 import Control.Lens hiding (Level)
33
34 import qualified Data.Vector as Vector
35 import qualified Data.List as List
36 import qualified Data.Set as Set
37 import qualified Data.Map as Map
38
39 ------------
40 -- | Io | --
41 ------------
42
43 -- | To print an important message as an IO()
44 printIOMsg :: String -> IO ()
45 printIOMsg msg =
46 putStrLn ( "\n"
47 <> "------------"
48 <> "\n"
49 <> "-- | " <> msg <> "\n" )
50
51
52 -- | To print a comment as an IO()
53 printIOComment :: String -> IO ()
54 printIOComment cmt =
55 putStrLn ( "\n" <> cmt <> "\n" )
56
57
58 --------------
59 -- | Misc | --
60 --------------
61
62
63 roundToStr :: (PrintfArg a, Floating a) => Int -> a -> String
64 roundToStr = printf "%0.*f"
65
66
67 countSup :: Double -> [Double] -> Int
68 countSup s l = length $ filter (>s) l
69
70
71 dropByIdx :: Int -> [a] -> [a]
72 dropByIdx k l = take k l ++ drop (k+1) l
73
74
75 elemIndex' :: Eq a => a -> [a] -> Int
76 elemIndex' e l = case (List.elemIndex e l) of
77 Nothing -> panic ("[ERR][Viz.Phylo.PhyloTools] element not in list")
78 Just i -> i
79
80
81 commonPrefix :: Eq a => [a] -> [a] -> [a] -> [a]
82 commonPrefix lst lst' acc =
83 if (null lst || null lst')
84 then acc
85 else if (head' "commonPrefix" lst == head' "commonPrefix" lst')
86 then commonPrefix (tail lst) (tail lst') (acc ++ [head' "commonPrefix" lst])
87 else acc
88
89
90 ---------------------
91 -- | Foundations | --
92 ---------------------
93
94
95 -- | Is this Ngrams a Foundations Root ?
96 isRoots :: Ngrams -> Vector Ngrams -> Bool
97 isRoots n ns = Vector.elem n ns
98
99 -- | To transform a list of nrams into a list of foundation's index
100 ngramsToIdx :: [Ngrams] -> Vector Ngrams -> [Int]
101 ngramsToIdx ns fdt = map (\n -> fromJust $ elemIndex n fdt) ns
102
103 -- | To transform a list of Ngrams Indexes into a Label
104 ngramsToLabel :: Vector Ngrams -> [Int] -> Text
105 ngramsToLabel ngrams l = unwords $ tail' "ngramsToLabel" $ concat $ map (\n -> ["|",n]) $ ngramsToText ngrams l
106
107
108 -- | To transform a list of Ngrams Indexes into a list of Text
109 ngramsToText :: Vector Ngrams -> [Int] -> [Text]
110 ngramsToText ngrams l = map (\idx -> ngrams Vector.! idx) l
111
112
113 --------------
114 -- | Time | --
115 --------------
116
117 -- | To transform a list of periods into a set of Dates
118 periodsToYears :: [(Date,Date)] -> Set Date
119 periodsToYears periods = (Set.fromList . sort . concat)
120 $ map (\(d,d') -> [d..d']) periods
121
122
123 findBounds :: [Date] -> (Date,Date)
124 findBounds dates =
125 let dates' = sort dates
126 in (head' "findBounds" dates', last' "findBounds" dates')
127
128
129 toPeriods :: [Date] -> Int -> Int -> [(Date,Date)]
130 toPeriods dates p s =
131 let (start,end) = findBounds dates
132 in map (\dates' -> (head' "toPeriods" dates', last' "toPeriods" dates'))
133 $ chunkAlong p s [start .. end]
134
135
136 -- | Get a regular & ascendante timeScale from a given list of dates
137 toTimeScale :: [Date] -> Int -> [Date]
138 toTimeScale dates step =
139 let (start,end) = findBounds dates
140 in [start, (start + step) .. end]
141
142
143 getTimeStep :: TimeUnit -> Int
144 getTimeStep time = case time of
145 Year _ s _ -> s
146
147 getTimePeriod :: TimeUnit -> Int
148 getTimePeriod time = case time of
149 Year p _ _ -> p
150
151 getTimeFrame :: TimeUnit -> Int
152 getTimeFrame time = case time of
153 Year _ _ f -> f
154
155 -------------
156 -- | Fis | --
157 -------------
158
159
160 -- | To find if l' is nested in l
161 isNested :: Eq a => [a] -> [a] -> Bool
162 isNested l l'
163 | null l' = True
164 | length l' > length l = False
165 | (union l l') == l = True
166 | otherwise = False
167
168
169 -- | To filter Fis with small Support but by keeping non empty Periods
170 keepFilled :: (Int -> [a] -> [a]) -> Int -> [a] -> [a]
171 keepFilled f thr l = if (null $ f thr l) && (not $ null l)
172 then keepFilled f (thr - 1) l
173 else f thr l
174
175
176 traceClique :: Map (Date, Date) [PhyloClique] -> String
177 traceClique mFis = foldl (\msg cpt -> msg <> show (countSup cpt cliques) <> " (>" <> show (cpt) <> ") " ) "" [1..6]
178 where
179 --------------------------------------
180 cliques :: [Double]
181 cliques = sort $ map (fromIntegral . size . _phyloClique_nodes) $ concat $ elems mFis
182 --------------------------------------
183
184
185 traceSupport :: Map (Date, Date) [PhyloClique] -> String
186 traceSupport mFis = foldl (\msg cpt -> msg <> show (countSup cpt supports) <> " (>" <> show (cpt) <> ") " ) "" [1..6]
187 where
188 --------------------------------------
189 supports :: [Double]
190 supports = sort $ map (fromIntegral . _phyloClique_support) $ concat $ elems mFis
191 --------------------------------------
192
193
194 traceFis :: [Char] -> Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
195 traceFis msg mFis = trace ( "\n" <> "-- | " <> msg <> " : " <> show (sum $ map length $ elems mFis) <> "\n"
196 <> "Support : " <> (traceSupport mFis) <> "\n"
197 <> "Nb Ngrams : " <> (traceClique mFis) <> "\n" ) mFis
198
199
200 ---------------
201 -- | Clique| --
202 ---------------
203
204
205 getCliqueSupport :: Clique -> Int
206 getCliqueSupport unit = case unit of
207 Fis s _ -> s
208 MaxClique _ -> 0
209
210 getCliqueSize :: Clique -> Int
211 getCliqueSize unit = case unit of
212 Fis _ s -> s
213 MaxClique s -> s
214
215
216 --------------
217 -- | Cooc | --
218 --------------
219
220 listToCombi' :: [a] -> [(a,a)]
221 listToCombi' l = [(x,y) | (x:rest) <- tails l, y <- rest]
222
223 listToEqual' :: Eq a => [a] -> [(a,a)]
224 listToEqual' l = [(x,y) | x <- l, y <- l, x == y]
225
226 listToKeys :: Eq a => [a] -> [(a,a)]
227 listToKeys lst = (listToCombi' lst) ++ (listToEqual' lst)
228
229 listToMatrix :: [Int] -> Map (Int,Int) Double
230 listToMatrix lst = fromList $ map (\k -> (k,1)) $ listToKeys $ sort lst
231
232 listToSeq :: Eq a => [a] -> [(a,a)]
233 listToSeq l = nubBy (\x y -> fst x == fst y) $ [ (x,y) | (x:rest) <- tails l, y <- rest ]
234
235 sumCooc :: Cooc -> Cooc -> Cooc
236 sumCooc cooc cooc' = unionWith (+) cooc cooc'
237
238 getTrace :: Cooc -> Double
239 getTrace cooc = sum $ elems $ filterWithKey (\(k,k') _ -> k == k') cooc
240
241 coocToDiago :: Cooc -> Cooc
242 coocToDiago cooc = filterWithKey (\(k,k') _ -> k == k') cooc
243
244 -- | To build the local cooc matrix of each phylogroup
245 ngramsToCooc :: [Int] -> [Cooc] -> Cooc
246 ngramsToCooc ngrams coocs =
247 let cooc = foldl (\acc cooc' -> sumCooc acc cooc') empty coocs
248 pairs = listToKeys ngrams
249 in filterWithKey (\k _ -> elem k pairs) cooc
250
251
252 --------------------
253 -- | PhyloGroup | --
254 --------------------
255
256 getGroupId :: PhyloGroup -> PhyloGroupId
257 getGroupId group = ((group ^. phylo_groupPeriod, group ^. phylo_groupLevel), group ^. phylo_groupIndex)
258
259 idToPrd :: PhyloGroupId -> PhyloPeriodId
260 idToPrd id = (fst . fst) id
261
262 getGroupThr :: PhyloGroup -> Double
263 getGroupThr group = last' "getGroupThr" ((group ^. phylo_groupMeta) ! "breaks")
264
265 groupByField :: Ord a => (PhyloGroup -> a) -> [PhyloGroup] -> Map a [PhyloGroup]
266 groupByField toField groups = fromListWith (++) $ map (\g -> (toField g, [g])) groups
267
268 getPeriodPointers :: Filiation -> PhyloGroup -> [Pointer]
269 getPeriodPointers fil group =
270 case fil of
271 ToChilds -> group ^. phylo_groupPeriodChilds
272 ToParents -> group ^. phylo_groupPeriodParents
273
274 filterProximity :: Proximity -> Double -> Double -> Bool
275 filterProximity proximity thr local =
276 case proximity of
277 WeightedLogJaccard _ -> local >= thr
278 Hamming -> undefined
279
280 getProximityName :: Proximity -> String
281 getProximityName proximity =
282 case proximity of
283 WeightedLogJaccard _ -> "WLJaccard"
284 Hamming -> "Hamming"
285
286 ---------------
287 -- | Phylo | --
288 ---------------
289
290 addPointers :: Filiation -> PointerType -> [Pointer] -> PhyloGroup -> PhyloGroup
291 addPointers fil pty pointers group =
292 case pty of
293 TemporalPointer -> case fil of
294 ToChilds -> group & phylo_groupPeriodChilds .~ pointers
295 ToParents -> group & phylo_groupPeriodParents .~ pointers
296 LevelPointer -> case fil of
297 ToChilds -> group & phylo_groupLevelChilds .~ pointers
298 ToParents -> group & phylo_groupLevelParents .~ pointers
299
300
301 getPeriodIds :: Phylo -> [(Date,Date)]
302 getPeriodIds phylo = sortOn fst
303 $ keys
304 $ phylo ^. phylo_periods
305
306 getLevelParentId :: PhyloGroup -> PhyloGroupId
307 getLevelParentId g = fst $ head' "getLevelParentId" $ g ^. phylo_groupLevelParents
308
309 getLastLevel :: Phylo -> Level
310 getLastLevel phylo = last' "lastLevel" $ getLevels phylo
311
312 getLevels :: Phylo -> [Level]
313 getLevels phylo = nub
314 $ map snd
315 $ keys $ view ( phylo_periods
316 . traverse
317 . phylo_periodLevels ) phylo
318
319 getSeaElevation :: Phylo -> SeaElevation
320 getSeaElevation phylo = seaElevation (getConfig phylo)
321
322
323 getConfig :: Phylo -> Config
324 getConfig phylo = (phylo ^. phylo_param) ^. phyloParam_config
325
326
327 getRoots :: Phylo -> Vector Ngrams
328 getRoots phylo = (phylo ^. phylo_foundations) ^. foundations_roots
329
330 phyloToLastBranches :: Phylo -> [[PhyloGroup]]
331 phyloToLastBranches phylo = elems
332 $ fromListWith (++)
333 $ map (\g -> (g ^. phylo_groupBranchId, [g]))
334 $ getGroupsFromLevel (last' "byBranches" $ getLevels phylo) phylo
335
336 getGroupsFromLevel :: Level -> Phylo -> [PhyloGroup]
337 getGroupsFromLevel lvl phylo =
338 elems $ view ( phylo_periods
339 . traverse
340 . phylo_periodLevels
341 . traverse
342 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
343 . phylo_levelGroups ) phylo
344
345
346 getGroupsFromLevelPeriods :: Level -> [PhyloPeriodId] -> Phylo -> [PhyloGroup]
347 getGroupsFromLevelPeriods lvl periods phylo =
348 elems $ view ( phylo_periods
349 . traverse
350 . filtered (\phyloPrd -> elem (phyloPrd ^. phylo_periodPeriod) periods)
351 . phylo_periodLevels
352 . traverse
353 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
354 . phylo_levelGroups ) phylo
355
356
357 getGroupsFromPeriods :: Level -> Map PhyloPeriodId PhyloPeriod -> [PhyloGroup]
358 getGroupsFromPeriods lvl periods =
359 elems $ view ( traverse
360 . phylo_periodLevels
361 . traverse
362 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
363 . phylo_levelGroups ) periods
364
365
366 updatePhyloGroups :: Level -> Map PhyloGroupId PhyloGroup -> Phylo -> Phylo
367 updatePhyloGroups lvl m phylo =
368 over ( phylo_periods
369 . traverse
370 . phylo_periodLevels
371 . traverse
372 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
373 . phylo_levelGroups
374 . traverse
375 ) (\group ->
376 let id = getGroupId group
377 in
378 if member id m
379 then m ! id
380 else group ) phylo
381
382
383 traceToPhylo :: Level -> Phylo -> Phylo
384 traceToPhylo lvl phylo =
385 trace ("\n" <> "-- | End of phylo making at level " <> show (lvl) <> " with "
386 <> show (length $ getGroupsFromLevel lvl phylo) <> " groups and "
387 <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel lvl phylo) <> " branches" <> "\n") phylo
388
389 --------------------
390 -- | Clustering | --
391 --------------------
392
393 relatedComponents :: Ord a => [[a]] -> [[a]]
394 relatedComponents graph = foldl' (\acc groups ->
395 if (null acc)
396 then acc ++ [groups]
397 else
398 let acc' = partition (\groups' -> disjoint (Set.fromList groups') (Set.fromList groups)) acc
399 in (fst acc') ++ [nub $ concat $ (snd acc') ++ [groups]]) [] graph
400
401
402 traceSynchronyEnd :: Phylo -> Phylo
403 traceSynchronyEnd phylo =
404 trace ( "\n" <> "-- | End synchronic clustering at level " <> show (getLastLevel phylo)
405 <> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
406 <> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
407 <> "\n" ) phylo
408
409 traceSynchronyStart :: Phylo -> Phylo
410 traceSynchronyStart phylo =
411 trace ( "\n" <> "-- | Start synchronic clustering at level " <> show (getLastLevel phylo)
412 <> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
413 <> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
414 <> "\n" ) phylo
415
416
417 -------------------
418 -- | Proximity | --
419 -------------------
420
421 getSensibility :: Proximity -> Double
422 getSensibility proxi = case proxi of
423 WeightedLogJaccard s -> s
424 Hamming -> undefined
425
426 ----------------
427 -- | Branch | --
428 ----------------
429
430 intersectInit :: Eq a => [a] -> [a] -> [a] -> [a]
431 intersectInit acc lst lst' =
432 if (null lst) || (null lst')
433 then acc
434 else if (head' "intersectInit" lst) == (head' "intersectInit" lst')
435 then intersectInit (acc ++ [head' "intersectInit" lst]) (tail lst) (tail lst')
436 else acc
437
438 branchIdsToProximity :: PhyloBranchId -> PhyloBranchId -> Double -> Double -> Double
439 branchIdsToProximity id id' thrInit thrStep = thrInit + thrStep * (fromIntegral $ length $ intersectInit [] (snd id) (snd id'))
440
441 ngramsInBranches :: [[PhyloGroup]] -> [Int]
442 ngramsInBranches branches = nub $ foldl (\acc g -> acc ++ (g ^. phylo_groupNgrams)) [] $ concat branches
443
444
445 traceMatchSuccess :: Double -> Double -> Double -> [[[PhyloGroup]]] -> [[[PhyloGroup]]]
446 traceMatchSuccess thr qua qua' nextBranches =
447 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . init . snd)
448 $ (head' "trace" $ head' "trace" $ head' "trace" nextBranches) ^. phylo_groupBranchId))
449 <> ",(1.." <> show (length nextBranches) <> ")]"
450 <> " | " <> show ((length . concat . concat) nextBranches) <> " groups" <> "\n"
451 <> " - splited with success in " <> show (map length nextBranches) <> " sub-branches" <> "\n"
452 <> " - for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " < " <> show(qua') <> ")\n" ) nextBranches
453
454
455 traceMatchFailure :: Double -> Double -> Double -> [[PhyloGroup]] -> [[PhyloGroup]]
456 traceMatchFailure thr qua qua' branches =
457 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
458 <> ",(1.." <> show (length branches) <> ")]"
459 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
460 <> " - split with failure for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " > " <> show(qua') <> ")\n"
461 ) branches
462
463
464 traceMatchNoSplit :: [[PhyloGroup]] -> [[PhyloGroup]]
465 traceMatchNoSplit branches =
466 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
467 <> ",(1.." <> show (length branches) <> ")]"
468 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
469 <> " - unable to split in smaller branches" <> "\n"
470 ) branches
471
472
473 traceMatchLimit :: [[PhyloGroup]] -> [[PhyloGroup]]
474 traceMatchLimit branches =
475 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
476 <> ",(1.." <> show (length branches) <> ")]"
477 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
478 <> " - unable to increase the threshold above 1" <> "\n"
479 ) branches
480
481
482 traceMatchEnd :: [PhyloGroup] -> [PhyloGroup]
483 traceMatchEnd groups =
484 trace ("\n" <> "-- | End temporal matching with " <> show (length $ nub $ map (\g -> g ^. phylo_groupBranchId) groups)
485 <> " branches and " <> show (length groups) <> " groups" <> "\n") groups
486
487
488 traceTemporalMatching :: [PhyloGroup] -> [PhyloGroup]
489 traceTemporalMatching groups =
490 trace ( "\n" <> "-- | Start temporal matching for " <> show(length groups) <> " groups" <> "\n") groups
491
492
493 traceGroupsProxi :: Map (PhyloGroupId,PhyloGroupId) Double -> Map (PhyloGroupId,PhyloGroupId) Double
494 traceGroupsProxi m =
495 trace ( "\n" <> "-- | " <> show(Map.size m) <> " computed pairs of groups proximity" <> "\n") m