]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/Phylo/PhyloTools.hs
Merge branch 'dev' into dev-doc-table-optimization
[gargantext.git] / src / Gargantext / Viz / Phylo / PhyloTools.hs
1 {-|
2 Module : Gargantext.Viz.Phylo.PhyloTools
3 Description : Module dedicated to all the tools needed for making a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 {-# LANGUAGE ViewPatterns #-}
12
13 module Gargantext.Viz.Phylo.PhyloTools where
14
15 import Data.Vector (Vector, elemIndex)
16 import Data.List (sort, concat, null, union, (++), tails, sortOn, nub, init, tail, partition, tails, nubBy)
17 import Data.Set (Set, disjoint)
18 import Data.Map (Map, elems, fromList, unionWith, keys, member, (!), filterWithKey, fromListWith, empty)
19 import Data.String (String)
20 import Data.Text (Text, unwords)
21
22 import Gargantext.Prelude
23 import Gargantext.Viz.AdaptativePhylo
24 import Text.Printf
25
26
27 import Debug.Trace (trace)
28 import Control.Lens hiding (Level)
29
30 import qualified Data.Vector as Vector
31 import qualified Data.List as List
32 import qualified Data.Set as Set
33 import qualified Data.Map as Map
34
35 ------------
36 -- | Io | --
37 ------------
38
39 -- | To print an important message as an IO()
40 printIOMsg :: String -> IO ()
41 printIOMsg msg =
42 putStrLn ( "\n"
43 <> "------------"
44 <> "\n"
45 <> "-- | " <> msg <> "\n" )
46
47
48 -- | To print a comment as an IO()
49 printIOComment :: String -> IO ()
50 printIOComment cmt =
51 putStrLn ( "\n" <> cmt <> "\n" )
52
53
54 --------------
55 -- | Misc | --
56 --------------
57
58
59 roundToStr :: (PrintfArg a, Floating a) => Int -> a -> String
60 roundToStr = printf "%0.*f"
61
62
63 countSup :: Double -> [Double] -> Int
64 countSup s l = length $ filter (>s) l
65
66
67 dropByIdx :: Int -> [a] -> [a]
68 dropByIdx k l = take k l ++ drop (k+1) l
69
70
71 elemIndex' :: Eq a => a -> [a] -> Int
72 elemIndex' e l = case (List.elemIndex e l) of
73 Nothing -> panic ("[ERR][Viz.Phylo.PhyloTools] element not in list")
74 Just i -> i
75
76
77 commonPrefix :: Eq a => [a] -> [a] -> [a] -> [a]
78 commonPrefix lst lst' acc =
79 if (null lst || null lst')
80 then acc
81 else if (head' "commonPrefix" lst == head' "commonPrefix" lst')
82 then commonPrefix (tail lst) (tail lst') (acc ++ [head' "commonPrefix" lst])
83 else acc
84
85
86 ---------------------
87 -- | Foundations | --
88 ---------------------
89
90
91 -- | Is this Ngrams a Foundations Root ?
92 isRoots :: Ngrams -> Vector Ngrams -> Bool
93 isRoots n ns = Vector.elem n ns
94
95 -- | To transform a list of nrams into a list of foundation's index
96 ngramsToIdx :: [Ngrams] -> Vector Ngrams -> [Int]
97 ngramsToIdx ns fdt = map (\n -> fromJust $ elemIndex n fdt) ns
98
99 -- | To transform a list of Ngrams Indexes into a Label
100 ngramsToLabel :: Vector Ngrams -> [Int] -> Text
101 ngramsToLabel ngrams l = unwords $ tail' "ngramsToLabel" $ concat $ map (\n -> ["|",n]) $ ngramsToText ngrams l
102
103
104 -- | To transform a list of Ngrams Indexes into a list of Text
105 ngramsToText :: Vector Ngrams -> [Int] -> [Text]
106 ngramsToText ngrams l = map (\idx -> ngrams Vector.! idx) l
107
108
109 --------------
110 -- | Time | --
111 --------------
112
113 -- | To transform a list of periods into a set of Dates
114 periodsToYears :: [(Date,Date)] -> Set Date
115 periodsToYears periods = (Set.fromList . sort . concat)
116 $ map (\(d,d') -> [d..d']) periods
117
118
119 findBounds :: [Date] -> (Date,Date)
120 findBounds dates =
121 let dates' = sort dates
122 in (head' "findBounds" dates', last' "findBounds" dates')
123
124
125 toPeriods :: [Date] -> Int -> Int -> [(Date,Date)]
126 toPeriods dates p s =
127 let (start,end) = findBounds dates
128 in map (\dates' -> (head' "toPeriods" dates', last' "toPeriods" dates'))
129 $ chunkAlong p s [start .. end]
130
131
132 -- | Get a regular & ascendante timeScale from a given list of dates
133 toTimeScale :: [Date] -> Int -> [Date]
134 toTimeScale dates step =
135 let (start,end) = findBounds dates
136 in [start, (start + step) .. end]
137
138
139 getTimeStep :: TimeUnit -> Int
140 getTimeStep time = case time of
141 Year _ s _ -> s
142
143 getTimePeriod :: TimeUnit -> Int
144 getTimePeriod time = case time of
145 Year p _ _ -> p
146
147 getTimeFrame :: TimeUnit -> Int
148 getTimeFrame time = case time of
149 Year _ _ f -> f
150
151 -------------
152 -- | Fis | --
153 -------------
154
155
156 -- | To find if l' is nested in l
157 isNested :: Eq a => [a] -> [a] -> Bool
158 isNested l l'
159 | null l' = True
160 | length l' > length l = False
161 | (union l l') == l = True
162 | otherwise = False
163
164
165 -- | To filter Fis with small Support but by keeping non empty Periods
166 keepFilled :: (Int -> [a] -> [a]) -> Int -> [a] -> [a]
167 keepFilled f thr l = if (null $ f thr l) && (not $ null l)
168 then keepFilled f (thr - 1) l
169 else f thr l
170
171
172 traceClique :: Map (Date, Date) [PhyloClique] -> String
173 traceClique mFis = foldl (\msg cpt -> msg <> show (countSup cpt cliques) <> " (>" <> show (cpt) <> ") " ) "" [1..6]
174 where
175 --------------------------------------
176 cliques :: [Double]
177 cliques = sort $ map (fromIntegral . length . _phyloClique_nodes) $ concat $ elems mFis
178 --------------------------------------
179
180
181 traceSupport :: Map (Date, Date) [PhyloClique] -> String
182 traceSupport mFis = foldl (\msg cpt -> msg <> show (countSup cpt supports) <> " (>" <> show (cpt) <> ") " ) "" [1..6]
183 where
184 --------------------------------------
185 supports :: [Double]
186 supports = sort $ map (fromIntegral . _phyloClique_support) $ concat $ elems mFis
187 --------------------------------------
188
189
190 traceFis :: [Char] -> Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
191 traceFis msg mFis = trace ( "\n" <> "-- | " <> msg <> " : " <> show (sum $ map length $ elems mFis) <> "\n"
192 <> "Support : " <> (traceSupport mFis) <> "\n"
193 <> "Nb Ngrams : " <> (traceClique mFis) <> "\n" ) mFis
194
195
196 ---------------
197 -- | Clique| --
198 ---------------
199
200
201 getCliqueSupport :: Clique -> Int
202 getCliqueSupport unit = case unit of
203 Fis s _ -> s
204 MaxClique _ -> 0
205
206 getCliqueSize :: Clique -> Int
207 getCliqueSize unit = case unit of
208 Fis _ s -> s
209 MaxClique s -> s
210
211
212 --------------
213 -- | Cooc | --
214 --------------
215
216 listToCombi' :: [a] -> [(a,a)]
217 listToCombi' l = [(x,y) | (x:rest) <- tails l, y <- rest]
218
219 listToEqual' :: Eq a => [a] -> [(a,a)]
220 listToEqual' l = [(x,y) | x <- l, y <- l, x == y]
221
222 listToKeys :: Eq a => [a] -> [(a,a)]
223 listToKeys lst = (listToCombi' lst) ++ (listToEqual' lst)
224
225 listToMatrix :: [Int] -> Map (Int,Int) Double
226 listToMatrix lst = fromList $ map (\k -> (k,1)) $ listToKeys $ sort lst
227
228 listToMatrix' :: [Ngrams] -> Map (Ngrams,Ngrams) Int
229 listToMatrix' lst = fromList $ map (\k -> (k,1)) $ listToKeys $ sort lst
230
231 listToSeq :: Eq a => [a] -> [(a,a)]
232 listToSeq l = nubBy (\x y -> fst x == fst y) $ [ (x,y) | (x:rest) <- tails l, y <- rest ]
233
234 sumCooc :: Cooc -> Cooc -> Cooc
235 sumCooc cooc cooc' = unionWith (+) cooc cooc'
236
237 getTrace :: Cooc -> Double
238 getTrace cooc = sum $ elems $ filterWithKey (\(k,k') _ -> k == k') cooc
239
240 coocToDiago :: Cooc -> Cooc
241 coocToDiago cooc = filterWithKey (\(k,k') _ -> k == k') cooc
242
243 -- | To build the local cooc matrix of each phylogroup
244 ngramsToCooc :: [Int] -> [Cooc] -> Cooc
245 ngramsToCooc ngrams coocs =
246 let cooc = foldl (\acc cooc' -> sumCooc acc cooc') empty coocs
247 pairs = listToKeys ngrams
248 in filterWithKey (\k _ -> elem k pairs) cooc
249
250
251 --------------------
252 -- | PhyloGroup | --
253 --------------------
254
255 getGroupId :: PhyloGroup -> PhyloGroupId
256 getGroupId group = ((group ^. phylo_groupPeriod, group ^. phylo_groupLevel), group ^. phylo_groupIndex)
257
258 idToPrd :: PhyloGroupId -> PhyloPeriodId
259 idToPrd id = (fst . fst) id
260
261 getGroupThr :: PhyloGroup -> Double
262 getGroupThr group = last' "getGroupThr" ((group ^. phylo_groupMeta) ! "breaks")
263
264 groupByField :: Ord a => (PhyloGroup -> a) -> [PhyloGroup] -> Map a [PhyloGroup]
265 groupByField toField groups = fromListWith (++) $ map (\g -> (toField g, [g])) groups
266
267 getPeriodPointers :: Filiation -> PhyloGroup -> [Pointer]
268 getPeriodPointers fil group =
269 case fil of
270 ToChilds -> group ^. phylo_groupPeriodChilds
271 ToParents -> group ^. phylo_groupPeriodParents
272
273 filterProximity :: Proximity -> Double -> Double -> Bool
274 filterProximity proximity thr local =
275 case proximity of
276 WeightedLogJaccard _ -> local >= thr
277 Hamming -> undefined
278
279 getProximityName :: Proximity -> String
280 getProximityName proximity =
281 case proximity of
282 WeightedLogJaccard _ -> "WLJaccard"
283 Hamming -> "Hamming"
284
285 ---------------
286 -- | Phylo | --
287 ---------------
288
289 addPointers :: Filiation -> PointerType -> [Pointer] -> PhyloGroup -> PhyloGroup
290 addPointers fil pty pointers group =
291 case pty of
292 TemporalPointer -> case fil of
293 ToChilds -> group & phylo_groupPeriodChilds .~ pointers
294 ToParents -> group & phylo_groupPeriodParents .~ pointers
295 LevelPointer -> case fil of
296 ToChilds -> group & phylo_groupLevelChilds .~ pointers
297 ToParents -> group & phylo_groupLevelParents .~ pointers
298
299
300 getPeriodIds :: Phylo -> [(Date,Date)]
301 getPeriodIds phylo = sortOn fst
302 $ keys
303 $ phylo ^. phylo_periods
304
305 getLevelParentId :: PhyloGroup -> PhyloGroupId
306 getLevelParentId g = fst $ head' "getLevelParentId" $ g ^. phylo_groupLevelParents
307
308 getLastLevel :: Phylo -> Level
309 getLastLevel phylo = last' "lastLevel" $ getLevels phylo
310
311 getLevels :: Phylo -> [Level]
312 getLevels phylo = nub
313 $ map snd
314 $ keys $ view ( phylo_periods
315 . traverse
316 . phylo_periodLevels ) phylo
317
318 getSeaElevation :: Phylo -> SeaElevation
319 getSeaElevation phylo = seaElevation (getConfig phylo)
320
321
322 getConfig :: Phylo -> Config
323 getConfig phylo = (phylo ^. phylo_param) ^. phyloParam_config
324
325
326 getRoots :: Phylo -> Vector Ngrams
327 getRoots phylo = (phylo ^. phylo_foundations) ^. foundations_roots
328
329 phyloToLastBranches :: Phylo -> [[PhyloGroup]]
330 phyloToLastBranches phylo = elems
331 $ fromListWith (++)
332 $ map (\g -> (g ^. phylo_groupBranchId, [g]))
333 $ getGroupsFromLevel (last' "byBranches" $ getLevels phylo) phylo
334
335 getGroupsFromLevel :: Level -> Phylo -> [PhyloGroup]
336 getGroupsFromLevel lvl phylo =
337 elems $ view ( phylo_periods
338 . traverse
339 . phylo_periodLevels
340 . traverse
341 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
342 . phylo_levelGroups ) phylo
343
344
345 getGroupsFromLevelPeriods :: Level -> [PhyloPeriodId] -> Phylo -> [PhyloGroup]
346 getGroupsFromLevelPeriods lvl periods phylo =
347 elems $ view ( phylo_periods
348 . traverse
349 . filtered (\phyloPrd -> elem (phyloPrd ^. phylo_periodPeriod) periods)
350 . phylo_periodLevels
351 . traverse
352 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
353 . phylo_levelGroups ) phylo
354
355
356 getGroupsFromPeriods :: Level -> Map PhyloPeriodId PhyloPeriod -> [PhyloGroup]
357 getGroupsFromPeriods lvl periods =
358 elems $ view ( traverse
359 . phylo_periodLevels
360 . traverse
361 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
362 . phylo_levelGroups ) periods
363
364
365 updatePhyloGroups :: Level -> Map PhyloGroupId PhyloGroup -> Phylo -> Phylo
366 updatePhyloGroups lvl m phylo =
367 over ( phylo_periods
368 . traverse
369 . phylo_periodLevels
370 . traverse
371 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
372 . phylo_levelGroups
373 . traverse
374 ) (\group ->
375 let id = getGroupId group
376 in
377 if member id m
378 then m ! id
379 else group ) phylo
380
381
382 traceToPhylo :: Level -> Phylo -> Phylo
383 traceToPhylo lvl phylo =
384 trace ("\n" <> "-- | End of phylo making at level " <> show (lvl) <> " with "
385 <> show (length $ getGroupsFromLevel lvl phylo) <> " groups and "
386 <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel lvl phylo) <> " branches" <> "\n") phylo
387
388 --------------------
389 -- | Clustering | --
390 --------------------
391
392 relatedComponents :: Ord a => [[a]] -> [[a]]
393 relatedComponents graph = foldl' (\acc groups ->
394 if (null acc)
395 then acc ++ [groups]
396 else
397 let acc' = partition (\groups' -> disjoint (Set.fromList groups') (Set.fromList groups)) acc
398 in (fst acc') ++ [nub $ concat $ (snd acc') ++ [groups]]) [] graph
399
400 toRelatedComponents :: [PhyloGroup] -> [((PhyloGroup,PhyloGroup),Double)] -> [[PhyloGroup]]
401 toRelatedComponents nodes edges =
402 let ref = fromList $ map (\g -> (getGroupId g, g)) nodes
403 clusters = relatedComponents $ ((map (\((g,g'),_) -> [getGroupId g, getGroupId g']) edges) ++ (map (\g -> [getGroupId g]) nodes))
404 in map (\cluster -> map (\gId -> ref ! gId) cluster) clusters
405
406
407 traceSynchronyEnd :: Phylo -> Phylo
408 traceSynchronyEnd phylo =
409 trace ( "\n" <> "-- | End synchronic clustering at level " <> show (getLastLevel phylo)
410 <> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
411 <> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
412 <> "\n" ) phylo
413
414 traceSynchronyStart :: Phylo -> Phylo
415 traceSynchronyStart phylo =
416 trace ( "\n" <> "-- | Start synchronic clustering at level " <> show (getLastLevel phylo)
417 <> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
418 <> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
419 <> "\n" ) phylo
420
421
422 -------------------
423 -- | Proximity | --
424 -------------------
425
426 getSensibility :: Proximity -> Double
427 getSensibility proxi = case proxi of
428 WeightedLogJaccard s -> s
429 Hamming -> undefined
430
431 ----------------
432 -- | Branch | --
433 ----------------
434
435 intersectInit :: Eq a => [a] -> [a] -> [a] -> [a]
436 intersectInit acc lst lst' =
437 if (null lst) || (null lst')
438 then acc
439 else if (head' "intersectInit" lst) == (head' "intersectInit" lst')
440 then intersectInit (acc ++ [head' "intersectInit" lst]) (tail lst) (tail lst')
441 else acc
442
443 branchIdsToProximity :: PhyloBranchId -> PhyloBranchId -> Double -> Double -> Double
444 branchIdsToProximity id id' thrInit thrStep = thrInit + thrStep * (fromIntegral $ length $ intersectInit [] (snd id) (snd id'))
445
446 ngramsInBranches :: [[PhyloGroup]] -> [Int]
447 ngramsInBranches branches = nub $ foldl (\acc g -> acc ++ (g ^. phylo_groupNgrams)) [] $ concat branches
448
449
450 traceMatchSuccess :: Double -> Double -> Double -> [[[PhyloGroup]]] -> [[[PhyloGroup]]]
451 traceMatchSuccess thr qua qua' nextBranches =
452 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . init . snd)
453 $ (head' "trace" $ head' "trace" $ head' "trace" nextBranches) ^. phylo_groupBranchId))
454 <> ",(1.." <> show (length nextBranches) <> ")]"
455 <> " | " <> show ((length . concat . concat) nextBranches) <> " groups" <> "\n"
456 <> " - splited with success in " <> show (map length nextBranches) <> " sub-branches" <> "\n"
457 <> " - for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " < " <> show(qua') <> ")\n" ) nextBranches
458
459
460 traceMatchFailure :: Double -> Double -> Double -> [[PhyloGroup]] -> [[PhyloGroup]]
461 traceMatchFailure thr qua qua' branches =
462 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
463 <> ",(1.." <> show (length branches) <> ")]"
464 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
465 <> " - split with failure for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " > " <> show(qua') <> ")\n"
466 ) branches
467
468
469 traceMatchNoSplit :: [[PhyloGroup]] -> [[PhyloGroup]]
470 traceMatchNoSplit branches =
471 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
472 <> ",(1.." <> show (length branches) <> ")]"
473 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
474 <> " - unable to split in smaller branches" <> "\n"
475 ) branches
476
477
478 traceMatchLimit :: [[PhyloGroup]] -> [[PhyloGroup]]
479 traceMatchLimit branches =
480 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
481 <> ",(1.." <> show (length branches) <> ")]"
482 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
483 <> " - unable to increase the threshold above 1" <> "\n"
484 ) branches
485
486
487 traceMatchEnd :: [PhyloGroup] -> [PhyloGroup]
488 traceMatchEnd groups =
489 trace ("\n" <> "-- | End temporal matching with " <> show (length $ nub $ map (\g -> g ^. phylo_groupBranchId) groups)
490 <> " branches and " <> show (length groups) <> " groups" <> "\n") groups
491
492
493 traceTemporalMatching :: [PhyloGroup] -> [PhyloGroup]
494 traceTemporalMatching groups =
495 trace ( "\n" <> "-- | Start temporal matching for " <> show(length groups) <> " groups" <> "\n") groups
496
497
498 traceGroupsProxi :: Map (PhyloGroupId,PhyloGroupId) Double -> Map (PhyloGroupId,PhyloGroupId) Double
499 traceGroupsProxi m =
500 trace ( "\n" <> "-- | " <> show(Map.size m) <> " computed pairs of groups proximity" <> "\n") m