]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/Phylo/PhyloTools.hs
add adjustable clustering by threshold
[gargantext.git] / src / Gargantext / Viz / Phylo / PhyloTools.hs
1 {-|
2 Module : Gargantext.Viz.Phylo.PhyloTools
3 Description : Module dedicated to all the tools needed for making a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 {-# LANGUAGE FlexibleContexts #-}
12 {-# LANGUAGE NoImplicitPrelude #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE RankNTypes #-}
15 {-# LANGUAGE ViewPatterns #-}
16
17 module Gargantext.Viz.Phylo.PhyloTools where
18
19 import Data.Vector (Vector, elemIndex)
20 import Data.List (sort, concat, null, union, (++), tails, sortOn, nub, init, tail, partition, tails, nubBy)
21 import Data.Set (Set, size, disjoint)
22 import Data.Map (Map, elems, fromList, unionWith, keys, member, (!), filterWithKey, fromListWith, empty)
23 import Data.String (String)
24 import Data.Text (Text, unwords)
25
26 import Gargantext.Prelude
27 import Gargantext.Viz.AdaptativePhylo
28 import Text.Printf
29
30
31 import Debug.Trace (trace)
32 import Control.Lens hiding (Level)
33
34 import qualified Data.Vector as Vector
35 import qualified Data.List as List
36 import qualified Data.Set as Set
37
38 ------------
39 -- | Io | --
40 ------------
41
42 -- | To print an important message as an IO()
43 printIOMsg :: String -> IO ()
44 printIOMsg msg =
45 putStrLn ( "\n"
46 <> "------------"
47 <> "\n"
48 <> "-- | " <> msg <> "\n" )
49
50
51 -- | To print a comment as an IO()
52 printIOComment :: String -> IO ()
53 printIOComment cmt =
54 putStrLn ( "\n" <> cmt <> "\n" )
55
56
57 --------------
58 -- | Misc | --
59 --------------
60
61
62 roundToStr :: (PrintfArg a, Floating a) => Int -> a -> String
63 roundToStr = printf "%0.*f"
64
65
66 countSup :: Double -> [Double] -> Int
67 countSup s l = length $ filter (>s) l
68
69 dropByIdx :: Int -> [a] -> [a]
70 dropByIdx k l = take k l ++ drop (k+1) l
71
72
73 elemIndex' :: Eq a => a -> [a] -> Int
74 elemIndex' e l = case (List.elemIndex e l) of
75 Nothing -> panic ("[ERR][Viz.Phylo.PhyloTools] element not in list")
76 Just i -> i
77
78
79 ---------------------
80 -- | Foundations | --
81 ---------------------
82
83
84 -- | Is this Ngrams a Foundations Root ?
85 isRoots :: Ngrams -> Vector Ngrams -> Bool
86 isRoots n ns = Vector.elem n ns
87
88 -- | To transform a list of nrams into a list of foundation's index
89 ngramsToIdx :: [Ngrams] -> Vector Ngrams -> [Int]
90 ngramsToIdx ns fdt = map (\n -> fromJust $ elemIndex n fdt) ns
91
92 -- | To transform a list of Ngrams Indexes into a Label
93 ngramsToLabel :: Vector Ngrams -> [Int] -> Text
94 ngramsToLabel ngrams l = unwords $ tail' "ngramsToLabel" $ concat $ map (\n -> ["|",n]) $ ngramsToText ngrams l
95
96
97 -- | To transform a list of Ngrams Indexes into a list of Text
98 ngramsToText :: Vector Ngrams -> [Int] -> [Text]
99 ngramsToText ngrams l = map (\idx -> ngrams Vector.! idx) l
100
101
102 --------------
103 -- | Time | --
104 --------------
105
106 -- | To transform a list of periods into a set of Dates
107 periodsToYears :: [(Date,Date)] -> Set Date
108 periodsToYears periods = (Set.fromList . sort . concat)
109 $ map (\(d,d') -> [d..d']) periods
110
111
112 findBounds :: [Date] -> (Date,Date)
113 findBounds dates =
114 let dates' = sort dates
115 in (head' "findBounds" dates', last' "findBounds" dates')
116
117
118 toPeriods :: [Date] -> Int -> Int -> [(Date,Date)]
119 toPeriods dates p s =
120 let (start,end) = findBounds dates
121 in map (\dates' -> (head' "toPeriods" dates', last' "toPeriods" dates'))
122 $ chunkAlong p s [start .. end]
123
124
125 -- | Get a regular & ascendante timeScale from a given list of dates
126 toTimeScale :: [Date] -> Int -> [Date]
127 toTimeScale dates step =
128 let (start,end) = findBounds dates
129 in [start, (start + step) .. end]
130
131
132 getTimeStep :: TimeUnit -> Int
133 getTimeStep time = case time of
134 Year _ s _ -> s
135
136 getTimePeriod :: TimeUnit -> Int
137 getTimePeriod time = case time of
138 Year p _ _ -> p
139
140 getTimeFrame :: TimeUnit -> Int
141 getTimeFrame time = case time of
142 Year _ _ f -> f
143
144 -------------
145 -- | Fis | --
146 -------------
147
148
149 -- | To find if l' is nested in l
150 isNested :: Eq a => [a] -> [a] -> Bool
151 isNested l l'
152 | null l' = True
153 | length l' > length l = False
154 | (union l l') == l = True
155 | otherwise = False
156
157
158 -- | To filter Fis with small Support but by keeping non empty Periods
159 keepFilled :: (Int -> [a] -> [a]) -> Int -> [a] -> [a]
160 keepFilled f thr l = if (null $ f thr l) && (not $ null l)
161 then keepFilled f (thr - 1) l
162 else f thr l
163
164
165 traceClique :: Map (Date, Date) [PhyloClique] -> String
166 traceClique mFis = foldl (\msg cpt -> msg <> show (countSup cpt cliques) <> " (>" <> show (cpt) <> ") " ) "" [1..6]
167 where
168 --------------------------------------
169 cliques :: [Double]
170 cliques = sort $ map (fromIntegral . size . _phyloClique_nodes) $ concat $ elems mFis
171 --------------------------------------
172
173
174 traceSupport :: Map (Date, Date) [PhyloClique] -> String
175 traceSupport mFis = foldl (\msg cpt -> msg <> show (countSup cpt supports) <> " (>" <> show (cpt) <> ") " ) "" [1..6]
176 where
177 --------------------------------------
178 supports :: [Double]
179 supports = sort $ map (fromIntegral . _phyloClique_support) $ concat $ elems mFis
180 --------------------------------------
181
182
183 traceFis :: [Char] -> Map (Date, Date) [PhyloClique] -> Map (Date, Date) [PhyloClique]
184 traceFis msg mFis = trace ( "\n" <> "-- | " <> msg <> " : " <> show (sum $ map length $ elems mFis) <> "\n"
185 <> "Support : " <> (traceSupport mFis) <> "\n"
186 <> "Nb Ngrams : " <> (traceClique mFis) <> "\n" ) mFis
187
188
189 ---------------
190 -- | Clique| --
191 ---------------
192
193
194 getCliqueSupport :: Clique -> Int
195 getCliqueSupport unit = case unit of
196 Fis s _ -> s
197 MaxClique _ -> 0
198
199 getCliqueSize :: Clique -> Int
200 getCliqueSize unit = case unit of
201 Fis _ s -> s
202 MaxClique s -> s
203
204
205 --------------
206 -- | Cooc | --
207 --------------
208
209 listToCombi' :: [a] -> [(a,a)]
210 listToCombi' l = [(x,y) | (x:rest) <- tails l, y <- rest]
211
212 listToEqual' :: Eq a => [a] -> [(a,a)]
213 listToEqual' l = [(x,y) | x <- l, y <- l, x == y]
214
215 listToKeys :: Eq a => [a] -> [(a,a)]
216 listToKeys lst = (listToCombi' lst) ++ (listToEqual' lst)
217
218 listToMatrix :: [Int] -> Map (Int,Int) Double
219 listToMatrix lst = fromList $ map (\k -> (k,1)) $ listToKeys $ sort lst
220
221 listToSeq :: Eq a => [a] -> [(a,a)]
222 listToSeq l = nubBy (\x y -> fst x == fst y) $ [ (x,y) | (x:rest) <- tails l, y <- rest ]
223
224 sumCooc :: Cooc -> Cooc -> Cooc
225 sumCooc cooc cooc' = unionWith (+) cooc cooc'
226
227 getTrace :: Cooc -> Double
228 getTrace cooc = sum $ elems $ filterWithKey (\(k,k') _ -> k == k') cooc
229
230
231 -- | To build the local cooc matrix of each phylogroup
232 ngramsToCooc :: [Int] -> [Cooc] -> Cooc
233 ngramsToCooc ngrams coocs =
234 let cooc = foldl (\acc cooc' -> sumCooc acc cooc') empty coocs
235 pairs = listToKeys ngrams
236 in filterWithKey (\k _ -> elem k pairs) cooc
237
238
239 --------------------
240 -- | PhyloGroup | --
241 --------------------
242
243 getGroupId :: PhyloGroup -> PhyloGroupId
244 getGroupId group = ((group ^. phylo_groupPeriod, group ^. phylo_groupLevel), group ^. phylo_groupIndex)
245
246 getGroupThr :: PhyloGroup -> Double
247 getGroupThr group = head' "getGroupThr" ((group ^. phylo_groupMeta) ! "thr")
248
249 groupByField :: Ord a => (PhyloGroup -> a) -> [PhyloGroup] -> Map a [PhyloGroup]
250 groupByField toField groups = fromListWith (++) $ map (\g -> (toField g, [g])) groups
251
252 getPeriodPointers :: Filiation -> PhyloGroup -> [Pointer]
253 getPeriodPointers fil group =
254 case fil of
255 ToChilds -> group ^. phylo_groupPeriodChilds
256 ToParents -> group ^. phylo_groupPeriodParents
257
258 filterProximity :: Proximity -> Double -> Double -> Bool
259 filterProximity proximity thr local =
260 case proximity of
261 WeightedLogJaccard _ _ _ -> local >= thr
262 Hamming -> undefined
263
264 getProximityName :: Proximity -> String
265 getProximityName proximity =
266 case proximity of
267 WeightedLogJaccard _ _ _ -> "WLJaccard"
268 Hamming -> "Hamming"
269
270 getProximityInit :: Proximity -> Double
271 getProximityInit proximity =
272 case proximity of
273 WeightedLogJaccard _ i _ -> i
274 Hamming -> undefined
275
276
277 getProximityStep :: Proximity -> Double
278 getProximityStep proximity =
279 case proximity of
280 WeightedLogJaccard _ _ s -> s
281 Hamming -> undefined
282
283 ---------------
284 -- | Phylo | --
285 ---------------
286
287 addPointers :: PhyloGroup -> Filiation -> PointerType -> [Pointer] -> PhyloGroup
288 addPointers group fil pty pointers =
289 case pty of
290 TemporalPointer -> case fil of
291 ToChilds -> group & phylo_groupPeriodChilds .~ pointers
292 ToParents -> group & phylo_groupPeriodParents .~ pointers
293 LevelPointer -> case fil of
294 ToChilds -> group & phylo_groupLevelChilds .~ pointers
295 ToParents -> group & phylo_groupLevelParents .~ pointers
296
297
298 getPeriodIds :: Phylo -> [(Date,Date)]
299 getPeriodIds phylo = sortOn fst
300 $ keys
301 $ phylo ^. phylo_periods
302
303 getLevelParentId :: PhyloGroup -> PhyloGroupId
304 getLevelParentId g = fst $ head' "getLevelParentId" $ g ^. phylo_groupLevelParents
305
306 getLastLevel :: Phylo -> Level
307 getLastLevel phylo = last' "lastLevel" $ getLevels phylo
308
309 getLevels :: Phylo -> [Level]
310 getLevels phylo = nub
311 $ map snd
312 $ keys $ view ( phylo_periods
313 . traverse
314 . phylo_periodLevels ) phylo
315
316
317 getPhyloThresholdInit :: Phylo -> Double
318 getPhyloThresholdInit phylo = getThresholdInit (phyloProximity (getConfig phylo))
319
320
321 getPhyloThresholdStep :: Phylo -> Double
322 getPhyloThresholdStep phylo = getThresholdStep (phyloProximity (getConfig phylo))
323
324
325 getConfig :: Phylo -> Config
326 getConfig phylo = (phylo ^. phylo_param) ^. phyloParam_config
327
328
329 getRoots :: Phylo -> Vector Ngrams
330 getRoots phylo = (phylo ^. phylo_foundations) ^. foundations_roots
331
332 phyloToLastBranches :: Phylo -> [[PhyloGroup]]
333 phyloToLastBranches phylo = elems
334 $ fromListWith (++)
335 $ map (\g -> (g ^. phylo_groupBranchId, [g]))
336 $ getGroupsFromLevel (last' "byBranches" $ getLevels phylo) phylo
337
338 getGroupsFromLevel :: Level -> Phylo -> [PhyloGroup]
339 getGroupsFromLevel lvl phylo =
340 elems $ view ( phylo_periods
341 . traverse
342 . phylo_periodLevels
343 . traverse
344 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
345 . phylo_levelGroups ) phylo
346
347
348 updatePhyloGroups :: Level -> Map PhyloGroupId PhyloGroup -> Phylo -> Phylo
349 updatePhyloGroups lvl m phylo =
350 over ( phylo_periods
351 . traverse
352 . phylo_periodLevels
353 . traverse
354 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
355 . phylo_levelGroups
356 . traverse
357 ) (\group ->
358 let id = getGroupId group
359 in
360 if member id m
361 then m ! id
362 else group ) phylo
363
364
365 traceToPhylo :: Level -> Phylo -> Phylo
366 traceToPhylo lvl phylo =
367 trace ("\n" <> "-- | End of phylo making at level " <> show (lvl) <> " with "
368 <> show (length $ getGroupsFromLevel lvl phylo) <> " groups and "
369 <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel lvl phylo) <> " branches" <> "\n") phylo
370
371 --------------------
372 -- | Clustering | --
373 --------------------
374
375 relatedComponents :: Ord a => [[a]] -> [[a]]
376 relatedComponents graph = foldl' (\acc groups ->
377 if (null acc)
378 then acc ++ [groups]
379 else
380 let acc' = partition (\groups' -> disjoint (Set.fromList groups') (Set.fromList groups)) acc
381 in (fst acc') ++ [nub $ concat $ (snd acc') ++ [groups]]) [] graph
382
383
384 traceSynchronyEnd :: Phylo -> Phylo
385 traceSynchronyEnd phylo =
386 trace ( "\n" <> "-- | End synchronic clustering at level " <> show (getLastLevel phylo)
387 <> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
388 <> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
389 <> "\n" ) phylo
390
391 traceSynchronyStart :: Phylo -> Phylo
392 traceSynchronyStart phylo =
393 trace ( "\n" <> "-- | Start synchronic clustering at level " <> show (getLastLevel phylo)
394 <> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
395 <> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
396 <> "\n" ) phylo
397
398
399 -------------------
400 -- | Proximity | --
401 -------------------
402
403 getSensibility :: Proximity -> Double
404 getSensibility proxi = case proxi of
405 WeightedLogJaccard s _ _ -> s
406 Hamming -> undefined
407
408 getThresholdInit :: Proximity -> Double
409 getThresholdInit proxi = case proxi of
410 WeightedLogJaccard _ t _ -> t
411 Hamming -> undefined
412
413 getThresholdStep :: Proximity -> Double
414 getThresholdStep proxi = case proxi of
415 WeightedLogJaccard _ _ s -> s
416 Hamming -> undefined
417
418
419 traceBranchMatching :: Proximity -> Double -> [PhyloGroup] -> [PhyloGroup]
420 traceBranchMatching proxi thr groups = case proxi of
421 WeightedLogJaccard _ i s -> trace (
422 roundToStr 2 thr <> " "
423 <> foldl (\acc _ -> acc <> ".") "." [(10*i),(10*i + 10*s)..(10*thr)]
424 <> " " <> show(length groups) <> " groups"
425 ) groups
426 Hamming -> undefined
427
428 ----------------
429 -- | Branch | --
430 ----------------
431
432 intersectInit :: Eq a => [a] -> [a] -> [a] -> [a]
433 intersectInit acc lst lst' =
434 if (null lst) || (null lst')
435 then acc
436 else if (head' "intersectInit" lst) == (head' "intersectInit" lst')
437 then intersectInit (acc ++ [head' "intersectInit" lst]) (tail lst) (tail lst')
438 else acc
439
440 branchIdsToProximity :: PhyloBranchId -> PhyloBranchId -> Double -> Double -> Double
441 branchIdsToProximity id id' thrInit thrStep = thrInit + thrStep * (fromIntegral $ length $ intersectInit [] (snd id) (snd id'))
442
443 ngramsInBranches :: [[PhyloGroup]] -> [Int]
444 ngramsInBranches branches = nub $ foldl (\acc g -> acc ++ (g ^. phylo_groupNgrams)) [] $ concat branches
445
446
447 traceMatchSuccess :: Double -> Double -> Double -> [[[PhyloGroup]]] -> [[[PhyloGroup]]]
448 traceMatchSuccess thr qua qua' nextBranches =
449 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . init . snd)
450 $ (head' "trace" $ head' "trace" $ head' "trace" nextBranches) ^. phylo_groupBranchId))
451 <> ",(1.." <> show (length nextBranches) <> ")]"
452 <> " | " <> show ((length . concat . concat) nextBranches) <> " groups" <> "\n"
453 <> " - splited with success in " <> show (map length nextBranches) <> " sub-branches" <> "\n"
454 <> " - for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " < " <> show(qua') <> ")\n" ) nextBranches
455
456
457 traceMatchFailure :: Double -> Double -> Double -> [[PhyloGroup]] -> [[PhyloGroup]]
458 traceMatchFailure thr qua qua' branches =
459 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
460 <> ",(1.." <> show (length branches) <> ")]"
461 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
462 <> " - split with failure for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " > " <> show(qua') <> ")\n"
463 ) branches
464
465
466 traceMatchNoSplit :: [[PhyloGroup]] -> [[PhyloGroup]]
467 traceMatchNoSplit branches =
468 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
469 <> ",(1.." <> show (length branches) <> ")]"
470 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
471 <> " - unable to split in smaller branches" <> "\n"
472 ) branches
473
474
475 traceMatchLimit :: [[PhyloGroup]] -> [[PhyloGroup]]
476 traceMatchLimit branches =
477 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
478 <> ",(1.." <> show (length branches) <> ")]"
479 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
480 <> " - unable to increase the threshold above 1" <> "\n"
481 ) branches
482
483
484 traceMatchEnd :: [PhyloGroup] -> [PhyloGroup]
485 traceMatchEnd groups =
486 trace ("\n" <> "-- | End temporal matching with " <> show (length $ nub $ map (\g -> g ^. phylo_groupBranchId) groups)
487 <> " branches and " <> show (length groups) <> " groups" <> "\n") groups
488
489
490 traceTemporalMatching :: [PhyloGroup] -> [PhyloGroup]
491 traceTemporalMatching groups =
492 trace ( "\n" <> "-- | Start temporal matching for " <> show(length groups) <> " groups" <> "\n") groups