]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/Phylo/PhyloTools.hs
add new synchronic clustering
[gargantext.git] / src / Gargantext / Viz / Phylo / PhyloTools.hs
1 {-|
2 Module : Gargantext.Viz.Phylo.PhyloTools
3 Description : Module dedicated to all the tools needed for making a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 {-# LANGUAGE FlexibleContexts #-}
12 {-# LANGUAGE NoImplicitPrelude #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE RankNTypes #-}
15 {-# LANGUAGE ViewPatterns #-}
16
17 module Gargantext.Viz.Phylo.PhyloTools where
18
19 import Data.Vector (Vector, elemIndex)
20 import Data.List (sort, concat, null, union, (++), tails, sortOn, nub, init, tail, partition, tails, nubBy)
21 import Data.Set (Set, size, disjoint)
22 import Data.Map (Map, elems, fromList, unionWith, keys, member, (!), filterWithKey, fromListWith, empty)
23 import Data.String (String)
24 import Data.Text (Text, unwords)
25
26 import Gargantext.Prelude
27 import Gargantext.Viz.AdaptativePhylo
28 import Text.Printf
29
30
31 import Debug.Trace (trace)
32 import Control.Lens hiding (Level)
33
34 import qualified Data.Vector as Vector
35 import qualified Data.List as List
36 import qualified Data.Set as Set
37
38 ------------
39 -- | Io | --
40 ------------
41
42 -- | To print an important message as an IO()
43 printIOMsg :: String -> IO ()
44 printIOMsg msg =
45 putStrLn ( "\n"
46 <> "------------"
47 <> "\n"
48 <> "-- | " <> msg <> "\n" )
49
50
51 -- | To print a comment as an IO()
52 printIOComment :: String -> IO ()
53 printIOComment cmt =
54 putStrLn ( "\n" <> cmt <> "\n" )
55
56
57 --------------
58 -- | Misc | --
59 --------------
60
61
62 roundToStr :: (PrintfArg a, Floating a) => Int -> a -> String
63 roundToStr = printf "%0.*f"
64
65
66 countSup :: Double -> [Double] -> Int
67 countSup s l = length $ filter (>s) l
68
69
70 elemIndex' :: Eq a => a -> [a] -> Int
71 elemIndex' e l = case (List.elemIndex e l) of
72 Nothing -> panic ("[ERR][Viz.Phylo.PhyloTools] element not in list")
73 Just i -> i
74
75
76 ---------------------
77 -- | Foundations | --
78 ---------------------
79
80
81 -- | Is this Ngrams a Foundations Root ?
82 isRoots :: Ngrams -> Vector Ngrams -> Bool
83 isRoots n ns = Vector.elem n ns
84
85 -- | To transform a list of nrams into a list of foundation's index
86 ngramsToIdx :: [Ngrams] -> Vector Ngrams -> [Int]
87 ngramsToIdx ns fdt = map (\n -> fromJust $ elemIndex n fdt) ns
88
89 -- | To transform a list of Ngrams Indexes into a Label
90 ngramsToLabel :: Vector Ngrams -> [Int] -> Text
91 ngramsToLabel ngrams l = unwords $ tail' "ngramsToLabel" $ concat $ map (\n -> ["|",n]) $ ngramsToText ngrams l
92
93
94 -- | To transform a list of Ngrams Indexes into a list of Text
95 ngramsToText :: Vector Ngrams -> [Int] -> [Text]
96 ngramsToText ngrams l = map (\idx -> ngrams Vector.! idx) l
97
98
99 --------------
100 -- | Time | --
101 --------------
102
103 -- | To transform a list of periods into a set of Dates
104 periodsToYears :: [(Date,Date)] -> Set Date
105 periodsToYears periods = (Set.fromList . sort . concat)
106 $ map (\(d,d') -> [d..d']) periods
107
108
109 findBounds :: [Date] -> (Date,Date)
110 findBounds dates =
111 let dates' = sort dates
112 in (head' "findBounds" dates', last' "findBounds" dates')
113
114
115 toPeriods :: [Date] -> Int -> Int -> [(Date,Date)]
116 toPeriods dates p s =
117 let (start,end) = findBounds dates
118 in map (\dates' -> (head' "toPeriods" dates', last' "toPeriods" dates'))
119 $ chunkAlong p s [start .. end]
120
121
122 -- | Get a regular & ascendante timeScale from a given list of dates
123 toTimeScale :: [Date] -> Int -> [Date]
124 toTimeScale dates step =
125 let (start,end) = findBounds dates
126 in [start, (start + step) .. end]
127
128
129 getTimeStep :: TimeUnit -> Int
130 getTimeStep time = case time of
131 Year _ s _ -> s
132
133 getTimePeriod :: TimeUnit -> Int
134 getTimePeriod time = case time of
135 Year p _ _ -> p
136
137 getTimeFrame :: TimeUnit -> Int
138 getTimeFrame time = case time of
139 Year _ _ f -> f
140
141 -------------
142 -- | Fis | --
143 -------------
144
145
146 -- | To find if l' is nested in l
147 isNested :: Eq a => [a] -> [a] -> Bool
148 isNested l l'
149 | null l' = True
150 | length l' > length l = False
151 | (union l l') == l = True
152 | otherwise = False
153
154
155 -- | To filter Fis with small Support but by keeping non empty Periods
156 keepFilled :: (Int -> [a] -> [a]) -> Int -> [a] -> [a]
157 keepFilled f thr l = if (null $ f thr l) && (not $ null l)
158 then keepFilled f (thr - 1) l
159 else f thr l
160
161
162 traceClique :: Map (Date, Date) [PhyloFis] -> String
163 traceClique mFis = foldl (\msg cpt -> msg <> show (countSup cpt cliques) <> " (>" <> show (cpt) <> ") " ) "" [1..6]
164 where
165 --------------------------------------
166 cliques :: [Double]
167 cliques = sort $ map (fromIntegral . size . _phyloFis_clique) $ concat $ elems mFis
168 --------------------------------------
169
170
171 traceSupport :: Map (Date, Date) [PhyloFis] -> String
172 traceSupport mFis = foldl (\msg cpt -> msg <> show (countSup cpt supports) <> " (>" <> show (cpt) <> ") " ) "" [1..6]
173 where
174 --------------------------------------
175 supports :: [Double]
176 supports = sort $ map (fromIntegral . _phyloFis_support) $ concat $ elems mFis
177 --------------------------------------
178
179
180 traceFis :: [Char] -> Map (Date, Date) [PhyloFis] -> Map (Date, Date) [PhyloFis]
181 traceFis msg mFis = trace ( "\n" <> "-- | " <> msg <> " : " <> show (sum $ map length $ elems mFis) <> "\n"
182 <> "Support : " <> (traceSupport mFis) <> "\n"
183 <> "Clique : " <> (traceClique mFis) <> "\n" ) mFis
184
185
186 -------------------------
187 -- | Contextual unit | --
188 -------------------------
189
190
191 getFisSupport :: ContextualUnit -> Int
192 getFisSupport unit = case unit of
193 Fis s _ -> s
194 -- _ -> panic ("[ERR][Viz.Phylo.PhyloTools.getFisSupport] Only Fis has a support")
195
196 getFisSize :: ContextualUnit -> Int
197 getFisSize unit = case unit of
198 Fis _ s -> s
199 -- _ -> panic ("[ERR][Viz.Phylo.PhyloTools.getFisSupport] Only Fis has a clique size")
200
201
202 --------------
203 -- | Cooc | --
204 --------------
205
206 listToCombi' :: [a] -> [(a,a)]
207 listToCombi' l = [(x,y) | (x:rest) <- tails l, y <- rest]
208
209 listToEqual' :: Eq a => [a] -> [(a,a)]
210 listToEqual' l = [(x,y) | x <- l, y <- l, x == y]
211
212 listToKeys :: Eq a => [a] -> [(a,a)]
213 listToKeys lst = (listToCombi' lst) ++ (listToEqual' lst)
214
215 listToMatrix :: [Int] -> Map (Int,Int) Double
216 listToMatrix lst = fromList $ map (\k -> (k,1)) $ listToKeys $ sort lst
217
218 listToSeq :: Eq a => [a] -> [(a,a)]
219 listToSeq l = nubBy (\x y -> fst x == fst y) $ [ (x,y) | (x:rest) <- tails l, y <- rest ]
220
221 sumCooc :: Cooc -> Cooc -> Cooc
222 sumCooc cooc cooc' = unionWith (+) cooc cooc'
223
224 getTrace :: Cooc -> Double
225 getTrace cooc = sum $ elems $ filterWithKey (\(k,k') _ -> k == k') cooc
226
227
228 -- | To build the local cooc matrix of each phylogroup
229 ngramsToCooc :: [Int] -> [Cooc] -> Cooc
230 ngramsToCooc ngrams coocs =
231 let cooc = foldl (\acc cooc' -> sumCooc acc cooc') empty coocs
232 pairs = listToKeys ngrams
233 in filterWithKey (\k _ -> elem k pairs) cooc
234
235
236 --------------------
237 -- | PhyloGroup | --
238 --------------------
239
240 getGroupId :: PhyloGroup -> PhyloGroupId
241 getGroupId group = ((group ^. phylo_groupPeriod, group ^. phylo_groupLevel), group ^. phylo_groupIndex)
242
243 groupByField :: Ord a => (PhyloGroup -> a) -> [PhyloGroup] -> Map a [PhyloGroup]
244 groupByField toField groups = fromListWith (++) $ map (\g -> (toField g, [g])) groups
245
246 getPeriodPointers :: Filiation -> PhyloGroup -> [Pointer]
247 getPeriodPointers fil group =
248 case fil of
249 ToChilds -> group ^. phylo_groupPeriodChilds
250 ToParents -> group ^. phylo_groupPeriodParents
251
252 filterProximity :: Proximity -> Double -> Double -> Bool
253 filterProximity proximity thr local =
254 case proximity of
255 WeightedLogJaccard _ _ _ -> local >= thr
256 Hamming -> undefined
257
258
259 ---------------
260 -- | Phylo | --
261 ---------------
262
263 addPointers :: PhyloGroup -> Filiation -> PointerType -> [Pointer] -> PhyloGroup
264 addPointers group fil pty pointers =
265 case pty of
266 TemporalPointer -> case fil of
267 ToChilds -> group & phylo_groupPeriodChilds .~ pointers
268 ToParents -> group & phylo_groupPeriodParents .~ pointers
269 LevelPointer -> case fil of
270 ToChilds -> group & phylo_groupLevelChilds .~ pointers
271 ToParents -> group & phylo_groupLevelParents .~ pointers
272
273
274 getPeriodIds :: Phylo -> [(Date,Date)]
275 getPeriodIds phylo = sortOn fst
276 $ keys
277 $ phylo ^. phylo_periods
278
279 getLevelParentId :: PhyloGroup -> PhyloGroupId
280 getLevelParentId g = fst $ head' "getLevelParentId" $ g ^. phylo_groupLevelParents
281
282 getLastLevel :: Phylo -> Level
283 getLastLevel phylo = last' "lastLevel" $ getLevels phylo
284
285 getLevels :: Phylo -> [Level]
286 getLevels phylo = nub
287 $ map snd
288 $ keys $ view ( phylo_periods
289 . traverse
290 . phylo_periodLevels ) phylo
291
292
293 getConfig :: Phylo -> Config
294 getConfig phylo = (phylo ^. phylo_param) ^. phyloParam_config
295
296
297 getRoots :: Phylo -> Vector Ngrams
298 getRoots phylo = (phylo ^. phylo_foundations) ^. foundations_roots
299
300 phyloToLastBranches :: Phylo -> [[PhyloGroup]]
301 phyloToLastBranches phylo = elems
302 $ fromListWith (++)
303 $ map (\g -> (g ^. phylo_groupBranchId, [g]))
304 $ getGroupsFromLevel (last' "byBranches" $ getLevels phylo) phylo
305
306 getGroupsFromLevel :: Level -> Phylo -> [PhyloGroup]
307 getGroupsFromLevel lvl phylo =
308 elems $ view ( phylo_periods
309 . traverse
310 . phylo_periodLevels
311 . traverse
312 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
313 . phylo_levelGroups ) phylo
314
315
316 updatePhyloGroups :: Level -> Map PhyloGroupId PhyloGroup -> Phylo -> Phylo
317 updatePhyloGroups lvl m phylo =
318 over ( phylo_periods
319 . traverse
320 . phylo_periodLevels
321 . traverse
322 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
323 . phylo_levelGroups
324 . traverse
325 ) (\group ->
326 let id = getGroupId group
327 in
328 if member id m
329 then m ! id
330 else group ) phylo
331
332
333 traceToPhylo :: Level -> Phylo -> Phylo
334 traceToPhylo lvl phylo =
335 trace ("\n" <> "-- | End of phylo making at level " <> show (lvl) <> " with "
336 <> show (length $ getGroupsFromLevel lvl phylo) <> " groups and "
337 <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel lvl phylo) <> " branches" <> "\n") phylo
338
339 --------------------
340 -- | Clustering | --
341 --------------------
342
343 relatedComponents :: Ord a => [[a]] -> [[a]]
344 relatedComponents graph = foldl' (\acc groups ->
345 if (null acc)
346 then acc ++ [groups]
347 else
348 let acc' = partition (\groups' -> disjoint (Set.fromList groups') (Set.fromList groups)) acc
349 in (fst acc') ++ [nub $ concat $ (snd acc') ++ [groups]]) [] graph
350
351
352 traceSynchronyEnd :: Phylo -> Phylo
353 traceSynchronyEnd phylo =
354 trace ( "\n" <> "-- | End synchronic clustering at level " <> show (getLastLevel phylo)
355 <> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
356 <> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
357 <> "\n" ) phylo
358
359 traceSynchronyStart :: Phylo -> Phylo
360 traceSynchronyStart phylo =
361 trace ( "\n" <> "-- | Start synchronic clustering at level " <> show (getLastLevel phylo)
362 <> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
363 <> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
364 <> "\n" ) phylo
365
366
367 -------------------
368 -- | Proximity | --
369 -------------------
370
371 getSensibility :: Proximity -> Double
372 getSensibility proxi = case proxi of
373 WeightedLogJaccard s _ _ -> s
374 Hamming -> undefined
375
376 getThresholdInit :: Proximity -> Double
377 getThresholdInit proxi = case proxi of
378 WeightedLogJaccard _ t _ -> t
379 Hamming -> undefined
380
381 getThresholdStep :: Proximity -> Double
382 getThresholdStep proxi = case proxi of
383 WeightedLogJaccard _ _ s -> s
384 Hamming -> undefined
385
386
387 traceBranchMatching :: Proximity -> Double -> [PhyloGroup] -> [PhyloGroup]
388 traceBranchMatching proxi thr groups = case proxi of
389 WeightedLogJaccard _ i s -> trace (
390 roundToStr 2 thr <> " "
391 <> foldl (\acc _ -> acc <> ".") "." [(10*i),(10*i + 10*s)..(10*thr)]
392 <> " " <> show(length groups) <> " groups"
393 ) groups
394 Hamming -> undefined
395
396 ----------------
397 -- | Branch | --
398 ----------------
399
400 intersectInit :: Eq a => [a] -> [a] -> [a] -> [a]
401 intersectInit acc lst lst' =
402 if (null lst) || (null lst')
403 then acc
404 else if (head' "intersectInit" lst) == (head' "intersectInit" lst')
405 then intersectInit (acc ++ [head' "intersectInit" lst]) (tail lst) (tail lst')
406 else acc
407
408 branchIdsToProximity :: PhyloBranchId -> PhyloBranchId -> Double -> Double -> Double
409 branchIdsToProximity id id' thrInit thrStep = thrInit + thrStep * (fromIntegral $ length $ intersectInit [] (snd id) (snd id'))
410
411 ngramsInBranches :: [[PhyloGroup]] -> [Int]
412 ngramsInBranches branches = nub $ foldl (\acc g -> acc ++ (g ^. phylo_groupNgrams)) [] $ concat branches
413
414
415 traceMatchSuccess :: Double -> Double -> Double -> [[[PhyloGroup]]] -> [[[PhyloGroup]]]
416 traceMatchSuccess thr qua qua' nextBranches =
417 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . init . snd)
418 $ (head' "trace" $ head' "trace" $ head' "trace" nextBranches) ^. phylo_groupBranchId))
419 <> ",(1.." <> show (length nextBranches) <> ")]"
420 <> " | " <> show ((length . concat . concat) nextBranches) <> " groups" <> "\n"
421 <> " - splited with success in " <> show (map length nextBranches) <> " sub-branches" <> "\n"
422 <> " - for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " < " <> show(qua') <> ")\n" ) nextBranches
423
424
425 traceMatchFailure :: Double -> Double -> Double -> [[PhyloGroup]] -> [[PhyloGroup]]
426 traceMatchFailure thr qua qua' branches =
427 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
428 <> ",(1.." <> show (length branches) <> ")]"
429 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
430 <> " - split with failure for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " > " <> show(qua') <> ")\n"
431 ) branches
432
433
434 traceMatchNoSplit :: [[PhyloGroup]] -> [[PhyloGroup]]
435 traceMatchNoSplit branches =
436 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
437 <> ",(1.." <> show (length branches) <> ")]"
438 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
439 <> " - unable to split in smaller branches" <> "\n"
440 ) branches
441
442
443 traceMatchLimit :: [[PhyloGroup]] -> [[PhyloGroup]]
444 traceMatchLimit branches =
445 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
446 <> ",(1.." <> show (length branches) <> ")]"
447 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
448 <> " - unable to increase the threshold above 1" <> "\n"
449 ) branches
450
451
452 traceMatchEnd :: [PhyloGroup] -> [PhyloGroup]
453 traceMatchEnd groups =
454 trace ("\n" <> "-- | End temporal matching with " <> show (length $ nub $ map (\g -> g ^. phylo_groupBranchId) groups)
455 <> " branches and " <> show (length groups) <> " groups" <> "\n") groups
456
457
458 traceTemporalMatching :: [PhyloGroup] -> [PhyloGroup]
459 traceTemporalMatching groups =
460 trace ( "\n" <> "-- | Start temporal matching for " <> show(length groups) <> " groups" <> "\n") groups