]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Viz/Phylo/PhyloTools.hs
Merge branch 'dev-corpus-add-file' of ssh://gitlab.iscpif.fr:20022/gargantext/haskell...
[gargantext.git] / src / Gargantext / Viz / Phylo / PhyloTools.hs
1 {-|
2 Module : Gargantext.Viz.Phylo.PhyloTools
3 Description : Module dedicated to all the tools needed for making a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 {-# LANGUAGE FlexibleContexts #-}
12 {-# LANGUAGE NoImplicitPrelude #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE RankNTypes #-}
15 {-# LANGUAGE ViewPatterns #-}
16
17 module Gargantext.Viz.Phylo.PhyloTools where
18
19 import Data.Vector (Vector, elemIndex)
20 import Data.List (sort, concat, null, union, (++), tails, sortOn, nub, init, tail, partition, tails, nubBy)
21 import Data.Set (Set, size, disjoint)
22 import Data.Map (Map, elems, fromList, unionWith, keys, member, (!), filterWithKey, fromListWith, empty)
23 import Data.String (String)
24 import Data.Text (Text, unwords)
25
26 import Gargantext.Prelude
27 import Gargantext.Viz.AdaptativePhylo
28 import Text.Printf
29
30
31 import Debug.Trace (trace)
32 import Control.Lens hiding (Level)
33
34 import qualified Data.Vector as Vector
35 import qualified Data.List as List
36 import qualified Data.Set as Set
37
38 ------------
39 -- | Io | --
40 ------------
41
42 -- | To print an important message as an IO()
43 printIOMsg :: String -> IO ()
44 printIOMsg msg =
45 putStrLn ( "\n"
46 <> "------------"
47 <> "\n"
48 <> "-- | " <> msg <> "\n" )
49
50
51 -- | To print a comment as an IO()
52 printIOComment :: String -> IO ()
53 printIOComment cmt =
54 putStrLn ( "\n" <> cmt <> "\n" )
55
56
57 --------------
58 -- | Misc | --
59 --------------
60
61
62 roundToStr :: (PrintfArg a, Floating a) => Int -> a -> String
63 roundToStr = printf "%0.*f"
64
65
66 countSup :: Double -> [Double] -> Int
67 countSup s l = length $ filter (>s) l
68
69 dropByIdx :: Int -> [a] -> [a]
70 dropByIdx k l = take k l ++ drop (k+1) l
71
72
73 elemIndex' :: Eq a => a -> [a] -> Int
74 elemIndex' e l = case (List.elemIndex e l) of
75 Nothing -> panic ("[ERR][Viz.Phylo.PhyloTools] element not in list")
76 Just i -> i
77
78
79 ---------------------
80 -- | Foundations | --
81 ---------------------
82
83
84 -- | Is this Ngrams a Foundations Root ?
85 isRoots :: Ngrams -> Vector Ngrams -> Bool
86 isRoots n ns = Vector.elem n ns
87
88 -- | To transform a list of nrams into a list of foundation's index
89 ngramsToIdx :: [Ngrams] -> Vector Ngrams -> [Int]
90 ngramsToIdx ns fdt = map (\n -> fromJust $ elemIndex n fdt) ns
91
92 -- | To transform a list of Ngrams Indexes into a Label
93 ngramsToLabel :: Vector Ngrams -> [Int] -> Text
94 ngramsToLabel ngrams l = unwords $ tail' "ngramsToLabel" $ concat $ map (\n -> ["|",n]) $ ngramsToText ngrams l
95
96
97 -- | To transform a list of Ngrams Indexes into a list of Text
98 ngramsToText :: Vector Ngrams -> [Int] -> [Text]
99 ngramsToText ngrams l = map (\idx -> ngrams Vector.! idx) l
100
101
102 --------------
103 -- | Time | --
104 --------------
105
106 -- | To transform a list of periods into a set of Dates
107 periodsToYears :: [(Date,Date)] -> Set Date
108 periodsToYears periods = (Set.fromList . sort . concat)
109 $ map (\(d,d') -> [d..d']) periods
110
111
112 findBounds :: [Date] -> (Date,Date)
113 findBounds dates =
114 let dates' = sort dates
115 in (head' "findBounds" dates', last' "findBounds" dates')
116
117
118 toPeriods :: [Date] -> Int -> Int -> [(Date,Date)]
119 toPeriods dates p s =
120 let (start,end) = findBounds dates
121 in map (\dates' -> (head' "toPeriods" dates', last' "toPeriods" dates'))
122 $ chunkAlong p s [start .. end]
123
124
125 -- | Get a regular & ascendante timeScale from a given list of dates
126 toTimeScale :: [Date] -> Int -> [Date]
127 toTimeScale dates step =
128 let (start,end) = findBounds dates
129 in [start, (start + step) .. end]
130
131
132 getTimeStep :: TimeUnit -> Int
133 getTimeStep time = case time of
134 Year _ s _ -> s
135
136 getTimePeriod :: TimeUnit -> Int
137 getTimePeriod time = case time of
138 Year p _ _ -> p
139
140 getTimeFrame :: TimeUnit -> Int
141 getTimeFrame time = case time of
142 Year _ _ f -> f
143
144 -------------
145 -- | Fis | --
146 -------------
147
148
149 -- | To find if l' is nested in l
150 isNested :: Eq a => [a] -> [a] -> Bool
151 isNested l l'
152 | null l' = True
153 | length l' > length l = False
154 | (union l l') == l = True
155 | otherwise = False
156
157
158 -- | To filter Fis with small Support but by keeping non empty Periods
159 keepFilled :: (Int -> [a] -> [a]) -> Int -> [a] -> [a]
160 keepFilled f thr l = if (null $ f thr l) && (not $ null l)
161 then keepFilled f (thr - 1) l
162 else f thr l
163
164
165 traceClique :: Map (Date, Date) [PhyloFis] -> String
166 traceClique mFis = foldl (\msg cpt -> msg <> show (countSup cpt cliques) <> " (>" <> show (cpt) <> ") " ) "" [1..6]
167 where
168 --------------------------------------
169 cliques :: [Double]
170 cliques = sort $ map (fromIntegral . size . _phyloFis_clique) $ concat $ elems mFis
171 --------------------------------------
172
173
174 traceSupport :: Map (Date, Date) [PhyloFis] -> String
175 traceSupport mFis = foldl (\msg cpt -> msg <> show (countSup cpt supports) <> " (>" <> show (cpt) <> ") " ) "" [1..6]
176 where
177 --------------------------------------
178 supports :: [Double]
179 supports = sort $ map (fromIntegral . _phyloFis_support) $ concat $ elems mFis
180 --------------------------------------
181
182
183 traceFis :: [Char] -> Map (Date, Date) [PhyloFis] -> Map (Date, Date) [PhyloFis]
184 traceFis msg mFis = trace ( "\n" <> "-- | " <> msg <> " : " <> show (sum $ map length $ elems mFis) <> "\n"
185 <> "Support : " <> (traceSupport mFis) <> "\n"
186 <> "Clique : " <> (traceClique mFis) <> "\n" ) mFis
187
188
189 -------------------------
190 -- | Contextual unit | --
191 -------------------------
192
193
194 getFisSupport :: ContextualUnit -> Int
195 getFisSupport unit = case unit of
196 Fis s _ -> s
197 -- _ -> panic ("[ERR][Viz.Phylo.PhyloTools.getFisSupport] Only Fis has a support")
198
199 getFisSize :: ContextualUnit -> Int
200 getFisSize unit = case unit of
201 Fis _ s -> s
202 -- _ -> panic ("[ERR][Viz.Phylo.PhyloTools.getFisSupport] Only Fis has a clique size")
203
204
205 --------------
206 -- | Cooc | --
207 --------------
208
209 listToCombi' :: [a] -> [(a,a)]
210 listToCombi' l = [(x,y) | (x:rest) <- tails l, y <- rest]
211
212 listToEqual' :: Eq a => [a] -> [(a,a)]
213 listToEqual' l = [(x,y) | x <- l, y <- l, x == y]
214
215 listToKeys :: Eq a => [a] -> [(a,a)]
216 listToKeys lst = (listToCombi' lst) ++ (listToEqual' lst)
217
218 listToMatrix :: [Int] -> Map (Int,Int) Double
219 listToMatrix lst = fromList $ map (\k -> (k,1)) $ listToKeys $ sort lst
220
221 listToSeq :: Eq a => [a] -> [(a,a)]
222 listToSeq l = nubBy (\x y -> fst x == fst y) $ [ (x,y) | (x:rest) <- tails l, y <- rest ]
223
224 sumCooc :: Cooc -> Cooc -> Cooc
225 sumCooc cooc cooc' = unionWith (+) cooc cooc'
226
227 getTrace :: Cooc -> Double
228 getTrace cooc = sum $ elems $ filterWithKey (\(k,k') _ -> k == k') cooc
229
230
231 -- | To build the local cooc matrix of each phylogroup
232 ngramsToCooc :: [Int] -> [Cooc] -> Cooc
233 ngramsToCooc ngrams coocs =
234 let cooc = foldl (\acc cooc' -> sumCooc acc cooc') empty coocs
235 pairs = listToKeys ngrams
236 in filterWithKey (\k _ -> elem k pairs) cooc
237
238
239 --------------------
240 -- | PhyloGroup | --
241 --------------------
242
243 getGroupId :: PhyloGroup -> PhyloGroupId
244 getGroupId group = ((group ^. phylo_groupPeriod, group ^. phylo_groupLevel), group ^. phylo_groupIndex)
245
246 groupByField :: Ord a => (PhyloGroup -> a) -> [PhyloGroup] -> Map a [PhyloGroup]
247 groupByField toField groups = fromListWith (++) $ map (\g -> (toField g, [g])) groups
248
249 getPeriodPointers :: Filiation -> PhyloGroup -> [Pointer]
250 getPeriodPointers fil group =
251 case fil of
252 ToChilds -> group ^. phylo_groupPeriodChilds
253 ToParents -> group ^. phylo_groupPeriodParents
254
255 filterProximity :: Proximity -> Double -> Double -> Bool
256 filterProximity proximity thr local =
257 case proximity of
258 WeightedLogJaccard _ _ _ -> local >= thr
259 Hamming -> undefined
260
261 getProximityName :: Proximity -> String
262 getProximityName proximity =
263 case proximity of
264 WeightedLogJaccard _ _ _ -> "WLJaccard"
265 Hamming -> "Hamming"
266
267 getProximityInit :: Proximity -> Double
268 getProximityInit proximity =
269 case proximity of
270 WeightedLogJaccard _ i _ -> i
271 Hamming -> undefined
272
273
274 getProximityStep :: Proximity -> Double
275 getProximityStep proximity =
276 case proximity of
277 WeightedLogJaccard _ _ s -> s
278 Hamming -> undefined
279
280 ---------------
281 -- | Phylo | --
282 ---------------
283
284 addPointers :: PhyloGroup -> Filiation -> PointerType -> [Pointer] -> PhyloGroup
285 addPointers group fil pty pointers =
286 case pty of
287 TemporalPointer -> case fil of
288 ToChilds -> group & phylo_groupPeriodChilds .~ pointers
289 ToParents -> group & phylo_groupPeriodParents .~ pointers
290 LevelPointer -> case fil of
291 ToChilds -> group & phylo_groupLevelChilds .~ pointers
292 ToParents -> group & phylo_groupLevelParents .~ pointers
293
294
295 getPeriodIds :: Phylo -> [(Date,Date)]
296 getPeriodIds phylo = sortOn fst
297 $ keys
298 $ phylo ^. phylo_periods
299
300 getLevelParentId :: PhyloGroup -> PhyloGroupId
301 getLevelParentId g = fst $ head' "getLevelParentId" $ g ^. phylo_groupLevelParents
302
303 getLastLevel :: Phylo -> Level
304 getLastLevel phylo = last' "lastLevel" $ getLevels phylo
305
306 getLevels :: Phylo -> [Level]
307 getLevels phylo = nub
308 $ map snd
309 $ keys $ view ( phylo_periods
310 . traverse
311 . phylo_periodLevels ) phylo
312
313
314 getConfig :: Phylo -> Config
315 getConfig phylo = (phylo ^. phylo_param) ^. phyloParam_config
316
317
318 getRoots :: Phylo -> Vector Ngrams
319 getRoots phylo = (phylo ^. phylo_foundations) ^. foundations_roots
320
321 phyloToLastBranches :: Phylo -> [[PhyloGroup]]
322 phyloToLastBranches phylo = elems
323 $ fromListWith (++)
324 $ map (\g -> (g ^. phylo_groupBranchId, [g]))
325 $ getGroupsFromLevel (last' "byBranches" $ getLevels phylo) phylo
326
327 getGroupsFromLevel :: Level -> Phylo -> [PhyloGroup]
328 getGroupsFromLevel lvl phylo =
329 elems $ view ( phylo_periods
330 . traverse
331 . phylo_periodLevels
332 . traverse
333 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
334 . phylo_levelGroups ) phylo
335
336
337 updatePhyloGroups :: Level -> Map PhyloGroupId PhyloGroup -> Phylo -> Phylo
338 updatePhyloGroups lvl m phylo =
339 over ( phylo_periods
340 . traverse
341 . phylo_periodLevels
342 . traverse
343 . filtered (\phyloLvl -> phyloLvl ^. phylo_levelLevel == lvl)
344 . phylo_levelGroups
345 . traverse
346 ) (\group ->
347 let id = getGroupId group
348 in
349 if member id m
350 then m ! id
351 else group ) phylo
352
353
354 traceToPhylo :: Level -> Phylo -> Phylo
355 traceToPhylo lvl phylo =
356 trace ("\n" <> "-- | End of phylo making at level " <> show (lvl) <> " with "
357 <> show (length $ getGroupsFromLevel lvl phylo) <> " groups and "
358 <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel lvl phylo) <> " branches" <> "\n") phylo
359
360 --------------------
361 -- | Clustering | --
362 --------------------
363
364 relatedComponents :: Ord a => [[a]] -> [[a]]
365 relatedComponents graph = foldl' (\acc groups ->
366 if (null acc)
367 then acc ++ [groups]
368 else
369 let acc' = partition (\groups' -> disjoint (Set.fromList groups') (Set.fromList groups)) acc
370 in (fst acc') ++ [nub $ concat $ (snd acc') ++ [groups]]) [] graph
371
372
373 traceSynchronyEnd :: Phylo -> Phylo
374 traceSynchronyEnd phylo =
375 trace ( "\n" <> "-- | End synchronic clustering at level " <> show (getLastLevel phylo)
376 <> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
377 <> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
378 <> "\n" ) phylo
379
380 traceSynchronyStart :: Phylo -> Phylo
381 traceSynchronyStart phylo =
382 trace ( "\n" <> "-- | Start synchronic clustering at level " <> show (getLastLevel phylo)
383 <> " with " <> show (length $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " groups"
384 <> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromLevel (getLastLevel phylo) phylo) <> " branches"
385 <> "\n" ) phylo
386
387
388 -------------------
389 -- | Proximity | --
390 -------------------
391
392 getSensibility :: Proximity -> Double
393 getSensibility proxi = case proxi of
394 WeightedLogJaccard s _ _ -> s
395 Hamming -> undefined
396
397 getThresholdInit :: Proximity -> Double
398 getThresholdInit proxi = case proxi of
399 WeightedLogJaccard _ t _ -> t
400 Hamming -> undefined
401
402 getThresholdStep :: Proximity -> Double
403 getThresholdStep proxi = case proxi of
404 WeightedLogJaccard _ _ s -> s
405 Hamming -> undefined
406
407
408 traceBranchMatching :: Proximity -> Double -> [PhyloGroup] -> [PhyloGroup]
409 traceBranchMatching proxi thr groups = case proxi of
410 WeightedLogJaccard _ i s -> trace (
411 roundToStr 2 thr <> " "
412 <> foldl (\acc _ -> acc <> ".") "." [(10*i),(10*i + 10*s)..(10*thr)]
413 <> " " <> show(length groups) <> " groups"
414 ) groups
415 Hamming -> undefined
416
417 ----------------
418 -- | Branch | --
419 ----------------
420
421 intersectInit :: Eq a => [a] -> [a] -> [a] -> [a]
422 intersectInit acc lst lst' =
423 if (null lst) || (null lst')
424 then acc
425 else if (head' "intersectInit" lst) == (head' "intersectInit" lst')
426 then intersectInit (acc ++ [head' "intersectInit" lst]) (tail lst) (tail lst')
427 else acc
428
429 branchIdsToProximity :: PhyloBranchId -> PhyloBranchId -> Double -> Double -> Double
430 branchIdsToProximity id id' thrInit thrStep = thrInit + thrStep * (fromIntegral $ length $ intersectInit [] (snd id) (snd id'))
431
432 ngramsInBranches :: [[PhyloGroup]] -> [Int]
433 ngramsInBranches branches = nub $ foldl (\acc g -> acc ++ (g ^. phylo_groupNgrams)) [] $ concat branches
434
435
436 traceMatchSuccess :: Double -> Double -> Double -> [[[PhyloGroup]]] -> [[[PhyloGroup]]]
437 traceMatchSuccess thr qua qua' nextBranches =
438 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . init . snd)
439 $ (head' "trace" $ head' "trace" $ head' "trace" nextBranches) ^. phylo_groupBranchId))
440 <> ",(1.." <> show (length nextBranches) <> ")]"
441 <> " | " <> show ((length . concat . concat) nextBranches) <> " groups" <> "\n"
442 <> " - splited with success in " <> show (map length nextBranches) <> " sub-branches" <> "\n"
443 <> " - for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " < " <> show(qua') <> ")\n" ) nextBranches
444
445
446 traceMatchFailure :: Double -> Double -> Double -> [[PhyloGroup]] -> [[PhyloGroup]]
447 traceMatchFailure thr qua qua' branches =
448 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
449 <> ",(1.." <> show (length branches) <> ")]"
450 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
451 <> " - split with failure for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " > " <> show(qua') <> ")\n"
452 ) branches
453
454
455 traceMatchNoSplit :: [[PhyloGroup]] -> [[PhyloGroup]]
456 traceMatchNoSplit branches =
457 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
458 <> ",(1.." <> show (length branches) <> ")]"
459 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
460 <> " - unable to split in smaller branches" <> "\n"
461 ) branches
462
463
464 traceMatchLimit :: [[PhyloGroup]] -> [[PhyloGroup]]
465 traceMatchLimit branches =
466 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
467 <> ",(1.." <> show (length branches) <> ")]"
468 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
469 <> " - unable to increase the threshold above 1" <> "\n"
470 ) branches
471
472
473 traceMatchEnd :: [PhyloGroup] -> [PhyloGroup]
474 traceMatchEnd groups =
475 trace ("\n" <> "-- | End temporal matching with " <> show (length $ nub $ map (\g -> g ^. phylo_groupBranchId) groups)
476 <> " branches and " <> show (length groups) <> " groups" <> "\n") groups
477
478
479 traceTemporalMatching :: [PhyloGroup] -> [PhyloGroup]
480 traceTemporalMatching groups =
481 trace ( "\n" <> "-- | Start temporal matching for " <> show(length groups) <> " groups" <> "\n") groups