]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Viz/Phylo/PhyloTools.hs
Merge branch 'dev' into 571-dev-node-corpus-api-search-fixes-take-2
[gargantext.git] / src / Gargantext / Core / Viz / Phylo / PhyloTools.hs
1 {-|
2 Module : Gargantext.Core.Viz.Phylo.PhyloTools
3 Description : Module dedicated to all the tools needed for making a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 {-# LANGUAGE ViewPatterns #-}
12
13 module Gargantext.Core.Viz.Phylo.PhyloTools where
14
15 import Control.Lens hiding (Level)
16 import Data.List (sort, concat, null, union, (++), tails, sortOn, nub, init, tail, iterate, transpose, partition, tails, nubBy, group, notElem, (!!))
17 import Data.Map (Map, elems, fromList, findWithDefault, unionWith, keys, member, (!), filterWithKey, fromListWith, empty, restrictKeys)
18 import Data.Set (Set, disjoint)
19 import Data.String (String)
20 import Data.Text (Text,unpack)
21 import Data.Vector (Vector, elemIndex)
22 import Debug.Trace (trace)
23 import Gargantext.Core.Viz.Phylo
24 import Gargantext.Prelude
25 import Prelude (floor,read)
26 import Text.Printf
27 import qualified Data.List as List
28 import qualified Data.Set as Set
29 import qualified Data.Text as Text
30 import qualified Data.Vector as Vector
31 import qualified Data.Map as Map
32
33 ------------
34 -- | Io | --
35 ------------
36
37 -- | To print an important message as an IO()
38 printIOMsg :: String -> IO ()
39 printIOMsg msg =
40 putStrLn ( "\n"
41 <> "------------"
42 <> "\n"
43 <> "-- | " <> msg <> "\n" )
44
45
46 -- | To print a comment as an IO()
47 printIOComment :: String -> IO ()
48 printIOComment cmt =
49 putStrLn ( "\n" <> cmt <> "\n" )
50
51
52 --------------
53 -- | Misc | --
54 --------------
55
56 -- truncate' :: Double -> Int -> Double
57 -- truncate' x n = (fromIntegral (floor (x * t))) / t
58 -- where t = 10^n
59
60 truncate' :: Double -> Int -> Double
61 truncate' x n = (fromIntegral $ (floor (x * t) :: Int)) / t
62 where
63 --------------
64 t :: Double
65 t = 10 ^n
66
67 getInMap :: Int -> Map Int Double -> Double
68 getInMap k m =
69 if (member k m)
70 then m ! k
71 else 0
72
73 roundToStr :: (PrintfArg a, Floating a) => Int -> a -> String
74 roundToStr = printf "%0.*f"
75
76
77 countSup :: Double -> [Double] -> Int
78 countSup s l = length $ filter (>s) l
79
80
81 dropByIdx :: Int -> [a] -> [a]
82 dropByIdx k l = take k l ++ drop (k+1) l
83
84
85 elemIndex' :: Eq a => a -> [a] -> Int
86 elemIndex' e l = case (List.elemIndex e l) of
87 Nothing -> panic ("[ERR][Viz.Phylo.PhyloTools] element not in list")
88 Just i -> i
89
90
91 commonPrefix :: Eq a => [a] -> [a] -> [a] -> [a]
92 commonPrefix lst lst' acc =
93 if (null lst || null lst')
94 then acc
95 else if (head' "commonPrefix" lst == head' "commonPrefix" lst')
96 then commonPrefix (tail lst) (tail lst') (acc ++ [head' "commonPrefix" lst])
97 else acc
98
99
100 ---------------------
101 -- | Foundations | --
102 ---------------------
103
104
105 -- | Is this Ngrams a Foundations Root ?
106 isRoots :: Ngrams -> Vector Ngrams -> Bool
107 isRoots n ns = Vector.elem n ns
108
109 -- | To transform a list of nrams into a list of foundation's index
110 ngramsToIdx :: [Ngrams] -> Vector Ngrams -> [Int]
111 ngramsToIdx ns fdt = map (\n -> fromJust $ elemIndex n fdt) ns
112
113 -- | To transform a list of sources into a list of sources' index
114 sourcesToIdx :: [Text] -> Vector Text -> [Int]
115 sourcesToIdx ss ps = nub $ map (\s -> fromJust $ elemIndex s ps) ss
116
117 -- | To transform a list of Ngrams Indexes into a Label
118 ngramsToLabel :: Vector Ngrams -> [Int] -> Text
119 ngramsToLabel ngrams l = Text.unwords $ tail' "ngramsToLabel" $ concat $ map (\n -> ["|",n]) $ ngramsToText ngrams l
120
121 idxToLabel :: [Int] -> String
122 idxToLabel l = List.unwords $ tail' "idxToLabel" $ concat $ map (\n -> ["|",show n]) l
123
124 idxToLabel' :: [Double] -> String
125 idxToLabel' l = List.unwords $ tail' "idxToLabel" $ concat $ map (\n -> ["|",show n]) l
126
127 -- | To transform a list of Ngrams Indexes into a list of Text
128 ngramsToText :: Vector Ngrams -> [Int] -> [Text]
129 ngramsToText ngrams l = map (\idx -> ngrams Vector.! idx) l
130
131
132 --------------
133 -- | Time | --
134 --------------
135
136 -- | To transform a list of periods into a set of Dates
137 periodsToYears :: [(Date,Date)] -> Set Date
138 periodsToYears periods = (Set.fromList . sort . concat)
139 $ map (\(d,d') -> [d..d']) periods
140
141
142 findBounds :: [Date] -> (Date,Date)
143 findBounds dates =
144 let dates' = sort dates
145 in (head' "findBounds" dates', last' "findBounds" dates')
146
147
148 toPeriods :: [Date] -> Int -> Int -> [(Date,Date)]
149 toPeriods dates p s =
150 let (start,end) = findBounds dates
151 in map (\dates' -> (head' "toPeriods" dates', last' "toPeriods" dates'))
152 $ chunkAlong p s [start .. end]
153
154
155 toFstDate :: [Text] -> Text
156 toFstDate ds = snd
157 $ head' "firstDate"
158 $ sortOn fst
159 $ map (\d ->
160 let d' = read (filter (\c -> notElem c ['U','T','C',' ',':','-']) $ unpack d)::Int
161 in (d',d)) ds
162
163 toLstDate :: [Text] -> Text
164 toLstDate ds = snd
165 $ head' "firstDate"
166 $ reverse
167 $ sortOn fst
168 $ map (\d ->
169 let d' = read (filter (\c -> notElem c ['U','T','C',' ',':','-']) $ unpack d)::Int
170 in (d',d)) ds
171
172
173 getTimeScale :: Phylo -> [Char]
174 getTimeScale p = case (timeUnit $ getConfig p) of
175 Epoch {} -> "epoch"
176 Year {} -> "year"
177 Month {} -> "month"
178 Week {} -> "week"
179 Day {} -> "day"
180
181
182 -- | Get a regular & ascendante timeScale from a given list of dates
183 toTimeScale :: [Date] -> Int -> [Date]
184 toTimeScale dates step =
185 let (start,end) = findBounds dates
186 in [start, (start + step) .. end]
187
188
189 getTimeStep :: TimeUnit -> Int
190 getTimeStep time = case time of
191 Epoch { .. } -> _epoch_step
192 Year { .. } -> _year_step
193 Month { .. } -> _month_step
194 Week { .. } -> _week_step
195 Day { .. } -> _day_step
196
197 getTimePeriod :: TimeUnit -> Int
198 getTimePeriod time = case time of
199 Epoch { .. } -> _epoch_period
200 Year { .. } -> _year_period
201 Month { .. } -> _month_period
202 Week { .. } -> _week_period
203 Day { .. } -> _day_period
204
205 getTimeFrame :: TimeUnit -> Int
206 getTimeFrame time = case time of
207 Epoch { .. } -> _epoch_matchingFrame
208 Year { .. } -> _year_matchingFrame
209 Month { .. } -> _month_matchingFrame
210 Week { .. } -> _week_matchingFrame
211 Day { .. } -> _day_matchingFrame
212
213 -------------
214 -- | Fis | --
215 -------------
216
217
218 -- | To find if l' is nested in l
219 isNested :: Eq a => [a] -> [a] -> Bool
220 isNested l l'
221 | null l' = True
222 | length l' > length l = False
223 | union l l' == l = True
224 | otherwise = False
225
226
227 -- | To filter Fis with small Support but by keeping non empty Periods
228 keepFilled :: (Int -> [a] -> [a]) -> Int -> [a] -> [a]
229 keepFilled f thr l = if (null $ f thr l) && (not $ null l)
230 then keepFilled f (thr - 1) l
231 else f thr l
232
233
234 traceClique :: Map (Date, Date) [Clustering] -> String
235 traceClique mFis = foldl (\msg cpt -> msg <> show (countSup cpt cliques) <> " (>" <> show (cpt) <> ") " ) "" [1..6]
236 where
237 --------------------------------------
238 cliques :: [Double]
239 cliques = sort $ map (fromIntegral . length . _clustering_roots) $ concat $ elems mFis
240 --------------------------------------
241
242
243 traceSupport :: Map (Date, Date) [Clustering] -> String
244 traceSupport mFis = foldl (\msg cpt -> msg <> show (countSup cpt supports) <> " (>" <> show (cpt) <> ") " ) "" [1..6]
245 where
246 --------------------------------------
247 supports :: [Double]
248 supports = sort $ map (fromIntegral . _clustering_support) $ concat $ elems mFis
249 --------------------------------------
250
251
252 traceFis :: [Char] -> Map (Date, Date) [Clustering] -> Map (Date, Date) [Clustering]
253 traceFis msg mFis = trace ( "\n" <> "-- | " <> msg <> " : " <> show (sum $ map length $ elems mFis) <> "\n"
254 <> "Support : " <> traceSupport mFis <> "\n"
255 <> "Nb Ngrams : " <> traceClique mFis <> "\n" ) mFis
256
257
258 ----------------
259 -- | Cluster| --
260 ----------------
261
262
263 getCliqueSupport :: Cluster -> Int
264 getCliqueSupport unit = case unit of
265 Fis s _ -> s
266 MaxClique _ _ _ -> 0
267
268 getCliqueSize :: Cluster -> Int
269 getCliqueSize unit = case unit of
270 Fis _ s -> s
271 MaxClique s _ _ -> s
272
273
274 --------------
275 -- | Cooc | --
276 --------------
277
278 listToCombi' :: [a] -> [(a,a)]
279 listToCombi' l = [(x,y) | (x:rest) <- tails l, y <- rest]
280
281 listToEqual' :: Eq a => [a] -> [(a,a)]
282 listToEqual' l = [(x,y) | x <- l, y <- l, x == y]
283
284 listToKeys :: Eq a => [a] -> [(a,a)]
285 listToKeys lst = (listToCombi' lst) ++ (listToEqual' lst)
286
287 listToMatrix :: [Int] -> Map (Int,Int) Double
288 listToMatrix lst = fromList $ map (\k -> (k,1)) $ listToKeys $ sort lst
289
290 listToMatrix' :: [Ngrams] -> Map (Ngrams,Ngrams) Int
291 listToMatrix' lst = fromList $ map (\k -> (k,1)) $ listToKeys $ sort lst
292
293 listToSeq :: Eq a => [a] -> [(a,a)]
294 listToSeq l = nubBy (\x y -> fst x == fst y) $ [ (x,y) | (x:rest) <- tails l, y <- rest ]
295
296 sumCooc :: Cooc -> Cooc -> Cooc
297 sumCooc cooc cooc' = unionWith (+) cooc cooc'
298
299 getTrace :: Cooc -> Double
300 getTrace cooc = sum $ elems $ filterWithKey (\(k,k') _ -> k == k') cooc
301
302 coocToDiago :: Cooc -> Cooc
303 coocToDiago cooc = filterWithKey (\(k,k') _ -> k == k') cooc
304
305 coocToAdjacency :: Cooc -> Cooc
306 coocToAdjacency cooc = Map.map (\_ -> 1) cooc
307
308 -- | To build the local cooc matrix of each phylogroup
309 ngramsToCooc :: [Int] -> [Cooc] -> Cooc
310 ngramsToCooc ngrams coocs =
311 let cooc = foldl (\acc cooc' -> sumCooc acc cooc') empty coocs
312 pairs = listToKeys ngrams
313 in filterWithKey (\k _ -> elem k pairs) cooc
314
315
316 -----------------
317 -- | Density | --
318 -----------------
319
320
321 -- | To build the density of a phylogroup
322 -- density is defined in Callon M, Courtial JP, Laville F (1991) Co-word analysis as a tool for describing
323 -- the network of interaction between basic and technological research: The case of polymer chemistry.
324 -- Scientometric 22: 155–205.
325 ngramsToDensity :: [Int] -> [Cooc] -> (Map Int Double) -> Double
326 ngramsToDensity ngrams coocs rootsCount =
327 let cooc = foldl (\acc cooc' -> sumCooc acc cooc') empty coocs
328 pairs = listToCombi' ngrams
329 density = map (\(i,j) ->
330 let nij = findWithDefault 0 (i,j) cooc
331 in (nij * nij) / ((rootsCount ! i) * (rootsCount ! j))) pairs
332 in (sum density) / (fromIntegral $ length ngrams)
333
334
335
336
337 ------------------
338 -- | Defaults | --
339 ------------------
340
341 -- | find the local maxima in a list of values
342 findMaxima :: [(Double,Double)] -> [Bool]
343 findMaxima lst = map (hasMax) $ toChunk 3 lst
344 where
345 ------
346 hasMax :: [(Double,Double)] -> Bool
347 hasMax chunk =
348 if (length chunk) /= 3
349 then False
350 else (snd(chunk !! 0) < snd(chunk !! 1)) && (snd(chunk !! 2) < snd(chunk !! 1))
351
352
353 -- | split a list into chunks of size n
354 toChunk :: Int -> [a] -> [[a]]
355 toChunk n = takeWhile ((== n) . length) . transpose . take n . iterate tail
356
357
358 -- | To compute the average degree from a cooc matrix
359 -- http://networksciencebook.com/chapter/2#degree
360 toAverageDegree :: Cooc -> Vector Ngrams -> Double
361 toAverageDegree cooc roots = 2 * (fromIntegral $ Map.size cooc) / (fromIntegral $ Vector.length roots)
362
363
364 -- | Use the giant component regime to estimate the default level
365 -- http://networksciencebook.com/chapter/3#networks-supercritical
366 regimeToDefaultLevel :: Cooc -> Vector Ngrams -> Double
367 regimeToDefaultLevel cooc roots
368 | avg == 0 = 1
369 | avg < 1 = avg * 0.6
370 | avg == 1 = 0.6
371 | avg < lnN = (avg * 0.2) / lnN
372 | otherwise = 0.2
373 where
374 avg :: Double
375 avg = toAverageDegree cooc roots
376 lnN :: Double
377 lnN = log (fromIntegral $ Vector.length roots)
378
379 coocToConfidence :: Phylo -> Cooc
380 coocToConfidence phylo =
381 let count = getRootsCount phylo
382 cooc = foldl (\acc cooc' -> sumCooc acc cooc') empty
383 $ elems $ getCoocByDate phylo
384 in Map.mapWithKey (\(a,b) w -> confidence a b w count) cooc
385 where
386 ----
387 -- confidence
388 confidence :: Int -> Int -> Double -> Map Int Double -> Double
389 confidence a b inter card = maximum [(inter / card ! a),(inter / card ! b)]
390
391
392 sumtest :: [Int] -> [Int] -> Int
393 sumtest l1 l2 = (head' "test" l1) + (head' "test" $ reverse l2)
394
395
396 findDefaultLevel :: Phylo -> Phylo
397 findDefaultLevel phylo =
398 let confidence = Map.filterWithKey (\(a,b) _ -> a /= b)
399 $ Map.filter (> 0.01)
400 $ coocToConfidence phylo
401 roots = getRoots phylo
402 level = regimeToDefaultLevel confidence roots
403 in updateLevel level phylo
404
405
406 --------------------
407 -- | PhyloGroup | --
408 --------------------
409
410 getGroupId :: PhyloGroup -> PhyloGroupId
411 getGroupId g = ((g ^. phylo_groupPeriod, g ^. phylo_groupScale), g ^. phylo_groupIndex)
412
413 getGroupNgrams :: PhyloGroup -> [Int]
414 getGroupNgrams g = g ^. phylo_groupNgrams
415
416 idToPrd :: PhyloGroupId -> Period
417 idToPrd id = (fst . fst) id
418
419 groupByField :: Ord a => (PhyloGroup -> a) -> [PhyloGroup] -> Map a [PhyloGroup]
420 groupByField toField groups = fromListWith (++) $ map (\g -> (toField g, [g])) groups
421
422 getPeriodPointers :: Filiation -> PhyloGroup -> [Pointer]
423 getPeriodPointers fil g =
424 case fil of
425 ToChilds -> g ^. phylo_groupPeriodChilds
426 ToParents -> g ^. phylo_groupPeriodParents
427 ToChildsMemory -> undefined
428 ToParentsMemory -> undefined
429
430 filterSimilarity :: PhyloSimilarity -> Double -> Double -> Bool
431 filterSimilarity similarity thr local =
432 case similarity of
433 WeightedLogJaccard _ _ -> local >= thr
434 WeightedLogSim _ _ -> local >= thr
435 Hamming _ _ -> undefined
436
437 getSimilarityName :: PhyloSimilarity -> String
438 getSimilarityName similarity =
439 case similarity of
440 WeightedLogJaccard _ _ -> "WLJaccard"
441 WeightedLogSim _ _ -> "WeightedLogSim"
442 Hamming _ _ -> "Hamming"
443
444 ---------------
445 -- | Phylo | --
446 ---------------
447
448 addPointers :: Filiation -> PointerType -> [Pointer] -> PhyloGroup -> PhyloGroup
449 addPointers fil pty pointers g =
450 case pty of
451 TemporalPointer -> case fil of
452 ToChilds -> g & phylo_groupPeriodChilds .~ pointers
453 ToParents -> g & phylo_groupPeriodParents .~ pointers
454 ToChildsMemory -> undefined
455 ToParentsMemory -> undefined
456 ScalePointer -> case fil of
457 ToChilds -> g & phylo_groupScaleChilds .~ pointers
458 ToParents -> g & phylo_groupScaleParents .~ pointers
459 ToChildsMemory -> undefined
460 ToParentsMemory -> undefined
461
462 toPointer' :: Double -> Pointer -> Pointer'
463 toPointer' thr pt = (fst pt,(thr,snd pt))
464
465
466 addMemoryPointers :: Filiation -> PointerType -> Double -> [Pointer] -> PhyloGroup -> PhyloGroup
467 addMemoryPointers fil pty thr pointers g =
468 case pty of
469 TemporalPointer -> case fil of
470 ToChilds -> undefined
471 ToParents -> undefined
472 ToChildsMemory -> g & phylo_groupPeriodMemoryChilds .~ (concat [(g ^. phylo_groupPeriodMemoryChilds),(map (\pt -> toPointer' thr pt) pointers)])
473 ToParentsMemory -> g & phylo_groupPeriodMemoryParents .~ (concat [(g ^. phylo_groupPeriodMemoryParents),(map (\pt -> toPointer' thr pt) pointers)])
474 ScalePointer -> undefined
475
476
477 getPeriodIds :: Phylo -> [(Date,Date)]
478 getPeriodIds phylo = sortOn fst
479 $ keys
480 $ phylo ^. phylo_periods
481
482 getLastDate :: Phylo -> Date
483 getLastDate phylo = snd $ last' "lastDate" $ getPeriodIds phylo
484
485 getLevelParentId :: PhyloGroup -> PhyloGroupId
486 getLevelParentId g = fst $ head' "getLevelParentId" $ g ^. phylo_groupScaleParents
487
488 getLastLevel :: Phylo -> Scale
489 getLastLevel phylo = last' "lastLevel" $ getScales phylo
490
491 getScales :: Phylo -> [Scale]
492 getScales phylo = nub
493 $ map snd
494 $ keys $ view ( phylo_periods
495 . traverse
496 . phylo_periodScales ) phylo
497
498 getSeaElevation :: Phylo -> SeaElevation
499 getSeaElevation phylo = seaElevation (getConfig phylo)
500
501 getSimilarity :: Phylo -> PhyloSimilarity
502 getSimilarity phylo = similarity (getConfig phylo)
503
504
505 getPhyloSeaRiseStart :: Phylo -> Double
506 getPhyloSeaRiseStart phylo = case (getSeaElevation phylo) of
507 Constante s _ -> s
508 Adaptative _ -> 0
509 Evolving _ -> 0
510
511 getPhyloSeaRiseSteps :: Phylo -> Double
512 getPhyloSeaRiseSteps phylo = case (getSeaElevation phylo) of
513 Constante _ s -> s
514 Adaptative s -> s
515 Evolving _ -> 0.1
516
517
518 getConfig :: Phylo -> PhyloConfig
519 getConfig phylo = (phylo ^. phylo_param) ^. phyloParam_config
520
521 getLevel :: Phylo -> Double
522 getLevel phylo = (phyloQuality (getConfig phylo)) ^. qua_granularity
523
524 getLadder :: Phylo -> [Double]
525 getLadder phylo = phylo ^. phylo_seaLadder
526
527 getCoocByDate :: Phylo -> Map Date Cooc
528 getCoocByDate phylo = coocByDate (phylo ^. phylo_counts)
529
530 getRootsCountByDate :: Phylo -> Map Date (Map Int Double)
531 getRootsCountByDate phylo = rootsCountByDate (phylo ^. phylo_counts)
532
533 getDocsByDate :: Phylo -> Map Date Double
534 getDocsByDate phylo = docsByDate (phylo ^. phylo_counts)
535
536 getRootsCount :: Phylo -> Map Int Double
537 getRootsCount phylo = rootsCount (phylo ^. phylo_counts)
538
539 getRootsFreq :: Phylo -> Map Int Double
540 getRootsFreq phylo = rootsFreq (phylo ^. phylo_counts)
541
542 getLastRootsFreq :: Phylo -> Map Int Double
543 getLastRootsFreq phylo = lastRootsFreq (phylo ^. phylo_counts)
544
545 setConfig :: PhyloConfig -> Phylo -> Phylo
546 setConfig config phylo = phylo
547 & phylo_param .~ (PhyloParam
548 ((phylo ^. phylo_param) ^. phyloParam_version)
549 ((phylo ^. phylo_param) ^. phyloParam_software)
550 config)
551
552 -- & phylo_param & phyloParam_config & phyloParam_config .~ config
553
554
555 getRoots :: Phylo -> Vector Ngrams
556 getRoots phylo = (phylo ^. phylo_foundations) ^. foundations_roots
557
558 getRootsInGroups :: Phylo -> Map Int [PhyloGroupId]
559 getRootsInGroups phylo = (phylo ^. phylo_foundations) ^. foundations_rootsInGroups
560
561 getSources :: Phylo -> Vector Text
562 getSources phylo = _sources (phylo ^. phylo_sources)
563
564
565 -- get the groups distributed by branches at the last scale
566 phyloLastScale :: Phylo -> [[PhyloGroup]]
567 phyloLastScale phylo = elems
568 $ fromListWith (++)
569 $ map (\g -> (g ^. phylo_groupBranchId, [g]))
570 $ getGroupsFromScale (last' "byBranches" $ getScales phylo) phylo
571
572 getGroupsFromScale :: Scale -> Phylo -> [PhyloGroup]
573 getGroupsFromScale lvl phylo =
574 elems $ view ( phylo_periods
575 . traverse
576 . phylo_periodScales
577 . traverse
578 . filtered (\phyloLvl -> phyloLvl ^. phylo_scaleScale == lvl)
579 . phylo_scaleGroups ) phylo
580
581
582 getGroupsFromScalePeriods :: Scale -> [Period] -> Phylo -> [PhyloGroup]
583 getGroupsFromScalePeriods lvl periods phylo =
584 elems $ view ( phylo_periods
585 . traverse
586 . filtered (\phyloPrd -> elem (phyloPrd ^. phylo_periodPeriod) periods)
587 . phylo_periodScales
588 . traverse
589 . filtered (\phyloLvl -> phyloLvl ^. phylo_scaleScale == lvl)
590 . phylo_scaleGroups ) phylo
591
592
593 getGroupsFromPeriods :: Scale -> Map Period PhyloPeriod -> [PhyloGroup]
594 getGroupsFromPeriods lvl periods =
595 elems $ view ( traverse
596 . phylo_periodScales
597 . traverse
598 . filtered (\phyloLvl -> phyloLvl ^. phylo_scaleScale == lvl)
599 . phylo_scaleGroups ) periods
600
601
602 updatePhyloGroups :: Scale -> Map PhyloGroupId PhyloGroup -> Phylo -> Phylo
603 updatePhyloGroups lvl m phylo =
604 over ( phylo_periods
605 . traverse
606 . phylo_periodScales
607 . traverse
608 . filtered (\phyloLvl -> phyloLvl ^. phylo_scaleScale == lvl)
609 . phylo_scaleGroups
610 . traverse
611 ) (\g ->
612 let id = getGroupId g
613 in
614 if member id m
615 then m ! id
616 else g ) phylo
617
618 updatePeriods :: Map (Date,Date) (Text,Text) -> Phylo -> Phylo
619 updatePeriods periods' phylo =
620 over (phylo_periods . traverse)
621 (\prd ->
622 let prd' = periods' ! (prd ^. phylo_periodPeriod)
623 lvls = map (\lvl -> lvl & phylo_scalePeriodStr .~ prd') $ prd ^. phylo_periodScales
624 in prd & phylo_periodPeriodStr .~ prd'
625 & phylo_periodScales .~ lvls
626 ) phylo
627
628 updateQuality :: Double -> Phylo -> Phylo
629 updateQuality quality phylo = phylo { _phylo_quality = quality }
630
631 updateLevel :: Double -> Phylo -> Phylo
632 updateLevel level phylo = phylo { _phylo_level = level }
633
634 traceToPhylo :: Scale -> Phylo -> Phylo
635 traceToPhylo lvl phylo =
636 trace ("\n" <> "-- | End of phylo making at level " <> show (lvl) <> " with "
637 <> show (length $ getGroupsFromScale lvl phylo) <> " groups and "
638 <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromScale lvl phylo) <> " branches" <> "\n") phylo
639
640 --------------------
641 -- | Clustering | --
642 --------------------
643
644 mergeBranchIds :: [[Int]] -> [Int]
645 mergeBranchIds ids = (head' "mergeBranchIds" . sort . mostFreq') ids
646 where
647 -- | 2) find the most Up Left ids in the hierarchy of similarity
648 -- mostUpLeft :: [[Int]] -> [[Int]]
649 -- mostUpLeft ids' =
650 -- let groupIds = (map (\gIds -> (length $ head' "gIds" gIds, head' "gIds" gIds)) . groupBy (\id id' -> length id == length id') . sortOn length) ids'
651 -- inf = (fst . minimum) groupIds
652 -- in map snd $ filter (\gIds -> fst gIds == inf) groupIds
653 -- | 1) find the most frequent ids
654 mostFreq' :: [[Int]] -> [[Int]]
655 mostFreq' ids' =
656 let groupIds = (map (\gIds -> (length gIds, head' "gIds" gIds)) . group . sort) ids'
657 sup = (fst . maximum) groupIds
658 in map snd $ filter (\gIds -> fst gIds == sup) groupIds
659
660
661 mergeMeta :: [Int] -> [PhyloGroup] -> Map Text [Double]
662 mergeMeta bId groups =
663 let ego = head' "mergeMeta" $ filter (\g -> (snd (g ^. phylo_groupBranchId)) == bId) groups
664 in fromList [("breaks",(ego ^. phylo_groupMeta) ! "breaks"),("seaLevels",(ego ^. phylo_groupMeta) ! "seaLevels")]
665
666
667 groupsToBranches' :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
668 groupsToBranches' groups =
669 {- run the related component algorithm -}
670 let egos = map (\g -> [getGroupId g]
671 ++ (map fst $ g ^. phylo_groupPeriodParents)
672 ++ (map fst $ g ^. phylo_groupPeriodChilds)
673 ++ (map fst $ g ^. phylo_groupAncestors)) $ elems groups
674 graph = relatedComponents egos
675 {- update each group's branch id -}
676 in map (\ids ->
677 let groups' = elems $ restrictKeys groups (Set.fromList ids)
678 bId = mergeBranchIds $ map (\g -> snd $ g ^. phylo_groupBranchId) groups'
679 in map (\g -> g & phylo_groupBranchId %~ (\(lvl,_) -> (lvl,bId))) groups') graph
680
681
682 relatedComponents :: Ord a => [[a]] -> [[a]]
683 relatedComponents graph = foldl' (\branches groups ->
684 if (null branches)
685 then branches ++ [groups]
686 else
687 let branchPart = partition (\branch -> disjoint (Set.fromList branch) (Set.fromList groups)) branches
688 in (fst branchPart) ++ [nub $ concat $ (snd branchPart) ++ [groups]]) [] graph
689
690
691 toRelatedComponents :: [PhyloGroup] -> [((PhyloGroup,PhyloGroup),Double)] -> [[PhyloGroup]]
692 toRelatedComponents nodes edges =
693 let ref = fromList $ map (\g -> (getGroupId g, g)) nodes
694 clusters = relatedComponents $ ((map (\((g,g'),_) -> [getGroupId g, getGroupId g']) edges) ++ (map (\g -> [getGroupId g]) nodes))
695 in map (\cluster -> map (\gId -> ref ! gId) cluster) clusters
696
697
698 traceSynchronyEnd :: Phylo -> Phylo
699 traceSynchronyEnd phylo =
700 trace ( "-- | End synchronic clustering at level " <> show (getLastLevel phylo)
701 <> " with " <> show (length $ getGroupsFromScale (getLastLevel phylo) phylo) <> " groups"
702 <> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromScale (getLastLevel phylo) phylo) <> " branches"
703 <> "\n" ) phylo
704
705 traceSynchronyStart :: Phylo -> Phylo
706 traceSynchronyStart phylo =
707 trace ( "\n" <> "-- | Start synchronic clustering at level " <> show (getLastLevel phylo)
708 <> " with " <> show (length $ getGroupsFromScale (getLastLevel phylo) phylo) <> " groups"
709 <> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromScale (getLastLevel phylo) phylo) <> " branches"
710 <> "\n" ) phylo
711
712
713 -------------------
714 -- | Similarity | --
715 -------------------
716
717 getSensibility :: PhyloSimilarity -> Double
718 getSensibility proxi = case proxi of
719 WeightedLogJaccard s _ -> s
720 WeightedLogSim s _ -> s
721 Hamming _ _ -> undefined
722
723 getMinSharedNgrams :: PhyloSimilarity -> Int
724 getMinSharedNgrams proxi = case proxi of
725 WeightedLogJaccard _ m -> m
726 WeightedLogSim _ m -> m
727 Hamming _ _ -> undefined
728
729 ----------------
730 -- | Branch | --
731 ----------------
732
733 intersectInit :: Eq a => [a] -> [a] -> [a] -> [a]
734 intersectInit acc lst lst' =
735 if (null lst) || (null lst')
736 then acc
737 else if (head' "intersectInit" lst) == (head' "intersectInit" lst')
738 then intersectInit (acc ++ [head' "intersectInit" lst]) (tail lst) (tail lst')
739 else acc
740
741 branchIdsToSimilarity :: PhyloBranchId -> PhyloBranchId -> Double -> Double -> Double
742 branchIdsToSimilarity id id' thrInit thrStep = thrInit + thrStep * (fromIntegral $ length $ intersectInit [] (snd id) (snd id'))
743
744 ngramsInBranches :: [[PhyloGroup]] -> [Int]
745 ngramsInBranches branches = nub $ foldl (\acc g -> acc ++ (g ^. phylo_groupNgrams)) [] $ concat branches
746
747
748 traceMatchSuccess :: Double -> Double -> Double -> [[[PhyloGroup]]] -> [[[PhyloGroup]]]
749 traceMatchSuccess thr qua qua' nextBranches =
750 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . init . snd)
751 $ (head' "trace" $ head' "trace" $ head' "trace" nextBranches) ^. phylo_groupBranchId))
752 <> ",(1.." <> show (length nextBranches) <> ")]"
753 <> " | " <> show ((length . concat . concat) nextBranches) <> " groups" <> "\n"
754 <> " - splited with success in " <> show (map length nextBranches) <> " sub-branches" <> "\n"
755 <> " - for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " < " <> show(qua') <> ")\n" ) nextBranches
756
757
758 traceMatchFailure :: Double -> Double -> Double -> [[PhyloGroup]] -> [[PhyloGroup]]
759 traceMatchFailure thr qua qua' branches =
760 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
761 <> ",(1.." <> show (length branches) <> ")]"
762 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
763 <> " - split with failure for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " > " <> show(qua') <> ")\n"
764 ) branches
765
766
767 traceMatchNoSplit :: [[PhyloGroup]] -> [[PhyloGroup]]
768 traceMatchNoSplit branches =
769 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
770 <> ",(1.." <> show (length branches) <> ")]"
771 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
772 <> " - unable to split in smaller branches" <> "\n"
773 ) branches
774
775
776 traceMatchLimit :: [[PhyloGroup]] -> [[PhyloGroup]]
777 traceMatchLimit branches =
778 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
779 <> ",(1.." <> show (length branches) <> ")]"
780 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
781 <> " - unable to increase the threshold above 1" <> "\n"
782 ) branches
783
784
785 traceMatchEnd :: [PhyloGroup] -> [PhyloGroup]
786 traceMatchEnd groups =
787 trace ("\n" <> "-- | End temporal matching with " <> show (length $ nub $ map (\g -> g ^. phylo_groupBranchId) groups)
788 <> " branches and " <> show (length groups) <> " groups" <> "\n") groups
789
790
791 traceTemporalMatching :: [PhyloGroup] -> [PhyloGroup]
792 traceTemporalMatching groups =
793 trace ( "\n" <> "-- | Start temporal matching for " <> show(length groups) <> " groups" <> "\n") groups
794
795
796 traceGroupsProxi :: [Double] -> [Double]
797 traceGroupsProxi l =
798 trace ( "\n" <> "-- | " <> show(List.length l) <> " computed pairs of groups Similarity" <> "\n") l