]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Viz/Phylo/PhyloTools.hs
[ngrams] try terms query without the materialized view
[gargantext.git] / src / Gargantext / Core / Viz / Phylo / PhyloTools.hs
1 {-|
2 Module : Gargantext.Core.Viz.Phylo.PhyloTools
3 Description : Module dedicated to all the tools needed for making a Phylo
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9 -}
10
11 {-# LANGUAGE ViewPatterns #-}
12
13 module Gargantext.Core.Viz.Phylo.PhyloTools where
14
15 import Control.Lens hiding (Level)
16 import Data.List (sort, concat, null, union, (++), tails, sortOn, nub, init, tail, iterate, transpose, partition, tails, nubBy, group, notElem, (!!))
17 import Data.Map (Map, elems, fromList, unionWith, keys, member, (!), filterWithKey, fromListWith, empty, restrictKeys)
18 import Data.Set (Set, disjoint)
19 import Data.String (String)
20 import Data.Text (Text,unpack)
21 import Data.Vector (Vector, elemIndex)
22 import Debug.Trace (trace)
23 import Gargantext.Core.Viz.Phylo
24 import Gargantext.Prelude
25 import Prelude (floor,read)
26 import Text.Printf
27 import qualified Data.List as List
28 import qualified Data.Set as Set
29 import qualified Data.Text as Text
30 import qualified Data.Vector as Vector
31 import qualified Data.Map as Map
32
33 ------------
34 -- | Io | --
35 ------------
36
37 -- | To print an important message as an IO()
38 printIOMsg :: String -> IO ()
39 printIOMsg msg =
40 putStrLn ( "\n"
41 <> "------------"
42 <> "\n"
43 <> "-- | " <> msg <> "\n" )
44
45
46 -- | To print a comment as an IO()
47 printIOComment :: String -> IO ()
48 printIOComment cmt =
49 putStrLn ( "\n" <> cmt <> "\n" )
50
51
52 --------------
53 -- | Misc | --
54 --------------
55
56 -- truncate' :: Double -> Int -> Double
57 -- truncate' x n = (fromIntegral (floor (x * t))) / t
58 -- where t = 10^n
59
60 truncate' :: Double -> Int -> Double
61 truncate' x n = (fromIntegral $ (floor (x * t) :: Int)) / t
62 where
63 --------------
64 t :: Double
65 t = 10 ^n
66
67 getInMap :: Int -> Map Int Double -> Double
68 getInMap k m =
69 if (member k m)
70 then m ! k
71 else 0
72
73 roundToStr :: (PrintfArg a, Floating a) => Int -> a -> String
74 roundToStr = printf "%0.*f"
75
76
77 countSup :: Double -> [Double] -> Int
78 countSup s l = length $ filter (>s) l
79
80
81 dropByIdx :: Int -> [a] -> [a]
82 dropByIdx k l = take k l ++ drop (k+1) l
83
84
85 elemIndex' :: Eq a => a -> [a] -> Int
86 elemIndex' e l = case (List.elemIndex e l) of
87 Nothing -> panic ("[ERR][Viz.Phylo.PhyloTools] element not in list")
88 Just i -> i
89
90
91 commonPrefix :: Eq a => [a] -> [a] -> [a] -> [a]
92 commonPrefix lst lst' acc =
93 if (null lst || null lst')
94 then acc
95 else if (head' "commonPrefix" lst == head' "commonPrefix" lst')
96 then commonPrefix (tail lst) (tail lst') (acc ++ [head' "commonPrefix" lst])
97 else acc
98
99
100 ---------------------
101 -- | Foundations | --
102 ---------------------
103
104
105 -- | Is this Ngrams a Foundations Root ?
106 isRoots :: Ngrams -> Vector Ngrams -> Bool
107 isRoots n ns = Vector.elem n ns
108
109 -- | To transform a list of nrams into a list of foundation's index
110 ngramsToIdx :: [Ngrams] -> Vector Ngrams -> [Int]
111 ngramsToIdx ns fdt = map (\n -> fromJust $ elemIndex n fdt) ns
112
113 -- | To transform a list of sources into a list of sources' index
114 sourcesToIdx :: [Text] -> Vector Text -> [Int]
115 sourcesToIdx ss ps = nub $ map (\s -> fromJust $ elemIndex s ps) ss
116
117 -- | To transform a list of Ngrams Indexes into a Label
118 ngramsToLabel :: Vector Ngrams -> [Int] -> Text
119 ngramsToLabel ngrams l = Text.unwords $ tail' "ngramsToLabel" $ concat $ map (\n -> ["|",n]) $ ngramsToText ngrams l
120
121 idxToLabel :: [Int] -> String
122 idxToLabel l = List.unwords $ tail' "idxToLabel" $ concat $ map (\n -> ["|",show n]) l
123
124 idxToLabel' :: [Double] -> String
125 idxToLabel' l = List.unwords $ tail' "idxToLabel" $ concat $ map (\n -> ["|",show n]) l
126
127 -- | To transform a list of Ngrams Indexes into a list of Text
128 ngramsToText :: Vector Ngrams -> [Int] -> [Text]
129 ngramsToText ngrams l = map (\idx -> ngrams Vector.! idx) l
130
131
132 --------------
133 -- | Time | --
134 --------------
135
136 -- | To transform a list of periods into a set of Dates
137 periodsToYears :: [(Date,Date)] -> Set Date
138 periodsToYears periods = (Set.fromList . sort . concat)
139 $ map (\(d,d') -> [d..d']) periods
140
141
142 findBounds :: [Date] -> (Date,Date)
143 findBounds dates =
144 let dates' = sort dates
145 in (head' "findBounds" dates', last' "findBounds" dates')
146
147
148 toPeriods :: [Date] -> Int -> Int -> [(Date,Date)]
149 toPeriods dates p s =
150 let (start,end) = findBounds dates
151 in map (\dates' -> (head' "toPeriods" dates', last' "toPeriods" dates'))
152 $ chunkAlong p s [start .. end]
153
154
155 toFstDate :: [Text] -> Text
156 toFstDate ds = snd
157 $ head' "firstDate"
158 $ sortOn fst
159 $ map (\d ->
160 let d' = read (filter (\c -> notElem c ['U','T','C',' ',':','-']) $ unpack d)::Int
161 in (d',d)) ds
162
163 toLstDate :: [Text] -> Text
164 toLstDate ds = snd
165 $ head' "firstDate"
166 $ reverse
167 $ sortOn fst
168 $ map (\d ->
169 let d' = read (filter (\c -> notElem c ['U','T','C',' ',':','-']) $ unpack d)::Int
170 in (d',d)) ds
171
172
173 getTimeScale :: Phylo -> [Char]
174 getTimeScale p = case (timeUnit $ getConfig p) of
175 Epoch _ _ _ -> "epoch"
176 Year _ _ _ -> "year"
177 Month _ _ _ -> "month"
178 Week _ _ _ -> "week"
179 Day _ _ _ -> "day"
180
181
182 -- | Get a regular & ascendante timeScale from a given list of dates
183 toTimeScale :: [Date] -> Int -> [Date]
184 toTimeScale dates step =
185 let (start,end) = findBounds dates
186 in [start, (start + step) .. end]
187
188
189 getTimeStep :: TimeUnit -> Int
190 getTimeStep time = case time of
191 Epoch _ s _ -> s
192 Year _ s _ -> s
193 Month _ s _ -> s
194 Week _ s _ -> s
195 Day _ s _ -> s
196
197 getTimePeriod :: TimeUnit -> Int
198 getTimePeriod time = case time of
199 Epoch p _ _ -> p
200 Year p _ _ -> p
201 Month p _ _ -> p
202 Week p _ _ -> p
203 Day p _ _ -> p
204
205 getTimeFrame :: TimeUnit -> Int
206 getTimeFrame time = case time of
207 Epoch _ _ f -> f
208 Year _ _ f -> f
209 Month _ _ f -> f
210 Week _ _ f -> f
211 Day _ _ f -> f
212
213 -------------
214 -- | Fis | --
215 -------------
216
217
218 -- | To find if l' is nested in l
219 isNested :: Eq a => [a] -> [a] -> Bool
220 isNested l l'
221 | null l' = True
222 | length l' > length l = False
223 | (union l l') == l = True
224 | otherwise = False
225
226
227 -- | To filter Fis with small Support but by keeping non empty Periods
228 keepFilled :: (Int -> [a] -> [a]) -> Int -> [a] -> [a]
229 keepFilled f thr l = if (null $ f thr l) && (not $ null l)
230 then keepFilled f (thr - 1) l
231 else f thr l
232
233
234 traceClique :: Map (Date, Date) [Clustering] -> String
235 traceClique mFis = foldl (\msg cpt -> msg <> show (countSup cpt cliques) <> " (>" <> show (cpt) <> ") " ) "" [1..6]
236 where
237 --------------------------------------
238 cliques :: [Double]
239 cliques = sort $ map (fromIntegral . length . _clustering_roots) $ concat $ elems mFis
240 --------------------------------------
241
242
243 traceSupport :: Map (Date, Date) [Clustering] -> String
244 traceSupport mFis = foldl (\msg cpt -> msg <> show (countSup cpt supports) <> " (>" <> show (cpt) <> ") " ) "" [1..6]
245 where
246 --------------------------------------
247 supports :: [Double]
248 supports = sort $ map (fromIntegral . _clustering_support) $ concat $ elems mFis
249 --------------------------------------
250
251
252 traceFis :: [Char] -> Map (Date, Date) [Clustering] -> Map (Date, Date) [Clustering]
253 traceFis msg mFis = trace ( "\n" <> "-- | " <> msg <> " : " <> show (sum $ map length $ elems mFis) <> "\n"
254 <> "Support : " <> (traceSupport mFis) <> "\n"
255 <> "Nb Ngrams : " <> (traceClique mFis) <> "\n" ) mFis
256
257
258 ----------------
259 -- | Cluster| --
260 ----------------
261
262
263 getCliqueSupport :: Cluster -> Int
264 getCliqueSupport unit = case unit of
265 Fis s _ -> s
266 MaxClique _ _ _ -> 0
267
268 getCliqueSize :: Cluster -> Int
269 getCliqueSize unit = case unit of
270 Fis _ s -> s
271 MaxClique s _ _ -> s
272
273
274 --------------
275 -- | Cooc | --
276 --------------
277
278 listToCombi' :: [a] -> [(a,a)]
279 listToCombi' l = [(x,y) | (x:rest) <- tails l, y <- rest]
280
281 listToEqual' :: Eq a => [a] -> [(a,a)]
282 listToEqual' l = [(x,y) | x <- l, y <- l, x == y]
283
284 listToKeys :: Eq a => [a] -> [(a,a)]
285 listToKeys lst = (listToCombi' lst) ++ (listToEqual' lst)
286
287 listToMatrix :: [Int] -> Map (Int,Int) Double
288 listToMatrix lst = fromList $ map (\k -> (k,1)) $ listToKeys $ sort lst
289
290 listToMatrix' :: [Ngrams] -> Map (Ngrams,Ngrams) Int
291 listToMatrix' lst = fromList $ map (\k -> (k,1)) $ listToKeys $ sort lst
292
293 listToSeq :: Eq a => [a] -> [(a,a)]
294 listToSeq l = nubBy (\x y -> fst x == fst y) $ [ (x,y) | (x:rest) <- tails l, y <- rest ]
295
296 sumCooc :: Cooc -> Cooc -> Cooc
297 sumCooc cooc cooc' = unionWith (+) cooc cooc'
298
299 getTrace :: Cooc -> Double
300 getTrace cooc = sum $ elems $ filterWithKey (\(k,k') _ -> k == k') cooc
301
302 coocToDiago :: Cooc -> Cooc
303 coocToDiago cooc = filterWithKey (\(k,k') _ -> k == k') cooc
304
305 coocToAdjacency :: Cooc -> Cooc
306 coocToAdjacency cooc = Map.map (\_ -> 1) cooc
307
308 -- | To build the local cooc matrix of each phylogroup
309 ngramsToCooc :: [Int] -> [Cooc] -> Cooc
310 ngramsToCooc ngrams coocs =
311 let cooc = foldl (\acc cooc' -> sumCooc acc cooc') empty coocs
312 pairs = listToKeys ngrams
313 in filterWithKey (\k _ -> elem k pairs) cooc
314
315
316 ------------------
317 -- | Defaults | --
318 ------------------
319
320 -- | find the local maxima in a list of values
321 findMaxima :: [(Double,Double)] -> [Bool]
322 findMaxima lst = map (hasMax) $ toChunk 3 lst
323 where
324 ------
325 hasMax :: [(Double,Double)] -> Bool
326 hasMax chunk =
327 if (length chunk) /= 3
328 then False
329 else (snd(chunk !! 0) < snd(chunk !! 1)) && (snd(chunk !! 2) < snd(chunk !! 1))
330
331
332 -- | split a list into chunks of size n
333 toChunk :: Int -> [a] -> [[a]]
334 toChunk n = takeWhile ((== n) . length) . transpose . take n . iterate tail
335
336
337 -- | To compute the average degree from a cooc matrix
338 -- http://networksciencebook.com/chapter/2#degree
339 toAverageDegree :: Cooc -> Vector Ngrams -> Double
340 toAverageDegree cooc roots = 2 * (fromIntegral $ Map.size cooc) / (fromIntegral $ Vector.length roots)
341
342
343 -- | Use the giant component regime to estimate the default level
344 -- http://networksciencebook.com/chapter/3#networks-supercritical
345 regimeToDefaultLevel :: Cooc -> Vector Ngrams -> Double
346 regimeToDefaultLevel cooc roots
347 | avg == 0 = 1
348 | avg < 1 = avg * 0.6
349 | avg == 1 = 0.6
350 | avg < lnN = (avg * 0.2) / lnN
351 | otherwise = 0.2
352 where
353 avg :: Double
354 avg = toAverageDegree cooc roots
355 lnN :: Double
356 lnN = log (fromIntegral $ Vector.length roots)
357
358 coocToConfidence :: Phylo -> Cooc
359 coocToConfidence phylo =
360 let count = getRootsCount phylo
361 cooc = foldl (\acc cooc' -> sumCooc acc cooc') empty
362 $ elems $ getCoocByDate phylo
363 in Map.mapWithKey (\(a,b) w -> confidence a b w count) cooc
364 where
365 ----
366 -- confidence
367 confidence :: Int -> Int -> Double -> Map Int Double -> Double
368 confidence a b inter card = maximum [(inter / card ! a),(inter / card ! b)]
369
370
371 sumtest :: [Int] -> [Int] -> Int
372 sumtest l1 l2 = (head' "test" l1) + (head' "test" $ reverse l2)
373
374
375 findDefaultLevel :: Phylo -> Phylo
376 findDefaultLevel phylo =
377 let confidence = Map.filterWithKey (\(a,b) _ -> a /= b)
378 $ Map.filter (> 0.01)
379 $ coocToConfidence phylo
380 roots = getRoots phylo
381 level = regimeToDefaultLevel confidence roots
382 in updateLevel level phylo
383
384
385 --------------------
386 -- | PhyloGroup | --
387 --------------------
388
389 getGroupId :: PhyloGroup -> PhyloGroupId
390 getGroupId g = ((g ^. phylo_groupPeriod, g ^. phylo_groupScale), g ^. phylo_groupIndex)
391
392 getGroupNgrams :: PhyloGroup -> [Int]
393 getGroupNgrams g = g ^. phylo_groupNgrams
394
395 idToPrd :: PhyloGroupId -> Period
396 idToPrd id = (fst . fst) id
397
398 groupByField :: Ord a => (PhyloGroup -> a) -> [PhyloGroup] -> Map a [PhyloGroup]
399 groupByField toField groups = fromListWith (++) $ map (\g -> (toField g, [g])) groups
400
401 getPeriodPointers :: Filiation -> PhyloGroup -> [Pointer]
402 getPeriodPointers fil g =
403 case fil of
404 ToChilds -> g ^. phylo_groupPeriodChilds
405 ToParents -> g ^. phylo_groupPeriodParents
406 ToChildsMemory -> undefined
407 ToParentsMemory -> undefined
408
409 filterSimilarity :: PhyloSimilarity -> Double -> Double -> Bool
410 filterSimilarity similarity thr local =
411 case similarity of
412 WeightedLogJaccard _ _ -> local >= thr
413 WeightedLogSim _ _ -> local >= thr
414 Hamming _ _ -> undefined
415
416 getSimilarityName :: PhyloSimilarity -> String
417 getSimilarityName similarity =
418 case similarity of
419 WeightedLogJaccard _ _ -> "WLJaccard"
420 WeightedLogSim _ _ -> "WeightedLogSim"
421 Hamming _ _ -> "Hamming"
422
423 ---------------
424 -- | Phylo | --
425 ---------------
426
427 addPointers :: Filiation -> PointerType -> [Pointer] -> PhyloGroup -> PhyloGroup
428 addPointers fil pty pointers g =
429 case pty of
430 TemporalPointer -> case fil of
431 ToChilds -> g & phylo_groupPeriodChilds .~ pointers
432 ToParents -> g & phylo_groupPeriodParents .~ pointers
433 ToChildsMemory -> undefined
434 ToParentsMemory -> undefined
435 ScalePointer -> case fil of
436 ToChilds -> g & phylo_groupScaleChilds .~ pointers
437 ToParents -> g & phylo_groupScaleParents .~ pointers
438 ToChildsMemory -> undefined
439 ToParentsMemory -> undefined
440
441 toPointer' :: Double -> Pointer -> Pointer'
442 toPointer' thr pt = (fst pt,(thr,snd pt))
443
444
445 addMemoryPointers :: Filiation -> PointerType -> Double -> [Pointer] -> PhyloGroup -> PhyloGroup
446 addMemoryPointers fil pty thr pointers g =
447 case pty of
448 TemporalPointer -> case fil of
449 ToChilds -> undefined
450 ToParents -> undefined
451 ToChildsMemory -> g & phylo_groupPeriodMemoryChilds .~ (concat [(g ^. phylo_groupPeriodMemoryChilds),(map (\pt -> toPointer' thr pt) pointers)])
452 ToParentsMemory -> g & phylo_groupPeriodMemoryParents .~ (concat [(g ^. phylo_groupPeriodMemoryParents),(map (\pt -> toPointer' thr pt) pointers)])
453 ScalePointer -> undefined
454
455
456 getPeriodIds :: Phylo -> [(Date,Date)]
457 getPeriodIds phylo = sortOn fst
458 $ keys
459 $ phylo ^. phylo_periods
460
461 getLevelParentId :: PhyloGroup -> PhyloGroupId
462 getLevelParentId g = fst $ head' "getLevelParentId" $ g ^. phylo_groupScaleParents
463
464 getLastLevel :: Phylo -> Scale
465 getLastLevel phylo = last' "lastLevel" $ getScales phylo
466
467 getScales :: Phylo -> [Scale]
468 getScales phylo = nub
469 $ map snd
470 $ keys $ view ( phylo_periods
471 . traverse
472 . phylo_periodScales ) phylo
473
474 getSeaElevation :: Phylo -> SeaElevation
475 getSeaElevation phylo = seaElevation (getConfig phylo)
476
477 getSimilarity :: Phylo -> PhyloSimilarity
478 getSimilarity phylo = similarity (getConfig phylo)
479
480
481 getPhyloSeaRiseStart :: Phylo -> Double
482 getPhyloSeaRiseStart phylo = case (getSeaElevation phylo) of
483 Constante s _ -> s
484 Adaptative _ -> 0
485 Evolving _ -> 0
486
487 getPhyloSeaRiseSteps :: Phylo -> Double
488 getPhyloSeaRiseSteps phylo = case (getSeaElevation phylo) of
489 Constante _ s -> s
490 Adaptative s -> s
491 Evolving _ -> 0.1
492
493
494 getConfig :: Phylo -> PhyloConfig
495 getConfig phylo = (phylo ^. phylo_param) ^. phyloParam_config
496
497 getLevel :: Phylo -> Double
498 getLevel phylo = _phylo_level phylo
499
500 getLadder :: Phylo -> [Double]
501 getLadder phylo = phylo ^. phylo_seaLadder
502
503 getCoocByDate :: Phylo -> Map Date Cooc
504 getCoocByDate phylo = coocByDate (phylo ^. phylo_counts)
505
506 getDocsByDate :: Phylo -> Map Date Double
507 getDocsByDate phylo = docsByDate (phylo ^. phylo_counts)
508
509 getRootsCount :: Phylo -> Map Int Double
510 getRootsCount phylo = rootsCount (phylo ^. phylo_counts)
511
512 getRootsFreq :: Phylo -> Map Int Double
513 getRootsFreq phylo = rootsFreq (phylo ^. phylo_counts)
514
515 getLastRootsFreq :: Phylo -> Map Int Double
516 getLastRootsFreq phylo = lastRootsFreq (phylo ^. phylo_counts)
517
518 setConfig :: PhyloConfig -> Phylo -> Phylo
519 setConfig config phylo = phylo
520 & phylo_param .~ (PhyloParam
521 ((phylo ^. phylo_param) ^. phyloParam_version)
522 ((phylo ^. phylo_param) ^. phyloParam_software)
523 config)
524
525 -- & phylo_param & phyloParam_config & phyloParam_config .~ config
526
527
528 getRoots :: Phylo -> Vector Ngrams
529 getRoots phylo = (phylo ^. phylo_foundations) ^. foundations_roots
530
531 getRootsInGroups :: Phylo -> Map Int [PhyloGroupId]
532 getRootsInGroups phylo = (phylo ^. phylo_foundations) ^. foundations_rootsInGroups
533
534 getSources :: Phylo -> Vector Text
535 getSources phylo = _sources (phylo ^. phylo_sources)
536
537
538 -- get the groups distributed by branches at the last scale
539 phyloLastScale :: Phylo -> [[PhyloGroup]]
540 phyloLastScale phylo = elems
541 $ fromListWith (++)
542 $ map (\g -> (g ^. phylo_groupBranchId, [g]))
543 $ getGroupsFromScale (last' "byBranches" $ getScales phylo) phylo
544
545 getGroupsFromScale :: Scale -> Phylo -> [PhyloGroup]
546 getGroupsFromScale lvl phylo =
547 elems $ view ( phylo_periods
548 . traverse
549 . phylo_periodScales
550 . traverse
551 . filtered (\phyloLvl -> phyloLvl ^. phylo_scaleScale == lvl)
552 . phylo_scaleGroups ) phylo
553
554
555 getGroupsFromScalePeriods :: Scale -> [Period] -> Phylo -> [PhyloGroup]
556 getGroupsFromScalePeriods lvl periods phylo =
557 elems $ view ( phylo_periods
558 . traverse
559 . filtered (\phyloPrd -> elem (phyloPrd ^. phylo_periodPeriod) periods)
560 . phylo_periodScales
561 . traverse
562 . filtered (\phyloLvl -> phyloLvl ^. phylo_scaleScale == lvl)
563 . phylo_scaleGroups ) phylo
564
565
566 getGroupsFromPeriods :: Scale -> Map Period PhyloPeriod -> [PhyloGroup]
567 getGroupsFromPeriods lvl periods =
568 elems $ view ( traverse
569 . phylo_periodScales
570 . traverse
571 . filtered (\phyloLvl -> phyloLvl ^. phylo_scaleScale == lvl)
572 . phylo_scaleGroups ) periods
573
574
575 updatePhyloGroups :: Scale -> Map PhyloGroupId PhyloGroup -> Phylo -> Phylo
576 updatePhyloGroups lvl m phylo =
577 over ( phylo_periods
578 . traverse
579 . phylo_periodScales
580 . traverse
581 . filtered (\phyloLvl -> phyloLvl ^. phylo_scaleScale == lvl)
582 . phylo_scaleGroups
583 . traverse
584 ) (\g ->
585 let id = getGroupId g
586 in
587 if member id m
588 then m ! id
589 else g ) phylo
590
591 updatePeriods :: Map (Date,Date) (Text,Text) -> Phylo -> Phylo
592 updatePeriods periods' phylo =
593 over (phylo_periods . traverse)
594 (\prd ->
595 let prd' = periods' ! (prd ^. phylo_periodPeriod)
596 lvls = map (\lvl -> lvl & phylo_scalePeriodStr .~ prd') $ prd ^. phylo_periodScales
597 in prd & phylo_periodPeriodStr .~ prd'
598 & phylo_periodScales .~ lvls
599 ) phylo
600
601 updateQuality :: Double -> Phylo -> Phylo
602 updateQuality quality phylo = phylo { _phylo_quality = quality }
603
604 updateLevel :: Double -> Phylo -> Phylo
605 updateLevel level phylo = phylo { _phylo_level = level }
606
607 traceToPhylo :: Scale -> Phylo -> Phylo
608 traceToPhylo lvl phylo =
609 trace ("\n" <> "-- | End of phylo making at level " <> show (lvl) <> " with "
610 <> show (length $ getGroupsFromScale lvl phylo) <> " groups and "
611 <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromScale lvl phylo) <> " branches" <> "\n") phylo
612
613 --------------------
614 -- | Clustering | --
615 --------------------
616
617 mergeBranchIds :: [[Int]] -> [Int]
618 mergeBranchIds ids = (head' "mergeBranchIds" . sort . mostFreq') ids
619 where
620 -- | 2) find the most Up Left ids in the hierarchy of similarity
621 -- mostUpLeft :: [[Int]] -> [[Int]]
622 -- mostUpLeft ids' =
623 -- let groupIds = (map (\gIds -> (length $ head' "gIds" gIds, head' "gIds" gIds)) . groupBy (\id id' -> length id == length id') . sortOn length) ids'
624 -- inf = (fst . minimum) groupIds
625 -- in map snd $ filter (\gIds -> fst gIds == inf) groupIds
626 -- | 1) find the most frequent ids
627 mostFreq' :: [[Int]] -> [[Int]]
628 mostFreq' ids' =
629 let groupIds = (map (\gIds -> (length gIds, head' "gIds" gIds)) . group . sort) ids'
630 sup = (fst . maximum) groupIds
631 in map snd $ filter (\gIds -> fst gIds == sup) groupIds
632
633
634 mergeMeta :: [Int] -> [PhyloGroup] -> Map Text [Double]
635 mergeMeta bId groups =
636 let ego = head' "mergeMeta" $ filter (\g -> (snd (g ^. phylo_groupBranchId)) == bId) groups
637 in fromList [("breaks",(ego ^. phylo_groupMeta) ! "breaks"),("seaLevels",(ego ^. phylo_groupMeta) ! "seaLevels")]
638
639
640 groupsToBranches' :: Map PhyloGroupId PhyloGroup -> [[PhyloGroup]]
641 groupsToBranches' groups =
642 {- run the related component algorithm -}
643 let egos = map (\g -> [getGroupId g]
644 ++ (map fst $ g ^. phylo_groupPeriodParents)
645 ++ (map fst $ g ^. phylo_groupPeriodChilds)
646 ++ (map fst $ g ^. phylo_groupAncestors)) $ elems groups
647 graph = relatedComponents egos
648 {- update each group's branch id -}
649 in map (\ids ->
650 let groups' = elems $ restrictKeys groups (Set.fromList ids)
651 bId = mergeBranchIds $ map (\g -> snd $ g ^. phylo_groupBranchId) groups'
652 in map (\g -> g & phylo_groupBranchId %~ (\(lvl,_) -> (lvl,bId))) groups') graph
653
654
655 relatedComponents :: Ord a => [[a]] -> [[a]]
656 relatedComponents graph = foldl' (\branches groups ->
657 if (null branches)
658 then branches ++ [groups]
659 else
660 let branchPart = partition (\branch -> disjoint (Set.fromList branch) (Set.fromList groups)) branches
661 in (fst branchPart) ++ [nub $ concat $ (snd branchPart) ++ [groups]]) [] graph
662
663
664 toRelatedComponents :: [PhyloGroup] -> [((PhyloGroup,PhyloGroup),Double)] -> [[PhyloGroup]]
665 toRelatedComponents nodes edges =
666 let ref = fromList $ map (\g -> (getGroupId g, g)) nodes
667 clusters = relatedComponents $ ((map (\((g,g'),_) -> [getGroupId g, getGroupId g']) edges) ++ (map (\g -> [getGroupId g]) nodes))
668 in map (\cluster -> map (\gId -> ref ! gId) cluster) clusters
669
670
671 traceSynchronyEnd :: Phylo -> Phylo
672 traceSynchronyEnd phylo =
673 trace ( "-- | End synchronic clustering at level " <> show (getLastLevel phylo)
674 <> " with " <> show (length $ getGroupsFromScale (getLastLevel phylo) phylo) <> " groups"
675 <> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromScale (getLastLevel phylo) phylo) <> " branches"
676 <> "\n" ) phylo
677
678 traceSynchronyStart :: Phylo -> Phylo
679 traceSynchronyStart phylo =
680 trace ( "\n" <> "-- | Start synchronic clustering at level " <> show (getLastLevel phylo)
681 <> " with " <> show (length $ getGroupsFromScale (getLastLevel phylo) phylo) <> " groups"
682 <> " and " <> show (length $ nub $ map _phylo_groupBranchId $ getGroupsFromScale (getLastLevel phylo) phylo) <> " branches"
683 <> "\n" ) phylo
684
685
686 -------------------
687 -- | Similarity | --
688 -------------------
689
690 getSensibility :: PhyloSimilarity -> Double
691 getSensibility proxi = case proxi of
692 WeightedLogJaccard s _ -> s
693 WeightedLogSim s _ -> s
694 Hamming _ _ -> undefined
695
696 getMinSharedNgrams :: PhyloSimilarity -> Int
697 getMinSharedNgrams proxi = case proxi of
698 WeightedLogJaccard _ m -> m
699 WeightedLogSim _ m -> m
700 Hamming _ _ -> undefined
701
702 ----------------
703 -- | Branch | --
704 ----------------
705
706 intersectInit :: Eq a => [a] -> [a] -> [a] -> [a]
707 intersectInit acc lst lst' =
708 if (null lst) || (null lst')
709 then acc
710 else if (head' "intersectInit" lst) == (head' "intersectInit" lst')
711 then intersectInit (acc ++ [head' "intersectInit" lst]) (tail lst) (tail lst')
712 else acc
713
714 branchIdsToSimilarity :: PhyloBranchId -> PhyloBranchId -> Double -> Double -> Double
715 branchIdsToSimilarity id id' thrInit thrStep = thrInit + thrStep * (fromIntegral $ length $ intersectInit [] (snd id) (snd id'))
716
717 ngramsInBranches :: [[PhyloGroup]] -> [Int]
718 ngramsInBranches branches = nub $ foldl (\acc g -> acc ++ (g ^. phylo_groupNgrams)) [] $ concat branches
719
720
721 traceMatchSuccess :: Double -> Double -> Double -> [[[PhyloGroup]]] -> [[[PhyloGroup]]]
722 traceMatchSuccess thr qua qua' nextBranches =
723 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . init . snd)
724 $ (head' "trace" $ head' "trace" $ head' "trace" nextBranches) ^. phylo_groupBranchId))
725 <> ",(1.." <> show (length nextBranches) <> ")]"
726 <> " | " <> show ((length . concat . concat) nextBranches) <> " groups" <> "\n"
727 <> " - splited with success in " <> show (map length nextBranches) <> " sub-branches" <> "\n"
728 <> " - for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " < " <> show(qua') <> ")\n" ) nextBranches
729
730
731 traceMatchFailure :: Double -> Double -> Double -> [[PhyloGroup]] -> [[PhyloGroup]]
732 traceMatchFailure thr qua qua' branches =
733 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
734 <> ",(1.." <> show (length branches) <> ")]"
735 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
736 <> " - split with failure for the local threshold " <> show (thr) <> " ( quality : " <> show (qua) <> " > " <> show(qua') <> ")\n"
737 ) branches
738
739
740 traceMatchNoSplit :: [[PhyloGroup]] -> [[PhyloGroup]]
741 traceMatchNoSplit branches =
742 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
743 <> ",(1.." <> show (length branches) <> ")]"
744 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
745 <> " - unable to split in smaller branches" <> "\n"
746 ) branches
747
748
749 traceMatchLimit :: [[PhyloGroup]] -> [[PhyloGroup]]
750 traceMatchLimit branches =
751 trace ( "\n" <> "-- local branches : " <> (init $ show ((init . snd) $ (head' "trace" $ head' "trace" branches) ^. phylo_groupBranchId))
752 <> ",(1.." <> show (length branches) <> ")]"
753 <> " | " <> show (length $ concat branches) <> " groups" <> "\n"
754 <> " - unable to increase the threshold above 1" <> "\n"
755 ) branches
756
757
758 traceMatchEnd :: [PhyloGroup] -> [PhyloGroup]
759 traceMatchEnd groups =
760 trace ("\n" <> "-- | End temporal matching with " <> show (length $ nub $ map (\g -> g ^. phylo_groupBranchId) groups)
761 <> " branches and " <> show (length groups) <> " groups" <> "\n") groups
762
763
764 traceTemporalMatching :: [PhyloGroup] -> [PhyloGroup]
765 traceTemporalMatching groups =
766 trace ( "\n" <> "-- | Start temporal matching for " <> show(length groups) <> " groups" <> "\n") groups
767
768
769 traceGroupsProxi :: [Double] -> [Double]
770 traceGroupsProxi l =
771 trace ( "\n" <> "-- | " <> show(List.length l) <> " computed pairs of groups Similarity" <> "\n") l