module Phylomemy.TemporalMatching where -- import Debug.Pretty.Simple (pTraceShow) import Control.Monad (Monad (..), foldM, forM_, unless) import Control.Monad.ST qualified as ST import Data.Bool (otherwise) import Data.Either (fromLeft) import Data.Eq (Eq (..)) import Data.Foldable (any, foldMap', toList) import Data.Function (($), (&), (.)) import Data.Functor (Functor (..), (<$>), (<&>)) import Data.Int (Int) import Data.List qualified as List import Data.Map.Strict qualified as Map import Data.Maybe (Maybe (..)) import Data.Ord (Ord (..)) import Data.Ord qualified as Ord import Data.Ratio (Rational, (%)) import Data.Scientific (toBoundedRealFloat) import Data.Semigroup (Semigroup (..)) import Data.Sequence (Seq) import Data.Sequence qualified as Seq import Data.Set (Set) import Data.Set qualified as Set import Data.Tree qualified as Tree import Data.Tuple (uncurry) import GHC.Stack (HasCallStack) import Logic hiding ((/)) import Numeric.Decimal qualified as Decimal import Numeric.Probability import Text.Show (Show) import Prelude (Double, Num (..), fromIntegral, pi, tan, toRational, (/), (^)) import Clustering.FrequentItemSet.BruteForce qualified as Clustering import Clustering.UnionFind.ST qualified as UnionFind import Phylomemy.Indexation type Similarity = Probability cardinal :: Num i => Set a -> i cardinal = fromIntegral . Set.size -- TODO: implement biased similarity from the paper similarityJaccard :: Ord.Ord a => Set a -> Set a -> Similarity similarityJaccard x y = cardinal (Set.intersection x y) % cardinal (Set.union x y) & probability & Decimal.arithError -- | A `MaximalSpanningTree` (MST) is one (possibly out of many) -- tree spanning accross all the given `(range, cluster)` nodes, -- with the maximal sum of edges' `Similarity` between them. -- -- ExplanationNote: https://en.wikipedia.org/wiki/Minimum_spanning_tree -- -- Viewing a phylomemy as a `MaximalSpanningTree` -- is the crux of understanding how it is computed: -- -- - the `mstMinimalSimilarity` is the next `Similarity` -- that will split the `MaximalSpanningTree` into two or more `MaximalSpanningTree`s. -- -- - it explains what the "scale of a phylomemy" is: -- merging clusters of the same `range` -- when they still belong to the same `MaximalSpanningTree`. -- -- ImplementationNote: using a `Tree.Tree` to represent a `MaximalSpanningTree` -- (instead of an adjacency edge map for instance) -- is motivated by the need to implement `mstSplit`, -- which needs to gather the `(range, cluster)` nodes -- of each `MaximalSpanningTree` resulting from the cut, -- because knowing that is required by `msfGlobalQuality`. -- -- TODO: "Inadequacies of Minimum Spanning Trees in Molecular Epidemiology" -- https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3187300/ -- -- TODO: "Divide-and-conquer based all spanning tree generation algorithm of a simple connected graph" -- https://www.sciencedirect.com/science/article/abs/pii/S0304397521006952 -- -- WarningNote: "The tree of one percent" -- https://link.springer.com/content/pdf/10.1186/gb-2006-7-10-118.pdf type MaximalSpanningTree range cluster = Tree.Tree (MSTNode range cluster) data MSTNode range cluster = MSTNode { mstNodeSimilarity :: Similarity -- ^ The `similarity` of the parent edge of this node -- ie. of the only edge getting closer to the root node of the `MaximalSpanningTree`. -- -- ImplementationNote: `maximalSpanningForest` puts a `Similarity` of `1` for the root node. -- and `mstSplit` leaves the `mstMinimalSimilarity` of the edge before splitting out the `MaximalSpanningTree`. , mstNodeRangeCluster :: (range, cluster) } deriving (Show) -- | A (disjoint) forest of `MaximalSpanningTree`s, -- indexed by the minimal @(range, cluster)@ node -- of each `MaximalSpanningTree`. type MaximalSpanningForest range cluster = [MaximalSpanningTree range cluster] -- | @(`maximalSpanningForest` similarity rangeToClusterToDocs)@ -- uses the Kruskal algorithm to find the trees spanning all -- the given `(range, cluster)` nodes in `rangeToClusterToDocs` -- with the maximal total `similarity` between them. -- -- ExplanationNote: https://en.wikipedia.org/wiki/Kruskal's_algorithm maximalSpanningForest :: forall range cluster doc. HasCallStack => Ord range => Ord cluster => {-similarityMeasure :::-} (cluster -> cluster -> Similarity) -> {-clusters :::-} range :-> cluster :-> Seq (Clustering.Transaction Root doc) -> MaximalSpanningForest range cluster maximalSpanningForest similarity rangeToClusterToDocs = ST.runST do -- DescriptionNote: create a `Point` for each given node -- each Point containing a root node and a maximal spanning tree. rangeToClusterToPoint :: range :-> cluster :-> UnionFind.Point s (MaximalSpanningTree range cluster) <- rangeToClusterToDocs & Map.traverseWithKey \srcR -> Map.traverseWithKey \srcC _docs -> UnionFind.fresh $ Tree.Node MSTNode { mstNodeSimilarity = proba1 , mstNodeRangeCluster = (srcR, srcC) } [] -- DescriptionNote: iterate from the greatest to the lowest similarity edge. forM_ (allEdges & Map.toDescList) \(simil, edges) -> do -- Iterate through all the edges of that `Similarity`. forM_ (edges & toList) \((srcR, srcC), (dstR, dstC)) -> do let srcPoint = rangeToClusterToPoint Map.! srcR Map.! srcC let dstPoint = rangeToClusterToPoint Map.! dstR Map.! dstC alreadyInSameMST <- UnionFind.equivalent srcPoint dstPoint unless alreadyInSameMST do -- DescriptionNote: the nodes of this edge (src -> dst) belong to the same `MaximalSpanningTree`. UnionFind.unionWith srcPoint dstPoint \(Tree.Node srcRoot srcForest) (Tree.Node dstRoot dstForest) -> return $ Tree.Node srcRoot $ Tree.Node MSTNode { mstNodeSimilarity = simil , mstNodeRangeCluster = mstNodeRangeCluster dstRoot } dstForest : srcForest rootForest :: [MaximalSpanningTree range cluster] <- rangeToClusterToPoint -- DescriptionNote: collect all the Points. & Map.elems & List.concatMap Map.elems -- DescriptionNote: keep only the `MaximalSpanningTree`s -- contained in non-redundant `Point`s. & (`foldM` []) \acc p -> do isRedundant <- UnionFind.redundant p if isRedundant then return acc else UnionFind.descriptor p <&> (: acc) return rootForest where -- Order `rangeToClusterToDocs` by `Similarity`, -- which will enable to add edges to the spanning tree in decreasing `Similarity` -- and hence build *a* maximal spanning tree. allEdges :: Similarity :-> Seq ((range, cluster), (range, cluster)) allEdges = Map.unionsWith (Seq.><) [ Map.singleton (similarity srcC dstC) (Seq.singleton (src, dst)) | (srcR, srcClusterToDocuments) <- rangeToClusterToDocs & Map.toList , (srcC, _docs) <- srcClusterToDocuments & Map.toList , let src = (srcR, srcC) , -- ExplanationNote: it does not matter whether lower or greater ranges are used for the destination nodes, -- as they contain the very same undirected edges and thus `Similarity`, -- hence would produce the same splitting `Similarity`s, -- though not necessarily the same `MaximalSpanningTree`. let (_, dstRangeToClusterToDocs) = Map.split srcR rangeToClusterToDocs , (dstR, dstClusterToDocs) <- dstRangeToClusterToDocs & Map.toList , (dstC, _docs) <- dstClusterToDocs & Map.toList , let dst = (dstR, dstC) ] -- | "sea-level rise algorithm" -- -- See: in `Phylomemy.References.RefDrawMeScience`, -- « C.5 The sea-level rise algorithm and its implementation in Gargantext » msfPrune :: forall range roots predictionMeasure. HasCallStack => Show range => Ord range => predictionMeasure ::: (Set (range, Cluster) -> Set (range, Cluster) -> Decimal.Arith Similarity) -> roots ::: Set Root -> MaximalSpanningForest range Cluster -> MaximalSpanningForest range Cluster msfPrune predictionMeasure roots = loop (return proba0) [] where loop :: Decimal.Arith Probability -> MaximalSpanningForest range Cluster -> MaximalSpanningForest range Cluster -> MaximalSpanningForest range Cluster loop previousQuality doneBranches currentBranches = -- pTraceShow (["msfPrune", "loop"], ("previousQuality", previousQuality), ("doneBranches", List.length doneBranches), ("todoBranches", List.length currentBranches)) $ case currentBranches of [] -> doneBranches currentBranch : todoBranches -> let splitBranches = mstSplit currentBranch in -- pTraceShow (["msfPrune", "loop", "splitBranches"], ("size", Map.size splitBranches)) $ if List.length splitBranches <= 1 then loop previousQuality (doneBranches <> splitBranches) todoBranches else letName (fmap mstNodes $ (doneBranches <> splitBranches) <> todoBranches) \mstToNodes -> let splitQuality = msfGlobalQuality predictionMeasure roots mstToNodes in if Decimal.arithError previousQuality < Decimal.arithError splitQuality then -- CorrectnessFixMe: these `Decimal.arithError` may raise "arithmetic overflow", -- it has to be understood. loop splitQuality doneBranches (splitBranches <> todoBranches) else loop previousQuality (doneBranches <> splitBranches) todoBranches -- | @(`mstSplit` mst)@ returns the `MaximalSpanningTree`s -- resulting from pruning out the given `MaximalSpanningTree` -- of all the edges having the minimal `Similarity` of that `MaximalSpanningTree`. mstSplit :: forall range cluster. HasCallStack => Ord range => Ord cluster => MaximalSpanningTree range cluster -> MaximalSpanningForest range cluster mstSplit mst = -- TODO: take minSimil as argument to avoid recomputing it when already known case mstMinimalSimilarity mst of Nothing -> [mst] Just minSimil -> cutMerge mst where cutMerge = uncurry (:) . cut cut :: MaximalSpanningTree range cluster -> (MaximalSpanningTree range cluster, [MaximalSpanningTree range cluster]) cut (Tree.Node node children) = let (keptChildren, cutChildren) = children & List.partition \tree -> minSimil < mstNodeSimilarity (Tree.rootLabel tree) in let cutChildrenRoots = cutChildren & List.concatMap cutMerge in let (keptChildrenForest, cutChildrenTree) = List.unzip (cut <$> keptChildren) in let cutChildrenTreeRoots = cutChildrenTree & List.concat & List.concatMap cutMerge in ( Tree.Node node keptChildrenForest , (cutChildrenRoots List.++ cutChildrenTreeRoots) -- WarningNote: the root nodes of those new `MaximalSpanningTree`s -- keep their `mstNodeSimilarity` to their cutting value, -- which is lower than their `mstMinimalSimilarity`. ) mstMinimalSimilarity :: MaximalSpanningTree range cluster -> Maybe Similarity mstMinimalSimilarity (Tree.Node _rootNode rootBranches) | List.null rootBranches = Nothing | otherwise = -- ExplanationNote: the root node of a `MaximalSpanningTree`, -- being a root node, does not have a parent, -- hence its `mstNodeSimilarity` must be ignored. Just $ List.minimum $ rootBranches <&> Tree.foldTree \node accs -> List.minimum $ (mstNodeSimilarity node) : accs -- | @(`mstSplittingSimilarities` mst)@ -- returns the `Similarity`s causing the given `MaximalSpanningForest` -- to split into further more `MaximalSpanningForest`. mstSplittingSimilarities :: MaximalSpanningTree range cluster -> Set Similarity mstSplittingSimilarities (Tree.Node _rootNode rootBranches) | List.null rootBranches = Set.empty | otherwise = rootBranches & foldMap' ( Tree.foldTree \node accs -> Set.unions (Set.singleton (mstNodeSimilarity node) : accs) ) type Scale = Int -- > 0 -- | @(`mstScales` msf)@ -- returns the successive results of calling `mstSplit` on the given `MaximalSpanningTree`s in @(msf)@ -- until each one of them is a single node. -- Even though the greatest `Scale`s may be different for each `MaximalSpanningTree` of @(msf)@, -- the result are grouped by `Scale` then by `MaximalSpanningTree`, -- which enabled to have a maximal `Scale` for the whole `msf`. mstScales :: forall range cluster. Ord range => Ord cluster => MaximalSpanningForest range cluster -> Scale :-> {-mst-} Int :-> MaximalSpanningForest range cluster mstScales initMSF = Map.unionsWith (Map.unionWith (List.++)) [ Map.fromDistinctAscList [ (scale, Map.singleton mstI msf) | (scale, msf) <- scaleToMsf ] | (mstI, mst) <- initMSF & List.zip [1 :: Int ..] , let scaleToMsf :: [(Scale, MaximalSpanningForest range cluster)] = -- scanl :: (acc -> scale -> acc) -> acc -> [scale] -> [acc] splitSimils & Map.keys & List.scanl' (\(scale, msf) _minSimil -> (scale + 1, msf >>= mstSplit)) (1, [mst]) ] where splitSimils :: Similarity :-> () = Map.unions [ mst & Tree.foldTree \node accs -> Map.unionsWith (\() () -> ()) (Map.singleton (mstNodeSimilarity node) () : accs) | mst <- initMSF ] -- | @(`mstNodes` mst)@ returns the nodes of the given @(mst)@. mstNodes :: HasCallStack => Ord range => Ord cluster => MaximalSpanningTree range cluster -> Set (range, cluster) mstNodes = foldMap' (Set.singleton . mstNodeRangeCluster) -- | The global quality of a `Phylomemy`. -- -- DescriptionNote: `msfGlobalQuality` counts the `(range, Cluster)` nodes of a branch -- but that a `(range, Cluster)` in itself can gather several documents. msfGlobalQuality :: HasCallStack => Ord range => predictionMeasure ::: (Set (range, Cluster) -> Set (range, Cluster) -> Decimal.Arith Similarity) -> roots ::: Set Root -> mstToNodes ::: [Set (range, Cluster)] -> Decimal.Arith Similarity msfGlobalQuality (Named predictionMeasure) (Named roots) (Named mstToNodes) = List.sum [ probability (1 % cardinal roots) * List.sum [ probability (cardinal retrievedNodes % nodesTotal) * predictionMeasure relevantNodes retrievedNodes | retrievedNodes <- branches ] | root <- roots & toList , -- CorrectnessNote: ignore branches not containing any requested `root` let branches = mstToNodes & List.filter (any (\(_r, c) -> Set.member root c)) , let nodesTotal = branches & List.map (cardinal @Int) & List.sum & fromIntegral , let relevantNodes = branches & foldMap' (Set.filter (\(_r, c) -> Set.member root c)) ] -- | L'idée de la F-mesure est de s'assurer qu'un classificateur fait de bonnes -- prédictions de la classe pertinente (bonne précision) -- en suffisamment grand nombre (bon rappel) -- sur un jeu de données cible. -- -- Tout comme la précision et le rappel, -- la F-mesure varie de 0 (plus mauvaise valeur) -- à 1 (meilleure valeur possible). -- -- ExplanationNote: https://en.wikipedia.org/wiki/F-score predictionMeasureF :: HasCallStack => Ord range => {-lambda :::-} -- | Trade-off between `precision` and `recall`. -- See https://en.wikipedia.org/wiki/Precision_and_recall -- -- > [It] predetermine[s] the desired shape of the phylomemy: a continent -- > (i.e., one large branch) or an archipelago of elements of knowledge -- > (i.e., many small branches). The estimation of `lambda` is left to the researcher’s -- > discretion in light of her own expertise and research questions, which makes -- > any phylomemy an artifact of the researcher’s perception -- -- For @(lambda == 0)@, only the `precision` counts, whereas for @(lambda == 1)@, only the `recall` counts. Similarity -> Set (range, Cluster) -> Set (range, Cluster) -> Decimal.Arith Similarity predictionMeasureF lambda relevantNodes retrievedNodes | lambda == proba0 = probability precision | lambda == proba1 = probability recall | otherwise = probability $ precision * recall * (1 + betaSquare) / (recall + betaSquare * precision) where precision = cardinal relevantRetrievedNodes % cardinal retrievedNodes recall = cardinal relevantRetrievedNodes % cardinal relevantNodes relevantRetrievedNodes = Set.intersection relevantNodes retrievedNodes lambdaDouble = lambda & Decimal.toScientificDecimal & toBoundedRealFloat @Double & fromLeft 1 -- ExplanationNote: the `tan` is just to spread the `lambda ∈ [0,1]` from -∞ to +∞ -- Two commonly used values for β are: -- - 2, which weighs recall higher than precision, -- - and 0.5, which weighs recall lower than precision. beta = tan (lambdaDouble * pi / 2) betaSquare :: Rational betaSquare = beta ^ (2 :: Int) & toRational