{-# LANGUAGE DeriveFunctor #-} {-# LANGUAGE RankNTypes #-} module Phylomemy.TemporalMatching where -- import Data.Traversable (traverse) -- import Debug.Pretty.Simple (pTraceShow, pTraceShowId) import Control.Monad (Monad (..), foldM, forM_, unless) import Control.Monad.ST qualified as ST import Data.Bool (otherwise) import Data.Either (fromLeft) import Data.Eq (Eq (..)) import Data.Foldable (any, foldMap', toList) import Data.Function (($), (&), (.)) import Data.Functor (Functor (..), (<$>), (<&>)) import Data.Int (Int) import Data.List qualified as List import Data.Map.Strict qualified as Map import Data.Maybe (Maybe (..)) import Data.Ord (Ord (..)) import Data.Ord qualified as Ord import Data.Ratio (Rational, (%)) import Data.Scientific (toBoundedRealFloat) import Data.Semigroup (Semigroup (..)) import Data.Sequence (Seq) import Data.Sequence qualified as Seq import Data.Set (Set) import Data.Set qualified as Set import Data.Tree qualified as Tree import Data.Tuple (uncurry) import GHC.Stack (HasCallStack) import Logic hiding ((/)) import Numeric.Decimal qualified as Decimal import Numeric.Probability import Text.Show (Show) import Prelude (Double, Num (..), fromIntegral, pi, tan, toRational, (/), (^)) import Clustering.FrequentItemSet.BruteForce qualified as Clustering import Clustering.UnionFind.ST qualified as UnionFind import Phylomemy.Indexation type Similarity = Probability cardinal :: Num i => Set a -> i cardinal = fromIntegral . Set.size -- TODO: implement biased similarity from the paper similarityJaccard :: Ord.Ord a => Set a -> Set a -> Similarity similarityJaccard x y = cardinal (Set.intersection x y) % cardinal (Set.union x y) & probability & Decimal.arithError -- | `AllSimilarities` maps each @(range, cluster)@ node -- to *all* the other @(range, cluster)@ nodes from a different @(range)@. -- -- Note that the destination nodes are first ordered by their range -- which enable to find the closest from the source's range. type AllSimilarities range cluster = {-source-} range :-> cluster :-> {-destination-} Direction (range :-> Similarity :-> Seq cluster) -- | A phylomemy can be viewed in two directions. data Direction a = Direction { directionUpstream :: a -- ^ Lookup for the closest similar enough destinations -- in ranges lower than its source's range. , directionDownstream :: a -- ^ Lookup for the closest similar enough destinations -- in ranges greater than its source's range. } deriving (Functor, Show) -- | @(`allSimilarities` similarityMeasure rangeToClusterToDocuments)@ -- computes all the temporal similarities for the given documents. allSimilarities :: Ord range => {-similarityMeasure :::-} (cluster -> cluster -> Similarity) -> {-clusters :::-} range :-> cluster :-> Seq (Clustering.Transaction Root (Document pos)) -> AllSimilarities range cluster allSimilarities similarity rangeToClusterToDocs = rangeToClusterToDocs & Map.mapWithKey \srcR -> Map.mapWithKey \srcC _docs -> let (rangeToClusterToDocsLT, rangeToClusterToDocsGT) = Map.split srcR rangeToClusterToDocs goStream = Map.mapMaybeWithKey \_rangeNE dstCD -> deleteEmptyMap $ Map.fromListWith (Seq.><) [ (simil, Seq.singleton dstC) | dstC <- dstCD & Map.keys , let simil = similarity srcC dstC , proba0 < simil ] in Direction { directionUpstream = goStream rangeToClusterToDocsLT , directionDownstream = goStream rangeToClusterToDocsGT } where deleteEmptyMap m | Map.null m = Nothing | otherwise = Just m -- -- | @(`similarities` phy)@ -- -- returns the set of `Similarity`s used by the given `Phylomemy`. -- similaritiesLevels :: AllSimilarities range cluster -> Set Similarity -- similaritiesLevels = -- Map.foldMapWithKey \_ -> -- Map.foldMapWithKey \_ dir -> -- Map.foldMapWithKey (\_ -> Set.fromList . Map.keys) (directionDownstream dir) -- -- | @(`phylomemyRaise` minSimil phy)@ -- -- filter-in only edges whose similarity is greater or equal to the given @(minSimil)@. -- phylomemyRaise :: Similarity -> AllSimilarities range cluster -> AllSimilarities range cluster -- phylomemyRaise minSimil = -- Map.map $ -- Map.map $ -- fmap @Direction $ -- Map.mapMaybe $ -- (deleteEmptyMap .) $ \childSC -> -- let (_lt, eqM, gt) = Map.splitLookup minSimil childSC -- in maybe gt (\eq -> Map.insert minSimil eq gt) eqM -- where -- deleteEmptyMap m -- | Map.null m = Nothing -- | otherwise = Just m -- | A `MaximalSpanningTree` is one (possibly out of many) -- maximal (similarity) spanning tree (MST), -- mapping each source @(range, cluster)@ to many destination @(range, cluster)@, -- from the lowest to the greatest `Similarity`. -- -- ExplanationNote: https://en.wikipedia.org/wiki/Minimum_spanning_tree -- -- Viewing a phylomemy as a MST is the crux of understanding how it is computed: -- -- - it makes it easy to prune edges with low similarity -- and know how many `MaximalSpanningTree`es this creates. -- -- - it explains what the "scale of a phylomemy" is: -- considering clusters of the same range and same MST as a single node. -- -- The order in which edges may be deleted is known in advance (aka. "offline"): -- it goes from the lowest similarity (0) to the highest similarity (1) -- (tie breaking with an order on the @(range, cluster)@ nodes) -- therefore computing a maximal spanning tree (MaximalSpanningTree) wrt. similarities -- gives: -- -- - the value of the next `Similarity` causing a split: -- it is the minimal `Similarity` inside this `MaximalSpanningTree`. -- -- - the resulting `MaximalSpanningTree`s after removing the edges -- with that minimal `Similarity`. -- -- TODO: "Inadequacies of Minimum Spanning Trees in Molecular Epidemiology" -- https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3187300/ -- -- TODO: "Divide-and-conquer based all spanning tree generation algorithm of a simple connected graph" -- https://www.sciencedirect.com/science/article/abs/pii/S0304397521006952 -- -- WarningNote: "The tree of one percent" -- https://link.springer.com/content/pdf/10.1186/gb-2006-7-10-118.pdf type MaximalSpanningTree range cluster = Tree.Tree (MSTNode range cluster) data MSTNode range cluster = MSTNode { mstNodeSimilarity :: Similarity , mstNodeRangeCluster :: (range, cluster) -- , nodeMSTMinSimilarity :: Min Similarity } deriving (Show) -- | A (disjoint) forest of `MaximalSpanningTree`s, -- indexed by the minimal @(range, cluster)@ node -- of each `MaximalSpanningTree`. type MaximalSpanningTrees range cluster = [MaximalSpanningTree range cluster] -- | @(`maximalSpanningTrees` allSimils)@ -- uses the Kruskal algorithm to find the maximal spanning trees -- of the given `AllSimilarities`. -- -- ExplanationNote: https://en.wikipedia.org/wiki/Kruskal's_algorithm maximalSpanningTrees :: forall range cluster. Ord range => Ord cluster => AllSimilarities range cluster -> MaximalSpanningTrees range cluster maximalSpanningTrees allSimils = ST.runST do -- DescriptionNote: create a `Point` for each given node -- each Point containing a root node and a maximal spanning tree. rangeToClusterToPoint :: (range :-> cluster :-> UnionFind.Point s (MaximalSpanningTree range cluster)) <- allSimils & Map.traverseWithKey \srcR -> Map.traverseWithKey \srcC _similToEdges -> UnionFind.fresh $ Tree.Node MSTNode { mstNodeSimilarity = proba1 , mstNodeRangeCluster = (srcR, srcC) } [] -- DescriptionNote: iterate from the greatest to the lowest similarity edge. forM_ (allEdges & Map.toDescList) \(simil, edges) -> do -- Iterate through all the edges of that `Similarity`. forM_ (edges & toList) \((srcR, srcC), (dstR, dstC)) -> do let srcPoint = rangeToClusterToPoint Map.! srcR Map.! srcC let dstPoint = rangeToClusterToPoint Map.! dstR Map.! dstC alreadyInSameMST <- UnionFind.equivalent srcPoint dstPoint unless alreadyInSameMST do -- DescriptionNote: the nodes of this edge (src -> dst) belong to the same `MaximalSpanningTree`. UnionFind.unionWith srcPoint dstPoint \(Tree.Node srcRoot srcForest) (Tree.Node dstRoot dstForest) -> return $ Tree.Node srcRoot $ Tree.Node MSTNode { mstNodeSimilarity = simil , mstNodeRangeCluster = mstNodeRangeCluster dstRoot } dstForest : srcForest rootTrees :: [MaximalSpanningTree range cluster] <- rangeToClusterToPoint -- DescriptionNote: collect all the Points. & Map.elems & List.concatMap Map.elems -- DescriptionNote: keep only the `MaximalSpanningTree`s -- contained in non-redundant `Point`s. & (`foldM` []) \acc p -> do isRedundant <- UnionFind.redundant p if isRedundant then return acc else UnionFind.descriptor p <&> (: acc) return rootTrees where -- Order `allSimils` by `Similarity`, ie. -- from: {-source-} range :-> cluster :-> {-destination-} Direction (range :-> Similarity :-> Seq cluster) -- to: Similarity :-> Seq ((range, cluster), (range, cluster)) -- which will enable to add edges to the spanning tree in decreasing `Similarity` -- and hence build *a* maximal spanning tree. allEdges :: Similarity :-> Seq ((range, cluster), (range, cluster)) allEdges = Map.unionsWith (Seq.><) [ dstSCs & Map.map (fmap \dstC -> (src, (dstR, dstC))) | (srcR, srcCRSC) <- allSimils & Map.toList , (srcC, dstRSC) <- srcCRSC & Map.toList , let src = (srcR, srcC) , -- ExplanationNote: it does not matter whether `directionDownstream` or `directionUpstream` is used, -- as they contain the very same edges and `Similarity`, just not in the same order, -- hence using any `Direction` would produce the same splitting similarities. (dstR, dstSCs) <- directionDownstream dstRSC & Map.toList ] -- | "sea-level rise algorithm" -- -- See: in `Phylomemy.References.RefDrawMeScience`, -- « C.5 The sea-level rise algorithm and its implementation in Gargantext » splitMaximalSpanningTrees :: HasCallStack => forall range roots predictionMeasure. Show range => Ord range => predictionMeasure ::: (Set (range, Cluster) -> Set (range, Cluster) -> Decimal.Arith Similarity) -> roots ::: Set Root -> MaximalSpanningTrees range Cluster -> MaximalSpanningTrees range Cluster splitMaximalSpanningTrees predictionMeasure roots = loop (return proba0) [] where loop previousQuality doneBranches currentBranches = -- pTraceShow (["splitMaximalSpanningTrees", "loop"], ("previousQuality", previousQuality), ("doneBranches", List.length doneBranches), ("todoBranches", List.length currentBranches)) $ case currentBranches of [] -> doneBranches currentBranch : todoBranches -> let splitBranches = splitMaximalSpanningTree currentBranch in -- pTraceShow (["splitMaximalSpanningTrees", "loop", "splitBranches"], ("size", Map.size splitBranches)) $ if List.length splitBranches <= 1 then loop previousQuality (doneBranches <> splitBranches) todoBranches else letName (fmap maximalSpanningTreeToNodes $ (doneBranches <> splitBranches) <> todoBranches) \branchToNodes -> let splitQuality = globalQuality predictionMeasure roots branchToNodes in if previousQuality < splitQuality then loop splitQuality doneBranches (splitBranches <> todoBranches) else loop previousQuality (doneBranches <> splitBranches) todoBranches splitMaximalSpanningTree :: forall range cluster. HasCallStack => Show range => Show cluster => Ord range => Ord cluster => MaximalSpanningTree range cluster -> MaximalSpanningTrees range cluster splitMaximalSpanningTree mst = case minimumSimilarity mst of Nothing -> [mst] Just minSimil -> cutMerge mst where cutMerge = uncurry (:) . cut cut :: MaximalSpanningTree range cluster -> (MaximalSpanningTree range cluster, [MaximalSpanningTree range cluster]) cut (Tree.Node node children) = let (keptChildren, cutChildren) = children & List.partition \tree -> minSimil < mstNodeSimilarity (Tree.rootLabel tree) in let cutChildrenRoots = cutChildren & List.concatMap cutMerge in let (keptChildrenTrees, cutChildrenTree) = List.unzip (cut <$> keptChildren) in let cutChildrenTreeRoots = cutChildrenTree & List.concat & List.concatMap cutMerge in ( Tree.Node node keptChildrenTrees , (cutChildrenRoots List.++ cutChildrenTreeRoots) -- WarningNote: the root nodes of those new `MaximalSpanningTree`s -- keep their `mstNodeSimilarity` to their cutting value, -- which is lower than their `minimumSimilarity`. ) minimumSimilarity :: MaximalSpanningTree range cluster -> Maybe Similarity minimumSimilarity (Tree.Node _rootNode rootBranches) | List.null rootBranches = Nothing | otherwise = -- ExplanationNote: the root node of a `MaximalSpanningTree`, -- being a root node, does not have parent, -- hence its `mstNodeSimilarity` must be ignored. Just $ List.minimum $ rootBranches <&> Tree.foldTree \node accs -> List.minimum $ (mstNodeSimilarity node) : accs -- | @(`maximalSpanningTreeToNodes` branch)@ returns the nodes of the given @(branch)@. maximalSpanningTreeToNodes :: HasCallStack => Ord range => Ord cluster => MaximalSpanningTree range cluster -> Set (range, cluster) maximalSpanningTreeToNodes = foldMap' (Set.singleton . mstNodeRangeCluster) -- | @(splittingSimilarities maxSpanTrees)@ -- returns the `Similarity`s causing the given `MaximalSpanningTrees` -- to split into further more `MaximalSpanningTrees`. -- splittingSimilarities :: HasCallStack => MaximalSpanningTrees range cluster -> Set Similarity -- splittingSimilarities = foldMap' $ Map.foldMapWithKey \simil _ -> Set.singleton simil -- | The global quality of a `Phylomemy`. -- -- DescriptionNote: `globalQuality` counts the `(range, Cluster)` nodes of a branch -- but that a `(range, Cluster)` in itself can gather several documents. globalQuality :: HasCallStack => Ord range => predictionMeasure ::: (Set (range, Cluster) -> Set (range, Cluster) -> Decimal.Arith Similarity) -> roots ::: Set Root -> branchToNodes ::: [Set (range, Cluster)] -> Decimal.Arith Similarity globalQuality (Named predictionMeasure) (Named roots) (Named branchToNodes) = List.sum [ probability (1 % cardinal roots) * List.sum [ probability (cardinal retrievedNodes % nodesTotal) * predictionMeasure relevantNodes retrievedNodes | retrievedNodes <- branches ] | root <- roots & toList , -- CorrectnessNote: ignore branches not containing any requested `root` let branches = branchToNodes & List.filter (any (\(_r, c) -> Set.member root c)) , let nodesTotal = branches & List.map (cardinal @Int) & List.sum & fromIntegral , let relevantNodes = branches & foldMap' (Set.filter (\(_r, c) -> Set.member root c)) ] -- | L'idée de la F-mesure est de s'assurer qu'un classificateur fait de bonnes -- prédictions de la classe pertinente (bonne précision) -- en suffisamment grand nombre (bon rappel) -- sur un jeu de données cible. -- -- Tout comme la précision et le rappel, -- la F-mesure varie de 0 (plus mauvaise valeur) -- à 1 (meilleure valeur possible). -- -- ExplanationNote: https://en.wikipedia.org/wiki/F-score predictionMeasureF :: HasCallStack => Ord range => {-lambda :::-} -- | Trade-off between `precision` and `recall`. -- See https://en.wikipedia.org/wiki/Precision_and_recall -- -- > [It] predetermine[s] the desired shape of the phylomemy: a continent -- > (i.e., one large branch) or an archipelago of elements of knowledge -- > (i.e., many small branches). The estimation of `lambda` is left to the researcher’s -- > discretion in light of her own expertise and research questions, which makes -- > any phylomemy an artifact of the researcher’s perception -- -- For @(lambda == 0)@, only the `precision` counts, whereas for @(lambda == 1)@, only the `recall` counts. Similarity -> Set (range, Cluster) -> Set (range, Cluster) -> Decimal.Arith Similarity predictionMeasureF lambda relevantNodes retrievedNodes | lambda == proba0 = probability precision | lambda == proba1 = probability recall | otherwise = probability $ precision * recall * (1 + betaSquare) / (recall + betaSquare * precision) where precision = cardinal relevantRetrievedNodes % cardinal retrievedNodes recall = cardinal relevantRetrievedNodes % cardinal relevantNodes relevantRetrievedNodes = Set.intersection relevantNodes retrievedNodes lambdaDouble = lambda & Decimal.toScientificDecimal & toBoundedRealFloat @Double & fromLeft 1 -- ExplanationNote: the `tan` is just to spread `lambda` -- Two commonly used values for β are: -- - 2, which weighs recall higher than precision, -- - and 0.5, which weighs recall lower than precision. beta = tan (lambdaDouble * pi / 2) betaSquare :: Rational betaSquare = beta ^ (2 :: Int) & toRational