src/Phylomemy/TemporalMatching.hs

   1 module Phylomemy.TemporalMatching where
   2
   3 -- import Data.Traversable (traverse)
   4 -- import Debug.Pretty.Simple (pTraceShow, pTraceShowId)
   5 import Control.Monad (Monad (..), foldM, forM_, unless)
   6 import Control.Monad.ST qualified as ST
   7 import Data.Bool (otherwise)
   8 import Data.Either (fromLeft)
   9 import Data.Eq (Eq (..))
  10 import Data.Foldable (any, foldMap', toList)
  11 import Data.Function (($), (&), (.))
  12 import Data.Functor (Functor (..), (<$>), (<&>))
  13 import Data.Int (Int)
  14 import Data.List qualified as List
  15 import Data.Map.Strict qualified as Map
  16 import Data.Maybe (Maybe (..))
  17 import Data.Ord (Ord (..))
  18 import Data.Ord qualified as Ord
  19 import Data.Ratio (Rational, (%))
  20 import Data.Scientific (toBoundedRealFloat)
  21 import Data.Semigroup (Semigroup (..))
  22 import Data.Sequence (Seq)
  23 import Data.Sequence qualified as Seq
  24 import Data.Set (Set)
  25 import Data.Set qualified as Set
  26 import Data.Tree qualified as Tree
  27 import Data.Tuple (uncurry)
  28 import GHC.Stack (HasCallStack)
  29 import Logic hiding ((/))
  30 import Numeric.Decimal qualified as Decimal
  31 import Numeric.Probability
  32 import Text.Show (Show)
  33 import Prelude (Double, Num (..), fromIntegral, pi, tan, toRational, (/), (^))
  34
  35 import Clustering.FrequentItemSet.BruteForce qualified as Clustering
  36 import Clustering.UnionFind.ST qualified as UnionFind
  37 import Phylomemy.Indexation
  38
  39 type Similarity = Probability
  40
  41 cardinal :: Num i => Set a -> i
  42 cardinal = fromIntegral . Set.size
  43
  44 -- TODO: implement biased similarity from the paper
  45 similarityJaccard :: Ord.Ord a => Set a -> Set a -> Similarity
  46 similarityJaccard x y =
  47   cardinal (Set.intersection x y) % cardinal (Set.union x y)
  48     & probability
  49     & Decimal.arithError
  50
  51 -- | A `MaximalSpanningTree` (MST) is one (possibly out of many)
  52 -- tree spanning accross all the given `(range, cluster)` nodes,
  53 -- with the maximal sum of edges' `Similarity` between them.
  54 --
  55 -- ExplanationNote: https://en.wikipedia.org/wiki/Minimum_spanning_tree
  56 --
  57 -- Viewing a phylomemy as a `MaximalSpanningTree`
  58 -- is the crux of understanding how it is computed:
  59 --
  60 -- - the `mstMinimalSimilarity` is the next `Similarity`
  61 -- that will split the `MaximalSpanningTree` into two or more `MaximalSpanningForest`.
  62 --
  63 -- - it explains what the "scale of a phylomemy" is:
  64 -- merging clusters of the same range and same `MaximalSpanningTree`
  65 -- when they still belong to the same `MaximalSpanningTree`.
  66 --
  67 -- ImplementationNote: using a `Tree.Tree` to represent a `MaximalSpanningTree`
  68 -- (instead of an adjacency edge map for instance)
  69 -- is motivated by the need to implement `mstSplit`,
  70 -- which needs to gather the `(range, cluster)` nodes
  71 -- of the `MaximalSpanningForest` resulting from the cut,
  72 -- which will then be filtered by `msfGlobalQuality`.
  73 --
  74 -- TODO: "Inadequacies of Minimum Spanning Trees in Molecular Epidemiology"
  75 -- https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3187300/
  76 --
  77 -- TODO: "Divide-and-conquer based all spanning tree generation algorithm of a simple connected graph"
  78 -- https://www.sciencedirect.com/science/article/abs/pii/S0304397521006952
  79 --
  80 -- WarningNote: "The tree of one percent"
  81 -- https://link.springer.com/content/pdf/10.1186/gb-2006-7-10-118.pdf
  82 type MaximalSpanningTree range cluster = Tree.Tree (MSTNode range cluster)
  83
  84 data MSTNode range cluster = MSTNode
  85   { mstNodeSimilarity :: Similarity
  86   -- ^ The `similarity` of the parent edge of this node
  87   -- ie. of the only edge getting closer to the root node of the `MaximalSpanningTree`.
  88   --
  89   -- ImplementationNote: `maximalSpanningForest` puts a `Similarity` of `1` for the root node.
  90   -- and `mstSplit` leaves the `mstMinimalSimilarity` of the edge before splitting out the `MaximalSpanningTree`.
  91   , mstNodeRangeCluster :: (range, cluster)
  92   -- , nodeMSTMinSimilarity :: Min Similarity
  93   }
  94   deriving (Show)
  95
  96 -- | A (disjoint) forest of `MaximalSpanningTree`s,
  97 -- indexed by the minimal @(range, cluster)@ node
  98 -- of each `MaximalSpanningTree`.
  99 type MaximalSpanningForest range cluster =
 100   [MaximalSpanningTree range cluster]
 101
 102 -- | @(`maximalSpanningForest` allSimils)@
 103 -- uses the Kruskal algorithm to find the maximal spanning trees
 104 -- of the given `AllSimilarities`.
 105 --
 106 -- ExplanationNote: https://en.wikipedia.org/wiki/Kruskal's_algorithm
 107 maximalSpanningForest ::
 108   forall range cluster doc.
 109   Ord range =>
 110   Ord cluster =>
 111   {-similarityMeasure :::-} (cluster -> cluster -> Similarity) ->
 112   {-clusters :::-} range :-> cluster :-> Seq (Clustering.Transaction Root doc) ->
 113   MaximalSpanningForest range cluster
 114 maximalSpanningForest similarity rangeToClusterToDocs = ST.runST do
 115   -- DescriptionNote: create a `Point` for each given node
 116   -- each Point containing a root node and a maximal spanning tree.
 117   rangeToClusterToPoint :: range :-> cluster :-> UnionFind.Point s (MaximalSpanningTree range cluster) <-
 118     rangeToClusterToDocs
 119       & Map.traverseWithKey \srcR ->
 120         Map.traverseWithKey \srcC _docs ->
 121           UnionFind.fresh $
 122             Tree.Node
 123               MSTNode
 124                 { mstNodeSimilarity = proba1
 125                 , mstNodeRangeCluster = (srcR, srcC)
 126                 }
 127               []
 128   -- DescriptionNote: iterate from the greatest to the lowest similarity edge.
 129   forM_ (allEdges & Map.toDescList) \(simil, edges) -> do
 130     -- Iterate through all the edges of that `Similarity`.
 131     forM_ (edges & toList) \((srcR, srcC), (dstR, dstC)) -> do
 132       let srcPoint = rangeToClusterToPoint Map.! srcR Map.! srcC
 133       let dstPoint = rangeToClusterToPoint Map.! dstR Map.! dstC
 134       alreadyInSameMST <- UnionFind.equivalent srcPoint dstPoint
 135       unless alreadyInSameMST do
 136         -- DescriptionNote: the nodes of this edge (src -> dst) belong to the same `MaximalSpanningTree`.
 137         UnionFind.unionWith srcPoint dstPoint \(Tree.Node srcRoot srcForest) (Tree.Node dstRoot dstForest) ->
 138           return $
 139             Tree.Node srcRoot $
 140               Tree.Node
 141                 MSTNode
 142                   { mstNodeSimilarity = simil
 143                   , mstNodeRangeCluster = mstNodeRangeCluster dstRoot
 144                   }
 145                 dstForest
 146                 : srcForest
 147   rootForest :: [MaximalSpanningTree range cluster] <-
 148     rangeToClusterToPoint
 149       -- DescriptionNote: collect all the Points.
 150       & Map.elems
 151       & List.concatMap Map.elems
 152       -- DescriptionNote: keep only the `MaximalSpanningTree`s
 153       -- contained in non-redundant `Point`s.
 154       & (`foldM` []) \acc p -> do
 155         isRedundant <- UnionFind.redundant p
 156         if isRedundant
 157           then return acc
 158           else UnionFind.descriptor p <&> (: acc)
 159   return rootForest
 160   where
 161     -- Order `rangeToClusterToDocs` by `Similarity`,
 162     -- which will enable to add edges to the spanning tree in decreasing `Similarity`
 163     -- and hence build *a* maximal spanning tree.
 164     allEdges :: Similarity :-> Seq ((range, cluster), (range, cluster))
 165     allEdges =
 166       Map.unionsWith
 167         (Seq.><)
 168         [ Map.singleton (similarity srcC dstC) (Seq.singleton (src, dst))
 169         | (srcR, srcClusterToDocuments) <- rangeToClusterToDocs & Map.toList
 170         , (srcC, _docs) <- srcClusterToDocuments & Map.toList
 171         , let src = (srcR, srcC)
 172         , -- ExplanationNote: it does not matter whether lower or greater ranges are used for the destination nodes,
 173         -- as they contain the very same undirected edges and thus `Similarity`,
 174         -- hence would produce the same splitting `Similarity`s,
 175         -- though not necessarily the same `MaximalSpanningTree`.
 176         let (_, dstRangeToClusterToDocs) = Map.split srcR rangeToClusterToDocs
 177         , (dstR, dstClusterToDocs) <- dstRangeToClusterToDocs & Map.toList
 178         , (dstC, _docs) <- dstClusterToDocs & Map.toList
 179         , let dst = (dstR, dstC)
 180         ]
 181
 182 -- | "sea-level rise algorithm"
 183 --
 184 -- See: in `Phylomemy.References.RefDrawMeScience`,
 185 -- « C.5 The sea-level rise algorithm and its implementation in Gargantext »
 186 msfSplit ::
 187   HasCallStack =>
 188   forall range roots predictionMeasure.
 189   Show range =>
 190   Ord range =>
 191   predictionMeasure ::: (Set (range, Cluster) -> Set (range, Cluster) -> Decimal.Arith Similarity) ->
 192   roots ::: Set Root ->
 193   MaximalSpanningForest range Cluster ->
 194   MaximalSpanningForest range Cluster
 195 msfSplit predictionMeasure roots =
 196   loop (return proba0) []
 197   where
 198     loop previousQuality doneBranches currentBranches =
 199       -- pTraceShow (["msfSplit", "loop"], ("previousQuality", previousQuality), ("doneBranches", List.length doneBranches), ("todoBranches", List.length currentBranches)) $
 200       case currentBranches of
 201         [] -> doneBranches
 202         currentBranch : todoBranches ->
 203           let splitBranches = mstSplit currentBranch
 204           in -- pTraceShow (["msfSplit", "loop", "splitBranches"], ("size", Map.size splitBranches)) $
 205              if List.length splitBranches <= 1
 206               then loop previousQuality (doneBranches <> splitBranches) todoBranches
 207               else letName (fmap mstNodes $ (doneBranches <> splitBranches) <> todoBranches) \mstToNodes ->
 208                 let splitQuality = msfGlobalQuality predictionMeasure roots mstToNodes
 209                 in if previousQuality < splitQuality
 210                     then loop splitQuality doneBranches (splitBranches <> todoBranches)
 211                     else loop previousQuality (doneBranches <> splitBranches) todoBranches
 212
 213 mstSplit ::
 214   forall range cluster.
 215   HasCallStack =>
 216   Show range =>
 217   Show cluster =>
 218   Ord range =>
 219   Ord cluster =>
 220   MaximalSpanningTree range cluster ->
 221   MaximalSpanningForest range cluster
 222 mstSplit mst =
 223   case mstMinimalSimilarity mst of
 224     Nothing -> [mst]
 225     Just minSimil -> cutMerge mst
 226       where
 227         cutMerge = uncurry (:) . cut
 228         cut :: MaximalSpanningTree range cluster -> (MaximalSpanningTree range cluster, [MaximalSpanningTree range cluster])
 229         cut (Tree.Node node children) =
 230           let (keptChildren, cutChildren) =
 231                 children & List.partition \tree ->
 232                   minSimil < mstNodeSimilarity (Tree.rootLabel tree)
 233           in let cutChildrenRoots = cutChildren & List.concatMap cutMerge
 234              in let (keptChildrenForest, cutChildrenTree) = List.unzip (cut <$> keptChildren)
 235                 in let cutChildrenTreeRoots = cutChildrenTree & List.concat & List.concatMap cutMerge
 236                    in ( Tree.Node node keptChildrenForest
 237                       , (cutChildrenRoots List.++ cutChildrenTreeRoots)
 238                       -- WarningNote: the root nodes of those new `MaximalSpanningTree`s
 239                       -- keep their `mstNodeSimilarity` to their cutting value,
 240                       -- which is lower than their `mstMinimalSimilarity`.
 241                       )
 242
 243 mstMinimalSimilarity :: MaximalSpanningTree range cluster -> Maybe Similarity
 244 mstMinimalSimilarity (Tree.Node _rootNode rootBranches)
 245   | List.null rootBranches = Nothing
 246   | otherwise =
 247       -- ExplanationNote: the root node of a `MaximalSpanningTree`,
 248       -- being a root node, does not have parent,
 249       -- hence its `mstNodeSimilarity` must be ignored.
 250       Just $
 251         List.minimum $
 252           rootBranches
 253             <&> Tree.foldTree \node accs ->
 254               List.minimum $ (mstNodeSimilarity node) : accs
 255
 256 -- | @(`mstSplittingSimilarities` mst)@
 257 -- returns the `Similarity`s causing the given `MaximalSpanningForest`
 258 -- to split into further more `MaximalSpanningForest`.
 259 mstSplittingSimilarities :: MaximalSpanningTree range cluster -> Set Similarity
 260 mstSplittingSimilarities (Tree.Node _rootNode rootBranches)
 261   | List.null rootBranches = Set.empty
 262   | otherwise = rootBranches & foldMap' (Tree.foldTree \node accs -> Set.unions (Set.singleton (mstNodeSimilarity node) : accs))
 263
 264 -- | @(`mstNodes` branch)@ returns the nodes of the given @(branch)@.
 265 mstNodes ::
 266   HasCallStack =>
 267   Ord range =>
 268   Ord cluster =>
 269   MaximalSpanningTree range cluster ->
 270   Set (range, cluster)
 271 mstNodes = foldMap' (Set.singleton . mstNodeRangeCluster)
 272
 273 -- | The global quality of a `Phylomemy`.
 274 --
 275 -- DescriptionNote: `msfGlobalQuality` counts the `(range, Cluster)` nodes of a branch
 276 -- but that a `(range, Cluster)` in itself can gather several documents.
 277 msfGlobalQuality ::
 278   HasCallStack =>
 279   Ord range =>
 280   predictionMeasure ::: (Set (range, Cluster) -> Set (range, Cluster) -> Decimal.Arith Similarity) ->
 281   roots ::: Set Root ->
 282   mstToNodes ::: [Set (range, Cluster)] ->
 283   Decimal.Arith Similarity
 284 msfGlobalQuality (Named predictionMeasure) (Named roots) (Named mstToNodes) =
 285   List.sum
 286     [ probability (1 % cardinal roots)
 287       * List.sum
 288         [ probability (cardinal retrievedNodes % nodesTotal)
 289           * predictionMeasure relevantNodes retrievedNodes
 290         | retrievedNodes <- branches
 291         ]
 292     | root <- roots & toList
 293     , -- CorrectnessNote: ignore branches not containing any requested `root`
 294     let branches = mstToNodes & List.filter (any (\(_r, c) -> Set.member root c))
 295     , let nodesTotal = branches & List.map (cardinal @Int) & List.sum & fromIntegral
 296     , let relevantNodes = branches & foldMap' (Set.filter (\(_r, c) -> Set.member root c))
 297     ]
 298
 299 -- | L'idée de la F-mesure est de s'assurer qu'un classificateur fait de bonnes
 300 -- prédictions de la classe pertinente (bonne précision)
 301 -- en suffisamment grand nombre (bon rappel)
 302 -- sur un jeu de données cible.
 303 --
 304 -- Tout comme la précision et le rappel,
 305 -- la F-mesure varie de 0 (plus mauvaise valeur)
 306 -- à 1 (meilleure valeur possible).
 307 --
 308 -- ExplanationNote: https://en.wikipedia.org/wiki/F-score
 309 predictionMeasureF ::
 310   HasCallStack =>
 311   Ord range =>
 312   {-lambda :::-}
 313
 314   -- | Trade-off between `precision` and `recall`.
 315   -- See https://en.wikipedia.org/wiki/Precision_and_recall
 316   --
 317   -- > [It] predetermine[s] the desired shape of the phylomemy: a continent
 318   -- > (i.e., one large branch) or an archipelago of elements of knowledge
 319   -- > (i.e., many small branches). The estimation of `lambda` is left to the researcher’s
 320   -- > discretion in light of her own expertise and research questions, which makes
 321   -- > any phylomemy an artifact of the researcher’s perception
 322   --
 323   -- For @(lambda == 0)@, only the `precision` counts, whereas for @(lambda == 1)@, only the `recall` counts.
 324   Similarity ->
 325   Set (range, Cluster) ->
 326   Set (range, Cluster) ->
 327   Decimal.Arith Similarity
 328 predictionMeasureF lambda relevantNodes retrievedNodes
 329   | lambda == proba0 = probability precision
 330   | lambda == proba1 = probability recall
 331   | otherwise = probability $ precision * recall * (1 + betaSquare) / (recall + betaSquare * precision)
 332   where
 333     precision = cardinal relevantRetrievedNodes % cardinal retrievedNodes
 334     recall = cardinal relevantRetrievedNodes % cardinal relevantNodes
 335     relevantRetrievedNodes = Set.intersection relevantNodes retrievedNodes
 336     lambdaDouble = lambda & Decimal.toScientificDecimal & toBoundedRealFloat @Double & fromLeft 1
 337     -- ExplanationNote: the `tan` is just to spread `lambda`
 338     -- Two commonly used values for β are:
 339     -- - 2, which weighs recall higher than precision,
 340     -- - and 0.5, which weighs recall lower than precision.
 341     beta = tan (lambdaDouble * pi / 2)
 342     betaSquare :: Rational
 343     betaSquare = beta ^ (2 :: Int) & toRational