src/Phylomemy/TemporalMatching.hs

   1 module Phylomemy.TemporalMatching where
   2
   3 -- import Debug.Pretty.Simple (pTraceShow)
   4 import Control.Monad (Monad (..), foldM, forM_, unless)
   5 import Control.Monad.ST qualified as ST
   6 import Data.Bool (otherwise)
   7 import Data.Either (fromLeft)
   8 import Data.Eq (Eq (..))
   9 import Data.Foldable (any, foldMap', toList)
  10 import Data.Function (($), (&), (.))
  11 import Data.Functor (Functor (..), (<$>), (<&>))
  12 import Data.Int (Int)
  13 import Data.List qualified as List
  14 import Data.Map.Strict qualified as Map
  15 import Data.Maybe (Maybe (..))
  16 import Data.Ord (Ord (..))
  17 import Data.Ord qualified as Ord
  18 import Data.Ratio (Rational, (%))
  19 import Data.Scientific (toBoundedRealFloat)
  20 import Data.Semigroup (Semigroup (..))
  21 import Data.Sequence (Seq)
  22 import Data.Sequence qualified as Seq
  23 import Data.Set (Set)
  24 import Data.Set qualified as Set
  25 import Data.Tree qualified as Tree
  26 import Data.Tuple (uncurry)
  27 import GHC.Stack (HasCallStack)
  28 import Logic hiding ((/))
  29 import Numeric.Decimal qualified as Decimal
  30 import Numeric.Probability
  31 import Text.Show (Show)
  32 import Prelude (Double, Num (..), fromIntegral, pi, tan, toRational, (/), (^))
  33
  34 import Clustering.FrequentItemSet.BruteForce qualified as Clustering
  35 import Clustering.UnionFind.ST qualified as UnionFind
  36 import Phylomemy.Indexation
  37
  38 type Similarity = Probability
  39
  40 cardinal :: Num i => Set a -> i
  41 cardinal = fromIntegral . Set.size
  42
  43 -- TODO: implement biased similarity from the paper
  44 similarityJaccard :: Ord.Ord a => Set a -> Set a -> Similarity
  45 similarityJaccard x y =
  46   cardinal (Set.intersection x y) % cardinal (Set.union x y)
  47     & probability
  48     & Decimal.arithError
  49
  50 -- | A `MaximalSpanningTree` (MST) is one (possibly out of many)
  51 -- tree spanning accross all the given `(range, cluster)` nodes,
  52 -- with the maximal sum of edges' `Similarity` between them.
  53 --
  54 -- ExplanationNote: https://en.wikipedia.org/wiki/Minimum_spanning_tree
  55 --
  56 -- Viewing a phylomemy as a `MaximalSpanningTree`
  57 -- is the crux of understanding how it is computed:
  58 --
  59 -- - the `mstMinimalSimilarity` is the next `Similarity`
  60 -- that will split the `MaximalSpanningTree` into two or more `MaximalSpanningTree`s.
  61 --
  62 -- - it explains what the "scale of a phylomemy" is:
  63 -- merging clusters of the same `range`
  64 -- when they still belong to the same `MaximalSpanningTree`.
  65 --
  66 -- ImplementationNote: using a `Tree.Tree` to represent a `MaximalSpanningTree`
  67 -- (instead of an adjacency edge map for instance)
  68 -- is motivated by the need to implement `mstSplit`,
  69 -- which needs to gather the `(range, cluster)` nodes
  70 -- of each `MaximalSpanningTree` resulting from the cut,
  71 -- because knowing that is required by `msfGlobalQuality`.
  72 --
  73 -- TODO: "Inadequacies of Minimum Spanning Trees in Molecular Epidemiology"
  74 -- https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3187300/
  75 --
  76 -- TODO: "Divide-and-conquer based all spanning tree generation algorithm of a simple connected graph"
  77 -- https://www.sciencedirect.com/science/article/abs/pii/S0304397521006952
  78 --
  79 -- WarningNote: "The tree of one percent"
  80 -- https://link.springer.com/content/pdf/10.1186/gb-2006-7-10-118.pdf
  81 type MaximalSpanningTree range cluster = Tree.Tree (MSTNode range cluster)
  82
  83 data MSTNode range cluster = MSTNode
  84   { mstNodeSimilarity :: Similarity
  85   -- ^ The `similarity` of the parent edge of this node
  86   -- ie. of the only edge getting closer to the root node of the `MaximalSpanningTree`.
  87   --
  88   -- ImplementationNote: `maximalSpanningForest` puts a `Similarity` of `1` for the root node.
  89   -- and `mstSplit` leaves the `mstMinimalSimilarity` of the edge before splitting out the `MaximalSpanningTree`.
  90   , mstNodeRangeCluster :: (range, cluster)
  91   }
  92   deriving (Show)
  93
  94 -- | A (disjoint) forest of `MaximalSpanningTree`s,
  95 -- indexed by the minimal @(range, cluster)@ node
  96 -- of each `MaximalSpanningTree`.
  97 type MaximalSpanningForest range cluster =
  98   [MaximalSpanningTree range cluster]
  99
 100 -- | @(`maximalSpanningForest` similarity rangeToClusterToDocs)@
 101 -- uses the Kruskal algorithm to find the trees spanning all
 102 -- the given `(range, cluster)` nodes in `rangeToClusterToDocs`
 103 -- with the maximal total `similarity` between them.
 104 --
 105 -- ExplanationNote: https://en.wikipedia.org/wiki/Kruskal's_algorithm
 106 maximalSpanningForest ::
 107   forall range cluster doc.
 108   HasCallStack =>
 109   Ord range =>
 110   Ord cluster =>
 111   {-similarityMeasure :::-} (cluster -> cluster -> Similarity) ->
 112   {-clusters :::-} range :-> cluster :-> Seq (Clustering.Transaction Root doc) ->
 113   MaximalSpanningForest range cluster
 114 maximalSpanningForest similarity rangeToClusterToDocs = ST.runST do
 115   -- DescriptionNote: create a `Point` for each given node
 116   -- each Point containing a root node and a maximal spanning tree.
 117   rangeToClusterToPoint :: range :-> cluster :-> UnionFind.Point s (MaximalSpanningTree range cluster) <-
 118     rangeToClusterToDocs
 119       & Map.traverseWithKey \srcR ->
 120         Map.traverseWithKey \srcC _docs ->
 121           UnionFind.fresh $
 122             Tree.Node
 123               MSTNode
 124                 { mstNodeSimilarity = proba1
 125                 , mstNodeRangeCluster = (srcR, srcC)
 126                 }
 127               []
 128   -- DescriptionNote: iterate from the greatest to the lowest similarity edge.
 129   forM_ (allEdges & Map.toDescList) \(simil, edges) -> do
 130     -- Iterate through all the edges of that `Similarity`.
 131     forM_ (edges & toList) \((srcR, srcC), (dstR, dstC)) -> do
 132       let srcPoint = rangeToClusterToPoint Map.! srcR Map.! srcC
 133       let dstPoint = rangeToClusterToPoint Map.! dstR Map.! dstC
 134       alreadyInSameMST <- UnionFind.equivalent srcPoint dstPoint
 135       unless alreadyInSameMST do
 136         -- DescriptionNote: the nodes of this edge (src -> dst) belong to the same `MaximalSpanningTree`.
 137         UnionFind.unionWith srcPoint dstPoint \(Tree.Node srcRoot srcForest) (Tree.Node dstRoot dstForest) ->
 138           return $
 139             Tree.Node srcRoot $
 140               Tree.Node
 141                 MSTNode
 142                   { mstNodeSimilarity = simil
 143                   , mstNodeRangeCluster = mstNodeRangeCluster dstRoot
 144                   }
 145                 dstForest
 146                 : srcForest
 147   rootForest :: [MaximalSpanningTree range cluster] <-
 148     rangeToClusterToPoint
 149       -- DescriptionNote: collect all the Points.
 150       & Map.elems
 151       & List.concatMap Map.elems
 152       -- DescriptionNote: keep only the `MaximalSpanningTree`s
 153       -- contained in non-redundant `Point`s.
 154       & (`foldM` []) \acc p -> do
 155         isRedundant <- UnionFind.redundant p
 156         if isRedundant
 157           then return acc
 158           else UnionFind.descriptor p <&> (: acc)
 159   return rootForest
 160   where
 161     -- Order `rangeToClusterToDocs` by `Similarity`,
 162     -- which will enable to add edges to the spanning tree in decreasing `Similarity`
 163     -- and hence build *a* maximal spanning tree.
 164     allEdges :: Similarity :-> Seq ((range, cluster), (range, cluster))
 165     allEdges =
 166       Map.unionsWith
 167         (Seq.><)
 168         [ Map.singleton (similarity srcC dstC) (Seq.singleton (src, dst))
 169         | (srcR, srcClusterToDocuments) <- rangeToClusterToDocs & Map.toList
 170         , (srcC, _docs) <- srcClusterToDocuments & Map.toList
 171         , let src = (srcR, srcC)
 172         , -- ExplanationNote: it does not matter whether lower or greater ranges are used for the destination nodes,
 173         -- as they contain the very same undirected edges and thus `Similarity`,
 174         -- hence would produce the same splitting `Similarity`s,
 175         -- though not necessarily the same `MaximalSpanningTree`.
 176         let (_, dstRangeToClusterToDocs) = Map.split srcR rangeToClusterToDocs
 177         , (dstR, dstClusterToDocs) <- dstRangeToClusterToDocs & Map.toList
 178         , (dstC, _docs) <- dstClusterToDocs & Map.toList
 179         , let dst = (dstR, dstC)
 180         ]
 181
 182 -- | "sea-level rise algorithm"
 183 --
 184 -- See: in `Phylomemy.References.RefDrawMeScience`,
 185 -- « C.5 The sea-level rise algorithm and its implementation in Gargantext »
 186 msfPrune ::
 187   forall range roots predictionMeasure.
 188   HasCallStack =>
 189   Show range =>
 190   Ord range =>
 191   predictionMeasure ::: (Set (range, Cluster) -> Set (range, Cluster) -> Decimal.Arith Similarity) ->
 192   roots ::: Set Root ->
 193   MaximalSpanningForest range Cluster ->
 194   MaximalSpanningForest range Cluster
 195 msfPrune predictionMeasure roots =
 196   loop (return proba0) []
 197   where
 198     loop ::
 199       Decimal.Arith Probability ->
 200       MaximalSpanningForest range Cluster ->
 201       MaximalSpanningForest range Cluster ->
 202       MaximalSpanningForest range Cluster
 203     loop previousQuality doneBranches currentBranches =
 204       -- pTraceShow (["msfPrune", "loop"], ("previousQuality", previousQuality), ("doneBranches", List.length doneBranches), ("todoBranches", List.length currentBranches)) $
 205       case currentBranches of
 206         [] -> doneBranches
 207         currentBranch : todoBranches ->
 208           let splitBranches = mstSplit currentBranch
 209           in -- pTraceShow (["msfPrune", "loop", "splitBranches"], ("size", Map.size splitBranches)) $
 210              if List.length splitBranches <= 1
 211               then loop previousQuality (doneBranches <> splitBranches) todoBranches
 212               else letName (fmap mstNodes $ (doneBranches <> splitBranches) <> todoBranches) \mstToNodes ->
 213                 let splitQuality = msfGlobalQuality predictionMeasure roots mstToNodes
 214                 in if Decimal.arithError previousQuality < Decimal.arithError splitQuality
 215                     then -- CorrectnessFixMe: these `Decimal.arithError` may raise "arithmetic overflow",
 216                     -- it has to be understood.
 217                       loop splitQuality doneBranches (splitBranches <> todoBranches)
 218                     else loop previousQuality (doneBranches <> splitBranches) todoBranches
 219
 220 -- | @(`mstSplit` mst)@ returns the `MaximalSpanningTree`s
 221 -- resulting from pruning out the given `MaximalSpanningTree`
 222 -- of all the edges having the minimal `Similarity` of that `MaximalSpanningTree`.
 223 mstSplit ::
 224   forall range cluster.
 225   HasCallStack =>
 226   Ord range =>
 227   Ord cluster =>
 228   MaximalSpanningTree range cluster ->
 229   MaximalSpanningForest range cluster
 230 mstSplit mst =
 231   -- TODO: take minSimil as argument to avoid recomputing it when already known
 232   case mstMinimalSimilarity mst of
 233     Nothing -> [mst]
 234     Just minSimil -> cutMerge mst
 235       where
 236         cutMerge = uncurry (:) . cut
 237         cut ::
 238           MaximalSpanningTree range cluster ->
 239           (MaximalSpanningTree range cluster, [MaximalSpanningTree range cluster])
 240         cut (Tree.Node node children) =
 241           let (keptChildren, cutChildren) =
 242                 children & List.partition \tree ->
 243                   minSimil < mstNodeSimilarity (Tree.rootLabel tree)
 244           in let cutChildrenRoots = cutChildren & List.concatMap cutMerge
 245              in let (keptChildrenForest, cutChildrenTree) = List.unzip (cut <$> keptChildren)
 246                 in let cutChildrenTreeRoots = cutChildrenTree & List.concat & List.concatMap cutMerge
 247                    in ( Tree.Node node keptChildrenForest
 248                       , (cutChildrenRoots List.++ cutChildrenTreeRoots)
 249                       -- WarningNote: the root nodes of those new `MaximalSpanningTree`s
 250                       -- keep their `mstNodeSimilarity` to their cutting value,
 251                       -- which is lower than their `mstMinimalSimilarity`.
 252                       )
 253
 254 mstMinimalSimilarity :: MaximalSpanningTree range cluster -> Maybe Similarity
 255 mstMinimalSimilarity (Tree.Node _rootNode rootBranches)
 256   | List.null rootBranches = Nothing
 257   | otherwise =
 258       -- ExplanationNote: the root node of a `MaximalSpanningTree`,
 259       -- being a root node, does not have a parent,
 260       -- hence its `mstNodeSimilarity` must be ignored.
 261       Just $
 262         List.minimum $
 263           rootBranches
 264             <&> Tree.foldTree \node accs ->
 265               List.minimum $ (mstNodeSimilarity node) : accs
 266
 267 -- | @(`mstSplittingSimilarities` mst)@
 268 -- returns the `Similarity`s causing the given `MaximalSpanningForest`
 269 -- to split into further more `MaximalSpanningForest`.
 270 mstSplittingSimilarities :: MaximalSpanningTree range cluster -> Set Similarity
 271 mstSplittingSimilarities (Tree.Node _rootNode rootBranches)
 272   | List.null rootBranches = Set.empty
 273   | otherwise =
 274       rootBranches
 275         & foldMap'
 276           ( Tree.foldTree \node accs ->
 277               Set.unions (Set.singleton (mstNodeSimilarity node) : accs)
 278           )
 279
 280 type Scale = Int -- > 0
 281
 282 -- | @(`mstScales` msf)@
 283 -- returns the successive results of calling `mstSplit` on the given `MaximalSpanningTree`s in @(msf)@
 284 -- until each one of them is a single node.
 285 -- Even though the greatest `Scale`s may be different for each `MaximalSpanningTree` of @(msf)@,
 286 -- the result are grouped by `Scale` then by `MaximalSpanningTree`,
 287 -- which enabled to have a maximal `Scale` for the whole `msf`.
 288 mstScales ::
 289   forall range cluster.
 290   Ord range =>
 291   Ord cluster =>
 292   MaximalSpanningForest range cluster ->
 293   Scale :-> {-mst-} Int :-> MaximalSpanningForest range cluster
 294 mstScales initMSF =
 295   Map.unionsWith
 296     (Map.unionWith (List.++))
 297     [ Map.fromDistinctAscList
 298       [ (scale, Map.singleton mstI msf)
 299       | (scale, msf) <- scaleToMsf
 300       ]
 301     | (mstI, mst) <- initMSF & List.zip [1 :: Int ..]
 302     , let scaleToMsf :: [(Scale, MaximalSpanningForest range cluster)] =
 303             -- scanl :: (acc -> scale -> acc) -> acc -> [scale] -> [acc]
 304             splitSimils
 305               & Map.keys
 306               & List.scanl'
 307                 (\(scale, msf) _minSimil -> (scale + 1, msf >>= mstSplit))
 308                 (1, [mst])
 309     ]
 310   where
 311     splitSimils :: Similarity :-> () =
 312       Map.unions
 313         [ mst
 314           & Tree.foldTree \node accs ->
 315             Map.unionsWith
 316               (\() () -> ())
 317               (Map.singleton (mstNodeSimilarity node) () : accs)
 318         | mst <- initMSF
 319         ]
 320
 321 -- | @(`mstNodes` mst)@ returns the nodes of the given @(mst)@.
 322 mstNodes ::
 323   HasCallStack =>
 324   Ord range =>
 325   Ord cluster =>
 326   MaximalSpanningTree range cluster ->
 327   Set (range, cluster)
 328 mstNodes = foldMap' (Set.singleton . mstNodeRangeCluster)
 329
 330 -- | The global quality of a `Phylomemy`.
 331 --
 332 -- DescriptionNote: `msfGlobalQuality` counts the `(range, Cluster)` nodes of a branch
 333 -- but that a `(range, Cluster)` in itself can gather several documents.
 334 msfGlobalQuality ::
 335   HasCallStack =>
 336   Ord range =>
 337   predictionMeasure ::: (Set (range, Cluster) -> Set (range, Cluster) -> Decimal.Arith Similarity) ->
 338   roots ::: Set Root ->
 339   mstToNodes ::: [Set (range, Cluster)] ->
 340   Decimal.Arith Similarity
 341 msfGlobalQuality (Named predictionMeasure) (Named roots) (Named mstToNodes) =
 342   List.sum
 343     [ probability (1 % cardinal roots)
 344       * List.sum
 345         [ probability (cardinal retrievedNodes % nodesTotal)
 346           * predictionMeasure relevantNodes retrievedNodes
 347         | retrievedNodes <- branches
 348         ]
 349     | root <- roots & toList
 350     , -- CorrectnessNote: ignore branches not containing any requested `root`
 351     let branches = mstToNodes & List.filter (any (\(_r, c) -> Set.member root c))
 352     , let nodesTotal = branches & List.map (cardinal @Int) & List.sum & fromIntegral
 353     , let relevantNodes = branches & foldMap' (Set.filter (\(_r, c) -> Set.member root c))
 354     ]
 355
 356 -- | L'idée de la F-mesure est de s'assurer qu'un classificateur fait de bonnes
 357 -- prédictions de la classe pertinente (bonne précision)
 358 -- en suffisamment grand nombre (bon rappel)
 359 -- sur un jeu de données cible.
 360 --
 361 -- Tout comme la précision et le rappel,
 362 -- la F-mesure varie de 0 (plus mauvaise valeur)
 363 -- à 1 (meilleure valeur possible).
 364 --
 365 -- ExplanationNote: https://en.wikipedia.org/wiki/F-score
 366 predictionMeasureF ::
 367   HasCallStack =>
 368   Ord range =>
 369   {-lambda :::-}
 370
 371   -- | Trade-off between `precision` and `recall`.
 372   -- See https://en.wikipedia.org/wiki/Precision_and_recall
 373   --
 374   -- > [It] predetermine[s] the desired shape of the phylomemy: a continent
 375   -- > (i.e., one large branch) or an archipelago of elements of knowledge
 376   -- > (i.e., many small branches). The estimation of `lambda` is left to the researcher’s
 377   -- > discretion in light of her own expertise and research questions, which makes
 378   -- > any phylomemy an artifact of the researcher’s perception
 379   --
 380   -- For @(lambda == 0)@, only the `precision` counts, whereas for @(lambda == 1)@, only the `recall` counts.
 381   Similarity ->
 382   Set (range, Cluster) ->
 383   Set (range, Cluster) ->
 384   Decimal.Arith Similarity
 385 predictionMeasureF lambda relevantNodes retrievedNodes
 386   | lambda == proba0 = probability precision
 387   | lambda == proba1 = probability recall
 388   | otherwise = probability $ precision * recall * (1 + betaSquare) / (recall + betaSquare * precision)
 389   where
 390     precision = cardinal relevantRetrievedNodes % cardinal retrievedNodes
 391     recall = cardinal relevantRetrievedNodes % cardinal relevantNodes
 392     relevantRetrievedNodes = Set.intersection relevantNodes retrievedNodes
 393     lambdaDouble = lambda & Decimal.toScientificDecimal & toBoundedRealFloat @Double & fromLeft 1
 394     -- ExplanationNote: the `tan` is just to spread the `lambda ∈ [0,1]` from -∞ to +∞
 395     -- Two commonly used values for β are:
 396     -- - 2, which weighs recall higher than precision,
 397     -- - and 0.5, which weighs recall lower than precision.
 398     beta = tan (lambdaDouble * pi / 2)
 399     betaSquare :: Rational
 400     betaSquare = beta ^ (2 :: Int) & toRational