1 module Phylomemy.TemporalMatching where
 
   3 -- import Debug.Pretty.Simple (pTraceShow)
 
   4 import Control.Monad (Monad (..), foldM, forM_, unless)
 
   5 import Control.Monad.ST qualified as ST
 
   6 import Data.Bool (otherwise)
 
   7 import Data.Either (fromLeft)
 
   8 import Data.Eq (Eq (..))
 
   9 import Data.Foldable (any, foldMap', toList)
 
  10 import Data.Function (($), (&), (.))
 
  11 import Data.Functor (Functor (..), (<$>), (<&>))
 
  13 import Data.List qualified as List
 
  14 import Data.Map.Strict qualified as Map
 
  15 import Data.Maybe (Maybe (..))
 
  16 import Data.Ord (Ord (..))
 
  17 import Data.Ord qualified as Ord
 
  18 import Data.Ratio (Rational, (%))
 
  19 import Data.Scientific (toBoundedRealFloat)
 
  20 import Data.Semigroup (Semigroup (..))
 
  21 import Data.Sequence (Seq)
 
  22 import Data.Sequence qualified as Seq
 
  24 import Data.Set qualified as Set
 
  25 import Data.Tree qualified as Tree
 
  26 import Data.Tuple (uncurry)
 
  27 import GHC.Stack (HasCallStack)
 
  28 import Logic hiding ((/))
 
  29 import Numeric.Decimal qualified as Decimal
 
  30 import Numeric.Probability
 
  31 import Text.Show (Show)
 
  32 import Prelude (Double, Num (..), fromIntegral, pi, tan, toRational, (/), (^))
 
  34 import Clustering.FrequentItemSet.BruteForce qualified as Clustering
 
  35 import Clustering.UnionFind.ST qualified as UnionFind
 
  36 import Phylomemy.Indexation
 
  38 type Similarity = Probability
 
  40 cardinal :: Num i => Set a -> i
 
  41 cardinal = fromIntegral . Set.size
 
  43 -- TODO: implement biased similarity from the paper
 
  44 similarityJaccard :: Ord.Ord a => Set a -> Set a -> Similarity
 
  45 similarityJaccard x y =
 
  46   cardinal (Set.intersection x y) % cardinal (Set.union x y)
 
  50 -- | A `MaximalSpanningTree` (MST) is one (possibly out of many)
 
  51 -- tree spanning accross all the given `(range, cluster)` nodes,
 
  52 -- with the maximal sum of edges' `Similarity` between them.
 
  54 -- ExplanationNote: https://en.wikipedia.org/wiki/Minimum_spanning_tree
 
  56 -- Viewing a phylomemy as a `MaximalSpanningTree`
 
  57 -- is the crux of understanding how it is computed:
 
  59 -- - the `mstMinimalSimilarity` is the next `Similarity`
 
  60 -- that will split the `MaximalSpanningTree` into two or more `MaximalSpanningTree`s.
 
  62 -- - it explains what the "scale of a phylomemy" is:
 
  63 -- merging clusters of the same `range`
 
  64 -- when they still belong to the same `MaximalSpanningTree`.
 
  66 -- ImplementationNote: using a `Tree.Tree` to represent a `MaximalSpanningTree`
 
  67 -- (instead of an adjacency edge map for instance)
 
  68 -- is motivated by the need to implement `mstSplit`,
 
  69 -- which needs to gather the `(range, cluster)` nodes
 
  70 -- of each `MaximalSpanningTree` resulting from the cut,
 
  71 -- because knowing that is required by `msfGlobalQuality`.
 
  73 -- TODO: "Inadequacies of Minimum Spanning Trees in Molecular Epidemiology"
 
  74 -- https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3187300/
 
  76 -- TODO: "Divide-and-conquer based all spanning tree generation algorithm of a simple connected graph"
 
  77 -- https://www.sciencedirect.com/science/article/abs/pii/S0304397521006952
 
  79 -- WarningNote: "The tree of one percent"
 
  80 -- https://link.springer.com/content/pdf/10.1186/gb-2006-7-10-118.pdf
 
  81 type MaximalSpanningTree range cluster = Tree.Tree (MSTNode range cluster)
 
  83 data MSTNode range cluster = MSTNode
 
  84   { mstNodeSimilarity :: Similarity
 
  85   -- ^ The `similarity` of the parent edge of this node
 
  86   -- ie. of the only edge getting closer to the root node of the `MaximalSpanningTree`.
 
  88   -- ImplementationNote: `maximalSpanningForest` puts a `Similarity` of `1` for the root node.
 
  89   -- and `mstSplit` leaves the `mstMinimalSimilarity` of the edge before splitting out the `MaximalSpanningTree`.
 
  90   , mstNodeRangeCluster :: (range, cluster)
 
  94 -- | A (disjoint) forest of `MaximalSpanningTree`s,
 
  95 -- indexed by the minimal @(range, cluster)@ node
 
  96 -- of each `MaximalSpanningTree`.
 
  97 type MaximalSpanningForest range cluster =
 
  98   [MaximalSpanningTree range cluster]
 
 100 -- | @(`maximalSpanningForest` similarity rangeToClusterToDocs)@
 
 101 -- uses the Kruskal algorithm to find the trees spanning all
 
 102 -- the given `(range, cluster)` nodes in `rangeToClusterToDocs`
 
 103 -- with the maximal total `similarity` between them.
 
 105 -- ExplanationNote: https://en.wikipedia.org/wiki/Kruskal's_algorithm
 
 106 maximalSpanningForest ::
 
 107   forall range cluster doc.
 
 111   {-similarityMeasure :::-} (cluster -> cluster -> Similarity) ->
 
 112   {-clusters :::-} range :-> cluster :-> Seq (Clustering.Transaction Root doc) ->
 
 113   MaximalSpanningForest range cluster
 
 114 maximalSpanningForest similarity rangeToClusterToDocs = ST.runST do
 
 115   -- DescriptionNote: create a `Point` for each given node
 
 116   -- each Point containing a root node and a maximal spanning tree.
 
 117   rangeToClusterToPoint :: range :-> cluster :-> UnionFind.Point s (MaximalSpanningTree range cluster) <-
 
 119       & Map.traverseWithKey \srcR ->
 
 120         Map.traverseWithKey \srcC _docs ->
 
 124                 { mstNodeSimilarity = proba1
 
 125                 , mstNodeRangeCluster = (srcR, srcC)
 
 128   -- DescriptionNote: iterate from the greatest to the lowest similarity edge.
 
 129   forM_ (allEdges & Map.toDescList) \(simil, edges) -> do
 
 130     -- Iterate through all the edges of that `Similarity`.
 
 131     forM_ (edges & toList) \((srcR, srcC), (dstR, dstC)) -> do
 
 132       let srcPoint = rangeToClusterToPoint Map.! srcR Map.! srcC
 
 133       let dstPoint = rangeToClusterToPoint Map.! dstR Map.! dstC
 
 134       alreadyInSameMST <- UnionFind.equivalent srcPoint dstPoint
 
 135       unless alreadyInSameMST do
 
 136         -- DescriptionNote: the nodes of this edge (src -> dst) belong to the same `MaximalSpanningTree`.
 
 137         UnionFind.unionWith srcPoint dstPoint \(Tree.Node srcRoot srcForest) (Tree.Node dstRoot dstForest) ->
 
 142                   { mstNodeSimilarity = simil
 
 143                   , mstNodeRangeCluster = mstNodeRangeCluster dstRoot
 
 147   rootForest :: [MaximalSpanningTree range cluster] <-
 
 148     rangeToClusterToPoint
 
 149       -- DescriptionNote: collect all the Points.
 
 151       & List.concatMap Map.elems
 
 152       -- DescriptionNote: keep only the `MaximalSpanningTree`s
 
 153       -- contained in non-redundant `Point`s.
 
 154       & (`foldM` []) \acc p -> do
 
 155         isRedundant <- UnionFind.redundant p
 
 158           else UnionFind.descriptor p <&> (: acc)
 
 161     -- Order `rangeToClusterToDocs` by `Similarity`,
 
 162     -- which will enable to add edges to the spanning tree in decreasing `Similarity`
 
 163     -- and hence build *a* maximal spanning tree.
 
 164     allEdges :: Similarity :-> Seq ((range, cluster), (range, cluster))
 
 168         [ Map.singleton (similarity srcC dstC) (Seq.singleton (src, dst))
 
 169         | (srcR, srcClusterToDocuments) <- rangeToClusterToDocs & Map.toList
 
 170         , (srcC, _docs) <- srcClusterToDocuments & Map.toList
 
 171         , let src = (srcR, srcC)
 
 172         , -- ExplanationNote: it does not matter whether lower or greater ranges are used for the destination nodes,
 
 173         -- as they contain the very same undirected edges and thus `Similarity`,
 
 174         -- hence would produce the same splitting `Similarity`s,
 
 175         -- though not necessarily the same `MaximalSpanningTree`.
 
 176         let (_, dstRangeToClusterToDocs) = Map.split srcR rangeToClusterToDocs
 
 177         , (dstR, dstClusterToDocs) <- dstRangeToClusterToDocs & Map.toList
 
 178         , (dstC, _docs) <- dstClusterToDocs & Map.toList
 
 179         , let dst = (dstR, dstC)
 
 182 -- | "sea-level rise algorithm"
 
 184 -- See: in `Phylomemy.References.RefDrawMeScience`,
 
 185 -- « C.5 The sea-level rise algorithm and its implementation in Gargantext »
 
 187   forall range roots predictionMeasure.
 
 191   predictionMeasure ::: (Set (range, Cluster) -> Set (range, Cluster) -> Decimal.Arith Similarity) ->
 
 192   roots ::: Set Root ->
 
 193   MaximalSpanningForest range Cluster ->
 
 194   MaximalSpanningForest range Cluster
 
 195 msfPrune predictionMeasure roots =
 
 196   loop (return proba0) []
 
 199       Decimal.Arith Probability ->
 
 200       MaximalSpanningForest range Cluster ->
 
 201       MaximalSpanningForest range Cluster ->
 
 202       MaximalSpanningForest range Cluster
 
 203     loop previousQuality doneBranches currentBranches =
 
 204       -- pTraceShow (["msfPrune", "loop"], ("previousQuality", previousQuality), ("doneBranches", List.length doneBranches), ("todoBranches", List.length currentBranches)) $
 
 205       case currentBranches of
 
 207         currentBranch : todoBranches ->
 
 208           let splitBranches = mstSplit currentBranch
 
 209           in -- pTraceShow (["msfPrune", "loop", "splitBranches"], ("size", Map.size splitBranches)) $
 
 210              if List.length splitBranches <= 1
 
 211               then loop previousQuality (doneBranches <> splitBranches) todoBranches
 
 212               else letName (fmap mstNodes $ (doneBranches <> splitBranches) <> todoBranches) \mstToNodes ->
 
 213                 let splitQuality = msfGlobalQuality predictionMeasure roots mstToNodes
 
 214                 in if Decimal.arithError previousQuality < Decimal.arithError splitQuality
 
 215                     then -- CorrectnessFixMe: these `Decimal.arithError` may raise "arithmetic overflow",
 
 216                     -- it has to be understood.
 
 217                       loop splitQuality doneBranches (splitBranches <> todoBranches)
 
 218                     else loop previousQuality (doneBranches <> splitBranches) todoBranches
 
 220 -- | @(`mstSplit` mst)@ returns the `MaximalSpanningTree`s
 
 221 -- resulting from pruning out the given `MaximalSpanningTree`
 
 222 -- of all the edges having the minimal `Similarity` of that `MaximalSpanningTree`.
 
 224   forall range cluster.
 
 228   MaximalSpanningTree range cluster ->
 
 229   MaximalSpanningForest range cluster
 
 231   -- TODO: take minSimil as argument to avoid recomputing it when already known
 
 232   case mstMinimalSimilarity mst of
 
 234     Just minSimil -> cutMerge mst
 
 236         cutMerge = uncurry (:) . cut
 
 238           MaximalSpanningTree range cluster ->
 
 239           (MaximalSpanningTree range cluster, [MaximalSpanningTree range cluster])
 
 240         cut (Tree.Node node children) =
 
 241           let (keptChildren, cutChildren) =
 
 242                 children & List.partition \tree ->
 
 243                   minSimil < mstNodeSimilarity (Tree.rootLabel tree)
 
 244           in let cutChildrenRoots = cutChildren & List.concatMap cutMerge
 
 245              in let (keptChildrenForest, cutChildrenTree) = List.unzip (cut <$> keptChildren)
 
 246                 in let cutChildrenTreeRoots = cutChildrenTree & List.concat & List.concatMap cutMerge
 
 247                    in ( Tree.Node node keptChildrenForest
 
 248                       , (cutChildrenRoots List.++ cutChildrenTreeRoots)
 
 249                       -- WarningNote: the root nodes of those new `MaximalSpanningTree`s
 
 250                       -- keep their `mstNodeSimilarity` to their cutting value,
 
 251                       -- which is lower than their `mstMinimalSimilarity`.
 
 254 mstMinimalSimilarity :: MaximalSpanningTree range cluster -> Maybe Similarity
 
 255 mstMinimalSimilarity (Tree.Node _rootNode rootBranches)
 
 256   | List.null rootBranches = Nothing
 
 258       -- ExplanationNote: the root node of a `MaximalSpanningTree`,
 
 259       -- being a root node, does not have a parent,
 
 260       -- hence its `mstNodeSimilarity` must be ignored.
 
 264             <&> Tree.foldTree \node accs ->
 
 265               List.minimum $ (mstNodeSimilarity node) : accs
 
 267 -- | @(`mstSplittingSimilarities` mst)@
 
 268 -- returns the `Similarity`s causing the given `MaximalSpanningForest`
 
 269 -- to split into further more `MaximalSpanningForest`.
 
 270 mstSplittingSimilarities :: MaximalSpanningTree range cluster -> Set Similarity
 
 271 mstSplittingSimilarities (Tree.Node _rootNode rootBranches)
 
 272   | List.null rootBranches = Set.empty
 
 276           ( Tree.foldTree \node accs ->
 
 277               Set.unions (Set.singleton (mstNodeSimilarity node) : accs)
 
 280 type Scale = Int -- > 0
 
 282 -- | @(`mstScales` msf)@
 
 283 -- returns the successive results of calling `mstSplit` on the given `MaximalSpanningTree`s in @(msf)@
 
 284 -- until each one of them is a single node.
 
 285 -- Even though the greatest `Scale`s may be different for each `MaximalSpanningTree` of @(msf)@,
 
 286 -- the result are grouped by `Scale` then by `MaximalSpanningTree`,
 
 287 -- which enabled to have a maximal `Scale` for the whole `msf`.
 
 289   forall range cluster.
 
 292   MaximalSpanningForest range cluster ->
 
 293   Scale :-> {-mst-} Int :-> MaximalSpanningForest range cluster
 
 296     (Map.unionWith (List.++))
 
 297     [ Map.fromDistinctAscList
 
 298       [ (scale, Map.singleton mstI msf)
 
 299       | (scale, msf) <- scaleToMsf
 
 301     | (mstI, mst) <- initMSF & List.zip [1 :: Int ..]
 
 302     , let scaleToMsf :: [(Scale, MaximalSpanningForest range cluster)] =
 
 303             -- scanl :: (acc -> scale -> acc) -> acc -> [scale] -> [acc]
 
 307                 (\(scale, msf) _minSimil -> (scale + 1, msf >>= mstSplit))
 
 311     splitSimils :: Similarity :-> () =
 
 314           & Tree.foldTree \node accs ->
 
 317               (Map.singleton (mstNodeSimilarity node) () : accs)
 
 321 -- | @(`mstNodes` mst)@ returns the nodes of the given @(mst)@.
 
 326   MaximalSpanningTree range cluster ->
 
 328 mstNodes = foldMap' (Set.singleton . mstNodeRangeCluster)
 
 330 -- | The global quality of a `Phylomemy`.
 
 332 -- DescriptionNote: `msfGlobalQuality` counts the `(range, Cluster)` nodes of a branch
 
 333 -- but that a `(range, Cluster)` in itself can gather several documents.
 
 337   predictionMeasure ::: (Set (range, Cluster) -> Set (range, Cluster) -> Decimal.Arith Similarity) ->
 
 338   roots ::: Set Root ->
 
 339   mstToNodes ::: [Set (range, Cluster)] ->
 
 340   Decimal.Arith Similarity
 
 341 msfGlobalQuality (Named predictionMeasure) (Named roots) (Named mstToNodes) =
 
 343     [ probability (1 % cardinal roots)
 
 345         [ probability (cardinal retrievedNodes % nodesTotal)
 
 346           * predictionMeasure relevantNodes retrievedNodes
 
 347         | retrievedNodes <- branches
 
 349     | root <- roots & toList
 
 350     , -- CorrectnessNote: ignore branches not containing any requested `root`
 
 351     let branches = mstToNodes & List.filter (any (\(_r, c) -> Set.member root c))
 
 352     , let nodesTotal = branches & List.map (cardinal @Int) & List.sum & fromIntegral
 
 353     , let relevantNodes = branches & foldMap' (Set.filter (\(_r, c) -> Set.member root c))
 
 356 -- | L'idée de la F-mesure est de s'assurer qu'un classificateur fait de bonnes
 
 357 -- prédictions de la classe pertinente (bonne précision)
 
 358 -- en suffisamment grand nombre (bon rappel)
 
 359 -- sur un jeu de données cible.
 
 361 -- Tout comme la précision et le rappel,
 
 362 -- la F-mesure varie de 0 (plus mauvaise valeur)
 
 363 -- à 1 (meilleure valeur possible).
 
 365 -- ExplanationNote: https://en.wikipedia.org/wiki/F-score
 
 366 predictionMeasureF ::
 
 371   -- | Trade-off between `precision` and `recall`.
 
 372   -- See https://en.wikipedia.org/wiki/Precision_and_recall
 
 374   -- > [It] predetermine[s] the desired shape of the phylomemy: a continent
 
 375   -- > (i.e., one large branch) or an archipelago of elements of knowledge
 
 376   -- > (i.e., many small branches). The estimation of `lambda` is left to the researcher’s
 
 377   -- > discretion in light of her own expertise and research questions, which makes
 
 378   -- > any phylomemy an artifact of the researcher’s perception
 
 380   -- For @(lambda == 0)@, only the `precision` counts, whereas for @(lambda == 1)@, only the `recall` counts.
 
 382   Set (range, Cluster) ->
 
 383   Set (range, Cluster) ->
 
 384   Decimal.Arith Similarity
 
 385 predictionMeasureF lambda relevantNodes retrievedNodes
 
 386   | lambda == proba0 = probability precision
 
 387   | lambda == proba1 = probability recall
 
 388   | otherwise = probability $ precision * recall * (1 + betaSquare) / (recall + betaSquare * precision)
 
 390     precision = cardinal relevantRetrievedNodes % cardinal retrievedNodes
 
 391     recall = cardinal relevantRetrievedNodes % cardinal relevantNodes
 
 392     relevantRetrievedNodes = Set.intersection relevantNodes retrievedNodes
 
 393     lambdaDouble = lambda & Decimal.toScientificDecimal & toBoundedRealFloat @Double & fromLeft 1
 
 394     -- ExplanationNote: the `tan` is just to spread the `lambda ∈ [0,1]` from -∞ to +∞
 
 395     -- Two commonly used values for β are:
 
 396     -- - 2, which weighs recall higher than precision,
 
 397     -- - and 0.5, which weighs recall lower than precision.
 
 398     beta = tan (lambdaDouble * pi / 2)
 
 399     betaSquare :: Rational
 
 400     betaSquare = beta ^ (2 :: Int) & toRational