1 module Phylomemy.TemporalMatching where
3 -- import Data.Traversable (traverse)
4 -- import Debug.Pretty.Simple (pTraceShow, pTraceShowId)
5 import Control.Monad (Monad (..), foldM, forM_, unless)
6 import Control.Monad.ST qualified as ST
7 import Data.Bool (otherwise)
8 import Data.Either (fromLeft)
9 import Data.Eq (Eq (..))
10 import Data.Foldable (any, foldMap', toList)
11 import Data.Function (($), (&), (.))
12 import Data.Functor (Functor (..), (<$>), (<&>))
14 import Data.List qualified as List
15 import Data.Map.Strict qualified as Map
16 import Data.Maybe (Maybe (..))
17 import Data.Ord (Ord (..))
18 import Data.Ord qualified as Ord
19 import Data.Ratio (Rational, (%))
20 import Data.Scientific (toBoundedRealFloat)
21 import Data.Semigroup (Semigroup (..))
22 import Data.Sequence (Seq)
23 import Data.Sequence qualified as Seq
25 import Data.Set qualified as Set
26 import Data.Tree qualified as Tree
27 import Data.Tuple (uncurry)
28 import GHC.Stack (HasCallStack)
29 import Logic hiding ((/))
30 import Numeric.Decimal qualified as Decimal
31 import Numeric.Probability
32 import Text.Show (Show)
33 import Prelude (Double, Num (..), fromIntegral, pi, tan, toRational, (/), (^))
35 import Clustering.FrequentItemSet.BruteForce qualified as Clustering
36 import Clustering.UnionFind.ST qualified as UnionFind
37 import Phylomemy.Indexation
39 type Similarity = Probability
41 cardinal :: Num i => Set a -> i
42 cardinal = fromIntegral . Set.size
44 -- TODO: implement biased similarity from the paper
45 similarityJaccard :: Ord.Ord a => Set a -> Set a -> Similarity
46 similarityJaccard x y =
47 cardinal (Set.intersection x y) % cardinal (Set.union x y)
51 -- | A `MaximalSpanningTree` (MST) is one (possibly out of many)
52 -- tree spanning accross all the given `(range, cluster)` nodes,
53 -- with the maximal sum of edges' `Similarity` between them.
55 -- ExplanationNote: https://en.wikipedia.org/wiki/Minimum_spanning_tree
57 -- Viewing a phylomemy as a `MaximalSpanningTree`
58 -- is the crux of understanding how it is computed:
60 -- - the `mstMinimalSimilarity` is the next `Similarity`
61 -- that will split the `MaximalSpanningTree` into two or more `MaximalSpanningForest`.
63 -- - it explains what the "scale of a phylomemy" is:
64 -- merging clusters of the same range and same `MaximalSpanningTree`
65 -- when they still belong to the same `MaximalSpanningTree`.
67 -- ImplementationNote: using a `Tree.Tree` to represent a `MaximalSpanningTree`
68 -- (instead of an adjacency edge map for instance)
69 -- is motivated by the need to implement `mstSplit`,
70 -- which needs to gather the `(range, cluster)` nodes
71 -- of the `MaximalSpanningForest` resulting from the cut,
72 -- which will then be filtered by `msfGlobalQuality`.
74 -- TODO: "Inadequacies of Minimum Spanning Trees in Molecular Epidemiology"
75 -- https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3187300/
77 -- TODO: "Divide-and-conquer based all spanning tree generation algorithm of a simple connected graph"
78 -- https://www.sciencedirect.com/science/article/abs/pii/S0304397521006952
80 -- WarningNote: "The tree of one percent"
81 -- https://link.springer.com/content/pdf/10.1186/gb-2006-7-10-118.pdf
82 type MaximalSpanningTree range cluster = Tree.Tree (MSTNode range cluster)
84 data MSTNode range cluster = MSTNode
85 { mstNodeSimilarity :: Similarity
86 -- ^ The `similarity` of the parent edge of this node
87 -- ie. of the only edge getting closer to the root node of the `MaximalSpanningTree`.
89 -- ImplementationNote: `maximalSpanningForest` puts a `Similarity` of `1` for the root node.
90 -- and `mstSplit` leaves the `mstMinimalSimilarity` of the edge before splitting out the `MaximalSpanningTree`.
91 , mstNodeRangeCluster :: (range, cluster)
92 -- , nodeMSTMinSimilarity :: Min Similarity
96 -- | A (disjoint) forest of `MaximalSpanningTree`s,
97 -- indexed by the minimal @(range, cluster)@ node
98 -- of each `MaximalSpanningTree`.
99 type MaximalSpanningForest range cluster =
100 [MaximalSpanningTree range cluster]
102 -- | @(`maximalSpanningForest` allSimils)@
103 -- uses the Kruskal algorithm to find the maximal spanning trees
104 -- of the given `AllSimilarities`.
106 -- ExplanationNote: https://en.wikipedia.org/wiki/Kruskal's_algorithm
107 maximalSpanningForest ::
108 forall range cluster doc.
111 {-similarityMeasure :::-} (cluster -> cluster -> Similarity) ->
112 {-clusters :::-} range :-> cluster :-> Seq (Clustering.Transaction Root doc) ->
113 MaximalSpanningForest range cluster
114 maximalSpanningForest similarity rangeToClusterToDocs = ST.runST do
115 -- DescriptionNote: create a `Point` for each given node
116 -- each Point containing a root node and a maximal spanning tree.
117 rangeToClusterToPoint :: range :-> cluster :-> UnionFind.Point s (MaximalSpanningTree range cluster) <-
119 & Map.traverseWithKey \srcR ->
120 Map.traverseWithKey \srcC _docs ->
124 { mstNodeSimilarity = proba1
125 , mstNodeRangeCluster = (srcR, srcC)
128 -- DescriptionNote: iterate from the greatest to the lowest similarity edge.
129 forM_ (allEdges & Map.toDescList) \(simil, edges) -> do
130 -- Iterate through all the edges of that `Similarity`.
131 forM_ (edges & toList) \((srcR, srcC), (dstR, dstC)) -> do
132 let srcPoint = rangeToClusterToPoint Map.! srcR Map.! srcC
133 let dstPoint = rangeToClusterToPoint Map.! dstR Map.! dstC
134 alreadyInSameMST <- UnionFind.equivalent srcPoint dstPoint
135 unless alreadyInSameMST do
136 -- DescriptionNote: the nodes of this edge (src -> dst) belong to the same `MaximalSpanningTree`.
137 UnionFind.unionWith srcPoint dstPoint \(Tree.Node srcRoot srcForest) (Tree.Node dstRoot dstForest) ->
142 { mstNodeSimilarity = simil
143 , mstNodeRangeCluster = mstNodeRangeCluster dstRoot
147 rootForest :: [MaximalSpanningTree range cluster] <-
148 rangeToClusterToPoint
149 -- DescriptionNote: collect all the Points.
151 & List.concatMap Map.elems
152 -- DescriptionNote: keep only the `MaximalSpanningTree`s
153 -- contained in non-redundant `Point`s.
154 & (`foldM` []) \acc p -> do
155 isRedundant <- UnionFind.redundant p
158 else UnionFind.descriptor p <&> (: acc)
161 -- Order `rangeToClusterToDocs` by `Similarity`,
162 -- which will enable to add edges to the spanning tree in decreasing `Similarity`
163 -- and hence build *a* maximal spanning tree.
164 allEdges :: Similarity :-> Seq ((range, cluster), (range, cluster))
168 [ Map.singleton (similarity srcC dstC) (Seq.singleton (src, dst))
169 | (srcR, srcClusterToDocuments) <- rangeToClusterToDocs & Map.toList
170 , (srcC, _docs) <- srcClusterToDocuments & Map.toList
171 , let src = (srcR, srcC)
172 , -- ExplanationNote: it does not matter whether lower or greater ranges are used for the destination nodes,
173 -- as they contain the very same undirected edges and thus `Similarity`,
174 -- hence would produce the same splitting `Similarity`s,
175 -- though not necessarily the same `MaximalSpanningTree`.
176 let (_, dstRangeToClusterToDocs) = Map.split srcR rangeToClusterToDocs
177 , (dstR, dstClusterToDocs) <- dstRangeToClusterToDocs & Map.toList
178 , (dstC, _docs) <- dstClusterToDocs & Map.toList
179 , let dst = (dstR, dstC)
182 -- | "sea-level rise algorithm"
184 -- See: in `Phylomemy.References.RefDrawMeScience`,
185 -- « C.5 The sea-level rise algorithm and its implementation in Gargantext »
188 forall range roots predictionMeasure.
191 predictionMeasure ::: (Set (range, Cluster) -> Set (range, Cluster) -> Decimal.Arith Similarity) ->
192 roots ::: Set Root ->
193 MaximalSpanningForest range Cluster ->
194 MaximalSpanningForest range Cluster
195 msfSplit predictionMeasure roots =
196 loop (return proba0) []
198 loop previousQuality doneBranches currentBranches =
199 -- pTraceShow (["msfSplit", "loop"], ("previousQuality", previousQuality), ("doneBranches", List.length doneBranches), ("todoBranches", List.length currentBranches)) $
200 case currentBranches of
202 currentBranch : todoBranches ->
203 let splitBranches = mstSplit currentBranch
204 in -- pTraceShow (["msfSplit", "loop", "splitBranches"], ("size", Map.size splitBranches)) $
205 if List.length splitBranches <= 1
206 then loop previousQuality (doneBranches <> splitBranches) todoBranches
207 else letName (fmap mstNodes $ (doneBranches <> splitBranches) <> todoBranches) \mstToNodes ->
208 let splitQuality = msfGlobalQuality predictionMeasure roots mstToNodes
209 in if previousQuality < splitQuality
210 then loop splitQuality doneBranches (splitBranches <> todoBranches)
211 else loop previousQuality (doneBranches <> splitBranches) todoBranches
214 forall range cluster.
220 MaximalSpanningTree range cluster ->
221 MaximalSpanningForest range cluster
223 case mstMinimalSimilarity mst of
225 Just minSimil -> cutMerge mst
227 cutMerge = uncurry (:) . cut
228 cut :: MaximalSpanningTree range cluster -> (MaximalSpanningTree range cluster, [MaximalSpanningTree range cluster])
229 cut (Tree.Node node children) =
230 let (keptChildren, cutChildren) =
231 children & List.partition \tree ->
232 minSimil < mstNodeSimilarity (Tree.rootLabel tree)
233 in let cutChildrenRoots = cutChildren & List.concatMap cutMerge
234 in let (keptChildrenForest, cutChildrenTree) = List.unzip (cut <$> keptChildren)
235 in let cutChildrenTreeRoots = cutChildrenTree & List.concat & List.concatMap cutMerge
236 in ( Tree.Node node keptChildrenForest
237 , (cutChildrenRoots List.++ cutChildrenTreeRoots)
238 -- WarningNote: the root nodes of those new `MaximalSpanningTree`s
239 -- keep their `mstNodeSimilarity` to their cutting value,
240 -- which is lower than their `mstMinimalSimilarity`.
243 mstMinimalSimilarity :: MaximalSpanningTree range cluster -> Maybe Similarity
244 mstMinimalSimilarity (Tree.Node _rootNode rootBranches)
245 | List.null rootBranches = Nothing
247 -- ExplanationNote: the root node of a `MaximalSpanningTree`,
248 -- being a root node, does not have parent,
249 -- hence its `mstNodeSimilarity` must be ignored.
253 <&> Tree.foldTree \node accs ->
254 List.minimum $ (mstNodeSimilarity node) : accs
256 -- | @(`mstSplittingSimilarities` mst)@
257 -- returns the `Similarity`s causing the given `MaximalSpanningForest`
258 -- to split into further more `MaximalSpanningForest`.
259 mstSplittingSimilarities :: MaximalSpanningTree range cluster -> Set Similarity
260 mstSplittingSimilarities (Tree.Node _rootNode rootBranches)
261 | List.null rootBranches = Set.empty
262 | otherwise = rootBranches & foldMap' (Tree.foldTree \node accs -> Set.unions (Set.singleton (mstNodeSimilarity node) : accs))
264 -- | @(`mstNodes` branch)@ returns the nodes of the given @(branch)@.
269 MaximalSpanningTree range cluster ->
271 mstNodes = foldMap' (Set.singleton . mstNodeRangeCluster)
273 -- | The global quality of a `Phylomemy`.
275 -- DescriptionNote: `msfGlobalQuality` counts the `(range, Cluster)` nodes of a branch
276 -- but that a `(range, Cluster)` in itself can gather several documents.
280 predictionMeasure ::: (Set (range, Cluster) -> Set (range, Cluster) -> Decimal.Arith Similarity) ->
281 roots ::: Set Root ->
282 mstToNodes ::: [Set (range, Cluster)] ->
283 Decimal.Arith Similarity
284 msfGlobalQuality (Named predictionMeasure) (Named roots) (Named mstToNodes) =
286 [ probability (1 % cardinal roots)
288 [ probability (cardinal retrievedNodes % nodesTotal)
289 * predictionMeasure relevantNodes retrievedNodes
290 | retrievedNodes <- branches
292 | root <- roots & toList
293 , -- CorrectnessNote: ignore branches not containing any requested `root`
294 let branches = mstToNodes & List.filter (any (\(_r, c) -> Set.member root c))
295 , let nodesTotal = branches & List.map (cardinal @Int) & List.sum & fromIntegral
296 , let relevantNodes = branches & foldMap' (Set.filter (\(_r, c) -> Set.member root c))
299 -- | L'idée de la F-mesure est de s'assurer qu'un classificateur fait de bonnes
300 -- prédictions de la classe pertinente (bonne précision)
301 -- en suffisamment grand nombre (bon rappel)
302 -- sur un jeu de données cible.
304 -- Tout comme la précision et le rappel,
305 -- la F-mesure varie de 0 (plus mauvaise valeur)
306 -- à 1 (meilleure valeur possible).
308 -- ExplanationNote: https://en.wikipedia.org/wiki/F-score
309 predictionMeasureF ::
314 -- | Trade-off between `precision` and `recall`.
315 -- See https://en.wikipedia.org/wiki/Precision_and_recall
317 -- > [It] predetermine[s] the desired shape of the phylomemy: a continent
318 -- > (i.e., one large branch) or an archipelago of elements of knowledge
319 -- > (i.e., many small branches). The estimation of `lambda` is left to the researcher’s
320 -- > discretion in light of her own expertise and research questions, which makes
321 -- > any phylomemy an artifact of the researcher’s perception
323 -- For @(lambda == 0)@, only the `precision` counts, whereas for @(lambda == 1)@, only the `recall` counts.
325 Set (range, Cluster) ->
326 Set (range, Cluster) ->
327 Decimal.Arith Similarity
328 predictionMeasureF lambda relevantNodes retrievedNodes
329 | lambda == proba0 = probability precision
330 | lambda == proba1 = probability recall
331 | otherwise = probability $ precision * recall * (1 + betaSquare) / (recall + betaSquare * precision)
333 precision = cardinal relevantRetrievedNodes % cardinal retrievedNodes
334 recall = cardinal relevantRetrievedNodes % cardinal relevantNodes
335 relevantRetrievedNodes = Set.intersection relevantNodes retrievedNodes
336 lambdaDouble = lambda & Decimal.toScientificDecimal & toBoundedRealFloat @Double & fromLeft 1
337 -- ExplanationNote: the `tan` is just to spread `lambda`
338 -- Two commonly used values for β are:
339 -- - 2, which weighs recall higher than precision,
340 -- - and 0.5, which weighs recall lower than precision.
341 beta = tan (lambdaDouble * pi / 2)
342 betaSquare :: Rational
343 betaSquare = beta ^ (2 :: Int) & toRational