1 module Phylomemy.TemporalMatching where
3 -- import Debug.Pretty.Simple (pTraceShow)
4 import Control.Monad (Monad (..), foldM, forM_, unless)
5 import Control.Monad.ST qualified as ST
6 import Data.Bool (otherwise)
7 import Data.Either (fromLeft)
8 import Data.Eq (Eq (..))
9 import Data.Foldable (any, foldMap', toList)
10 import Data.Function (($), (&), (.))
11 import Data.Functor (Functor (..), (<$>), (<&>))
13 import Data.List qualified as List
14 import Data.Map.Strict qualified as Map
15 import Data.Maybe (Maybe (..))
16 import Data.Ord (Ord (..))
17 import Data.Ord qualified as Ord
18 import Data.Ratio (Rational, (%))
19 import Data.Scientific (toBoundedRealFloat)
20 import Data.Semigroup (Semigroup (..))
21 import Data.Sequence (Seq)
22 import Data.Sequence qualified as Seq
24 import Data.Set qualified as Set
25 import Data.Tree qualified as Tree
26 import Data.Tuple (uncurry)
27 import GHC.Stack (HasCallStack)
28 import Logic hiding ((/))
29 import Numeric.Decimal qualified as Decimal
30 import Numeric.Probability
31 import Text.Show (Show)
32 import Prelude (Double, Num (..), fromIntegral, pi, tan, toRational, (/), (^))
34 import Clustering.FrequentItemSet.BruteForce qualified as Clustering
35 import Clustering.UnionFind.ST qualified as UnionFind
36 import Phylomemy.Indexation
38 type Similarity = Probability
40 cardinal :: Num i => Set a -> i
41 cardinal = fromIntegral . Set.size
43 -- TODO: implement biased similarity from the paper
44 similarityJaccard :: Ord.Ord a => Set a -> Set a -> Similarity
45 similarityJaccard x y =
46 cardinal (Set.intersection x y) % cardinal (Set.union x y)
50 -- | A `MaximalSpanningTree` (MST) is one (possibly out of many)
51 -- tree spanning accross all the given `(range, cluster)` nodes,
52 -- with the maximal sum of edges' `Similarity` between them.
54 -- ExplanationNote: https://en.wikipedia.org/wiki/Minimum_spanning_tree
56 -- Viewing a phylomemy as a `MaximalSpanningTree`
57 -- is the crux of understanding how it is computed:
59 -- - the `mstMinimalSimilarity` is the next `Similarity`
60 -- that will split the `MaximalSpanningTree` into two or more `MaximalSpanningTree`s.
62 -- - it explains what the "scale of a phylomemy" is:
63 -- merging clusters of the same `range`
64 -- when they still belong to the same `MaximalSpanningTree`.
66 -- ImplementationNote: using a `Tree.Tree` to represent a `MaximalSpanningTree`
67 -- (instead of an adjacency edge map for instance)
68 -- is motivated by the need to implement `mstSplit`,
69 -- which needs to gather the `(range, cluster)` nodes
70 -- of each `MaximalSpanningTree` resulting from the cut,
71 -- because knowing that is required by `msfGlobalQuality`.
73 -- TODO: "Inadequacies of Minimum Spanning Trees in Molecular Epidemiology"
74 -- https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3187300/
76 -- TODO: "Divide-and-conquer based all spanning tree generation algorithm of a simple connected graph"
77 -- https://www.sciencedirect.com/science/article/abs/pii/S0304397521006952
79 -- WarningNote: "The tree of one percent"
80 -- https://link.springer.com/content/pdf/10.1186/gb-2006-7-10-118.pdf
81 type MaximalSpanningTree range cluster = Tree.Tree (MSTNode range cluster)
83 data MSTNode range cluster = MSTNode
84 { mstNodeSimilarity :: Similarity
85 -- ^ The `similarity` of the parent edge of this node
86 -- ie. of the only edge getting closer to the root node of the `MaximalSpanningTree`.
88 -- ImplementationNote: `maximalSpanningForest` puts a `Similarity` of `1` for the root node.
89 -- and `mstSplit` leaves the `mstMinimalSimilarity` of the edge before splitting out the `MaximalSpanningTree`.
90 , mstNodeRangeCluster :: (range, cluster)
94 -- | A (disjoint) forest of `MaximalSpanningTree`s,
95 -- indexed by the minimal @(range, cluster)@ node
96 -- of each `MaximalSpanningTree`.
97 type MaximalSpanningForest range cluster =
98 [MaximalSpanningTree range cluster]
100 -- | @(`maximalSpanningForest` similarity rangeToClusterToDocs)@
101 -- uses the Kruskal algorithm to find the trees spanning all
102 -- the given `(range, cluster)` nodes in `rangeToClusterToDocs`
103 -- with the maximal total `similarity` between them.
105 -- ExplanationNote: https://en.wikipedia.org/wiki/Kruskal's_algorithm
106 maximalSpanningForest ::
107 forall range cluster doc.
111 {-similarityMeasure :::-} (cluster -> cluster -> Similarity) ->
112 {-clusters :::-} range :-> cluster :-> Seq (Clustering.Transaction Root doc) ->
113 MaximalSpanningForest range cluster
114 maximalSpanningForest similarity rangeToClusterToDocs = ST.runST do
115 -- DescriptionNote: create a `Point` for each given node
116 -- each Point containing a root node and a maximal spanning tree.
117 rangeToClusterToPoint :: range :-> cluster :-> UnionFind.Point s (MaximalSpanningTree range cluster) <-
119 & Map.traverseWithKey \srcR ->
120 Map.traverseWithKey \srcC _docs ->
124 { mstNodeSimilarity = proba1
125 , mstNodeRangeCluster = (srcR, srcC)
128 -- DescriptionNote: iterate from the greatest to the lowest similarity edge.
129 forM_ (allEdges & Map.toDescList) \(simil, edges) -> do
130 -- Iterate through all the edges of that `Similarity`.
131 forM_ (edges & toList) \((srcR, srcC), (dstR, dstC)) -> do
132 let srcPoint = rangeToClusterToPoint Map.! srcR Map.! srcC
133 let dstPoint = rangeToClusterToPoint Map.! dstR Map.! dstC
134 alreadyInSameMST <- UnionFind.equivalent srcPoint dstPoint
135 unless alreadyInSameMST do
136 -- DescriptionNote: the nodes of this edge (src -> dst) belong to the same `MaximalSpanningTree`.
137 UnionFind.unionWith srcPoint dstPoint \(Tree.Node srcRoot srcForest) (Tree.Node dstRoot dstForest) ->
142 { mstNodeSimilarity = simil
143 , mstNodeRangeCluster = mstNodeRangeCluster dstRoot
147 rootForest :: [MaximalSpanningTree range cluster] <-
148 rangeToClusterToPoint
149 -- DescriptionNote: collect all the Points.
151 & List.concatMap Map.elems
152 -- DescriptionNote: keep only the `MaximalSpanningTree`s
153 -- contained in non-redundant `Point`s.
154 & (`foldM` []) \acc p -> do
155 isRedundant <- UnionFind.redundant p
158 else UnionFind.descriptor p <&> (: acc)
161 -- Order `rangeToClusterToDocs` by `Similarity`,
162 -- which will enable to add edges to the spanning tree in decreasing `Similarity`
163 -- and hence build *a* maximal spanning tree.
164 allEdges :: Similarity :-> Seq ((range, cluster), (range, cluster))
168 [ Map.singleton (similarity srcC dstC) (Seq.singleton (src, dst))
169 | (srcR, srcClusterToDocuments) <- rangeToClusterToDocs & Map.toList
170 , (srcC, _docs) <- srcClusterToDocuments & Map.toList
171 , let src = (srcR, srcC)
172 , -- ExplanationNote: it does not matter whether lower or greater ranges are used for the destination nodes,
173 -- as they contain the very same undirected edges and thus `Similarity`,
174 -- hence would produce the same splitting `Similarity`s,
175 -- though not necessarily the same `MaximalSpanningTree`.
176 let (_, dstRangeToClusterToDocs) = Map.split srcR rangeToClusterToDocs
177 , (dstR, dstClusterToDocs) <- dstRangeToClusterToDocs & Map.toList
178 , (dstC, _docs) <- dstClusterToDocs & Map.toList
179 , let dst = (dstR, dstC)
182 -- | "sea-level rise algorithm"
184 -- See: in `Phylomemy.References.RefDrawMeScience`,
185 -- « C.5 The sea-level rise algorithm and its implementation in Gargantext »
187 forall range roots predictionMeasure.
191 predictionMeasure ::: (Set (range, Cluster) -> Set (range, Cluster) -> Decimal.Arith Similarity) ->
192 roots ::: Set Root ->
193 MaximalSpanningForest range Cluster ->
194 MaximalSpanningForest range Cluster
195 msfPrune predictionMeasure roots =
196 loop (return proba0) []
199 Decimal.Arith Probability ->
200 MaximalSpanningForest range Cluster ->
201 MaximalSpanningForest range Cluster ->
202 MaximalSpanningForest range Cluster
203 loop previousQuality doneBranches currentBranches =
204 -- pTraceShow (["msfPrune", "loop"], ("previousQuality", previousQuality), ("doneBranches", List.length doneBranches), ("todoBranches", List.length currentBranches)) $
205 case currentBranches of
207 currentBranch : todoBranches ->
208 let splitBranches = mstSplit currentBranch
209 in -- pTraceShow (["msfPrune", "loop", "splitBranches"], ("size", Map.size splitBranches)) $
210 if List.length splitBranches <= 1
211 then loop previousQuality (doneBranches <> splitBranches) todoBranches
212 else letName (fmap mstNodes $ (doneBranches <> splitBranches) <> todoBranches) \mstToNodes ->
213 let splitQuality = msfGlobalQuality predictionMeasure roots mstToNodes
214 in if Decimal.arithError previousQuality < Decimal.arithError splitQuality
215 then -- CorrectnessFixMe: these `Decimal.arithError` may raise "arithmetic overflow",
216 -- it has to be understood.
217 loop splitQuality doneBranches (splitBranches <> todoBranches)
218 else loop previousQuality (doneBranches <> splitBranches) todoBranches
220 -- | @(`mstSplit` mst)@ returns the `MaximalSpanningTree`s
221 -- resulting from pruning out the given `MaximalSpanningTree`
222 -- of all the edges having the minimal `Similarity` of that `MaximalSpanningTree`.
224 forall range cluster.
228 MaximalSpanningTree range cluster ->
229 MaximalSpanningForest range cluster
231 -- TODO: take minSimil as argument to avoid recomputing it when already known
232 case mstMinimalSimilarity mst of
234 Just minSimil -> cutMerge mst
236 cutMerge = uncurry (:) . cut
238 MaximalSpanningTree range cluster ->
239 (MaximalSpanningTree range cluster, [MaximalSpanningTree range cluster])
240 cut (Tree.Node node children) =
241 let (keptChildren, cutChildren) =
242 children & List.partition \tree ->
243 minSimil < mstNodeSimilarity (Tree.rootLabel tree)
244 in let cutChildrenRoots = cutChildren & List.concatMap cutMerge
245 in let (keptChildrenForest, cutChildrenTree) = List.unzip (cut <$> keptChildren)
246 in let cutChildrenTreeRoots = cutChildrenTree & List.concat & List.concatMap cutMerge
247 in ( Tree.Node node keptChildrenForest
248 , (cutChildrenRoots List.++ cutChildrenTreeRoots)
249 -- WarningNote: the root nodes of those new `MaximalSpanningTree`s
250 -- keep their `mstNodeSimilarity` to their cutting value,
251 -- which is lower than their `mstMinimalSimilarity`.
254 mstMinimalSimilarity :: MaximalSpanningTree range cluster -> Maybe Similarity
255 mstMinimalSimilarity (Tree.Node _rootNode rootBranches)
256 | List.null rootBranches = Nothing
258 -- ExplanationNote: the root node of a `MaximalSpanningTree`,
259 -- being a root node, does not have a parent,
260 -- hence its `mstNodeSimilarity` must be ignored.
264 <&> Tree.foldTree \node accs ->
265 List.minimum $ (mstNodeSimilarity node) : accs
267 -- | @(`mstSplittingSimilarities` mst)@
268 -- returns the `Similarity`s causing the given `MaximalSpanningForest`
269 -- to split into further more `MaximalSpanningForest`.
270 mstSplittingSimilarities :: MaximalSpanningTree range cluster -> Set Similarity
271 mstSplittingSimilarities (Tree.Node _rootNode rootBranches)
272 | List.null rootBranches = Set.empty
276 ( Tree.foldTree \node accs ->
277 Set.unions (Set.singleton (mstNodeSimilarity node) : accs)
280 type Scale = Int -- > 0
282 -- | @(`mstScales` msf)@
283 -- returns the successive results of calling `mstSplit` on the given `MaximalSpanningTree`s in @(msf)@
284 -- until each one of them is a single node.
285 -- Even though the greatest `Scale`s may be different for each `MaximalSpanningTree` of @(msf)@,
286 -- the result are grouped by `Scale` then by `MaximalSpanningTree`,
287 -- which enabled to have a maximal `Scale` for the whole `msf`.
289 forall range cluster.
292 MaximalSpanningForest range cluster ->
293 Scale :-> {-mst-} Int :-> MaximalSpanningForest range cluster
296 (Map.unionWith (List.++))
297 [ Map.fromDistinctAscList
298 [ (scale, Map.singleton mstI msf)
299 | (scale, msf) <- scaleToMsf
301 | (mstI, mst) <- initMSF & List.zip [1 :: Int ..]
302 , let scaleToMsf :: [(Scale, MaximalSpanningForest range cluster)] =
303 -- scanl :: (acc -> scale -> acc) -> acc -> [scale] -> [acc]
307 (\(scale, msf) _minSimil -> (scale + 1, msf >>= mstSplit))
311 splitSimils :: Similarity :-> () =
314 & Tree.foldTree \node accs ->
317 (Map.singleton (mstNodeSimilarity node) () : accs)
321 -- | @(`mstNodes` mst)@ returns the nodes of the given @(mst)@.
326 MaximalSpanningTree range cluster ->
328 mstNodes = foldMap' (Set.singleton . mstNodeRangeCluster)
330 -- | The global quality of a `Phylomemy`.
332 -- DescriptionNote: `msfGlobalQuality` counts the `(range, Cluster)` nodes of a branch
333 -- but that a `(range, Cluster)` in itself can gather several documents.
337 predictionMeasure ::: (Set (range, Cluster) -> Set (range, Cluster) -> Decimal.Arith Similarity) ->
338 roots ::: Set Root ->
339 mstToNodes ::: [Set (range, Cluster)] ->
340 Decimal.Arith Similarity
341 msfGlobalQuality (Named predictionMeasure) (Named roots) (Named mstToNodes) =
343 [ probability (1 % cardinal roots)
345 [ probability (cardinal retrievedNodes % nodesTotal)
346 * predictionMeasure relevantNodes retrievedNodes
347 | retrievedNodes <- branches
349 | root <- roots & toList
350 , -- CorrectnessNote: ignore branches not containing any requested `root`
351 let branches = mstToNodes & List.filter (any (\(_r, c) -> Set.member root c))
352 , let nodesTotal = branches & List.map (cardinal @Int) & List.sum & fromIntegral
353 , let relevantNodes = branches & foldMap' (Set.filter (\(_r, c) -> Set.member root c))
356 -- | L'idée de la F-mesure est de s'assurer qu'un classificateur fait de bonnes
357 -- prédictions de la classe pertinente (bonne précision)
358 -- en suffisamment grand nombre (bon rappel)
359 -- sur un jeu de données cible.
361 -- Tout comme la précision et le rappel,
362 -- la F-mesure varie de 0 (plus mauvaise valeur)
363 -- à 1 (meilleure valeur possible).
365 -- ExplanationNote: https://en.wikipedia.org/wiki/F-score
366 predictionMeasureF ::
371 -- | Trade-off between `precision` and `recall`.
372 -- See https://en.wikipedia.org/wiki/Precision_and_recall
374 -- > [It] predetermine[s] the desired shape of the phylomemy: a continent
375 -- > (i.e., one large branch) or an archipelago of elements of knowledge
376 -- > (i.e., many small branches). The estimation of `lambda` is left to the researcher’s
377 -- > discretion in light of her own expertise and research questions, which makes
378 -- > any phylomemy an artifact of the researcher’s perception
380 -- For @(lambda == 0)@, only the `precision` counts, whereas for @(lambda == 1)@, only the `recall` counts.
382 Set (range, Cluster) ->
383 Set (range, Cluster) ->
384 Decimal.Arith Similarity
385 predictionMeasureF lambda relevantNodes retrievedNodes
386 | lambda == proba0 = probability precision
387 | lambda == proba1 = probability recall
388 | otherwise = probability $ precision * recall * (1 + betaSquare) / (recall + betaSquare * precision)
390 precision = cardinal relevantRetrievedNodes % cardinal retrievedNodes
391 recall = cardinal relevantRetrievedNodes % cardinal relevantNodes
392 relevantRetrievedNodes = Set.intersection relevantNodes retrievedNodes
393 lambdaDouble = lambda & Decimal.toScientificDecimal & toBoundedRealFloat @Double & fromLeft 1
394 -- ExplanationNote: the `tan` is just to spread the `lambda ∈ [0,1]` from -∞ to +∞
395 -- Two commonly used values for β are:
396 -- - 2, which weighs recall higher than precision,
397 -- - and 0.5, which weighs recall lower than precision.
398 beta = tan (lambdaDouble * pi / 2)
399 betaSquare :: Rational
400 betaSquare = beta ^ (2 :: Int) & toRational