]> Git — Sourcephile - literate-phylomemy.git/blob - src/Phylomemy/TemporalMatching.hs
completeness(scale): add support for scale
[literate-phylomemy.git] / src / Phylomemy / TemporalMatching.hs
1 module Phylomemy.TemporalMatching where
2
3 -- import Debug.Pretty.Simple (pTraceShow)
4 import Control.Monad (Monad (..), foldM, forM_, unless)
5 import Control.Monad.ST qualified as ST
6 import Data.Bool (otherwise)
7 import Data.Either (fromLeft)
8 import Data.Eq (Eq (..))
9 import Data.Foldable (any, foldMap', toList)
10 import Data.Function (($), (&), (.))
11 import Data.Functor (Functor (..), (<$>), (<&>))
12 import Data.Int (Int)
13 import Data.List qualified as List
14 import Data.Map.Strict qualified as Map
15 import Data.Maybe (Maybe (..))
16 import Data.Ord (Ord (..))
17 import Data.Ord qualified as Ord
18 import Data.Ratio (Rational, (%))
19 import Data.Scientific (toBoundedRealFloat)
20 import Data.Semigroup (Semigroup (..))
21 import Data.Sequence (Seq)
22 import Data.Sequence qualified as Seq
23 import Data.Set (Set)
24 import Data.Set qualified as Set
25 import Data.Tree qualified as Tree
26 import Data.Tuple (uncurry)
27 import GHC.Stack (HasCallStack)
28 import Logic hiding ((/))
29 import Numeric.Decimal qualified as Decimal
30 import Numeric.Probability
31 import Text.Show (Show)
32 import Prelude (Double, Num (..), fromIntegral, pi, tan, toRational, (/), (^))
33
34 import Clustering.FrequentItemSet.BruteForce qualified as Clustering
35 import Clustering.UnionFind.ST qualified as UnionFind
36 import Phylomemy.Indexation
37
38 type Similarity = Probability
39
40 cardinal :: Num i => Set a -> i
41 cardinal = fromIntegral . Set.size
42
43 -- TODO: implement biased similarity from the paper
44 similarityJaccard :: Ord.Ord a => Set a -> Set a -> Similarity
45 similarityJaccard x y =
46 cardinal (Set.intersection x y) % cardinal (Set.union x y)
47 & probability
48 & Decimal.arithError
49
50 -- | A `MaximalSpanningTree` (MST) is one (possibly out of many)
51 -- tree spanning accross all the given `(range, cluster)` nodes,
52 -- with the maximal sum of edges' `Similarity` between them.
53 --
54 -- ExplanationNote: https://en.wikipedia.org/wiki/Minimum_spanning_tree
55 --
56 -- Viewing a phylomemy as a `MaximalSpanningTree`
57 -- is the crux of understanding how it is computed:
58 --
59 -- - the `mstMinimalSimilarity` is the next `Similarity`
60 -- that will split the `MaximalSpanningTree` into two or more `MaximalSpanningTree`s.
61 --
62 -- - it explains what the "scale of a phylomemy" is:
63 -- merging clusters of the same `range`
64 -- when they still belong to the same `MaximalSpanningTree`.
65 --
66 -- ImplementationNote: using a `Tree.Tree` to represent a `MaximalSpanningTree`
67 -- (instead of an adjacency edge map for instance)
68 -- is motivated by the need to implement `mstSplit`,
69 -- which needs to gather the `(range, cluster)` nodes
70 -- of each `MaximalSpanningTree` resulting from the cut,
71 -- because knowing that is required by `msfGlobalQuality`.
72 --
73 -- TODO: "Inadequacies of Minimum Spanning Trees in Molecular Epidemiology"
74 -- https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3187300/
75 --
76 -- TODO: "Divide-and-conquer based all spanning tree generation algorithm of a simple connected graph"
77 -- https://www.sciencedirect.com/science/article/abs/pii/S0304397521006952
78 --
79 -- WarningNote: "The tree of one percent"
80 -- https://link.springer.com/content/pdf/10.1186/gb-2006-7-10-118.pdf
81 type MaximalSpanningTree range cluster = Tree.Tree (MSTNode range cluster)
82
83 data MSTNode range cluster = MSTNode
84 { mstNodeSimilarity :: Similarity
85 -- ^ The `similarity` of the parent edge of this node
86 -- ie. of the only edge getting closer to the root node of the `MaximalSpanningTree`.
87 --
88 -- ImplementationNote: `maximalSpanningForest` puts a `Similarity` of `1` for the root node.
89 -- and `mstSplit` leaves the `mstMinimalSimilarity` of the edge before splitting out the `MaximalSpanningTree`.
90 , mstNodeRangeCluster :: (range, cluster)
91 }
92 deriving (Show)
93
94 -- | A (disjoint) forest of `MaximalSpanningTree`s,
95 -- indexed by the minimal @(range, cluster)@ node
96 -- of each `MaximalSpanningTree`.
97 type MaximalSpanningForest range cluster =
98 [MaximalSpanningTree range cluster]
99
100 -- | @(`maximalSpanningForest` similarity rangeToClusterToDocs)@
101 -- uses the Kruskal algorithm to find the trees spanning all
102 -- the given `(range, cluster)` nodes in `rangeToClusterToDocs`
103 -- with the maximal total `similarity` between them.
104 --
105 -- ExplanationNote: https://en.wikipedia.org/wiki/Kruskal's_algorithm
106 maximalSpanningForest ::
107 forall range cluster doc.
108 HasCallStack =>
109 Ord range =>
110 Ord cluster =>
111 {-similarityMeasure :::-} (cluster -> cluster -> Similarity) ->
112 {-clusters :::-} range :-> cluster :-> Seq (Clustering.Transaction Root doc) ->
113 MaximalSpanningForest range cluster
114 maximalSpanningForest similarity rangeToClusterToDocs = ST.runST do
115 -- DescriptionNote: create a `Point` for each given node
116 -- each Point containing a root node and a maximal spanning tree.
117 rangeToClusterToPoint :: range :-> cluster :-> UnionFind.Point s (MaximalSpanningTree range cluster) <-
118 rangeToClusterToDocs
119 & Map.traverseWithKey \srcR ->
120 Map.traverseWithKey \srcC _docs ->
121 UnionFind.fresh $
122 Tree.Node
123 MSTNode
124 { mstNodeSimilarity = proba1
125 , mstNodeRangeCluster = (srcR, srcC)
126 }
127 []
128 -- DescriptionNote: iterate from the greatest to the lowest similarity edge.
129 forM_ (allEdges & Map.toDescList) \(simil, edges) -> do
130 -- Iterate through all the edges of that `Similarity`.
131 forM_ (edges & toList) \((srcR, srcC), (dstR, dstC)) -> do
132 let srcPoint = rangeToClusterToPoint Map.! srcR Map.! srcC
133 let dstPoint = rangeToClusterToPoint Map.! dstR Map.! dstC
134 alreadyInSameMST <- UnionFind.equivalent srcPoint dstPoint
135 unless alreadyInSameMST do
136 -- DescriptionNote: the nodes of this edge (src -> dst) belong to the same `MaximalSpanningTree`.
137 UnionFind.unionWith srcPoint dstPoint \(Tree.Node srcRoot srcForest) (Tree.Node dstRoot dstForest) ->
138 return $
139 Tree.Node srcRoot $
140 Tree.Node
141 MSTNode
142 { mstNodeSimilarity = simil
143 , mstNodeRangeCluster = mstNodeRangeCluster dstRoot
144 }
145 dstForest
146 : srcForest
147 rootForest :: [MaximalSpanningTree range cluster] <-
148 rangeToClusterToPoint
149 -- DescriptionNote: collect all the Points.
150 & Map.elems
151 & List.concatMap Map.elems
152 -- DescriptionNote: keep only the `MaximalSpanningTree`s
153 -- contained in non-redundant `Point`s.
154 & (`foldM` []) \acc p -> do
155 isRedundant <- UnionFind.redundant p
156 if isRedundant
157 then return acc
158 else UnionFind.descriptor p <&> (: acc)
159 return rootForest
160 where
161 -- Order `rangeToClusterToDocs` by `Similarity`,
162 -- which will enable to add edges to the spanning tree in decreasing `Similarity`
163 -- and hence build *a* maximal spanning tree.
164 allEdges :: Similarity :-> Seq ((range, cluster), (range, cluster))
165 allEdges =
166 Map.unionsWith
167 (Seq.><)
168 [ Map.singleton (similarity srcC dstC) (Seq.singleton (src, dst))
169 | (srcR, srcClusterToDocuments) <- rangeToClusterToDocs & Map.toList
170 , (srcC, _docs) <- srcClusterToDocuments & Map.toList
171 , let src = (srcR, srcC)
172 , -- ExplanationNote: it does not matter whether lower or greater ranges are used for the destination nodes,
173 -- as they contain the very same undirected edges and thus `Similarity`,
174 -- hence would produce the same splitting `Similarity`s,
175 -- though not necessarily the same `MaximalSpanningTree`.
176 let (_, dstRangeToClusterToDocs) = Map.split srcR rangeToClusterToDocs
177 , (dstR, dstClusterToDocs) <- dstRangeToClusterToDocs & Map.toList
178 , (dstC, _docs) <- dstClusterToDocs & Map.toList
179 , let dst = (dstR, dstC)
180 ]
181
182 -- | "sea-level rise algorithm"
183 --
184 -- See: in `Phylomemy.References.RefDrawMeScience`,
185 -- « C.5 The sea-level rise algorithm and its implementation in Gargantext »
186 msfPrune ::
187 forall range roots predictionMeasure.
188 HasCallStack =>
189 Show range =>
190 Ord range =>
191 predictionMeasure ::: (Set (range, Cluster) -> Set (range, Cluster) -> Decimal.Arith Similarity) ->
192 roots ::: Set Root ->
193 MaximalSpanningForest range Cluster ->
194 MaximalSpanningForest range Cluster
195 msfPrune predictionMeasure roots =
196 loop (return proba0) []
197 where
198 loop ::
199 Decimal.Arith Probability ->
200 MaximalSpanningForest range Cluster ->
201 MaximalSpanningForest range Cluster ->
202 MaximalSpanningForest range Cluster
203 loop previousQuality doneBranches currentBranches =
204 -- pTraceShow (["msfPrune", "loop"], ("previousQuality", previousQuality), ("doneBranches", List.length doneBranches), ("todoBranches", List.length currentBranches)) $
205 case currentBranches of
206 [] -> doneBranches
207 currentBranch : todoBranches ->
208 let splitBranches = mstSplit currentBranch
209 in -- pTraceShow (["msfPrune", "loop", "splitBranches"], ("size", Map.size splitBranches)) $
210 if List.length splitBranches <= 1
211 then loop previousQuality (doneBranches <> splitBranches) todoBranches
212 else letName (fmap mstNodes $ (doneBranches <> splitBranches) <> todoBranches) \mstToNodes ->
213 let splitQuality = msfGlobalQuality predictionMeasure roots mstToNodes
214 in if Decimal.arithError previousQuality < Decimal.arithError splitQuality
215 then -- CorrectnessFixMe: these `Decimal.arithError` may raise "arithmetic overflow",
216 -- it has to be understood.
217 loop splitQuality doneBranches (splitBranches <> todoBranches)
218 else loop previousQuality (doneBranches <> splitBranches) todoBranches
219
220 -- | @(`mstSplit` mst)@ returns the `MaximalSpanningTree`s
221 -- resulting from pruning out the given `MaximalSpanningTree`
222 -- of all the edges having the minimal `Similarity` of that `MaximalSpanningTree`.
223 mstSplit ::
224 forall range cluster.
225 HasCallStack =>
226 Ord range =>
227 Ord cluster =>
228 MaximalSpanningTree range cluster ->
229 MaximalSpanningForest range cluster
230 mstSplit mst =
231 -- TODO: take minSimil as argument to avoid recomputing it when already known
232 case mstMinimalSimilarity mst of
233 Nothing -> [mst]
234 Just minSimil -> cutMerge mst
235 where
236 cutMerge = uncurry (:) . cut
237 cut ::
238 MaximalSpanningTree range cluster ->
239 (MaximalSpanningTree range cluster, [MaximalSpanningTree range cluster])
240 cut (Tree.Node node children) =
241 let (keptChildren, cutChildren) =
242 children & List.partition \tree ->
243 minSimil < mstNodeSimilarity (Tree.rootLabel tree)
244 in let cutChildrenRoots = cutChildren & List.concatMap cutMerge
245 in let (keptChildrenForest, cutChildrenTree) = List.unzip (cut <$> keptChildren)
246 in let cutChildrenTreeRoots = cutChildrenTree & List.concat & List.concatMap cutMerge
247 in ( Tree.Node node keptChildrenForest
248 , (cutChildrenRoots List.++ cutChildrenTreeRoots)
249 -- WarningNote: the root nodes of those new `MaximalSpanningTree`s
250 -- keep their `mstNodeSimilarity` to their cutting value,
251 -- which is lower than their `mstMinimalSimilarity`.
252 )
253
254 mstMinimalSimilarity :: MaximalSpanningTree range cluster -> Maybe Similarity
255 mstMinimalSimilarity (Tree.Node _rootNode rootBranches)
256 | List.null rootBranches = Nothing
257 | otherwise =
258 -- ExplanationNote: the root node of a `MaximalSpanningTree`,
259 -- being a root node, does not have a parent,
260 -- hence its `mstNodeSimilarity` must be ignored.
261 Just $
262 List.minimum $
263 rootBranches
264 <&> Tree.foldTree \node accs ->
265 List.minimum $ (mstNodeSimilarity node) : accs
266
267 -- | @(`mstSplittingSimilarities` mst)@
268 -- returns the `Similarity`s causing the given `MaximalSpanningForest`
269 -- to split into further more `MaximalSpanningForest`.
270 mstSplittingSimilarities :: MaximalSpanningTree range cluster -> Set Similarity
271 mstSplittingSimilarities (Tree.Node _rootNode rootBranches)
272 | List.null rootBranches = Set.empty
273 | otherwise =
274 rootBranches
275 & foldMap'
276 ( Tree.foldTree \node accs ->
277 Set.unions (Set.singleton (mstNodeSimilarity node) : accs)
278 )
279
280 type Scale = Int -- > 0
281
282 -- | @(`mstScales` msf)@
283 -- returns the successive results of calling `mstSplit` on the given `MaximalSpanningTree`s in @(msf)@
284 -- until each one of them is a single node.
285 -- Even though the greatest `Scale`s may be different for each `MaximalSpanningTree` of @(msf)@,
286 -- the result are grouped by `Scale` then by `MaximalSpanningTree`,
287 -- which enabled to have a maximal `Scale` for the whole `msf`.
288 mstScales ::
289 forall range cluster.
290 Ord range =>
291 Ord cluster =>
292 MaximalSpanningForest range cluster ->
293 Scale :-> {-mst-} Int :-> MaximalSpanningForest range cluster
294 mstScales initMSF =
295 Map.unionsWith
296 (Map.unionWith (List.++))
297 [ Map.fromDistinctAscList
298 [ (scale, Map.singleton mstI msf)
299 | (scale, msf) <- scaleToMsf
300 ]
301 | (mstI, mst) <- initMSF & List.zip [1 :: Int ..]
302 , let scaleToMsf :: [(Scale, MaximalSpanningForest range cluster)] =
303 -- scanl :: (acc -> scale -> acc) -> acc -> [scale] -> [acc]
304 splitSimils
305 & Map.keys
306 & List.scanl'
307 (\(scale, msf) _minSimil -> (scale + 1, msf >>= mstSplit))
308 (1, [mst])
309 ]
310 where
311 splitSimils :: Similarity :-> () =
312 Map.unions
313 [ mst
314 & Tree.foldTree \node accs ->
315 Map.unionsWith
316 (\() () -> ())
317 (Map.singleton (mstNodeSimilarity node) () : accs)
318 | mst <- initMSF
319 ]
320
321 -- | @(`mstNodes` mst)@ returns the nodes of the given @(mst)@.
322 mstNodes ::
323 HasCallStack =>
324 Ord range =>
325 Ord cluster =>
326 MaximalSpanningTree range cluster ->
327 Set (range, cluster)
328 mstNodes = foldMap' (Set.singleton . mstNodeRangeCluster)
329
330 -- | The global quality of a `Phylomemy`.
331 --
332 -- DescriptionNote: `msfGlobalQuality` counts the `(range, Cluster)` nodes of a branch
333 -- but that a `(range, Cluster)` in itself can gather several documents.
334 msfGlobalQuality ::
335 HasCallStack =>
336 Ord range =>
337 predictionMeasure ::: (Set (range, Cluster) -> Set (range, Cluster) -> Decimal.Arith Similarity) ->
338 roots ::: Set Root ->
339 mstToNodes ::: [Set (range, Cluster)] ->
340 Decimal.Arith Similarity
341 msfGlobalQuality (Named predictionMeasure) (Named roots) (Named mstToNodes) =
342 List.sum
343 [ probability (1 % cardinal roots)
344 * List.sum
345 [ probability (cardinal retrievedNodes % nodesTotal)
346 * predictionMeasure relevantNodes retrievedNodes
347 | retrievedNodes <- branches
348 ]
349 | root <- roots & toList
350 , -- CorrectnessNote: ignore branches not containing any requested `root`
351 let branches = mstToNodes & List.filter (any (\(_r, c) -> Set.member root c))
352 , let nodesTotal = branches & List.map (cardinal @Int) & List.sum & fromIntegral
353 , let relevantNodes = branches & foldMap' (Set.filter (\(_r, c) -> Set.member root c))
354 ]
355
356 -- | L'idée de la F-mesure est de s'assurer qu'un classificateur fait de bonnes
357 -- prédictions de la classe pertinente (bonne précision)
358 -- en suffisamment grand nombre (bon rappel)
359 -- sur un jeu de données cible.
360 --
361 -- Tout comme la précision et le rappel,
362 -- la F-mesure varie de 0 (plus mauvaise valeur)
363 -- à 1 (meilleure valeur possible).
364 --
365 -- ExplanationNote: https://en.wikipedia.org/wiki/F-score
366 predictionMeasureF ::
367 HasCallStack =>
368 Ord range =>
369 {-lambda :::-}
370
371 -- | Trade-off between `precision` and `recall`.
372 -- See https://en.wikipedia.org/wiki/Precision_and_recall
373 --
374 -- > [It] predetermine[s] the desired shape of the phylomemy: a continent
375 -- > (i.e., one large branch) or an archipelago of elements of knowledge
376 -- > (i.e., many small branches). The estimation of `lambda` is left to the researcher’s
377 -- > discretion in light of her own expertise and research questions, which makes
378 -- > any phylomemy an artifact of the researcher’s perception
379 --
380 -- For @(lambda == 0)@, only the `precision` counts, whereas for @(lambda == 1)@, only the `recall` counts.
381 Similarity ->
382 Set (range, Cluster) ->
383 Set (range, Cluster) ->
384 Decimal.Arith Similarity
385 predictionMeasureF lambda relevantNodes retrievedNodes
386 | lambda == proba0 = probability precision
387 | lambda == proba1 = probability recall
388 | otherwise = probability $ precision * recall * (1 + betaSquare) / (recall + betaSquare * precision)
389 where
390 precision = cardinal relevantRetrievedNodes % cardinal retrievedNodes
391 recall = cardinal relevantRetrievedNodes % cardinal relevantNodes
392 relevantRetrievedNodes = Set.intersection relevantNodes retrievedNodes
393 lambdaDouble = lambda & Decimal.toScientificDecimal & toBoundedRealFloat @Double & fromLeft 1
394 -- ExplanationNote: the `tan` is just to spread the `lambda ∈ [0,1]` from -∞ to +∞
395 -- Two commonly used values for β are:
396 -- - 2, which weighs recall higher than precision,
397 -- - and 0.5, which weighs recall lower than precision.
398 beta = tan (lambdaDouble * pi / 2)
399 betaSquare :: Rational
400 betaSquare = beta ^ (2 :: Int) & toRational