]> Git — Sourcephile - literate-phylomemy.git/blob - src/Phylomemy/TemporalMatching.hs
f26ebe8839e3a6c1ffafaf15b251b80a8b8f3949
[literate-phylomemy.git] / src / Phylomemy / TemporalMatching.hs
1 module Phylomemy.TemporalMatching where
2
3 -- import Data.Traversable (traverse)
4 -- import Debug.Pretty.Simple (pTraceShow, pTraceShowId)
5 import Control.Monad (Monad (..), foldM, forM_, unless)
6 import Control.Monad.ST qualified as ST
7 import Data.Bool (otherwise)
8 import Data.Either (fromLeft)
9 import Data.Eq (Eq (..))
10 import Data.Foldable (any, foldMap', toList)
11 import Data.Function (($), (&), (.))
12 import Data.Functor (Functor (..), (<$>), (<&>))
13 import Data.Int (Int)
14 import Data.List qualified as List
15 import Data.Map.Strict qualified as Map
16 import Data.Maybe (Maybe (..))
17 import Data.Ord (Ord (..))
18 import Data.Ord qualified as Ord
19 import Data.Ratio (Rational, (%))
20 import Data.Scientific (toBoundedRealFloat)
21 import Data.Semigroup (Semigroup (..))
22 import Data.Sequence (Seq)
23 import Data.Sequence qualified as Seq
24 import Data.Set (Set)
25 import Data.Set qualified as Set
26 import Data.Tree qualified as Tree
27 import Data.Tuple (uncurry)
28 import GHC.Stack (HasCallStack)
29 import Logic hiding ((/))
30 import Numeric.Decimal qualified as Decimal
31 import Numeric.Probability
32 import Text.Show (Show)
33 import Prelude (Double, Num (..), fromIntegral, pi, tan, toRational, (/), (^))
34
35 import Clustering.FrequentItemSet.BruteForce qualified as Clustering
36 import Clustering.UnionFind.ST qualified as UnionFind
37 import Phylomemy.Indexation
38
39 type Similarity = Probability
40
41 cardinal :: Num i => Set a -> i
42 cardinal = fromIntegral . Set.size
43
44 -- TODO: implement biased similarity from the paper
45 similarityJaccard :: Ord.Ord a => Set a -> Set a -> Similarity
46 similarityJaccard x y =
47 cardinal (Set.intersection x y) % cardinal (Set.union x y)
48 & probability
49 & Decimal.arithError
50
51 -- | A `MaximalSpanningTree` (MST) is one (possibly out of many)
52 -- tree spanning accross all the given `(range, cluster)` nodes,
53 -- with the maximal sum of edges' `Similarity` between them.
54 --
55 -- ExplanationNote: https://en.wikipedia.org/wiki/Minimum_spanning_tree
56 --
57 -- Viewing a phylomemy as a `MaximalSpanningTree`
58 -- is the crux of understanding how it is computed:
59 --
60 -- - the `mstMinimalSimilarity` is the next `Similarity`
61 -- that will split the `MaximalSpanningTree` into two or more `MaximalSpanningForest`.
62 --
63 -- - it explains what the "scale of a phylomemy" is:
64 -- merging clusters of the same range and same `MaximalSpanningTree`
65 -- when they still belong to the same `MaximalSpanningTree`.
66 --
67 -- ImplementationNote: using a `Tree.Tree` to represent a `MaximalSpanningTree`
68 -- (instead of an adjacency edge map for instance)
69 -- is motivated by the need to implement `mstSplit`,
70 -- which needs to gather the `(range, cluster)` nodes
71 -- of the `MaximalSpanningForest` resulting from the cut,
72 -- which will then be filtered by `msfGlobalQuality`.
73 --
74 -- TODO: "Inadequacies of Minimum Spanning Trees in Molecular Epidemiology"
75 -- https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3187300/
76 --
77 -- TODO: "Divide-and-conquer based all spanning tree generation algorithm of a simple connected graph"
78 -- https://www.sciencedirect.com/science/article/abs/pii/S0304397521006952
79 --
80 -- WarningNote: "The tree of one percent"
81 -- https://link.springer.com/content/pdf/10.1186/gb-2006-7-10-118.pdf
82 type MaximalSpanningTree range cluster = Tree.Tree (MSTNode range cluster)
83
84 data MSTNode range cluster = MSTNode
85 { mstNodeSimilarity :: Similarity
86 -- ^ The `similarity` of the parent edge of this node
87 -- ie. of the only edge getting closer to the root node of the `MaximalSpanningTree`.
88 --
89 -- ImplementationNote: `maximalSpanningForest` puts a `Similarity` of `1` for the root node.
90 -- and `mstSplit` leaves the `mstMinimalSimilarity` of the edge before splitting out the `MaximalSpanningTree`.
91 , mstNodeRangeCluster :: (range, cluster)
92 -- , nodeMSTMinSimilarity :: Min Similarity
93 }
94 deriving (Show)
95
96 -- | A (disjoint) forest of `MaximalSpanningTree`s,
97 -- indexed by the minimal @(range, cluster)@ node
98 -- of each `MaximalSpanningTree`.
99 type MaximalSpanningForest range cluster =
100 [MaximalSpanningTree range cluster]
101
102 -- | @(`maximalSpanningForest` allSimils)@
103 -- uses the Kruskal algorithm to find the maximal spanning trees
104 -- of the given `AllSimilarities`.
105 --
106 -- ExplanationNote: https://en.wikipedia.org/wiki/Kruskal's_algorithm
107 maximalSpanningForest ::
108 forall range cluster doc.
109 Ord range =>
110 Ord cluster =>
111 {-similarityMeasure :::-} (cluster -> cluster -> Similarity) ->
112 {-clusters :::-} range :-> cluster :-> Seq (Clustering.Transaction Root doc) ->
113 MaximalSpanningForest range cluster
114 maximalSpanningForest similarity rangeToClusterToDocs = ST.runST do
115 -- DescriptionNote: create a `Point` for each given node
116 -- each Point containing a root node and a maximal spanning tree.
117 rangeToClusterToPoint :: range :-> cluster :-> UnionFind.Point s (MaximalSpanningTree range cluster) <-
118 rangeToClusterToDocs
119 & Map.traverseWithKey \srcR ->
120 Map.traverseWithKey \srcC _docs ->
121 UnionFind.fresh $
122 Tree.Node
123 MSTNode
124 { mstNodeSimilarity = proba1
125 , mstNodeRangeCluster = (srcR, srcC)
126 }
127 []
128 -- DescriptionNote: iterate from the greatest to the lowest similarity edge.
129 forM_ (allEdges & Map.toDescList) \(simil, edges) -> do
130 -- Iterate through all the edges of that `Similarity`.
131 forM_ (edges & toList) \((srcR, srcC), (dstR, dstC)) -> do
132 let srcPoint = rangeToClusterToPoint Map.! srcR Map.! srcC
133 let dstPoint = rangeToClusterToPoint Map.! dstR Map.! dstC
134 alreadyInSameMST <- UnionFind.equivalent srcPoint dstPoint
135 unless alreadyInSameMST do
136 -- DescriptionNote: the nodes of this edge (src -> dst) belong to the same `MaximalSpanningTree`.
137 UnionFind.unionWith srcPoint dstPoint \(Tree.Node srcRoot srcForest) (Tree.Node dstRoot dstForest) ->
138 return $
139 Tree.Node srcRoot $
140 Tree.Node
141 MSTNode
142 { mstNodeSimilarity = simil
143 , mstNodeRangeCluster = mstNodeRangeCluster dstRoot
144 }
145 dstForest
146 : srcForest
147 rootForest :: [MaximalSpanningTree range cluster] <-
148 rangeToClusterToPoint
149 -- DescriptionNote: collect all the Points.
150 & Map.elems
151 & List.concatMap Map.elems
152 -- DescriptionNote: keep only the `MaximalSpanningTree`s
153 -- contained in non-redundant `Point`s.
154 & (`foldM` []) \acc p -> do
155 isRedundant <- UnionFind.redundant p
156 if isRedundant
157 then return acc
158 else UnionFind.descriptor p <&> (: acc)
159 return rootForest
160 where
161 -- Order `rangeToClusterToDocs` by `Similarity`,
162 -- which will enable to add edges to the spanning tree in decreasing `Similarity`
163 -- and hence build *a* maximal spanning tree.
164 allEdges :: Similarity :-> Seq ((range, cluster), (range, cluster))
165 allEdges =
166 Map.unionsWith
167 (Seq.><)
168 [ Map.singleton (similarity srcC dstC) (Seq.singleton (src, dst))
169 | (srcR, srcClusterToDocuments) <- rangeToClusterToDocs & Map.toList
170 , (srcC, _docs) <- srcClusterToDocuments & Map.toList
171 , let src = (srcR, srcC)
172 , -- ExplanationNote: it does not matter whether lower or greater ranges are used for the destination nodes,
173 -- as they contain the very same undirected edges and thus `Similarity`,
174 -- hence would produce the same splitting `Similarity`s,
175 -- though not necessarily the same `MaximalSpanningTree`.
176 let (_, dstRangeToClusterToDocs) = Map.split srcR rangeToClusterToDocs
177 , (dstR, dstClusterToDocs) <- dstRangeToClusterToDocs & Map.toList
178 , (dstC, _docs) <- dstClusterToDocs & Map.toList
179 , let dst = (dstR, dstC)
180 ]
181
182 -- | "sea-level rise algorithm"
183 --
184 -- See: in `Phylomemy.References.RefDrawMeScience`,
185 -- « C.5 The sea-level rise algorithm and its implementation in Gargantext »
186 msfSplit ::
187 HasCallStack =>
188 forall range roots predictionMeasure.
189 Show range =>
190 Ord range =>
191 predictionMeasure ::: (Set (range, Cluster) -> Set (range, Cluster) -> Decimal.Arith Similarity) ->
192 roots ::: Set Root ->
193 MaximalSpanningForest range Cluster ->
194 MaximalSpanningForest range Cluster
195 msfSplit predictionMeasure roots =
196 loop (return proba0) []
197 where
198 loop previousQuality doneBranches currentBranches =
199 -- pTraceShow (["msfSplit", "loop"], ("previousQuality", previousQuality), ("doneBranches", List.length doneBranches), ("todoBranches", List.length currentBranches)) $
200 case currentBranches of
201 [] -> doneBranches
202 currentBranch : todoBranches ->
203 let splitBranches = mstSplit currentBranch
204 in -- pTraceShow (["msfSplit", "loop", "splitBranches"], ("size", Map.size splitBranches)) $
205 if List.length splitBranches <= 1
206 then loop previousQuality (doneBranches <> splitBranches) todoBranches
207 else letName (fmap mstNodes $ (doneBranches <> splitBranches) <> todoBranches) \mstToNodes ->
208 let splitQuality = msfGlobalQuality predictionMeasure roots mstToNodes
209 in if previousQuality < splitQuality
210 then loop splitQuality doneBranches (splitBranches <> todoBranches)
211 else loop previousQuality (doneBranches <> splitBranches) todoBranches
212
213 mstSplit ::
214 forall range cluster.
215 HasCallStack =>
216 Show range =>
217 Show cluster =>
218 Ord range =>
219 Ord cluster =>
220 MaximalSpanningTree range cluster ->
221 MaximalSpanningForest range cluster
222 mstSplit mst =
223 case mstMinimalSimilarity mst of
224 Nothing -> [mst]
225 Just minSimil -> cutMerge mst
226 where
227 cutMerge = uncurry (:) . cut
228 cut :: MaximalSpanningTree range cluster -> (MaximalSpanningTree range cluster, [MaximalSpanningTree range cluster])
229 cut (Tree.Node node children) =
230 let (keptChildren, cutChildren) =
231 children & List.partition \tree ->
232 minSimil < mstNodeSimilarity (Tree.rootLabel tree)
233 in let cutChildrenRoots = cutChildren & List.concatMap cutMerge
234 in let (keptChildrenForest, cutChildrenTree) = List.unzip (cut <$> keptChildren)
235 in let cutChildrenTreeRoots = cutChildrenTree & List.concat & List.concatMap cutMerge
236 in ( Tree.Node node keptChildrenForest
237 , (cutChildrenRoots List.++ cutChildrenTreeRoots)
238 -- WarningNote: the root nodes of those new `MaximalSpanningTree`s
239 -- keep their `mstNodeSimilarity` to their cutting value,
240 -- which is lower than their `mstMinimalSimilarity`.
241 )
242
243 mstMinimalSimilarity :: MaximalSpanningTree range cluster -> Maybe Similarity
244 mstMinimalSimilarity (Tree.Node _rootNode rootBranches)
245 | List.null rootBranches = Nothing
246 | otherwise =
247 -- ExplanationNote: the root node of a `MaximalSpanningTree`,
248 -- being a root node, does not have parent,
249 -- hence its `mstNodeSimilarity` must be ignored.
250 Just $
251 List.minimum $
252 rootBranches
253 <&> Tree.foldTree \node accs ->
254 List.minimum $ (mstNodeSimilarity node) : accs
255
256 -- | @(`mstSplittingSimilarities` mst)@
257 -- returns the `Similarity`s causing the given `MaximalSpanningForest`
258 -- to split into further more `MaximalSpanningForest`.
259 mstSplittingSimilarities :: MaximalSpanningTree range cluster -> Set Similarity
260 mstSplittingSimilarities (Tree.Node _rootNode rootBranches)
261 | List.null rootBranches = Set.empty
262 | otherwise = rootBranches & foldMap' (Tree.foldTree \node accs -> Set.unions (Set.singleton (mstNodeSimilarity node) : accs))
263
264 -- | @(`mstNodes` branch)@ returns the nodes of the given @(branch)@.
265 mstNodes ::
266 HasCallStack =>
267 Ord range =>
268 Ord cluster =>
269 MaximalSpanningTree range cluster ->
270 Set (range, cluster)
271 mstNodes = foldMap' (Set.singleton . mstNodeRangeCluster)
272
273 -- | The global quality of a `Phylomemy`.
274 --
275 -- DescriptionNote: `msfGlobalQuality` counts the `(range, Cluster)` nodes of a branch
276 -- but that a `(range, Cluster)` in itself can gather several documents.
277 msfGlobalQuality ::
278 HasCallStack =>
279 Ord range =>
280 predictionMeasure ::: (Set (range, Cluster) -> Set (range, Cluster) -> Decimal.Arith Similarity) ->
281 roots ::: Set Root ->
282 mstToNodes ::: [Set (range, Cluster)] ->
283 Decimal.Arith Similarity
284 msfGlobalQuality (Named predictionMeasure) (Named roots) (Named mstToNodes) =
285 List.sum
286 [ probability (1 % cardinal roots)
287 * List.sum
288 [ probability (cardinal retrievedNodes % nodesTotal)
289 * predictionMeasure relevantNodes retrievedNodes
290 | retrievedNodes <- branches
291 ]
292 | root <- roots & toList
293 , -- CorrectnessNote: ignore branches not containing any requested `root`
294 let branches = mstToNodes & List.filter (any (\(_r, c) -> Set.member root c))
295 , let nodesTotal = branches & List.map (cardinal @Int) & List.sum & fromIntegral
296 , let relevantNodes = branches & foldMap' (Set.filter (\(_r, c) -> Set.member root c))
297 ]
298
299 -- | L'idée de la F-mesure est de s'assurer qu'un classificateur fait de bonnes
300 -- prédictions de la classe pertinente (bonne précision)
301 -- en suffisamment grand nombre (bon rappel)
302 -- sur un jeu de données cible.
303 --
304 -- Tout comme la précision et le rappel,
305 -- la F-mesure varie de 0 (plus mauvaise valeur)
306 -- à 1 (meilleure valeur possible).
307 --
308 -- ExplanationNote: https://en.wikipedia.org/wiki/F-score
309 predictionMeasureF ::
310 HasCallStack =>
311 Ord range =>
312 {-lambda :::-}
313
314 -- | Trade-off between `precision` and `recall`.
315 -- See https://en.wikipedia.org/wiki/Precision_and_recall
316 --
317 -- > [It] predetermine[s] the desired shape of the phylomemy: a continent
318 -- > (i.e., one large branch) or an archipelago of elements of knowledge
319 -- > (i.e., many small branches). The estimation of `lambda` is left to the researcher’s
320 -- > discretion in light of her own expertise and research questions, which makes
321 -- > any phylomemy an artifact of the researcher’s perception
322 --
323 -- For @(lambda == 0)@, only the `precision` counts, whereas for @(lambda == 1)@, only the `recall` counts.
324 Similarity ->
325 Set (range, Cluster) ->
326 Set (range, Cluster) ->
327 Decimal.Arith Similarity
328 predictionMeasureF lambda relevantNodes retrievedNodes
329 | lambda == proba0 = probability precision
330 | lambda == proba1 = probability recall
331 | otherwise = probability $ precision * recall * (1 + betaSquare) / (recall + betaSquare * precision)
332 where
333 precision = cardinal relevantRetrievedNodes % cardinal retrievedNodes
334 recall = cardinal relevantRetrievedNodes % cardinal relevantNodes
335 relevantRetrievedNodes = Set.intersection relevantNodes retrievedNodes
336 lambdaDouble = lambda & Decimal.toScientificDecimal & toBoundedRealFloat @Double & fromLeft 1
337 -- ExplanationNote: the `tan` is just to spread `lambda`
338 -- Two commonly used values for β are:
339 -- - 2, which weighs recall higher than precision,
340 -- - and 0.5, which weighs recall lower than precision.
341 beta = tan (lambdaDouble * pi / 2)
342 betaSquare :: Rational
343 betaSquare = beta ^ (2 :: Int) & toRational