1 {-# LANGUAGE DeriveFunctor #-}
2 {-# LANGUAGE RankNTypes #-}
4 module Phylomemy.TemporalMatching where
6 -- import Data.Traversable (traverse)
7 -- import Debug.Pretty.Simple (pTraceShow, pTraceShowId)
8 import Control.Monad (Monad (..), foldM, forM_, unless)
9 import Control.Monad.ST qualified as ST
10 import Data.Bool (otherwise)
11 import Data.Either (fromLeft)
12 import Data.Eq (Eq (..))
13 import Data.Foldable (any, foldMap', toList)
14 import Data.Function (($), (&), (.))
15 import Data.Functor (Functor (..), (<$>), (<&>))
17 import Data.List qualified as List
18 import Data.Map.Strict qualified as Map
19 import Data.Maybe (Maybe (..))
20 import Data.Ord (Ord (..))
21 import Data.Ord qualified as Ord
22 import Data.Ratio (Rational, (%))
23 import Data.Scientific (toBoundedRealFloat)
24 import Data.Semigroup (Semigroup (..))
25 import Data.Sequence (Seq)
26 import Data.Sequence qualified as Seq
28 import Data.Set qualified as Set
29 import Data.Tree qualified as Tree
30 import Data.Tuple (uncurry)
31 import GHC.Stack (HasCallStack)
32 import Logic hiding ((/))
33 import Numeric.Decimal qualified as Decimal
34 import Numeric.Probability
35 import Text.Show (Show)
36 import Prelude (Double, Num (..), fromIntegral, pi, tan, toRational, (/), (^))
38 import Clustering.FrequentItemSet.BruteForce qualified as Clustering
39 import Clustering.UnionFind.ST qualified as UnionFind
40 import Phylomemy.Indexation
42 type Similarity = Probability
44 cardinal :: Num i => Set a -> i
45 cardinal = fromIntegral . Set.size
47 -- TODO: implement biased similarity from the paper
48 similarityJaccard :: Ord.Ord a => Set a -> Set a -> Similarity
49 similarityJaccard x y =
50 cardinal (Set.intersection x y) % cardinal (Set.union x y)
54 -- | `AllSimilarities` maps each @(range, cluster)@ node
55 -- to *all* the other @(range, cluster)@ nodes from a different @(range)@.
57 -- Note that the destination nodes are first ordered by their range
58 -- which enable to find the closest from the source's range.
59 type AllSimilarities range cluster =
60 {-source-} range :-> cluster :-> {-destination-} Direction (range :-> Similarity :-> Seq cluster)
62 -- | A phylomemy can be viewed in two directions.
63 data Direction a = Direction
64 { directionUpstream :: a
65 -- ^ Lookup for the closest similar enough destinations
66 -- in ranges lower than its source's range.
67 , directionDownstream :: a
68 -- ^ Lookup for the closest similar enough destinations
69 -- in ranges greater than its source's range.
71 deriving (Functor, Show)
73 -- | @(`allSimilarities` similarityMeasure rangeToClusterToDocuments)@
74 -- computes all the temporal similarities for the given documents.
77 {-similarityMeasure :::-} (cluster -> cluster -> Similarity) ->
78 {-clusters :::-} range :-> cluster :-> Seq (Clustering.Transaction Root (Document pos)) ->
79 AllSimilarities range cluster
80 allSimilarities similarity rangeToClusterToDocs =
82 & Map.mapWithKey \srcR ->
83 Map.mapWithKey \srcC _docs ->
85 (rangeToClusterToDocsLT, rangeToClusterToDocsGT) = Map.split srcR rangeToClusterToDocs
86 goStream = Map.mapMaybeWithKey \_rangeNE dstCD ->
90 [ (simil, Seq.singleton dstC)
91 | dstC <- dstCD & Map.keys
92 , let simil = similarity srcC dstC
97 { directionUpstream = goStream rangeToClusterToDocsLT
98 , directionDownstream = goStream rangeToClusterToDocsGT
102 | Map.null m = Nothing
105 -- -- | @(`similarities` phy)@
106 -- -- returns the set of `Similarity`s used by the given `Phylomemy`.
107 -- similaritiesLevels :: AllSimilarities range cluster -> Set Similarity
108 -- similaritiesLevels =
109 -- Map.foldMapWithKey \_ ->
110 -- Map.foldMapWithKey \_ dir ->
111 -- Map.foldMapWithKey (\_ -> Set.fromList . Map.keys) (directionDownstream dir)
113 -- -- | @(`phylomemyRaise` minSimil phy)@
114 -- -- filter-in only edges whose similarity is greater or equal to the given @(minSimil)@.
115 -- phylomemyRaise :: Similarity -> AllSimilarities range cluster -> AllSimilarities range cluster
116 -- phylomemyRaise minSimil =
121 -- (deleteEmptyMap .) $ \childSC ->
122 -- let (_lt, eqM, gt) = Map.splitLookup minSimil childSC
123 -- in maybe gt (\eq -> Map.insert minSimil eq gt) eqM
126 -- | Map.null m = Nothing
127 -- | otherwise = Just m
129 -- | A `MaximalSpanningTree` is one (possibly out of many)
130 -- maximal (similarity) spanning tree (MST),
131 -- mapping each source @(range, cluster)@ to many destination @(range, cluster)@,
132 -- from the lowest to the greatest `Similarity`.
134 -- ExplanationNote: https://en.wikipedia.org/wiki/Minimum_spanning_tree
136 -- Viewing a phylomemy as a MST is the crux of understanding how it is computed:
138 -- - it makes it easy to prune edges with low similarity
139 -- and know how many `MaximalSpanningTree`es this creates.
141 -- - it explains what the "scale of a phylomemy" is:
142 -- considering clusters of the same range and same MST as a single node.
144 -- The order in which edges may be deleted is known in advance (aka. "offline"):
145 -- it goes from the lowest similarity (0) to the highest similarity (1)
146 -- (tie breaking with an order on the @(range, cluster)@ nodes)
147 -- therefore computing a maximal spanning tree (MaximalSpanningTree) wrt. similarities
150 -- - the value of the next `Similarity` causing a split:
151 -- it is the minimal `Similarity` inside this `MaximalSpanningTree`.
153 -- - the resulting `MaximalSpanningTree`s after removing the edges
154 -- with that minimal `Similarity`.
156 -- TODO: "Inadequacies of Minimum Spanning Trees in Molecular Epidemiology"
157 -- https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3187300/
159 -- TODO: "Divide-and-conquer based all spanning tree generation algorithm of a simple connected graph"
160 -- https://www.sciencedirect.com/science/article/abs/pii/S0304397521006952
162 -- WarningNote: "The tree of one percent"
163 -- https://link.springer.com/content/pdf/10.1186/gb-2006-7-10-118.pdf
164 type MaximalSpanningTree range cluster = Tree.Tree (MSTNode range cluster)
166 data MSTNode range cluster = MSTNode
167 { mstNodeSimilarity :: Similarity
168 , mstNodeRangeCluster :: (range, cluster)
169 -- , nodeMSTMinSimilarity :: Min Similarity
173 -- | A (disjoint) forest of `MaximalSpanningTree`s,
174 -- indexed by the minimal @(range, cluster)@ node
175 -- of each `MaximalSpanningTree`.
176 type MaximalSpanningTrees range cluster =
177 [MaximalSpanningTree range cluster]
179 -- | @(`maximalSpanningTrees` allSimils)@
180 -- uses the Kruskal algorithm to find the maximal spanning trees
181 -- of the given `AllSimilarities`.
183 -- ExplanationNote: https://en.wikipedia.org/wiki/Kruskal's_algorithm
184 maximalSpanningTrees ::
185 forall range cluster.
188 AllSimilarities range cluster ->
189 MaximalSpanningTrees range cluster
190 maximalSpanningTrees allSimils = ST.runST do
191 -- DescriptionNote: create a `Point` for each given node
192 -- each Point containing a root node and a maximal spanning tree.
193 rangeToClusterToPoint :: (range :-> cluster :-> UnionFind.Point s (MaximalSpanningTree range cluster)) <-
195 & Map.traverseWithKey \srcR ->
196 Map.traverseWithKey \srcC _similToEdges ->
200 { mstNodeSimilarity = proba1
201 , mstNodeRangeCluster = (srcR, srcC)
204 -- DescriptionNote: iterate from the greatest to the lowest similarity edge.
205 forM_ (allEdges & Map.toDescList) \(simil, edges) -> do
206 -- Iterate through all the edges of that `Similarity`.
207 forM_ (edges & toList) \((srcR, srcC), (dstR, dstC)) -> do
208 let srcPoint = rangeToClusterToPoint Map.! srcR Map.! srcC
209 let dstPoint = rangeToClusterToPoint Map.! dstR Map.! dstC
210 alreadyInSameMST <- UnionFind.equivalent srcPoint dstPoint
211 unless alreadyInSameMST do
212 -- DescriptionNote: the nodes of this edge (src -> dst) belong to the same `MaximalSpanningTree`.
213 UnionFind.unionWith srcPoint dstPoint \(Tree.Node srcRoot srcForest) (Tree.Node dstRoot dstForest) ->
218 { mstNodeSimilarity = simil
219 , mstNodeRangeCluster = mstNodeRangeCluster dstRoot
223 rootTrees :: [MaximalSpanningTree range cluster] <-
224 rangeToClusterToPoint
225 -- DescriptionNote: collect all the Points.
227 & List.concatMap Map.elems
228 -- DescriptionNote: keep only the `MaximalSpanningTree`s
229 -- contained in non-redundant `Point`s.
230 & (`foldM` []) \acc p -> do
231 isRedundant <- UnionFind.redundant p
234 else UnionFind.descriptor p <&> (: acc)
237 -- Order `allSimils` by `Similarity`, ie.
238 -- from: {-source-} range :-> cluster :-> {-destination-} Direction (range :-> Similarity :-> Seq cluster)
239 -- to: Similarity :-> Seq ((range, cluster), (range, cluster))
240 -- which will enable to add edges to the spanning tree in decreasing `Similarity`
241 -- and hence build *a* maximal spanning tree.
242 allEdges :: Similarity :-> Seq ((range, cluster), (range, cluster))
246 [ dstSCs & Map.map (fmap \dstC -> (src, (dstR, dstC)))
247 | (srcR, srcCRSC) <- allSimils & Map.toList
248 , (srcC, dstRSC) <- srcCRSC & Map.toList
249 , let src = (srcR, srcC)
250 , -- ExplanationNote: it does not matter whether `directionDownstream` or `directionUpstream` is used,
251 -- as they contain the very same edges and `Similarity`, just not in the same order,
252 -- hence using any `Direction` would produce the same splitting similarities.
253 (dstR, dstSCs) <- directionDownstream dstRSC & Map.toList
256 -- | "sea-level rise algorithm"
258 -- See: in `Phylomemy.References.RefDrawMeScience`,
259 -- « C.5 The sea-level rise algorithm and its implementation in Gargantext »
260 splitMaximalSpanningTrees ::
262 forall range roots predictionMeasure.
265 predictionMeasure ::: (Set (range, Cluster) -> Set (range, Cluster) -> Decimal.Arith Similarity) ->
266 roots ::: Set Root ->
267 MaximalSpanningTrees range Cluster ->
268 MaximalSpanningTrees range Cluster
269 splitMaximalSpanningTrees predictionMeasure roots =
270 loop (return proba0) []
272 loop previousQuality doneBranches currentBranches =
273 -- pTraceShow (["splitMaximalSpanningTrees", "loop"], ("previousQuality", previousQuality), ("doneBranches", List.length doneBranches), ("todoBranches", List.length currentBranches)) $
274 case currentBranches of
276 currentBranch : todoBranches ->
277 let splitBranches = splitMaximalSpanningTree currentBranch
278 in -- pTraceShow (["splitMaximalSpanningTrees", "loop", "splitBranches"], ("size", Map.size splitBranches)) $
279 if List.length splitBranches <= 1
280 then loop previousQuality (doneBranches <> splitBranches) todoBranches
281 else letName (fmap maximalSpanningTreeToNodes $ (doneBranches <> splitBranches) <> todoBranches) \branchToNodes ->
282 let splitQuality = globalQuality predictionMeasure roots branchToNodes
283 in if previousQuality < splitQuality
284 then loop splitQuality doneBranches (splitBranches <> todoBranches)
285 else loop previousQuality (doneBranches <> splitBranches) todoBranches
287 splitMaximalSpanningTree ::
288 forall range cluster.
294 MaximalSpanningTree range cluster ->
295 MaximalSpanningTrees range cluster
296 splitMaximalSpanningTree mst =
297 case minimumSimilarity mst of
299 Just minSimil -> cutMerge mst
301 cutMerge = uncurry (:) . cut
302 cut :: MaximalSpanningTree range cluster -> (MaximalSpanningTree range cluster, [MaximalSpanningTree range cluster])
303 cut (Tree.Node node children) =
304 let (keptChildren, cutChildren) =
305 children & List.partition \tree ->
306 minSimil < mstNodeSimilarity (Tree.rootLabel tree)
307 in let cutChildrenRoots = cutChildren & List.concatMap cutMerge
308 in let (keptChildrenTrees, cutChildrenTree) = List.unzip (cut <$> keptChildren)
309 in let cutChildrenTreeRoots = cutChildrenTree & List.concat & List.concatMap cutMerge
310 in ( Tree.Node node keptChildrenTrees
311 , (cutChildrenRoots List.++ cutChildrenTreeRoots)
312 -- WarningNote: the root nodes of those new `MaximalSpanningTree`s
313 -- keep their `mstNodeSimilarity` to their cutting value,
314 -- which is lower than their `minimumSimilarity`.
317 minimumSimilarity :: MaximalSpanningTree range cluster -> Maybe Similarity
318 minimumSimilarity (Tree.Node _rootNode rootBranches)
319 | List.null rootBranches = Nothing
321 -- ExplanationNote: the root node of a `MaximalSpanningTree`,
322 -- being a root node, does not have parent,
323 -- hence its `mstNodeSimilarity` must be ignored.
327 <&> Tree.foldTree \node accs ->
328 List.minimum $ (mstNodeSimilarity node) : accs
330 -- | @(`maximalSpanningTreeToNodes` branch)@ returns the nodes of the given @(branch)@.
331 maximalSpanningTreeToNodes ::
335 MaximalSpanningTree range cluster ->
337 maximalSpanningTreeToNodes = foldMap' (Set.singleton . mstNodeRangeCluster)
339 -- | @(splittingSimilarities maxSpanTrees)@
340 -- returns the `Similarity`s causing the given `MaximalSpanningTrees`
341 -- to split into further more `MaximalSpanningTrees`.
342 -- splittingSimilarities :: HasCallStack => MaximalSpanningTrees range cluster -> Set Similarity
343 -- splittingSimilarities = foldMap' $ Map.foldMapWithKey \simil _ -> Set.singleton simil
345 -- | The global quality of a `Phylomemy`.
347 -- DescriptionNote: `globalQuality` counts the `(range, Cluster)` nodes of a branch
348 -- but that a `(range, Cluster)` in itself can gather several documents.
352 predictionMeasure ::: (Set (range, Cluster) -> Set (range, Cluster) -> Decimal.Arith Similarity) ->
353 roots ::: Set Root ->
354 branchToNodes ::: [Set (range, Cluster)] ->
355 Decimal.Arith Similarity
356 globalQuality (Named predictionMeasure) (Named roots) (Named branchToNodes) =
358 [ probability (1 % cardinal roots)
360 [ probability (cardinal retrievedNodes % nodesTotal)
361 * predictionMeasure relevantNodes retrievedNodes
362 | retrievedNodes <- branches
364 | root <- roots & toList
365 , -- CorrectnessNote: ignore branches not containing any requested `root`
366 let branches = branchToNodes & List.filter (any (\(_r, c) -> Set.member root c))
367 , let nodesTotal = branches & List.map (cardinal @Int) & List.sum & fromIntegral
368 , let relevantNodes = branches & foldMap' (Set.filter (\(_r, c) -> Set.member root c))
371 -- | L'idée de la F-mesure est de s'assurer qu'un classificateur fait de bonnes
372 -- prédictions de la classe pertinente (bonne précision)
373 -- en suffisamment grand nombre (bon rappel)
374 -- sur un jeu de données cible.
376 -- Tout comme la précision et le rappel,
377 -- la F-mesure varie de 0 (plus mauvaise valeur)
378 -- à 1 (meilleure valeur possible).
380 -- ExplanationNote: https://en.wikipedia.org/wiki/F-score
381 predictionMeasureF ::
386 -- | Trade-off between `precision` and `recall`.
387 -- See https://en.wikipedia.org/wiki/Precision_and_recall
389 -- > [It] predetermine[s] the desired shape of the phylomemy: a continent
390 -- > (i.e., one large branch) or an archipelago of elements of knowledge
391 -- > (i.e., many small branches). The estimation of `lambda` is left to the researcher’s
392 -- > discretion in light of her own expertise and research questions, which makes
393 -- > any phylomemy an artifact of the researcher’s perception
395 -- For @(lambda == 0)@, only the `precision` counts, whereas for @(lambda == 1)@, only the `recall` counts.
397 Set (range, Cluster) ->
398 Set (range, Cluster) ->
399 Decimal.Arith Similarity
400 predictionMeasureF lambda relevantNodes retrievedNodes
401 | lambda == proba0 = probability precision
402 | lambda == proba1 = probability recall
403 | otherwise = probability $ precision * recall * (1 + betaSquare) / (recall + betaSquare * precision)
405 precision = cardinal relevantRetrievedNodes % cardinal retrievedNodes
406 recall = cardinal relevantRetrievedNodes % cardinal relevantNodes
407 relevantRetrievedNodes = Set.intersection relevantNodes retrievedNodes
408 lambdaDouble = lambda & Decimal.toScientificDecimal & toBoundedRealFloat @Double & fromLeft 1
409 -- ExplanationNote: the `tan` is just to spread `lambda`
410 -- Two commonly used values for β are:
411 -- - 2, which weighs recall higher than precision,
412 -- - and 0.5, which weighs recall lower than precision.
413 beta = tan (lambdaDouble * pi / 2)
414 betaSquare :: Rational
415 betaSquare = beta ^ (2 :: Int) & toRational