]> Git — Sourcephile - literate-phylomemy.git/blob - src/Phylomemy/TemporalMatching.hs
init
[literate-phylomemy.git] / src / Phylomemy / TemporalMatching.hs
1 {-# LANGUAGE DeriveFunctor #-}
2 {-# LANGUAGE RankNTypes #-}
3
4 module Phylomemy.TemporalMatching where
5
6 -- import Data.Traversable (traverse)
7 -- import Debug.Pretty.Simple (pTraceShow, pTraceShowId)
8 import Control.Monad (Monad (..), foldM, forM_, unless)
9 import Control.Monad.ST qualified as ST
10 import Data.Bool (otherwise)
11 import Data.Either (fromLeft)
12 import Data.Eq (Eq (..))
13 import Data.Foldable (any, foldMap', toList)
14 import Data.Function (($), (&), (.))
15 import Data.Functor (Functor (..), (<$>), (<&>))
16 import Data.Int (Int)
17 import Data.List qualified as List
18 import Data.Map.Strict qualified as Map
19 import Data.Maybe (Maybe (..))
20 import Data.Ord (Ord (..))
21 import Data.Ord qualified as Ord
22 import Data.Ratio (Rational, (%))
23 import Data.Scientific (toBoundedRealFloat)
24 import Data.Semigroup (Semigroup (..))
25 import Data.Sequence (Seq)
26 import Data.Sequence qualified as Seq
27 import Data.Set (Set)
28 import Data.Set qualified as Set
29 import Data.Tree qualified as Tree
30 import Data.Tuple (uncurry)
31 import GHC.Stack (HasCallStack)
32 import Logic hiding ((/))
33 import Numeric.Decimal qualified as Decimal
34 import Numeric.Probability
35 import Text.Show (Show)
36 import Prelude (Double, Num (..), fromIntegral, pi, tan, toRational, (/), (^))
37
38 import Clustering.FrequentItemSet.BruteForce qualified as Clustering
39 import Clustering.UnionFind.ST qualified as UnionFind
40 import Phylomemy.Indexation
41
42 type Similarity = Probability
43
44 cardinal :: Num i => Set a -> i
45 cardinal = fromIntegral . Set.size
46
47 -- TODO: implement biased similarity from the paper
48 similarityJaccard :: Ord.Ord a => Set a -> Set a -> Similarity
49 similarityJaccard x y =
50 cardinal (Set.intersection x y) % cardinal (Set.union x y)
51 & probability
52 & Decimal.arithError
53
54 -- | `AllSimilarities` maps each @(range, cluster)@ node
55 -- to *all* the other @(range, cluster)@ nodes from a different @(range)@.
56 --
57 -- Note that the destination nodes are first ordered by their range
58 -- which enable to find the closest from the source's range.
59 type AllSimilarities range cluster =
60 {-source-} range :-> cluster :-> {-destination-} Direction (range :-> Similarity :-> Seq cluster)
61
62 -- | A phylomemy can be viewed in two directions.
63 data Direction a = Direction
64 { directionUpstream :: a
65 -- ^ Lookup for the closest similar enough destinations
66 -- in ranges lower than its source's range.
67 , directionDownstream :: a
68 -- ^ Lookup for the closest similar enough destinations
69 -- in ranges greater than its source's range.
70 }
71 deriving (Functor, Show)
72
73 -- | @(`allSimilarities` similarityMeasure rangeToClusterToDocuments)@
74 -- computes all the temporal similarities for the given documents.
75 allSimilarities ::
76 Ord range =>
77 {-similarityMeasure :::-} (cluster -> cluster -> Similarity) ->
78 {-clusters :::-} range :-> cluster :-> Seq (Clustering.Transaction Root (Document pos)) ->
79 AllSimilarities range cluster
80 allSimilarities similarity rangeToClusterToDocs =
81 rangeToClusterToDocs
82 & Map.mapWithKey \srcR ->
83 Map.mapWithKey \srcC _docs ->
84 let
85 (rangeToClusterToDocsLT, rangeToClusterToDocsGT) = Map.split srcR rangeToClusterToDocs
86 goStream = Map.mapMaybeWithKey \_rangeNE dstCD ->
87 deleteEmptyMap $
88 Map.fromListWith
89 (Seq.><)
90 [ (simil, Seq.singleton dstC)
91 | dstC <- dstCD & Map.keys
92 , let simil = similarity srcC dstC
93 , proba0 < simil
94 ]
95 in
96 Direction
97 { directionUpstream = goStream rangeToClusterToDocsLT
98 , directionDownstream = goStream rangeToClusterToDocsGT
99 }
100 where
101 deleteEmptyMap m
102 | Map.null m = Nothing
103 | otherwise = Just m
104
105 -- -- | @(`similarities` phy)@
106 -- -- returns the set of `Similarity`s used by the given `Phylomemy`.
107 -- similaritiesLevels :: AllSimilarities range cluster -> Set Similarity
108 -- similaritiesLevels =
109 -- Map.foldMapWithKey \_ ->
110 -- Map.foldMapWithKey \_ dir ->
111 -- Map.foldMapWithKey (\_ -> Set.fromList . Map.keys) (directionDownstream dir)
112
113 -- -- | @(`phylomemyRaise` minSimil phy)@
114 -- -- filter-in only edges whose similarity is greater or equal to the given @(minSimil)@.
115 -- phylomemyRaise :: Similarity -> AllSimilarities range cluster -> AllSimilarities range cluster
116 -- phylomemyRaise minSimil =
117 -- Map.map $
118 -- Map.map $
119 -- fmap @Direction $
120 -- Map.mapMaybe $
121 -- (deleteEmptyMap .) $ \childSC ->
122 -- let (_lt, eqM, gt) = Map.splitLookup minSimil childSC
123 -- in maybe gt (\eq -> Map.insert minSimil eq gt) eqM
124 -- where
125 -- deleteEmptyMap m
126 -- | Map.null m = Nothing
127 -- | otherwise = Just m
128
129 -- | A `MaximalSpanningTree` is one (possibly out of many)
130 -- maximal (similarity) spanning tree (MST),
131 -- mapping each source @(range, cluster)@ to many destination @(range, cluster)@,
132 -- from the lowest to the greatest `Similarity`.
133 --
134 -- ExplanationNote: https://en.wikipedia.org/wiki/Minimum_spanning_tree
135 --
136 -- Viewing a phylomemy as a MST is the crux of understanding how it is computed:
137 --
138 -- - it makes it easy to prune edges with low similarity
139 -- and know how many `MaximalSpanningTree`es this creates.
140 --
141 -- - it explains what the "scale of a phylomemy" is:
142 -- considering clusters of the same range and same MST as a single node.
143 --
144 -- The order in which edges may be deleted is known in advance (aka. "offline"):
145 -- it goes from the lowest similarity (0) to the highest similarity (1)
146 -- (tie breaking with an order on the @(range, cluster)@ nodes)
147 -- therefore computing a maximal spanning tree (MaximalSpanningTree) wrt. similarities
148 -- gives:
149 --
150 -- - the value of the next `Similarity` causing a split:
151 -- it is the minimal `Similarity` inside this `MaximalSpanningTree`.
152 --
153 -- - the resulting `MaximalSpanningTree`s after removing the edges
154 -- with that minimal `Similarity`.
155 --
156 -- TODO: "Inadequacies of Minimum Spanning Trees in Molecular Epidemiology"
157 -- https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3187300/
158 --
159 -- TODO: "Divide-and-conquer based all spanning tree generation algorithm of a simple connected graph"
160 -- https://www.sciencedirect.com/science/article/abs/pii/S0304397521006952
161 --
162 -- WarningNote: "The tree of one percent"
163 -- https://link.springer.com/content/pdf/10.1186/gb-2006-7-10-118.pdf
164 type MaximalSpanningTree range cluster = Tree.Tree (MSTNode range cluster)
165
166 data MSTNode range cluster = MSTNode
167 { mstNodeSimilarity :: Similarity
168 , mstNodeRangeCluster :: (range, cluster)
169 -- , nodeMSTMinSimilarity :: Min Similarity
170 }
171 deriving (Show)
172
173 -- | A (disjoint) forest of `MaximalSpanningTree`s,
174 -- indexed by the minimal @(range, cluster)@ node
175 -- of each `MaximalSpanningTree`.
176 type MaximalSpanningTrees range cluster =
177 [MaximalSpanningTree range cluster]
178
179 -- | @(`maximalSpanningTrees` allSimils)@
180 -- uses the Kruskal algorithm to find the maximal spanning trees
181 -- of the given `AllSimilarities`.
182 --
183 -- ExplanationNote: https://en.wikipedia.org/wiki/Kruskal's_algorithm
184 maximalSpanningTrees ::
185 forall range cluster.
186 Ord range =>
187 Ord cluster =>
188 AllSimilarities range cluster ->
189 MaximalSpanningTrees range cluster
190 maximalSpanningTrees allSimils = ST.runST do
191 -- DescriptionNote: create a `Point` for each given node
192 -- each Point containing a root node and a maximal spanning tree.
193 rangeToClusterToPoint :: (range :-> cluster :-> UnionFind.Point s (MaximalSpanningTree range cluster)) <-
194 allSimils
195 & Map.traverseWithKey \srcR ->
196 Map.traverseWithKey \srcC _similToEdges ->
197 UnionFind.fresh $
198 Tree.Node
199 MSTNode
200 { mstNodeSimilarity = proba1
201 , mstNodeRangeCluster = (srcR, srcC)
202 }
203 []
204 -- DescriptionNote: iterate from the greatest to the lowest similarity edge.
205 forM_ (allEdges & Map.toDescList) \(simil, edges) -> do
206 -- Iterate through all the edges of that `Similarity`.
207 forM_ (edges & toList) \((srcR, srcC), (dstR, dstC)) -> do
208 let srcPoint = rangeToClusterToPoint Map.! srcR Map.! srcC
209 let dstPoint = rangeToClusterToPoint Map.! dstR Map.! dstC
210 alreadyInSameMST <- UnionFind.equivalent srcPoint dstPoint
211 unless alreadyInSameMST do
212 -- DescriptionNote: the nodes of this edge (src -> dst) belong to the same `MaximalSpanningTree`.
213 UnionFind.unionWith srcPoint dstPoint \(Tree.Node srcRoot srcForest) (Tree.Node dstRoot dstForest) ->
214 return $
215 Tree.Node srcRoot $
216 Tree.Node
217 MSTNode
218 { mstNodeSimilarity = simil
219 , mstNodeRangeCluster = mstNodeRangeCluster dstRoot
220 }
221 dstForest
222 : srcForest
223 rootTrees :: [MaximalSpanningTree range cluster] <-
224 rangeToClusterToPoint
225 -- DescriptionNote: collect all the Points.
226 & Map.elems
227 & List.concatMap Map.elems
228 -- DescriptionNote: keep only the `MaximalSpanningTree`s
229 -- contained in non-redundant `Point`s.
230 & (`foldM` []) \acc p -> do
231 isRedundant <- UnionFind.redundant p
232 if isRedundant
233 then return acc
234 else UnionFind.descriptor p <&> (: acc)
235 return rootTrees
236 where
237 -- Order `allSimils` by `Similarity`, ie.
238 -- from: {-source-} range :-> cluster :-> {-destination-} Direction (range :-> Similarity :-> Seq cluster)
239 -- to: Similarity :-> Seq ((range, cluster), (range, cluster))
240 -- which will enable to add edges to the spanning tree in decreasing `Similarity`
241 -- and hence build *a* maximal spanning tree.
242 allEdges :: Similarity :-> Seq ((range, cluster), (range, cluster))
243 allEdges =
244 Map.unionsWith
245 (Seq.><)
246 [ dstSCs & Map.map (fmap \dstC -> (src, (dstR, dstC)))
247 | (srcR, srcCRSC) <- allSimils & Map.toList
248 , (srcC, dstRSC) <- srcCRSC & Map.toList
249 , let src = (srcR, srcC)
250 , -- ExplanationNote: it does not matter whether `directionDownstream` or `directionUpstream` is used,
251 -- as they contain the very same edges and `Similarity`, just not in the same order,
252 -- hence using any `Direction` would produce the same splitting similarities.
253 (dstR, dstSCs) <- directionDownstream dstRSC & Map.toList
254 ]
255
256 -- | "sea-level rise algorithm"
257 --
258 -- See: in `Phylomemy.References.RefDrawMeScience`,
259 -- « C.5 The sea-level rise algorithm and its implementation in Gargantext »
260 splitMaximalSpanningTrees ::
261 HasCallStack =>
262 forall range roots predictionMeasure.
263 Show range =>
264 Ord range =>
265 predictionMeasure ::: (Set (range, Cluster) -> Set (range, Cluster) -> Decimal.Arith Similarity) ->
266 roots ::: Set Root ->
267 MaximalSpanningTrees range Cluster ->
268 MaximalSpanningTrees range Cluster
269 splitMaximalSpanningTrees predictionMeasure roots =
270 loop (return proba0) []
271 where
272 loop previousQuality doneBranches currentBranches =
273 -- pTraceShow (["splitMaximalSpanningTrees", "loop"], ("previousQuality", previousQuality), ("doneBranches", List.length doneBranches), ("todoBranches", List.length currentBranches)) $
274 case currentBranches of
275 [] -> doneBranches
276 currentBranch : todoBranches ->
277 let splitBranches = splitMaximalSpanningTree currentBranch
278 in -- pTraceShow (["splitMaximalSpanningTrees", "loop", "splitBranches"], ("size", Map.size splitBranches)) $
279 if List.length splitBranches <= 1
280 then loop previousQuality (doneBranches <> splitBranches) todoBranches
281 else letName (fmap maximalSpanningTreeToNodes $ (doneBranches <> splitBranches) <> todoBranches) \branchToNodes ->
282 let splitQuality = globalQuality predictionMeasure roots branchToNodes
283 in if previousQuality < splitQuality
284 then loop splitQuality doneBranches (splitBranches <> todoBranches)
285 else loop previousQuality (doneBranches <> splitBranches) todoBranches
286
287 splitMaximalSpanningTree ::
288 forall range cluster.
289 HasCallStack =>
290 Show range =>
291 Show cluster =>
292 Ord range =>
293 Ord cluster =>
294 MaximalSpanningTree range cluster ->
295 MaximalSpanningTrees range cluster
296 splitMaximalSpanningTree mst =
297 case minimumSimilarity mst of
298 Nothing -> [mst]
299 Just minSimil -> cutMerge mst
300 where
301 cutMerge = uncurry (:) . cut
302 cut :: MaximalSpanningTree range cluster -> (MaximalSpanningTree range cluster, [MaximalSpanningTree range cluster])
303 cut (Tree.Node node children) =
304 let (keptChildren, cutChildren) =
305 children & List.partition \tree ->
306 minSimil < mstNodeSimilarity (Tree.rootLabel tree)
307 in let cutChildrenRoots = cutChildren & List.concatMap cutMerge
308 in let (keptChildrenTrees, cutChildrenTree) = List.unzip (cut <$> keptChildren)
309 in let cutChildrenTreeRoots = cutChildrenTree & List.concat & List.concatMap cutMerge
310 in ( Tree.Node node keptChildrenTrees
311 , (cutChildrenRoots List.++ cutChildrenTreeRoots)
312 -- WarningNote: the root nodes of those new `MaximalSpanningTree`s
313 -- keep their `mstNodeSimilarity` to their cutting value,
314 -- which is lower than their `minimumSimilarity`.
315 )
316
317 minimumSimilarity :: MaximalSpanningTree range cluster -> Maybe Similarity
318 minimumSimilarity (Tree.Node _rootNode rootBranches)
319 | List.null rootBranches = Nothing
320 | otherwise =
321 -- ExplanationNote: the root node of a `MaximalSpanningTree`,
322 -- being a root node, does not have parent,
323 -- hence its `mstNodeSimilarity` must be ignored.
324 Just $
325 List.minimum $
326 rootBranches
327 <&> Tree.foldTree \node accs ->
328 List.minimum $ (mstNodeSimilarity node) : accs
329
330 -- | @(`maximalSpanningTreeToNodes` branch)@ returns the nodes of the given @(branch)@.
331 maximalSpanningTreeToNodes ::
332 HasCallStack =>
333 Ord range =>
334 Ord cluster =>
335 MaximalSpanningTree range cluster ->
336 Set (range, cluster)
337 maximalSpanningTreeToNodes = foldMap' (Set.singleton . mstNodeRangeCluster)
338
339 -- | @(splittingSimilarities maxSpanTrees)@
340 -- returns the `Similarity`s causing the given `MaximalSpanningTrees`
341 -- to split into further more `MaximalSpanningTrees`.
342 -- splittingSimilarities :: HasCallStack => MaximalSpanningTrees range cluster -> Set Similarity
343 -- splittingSimilarities = foldMap' $ Map.foldMapWithKey \simil _ -> Set.singleton simil
344
345 -- | The global quality of a `Phylomemy`.
346 --
347 -- DescriptionNote: `globalQuality` counts the `(range, Cluster)` nodes of a branch
348 -- but that a `(range, Cluster)` in itself can gather several documents.
349 globalQuality ::
350 HasCallStack =>
351 Ord range =>
352 predictionMeasure ::: (Set (range, Cluster) -> Set (range, Cluster) -> Decimal.Arith Similarity) ->
353 roots ::: Set Root ->
354 branchToNodes ::: [Set (range, Cluster)] ->
355 Decimal.Arith Similarity
356 globalQuality (Named predictionMeasure) (Named roots) (Named branchToNodes) =
357 List.sum
358 [ probability (1 % cardinal roots)
359 * List.sum
360 [ probability (cardinal retrievedNodes % nodesTotal)
361 * predictionMeasure relevantNodes retrievedNodes
362 | retrievedNodes <- branches
363 ]
364 | root <- roots & toList
365 , -- CorrectnessNote: ignore branches not containing any requested `root`
366 let branches = branchToNodes & List.filter (any (\(_r, c) -> Set.member root c))
367 , let nodesTotal = branches & List.map (cardinal @Int) & List.sum & fromIntegral
368 , let relevantNodes = branches & foldMap' (Set.filter (\(_r, c) -> Set.member root c))
369 ]
370
371 -- | L'idée de la F-mesure est de s'assurer qu'un classificateur fait de bonnes
372 -- prédictions de la classe pertinente (bonne précision)
373 -- en suffisamment grand nombre (bon rappel)
374 -- sur un jeu de données cible.
375 --
376 -- Tout comme la précision et le rappel,
377 -- la F-mesure varie de 0 (plus mauvaise valeur)
378 -- à 1 (meilleure valeur possible).
379 --
380 -- ExplanationNote: https://en.wikipedia.org/wiki/F-score
381 predictionMeasureF ::
382 HasCallStack =>
383 Ord range =>
384 {-lambda :::-}
385
386 -- | Trade-off between `precision` and `recall`.
387 -- See https://en.wikipedia.org/wiki/Precision_and_recall
388 --
389 -- > [It] predetermine[s] the desired shape of the phylomemy: a continent
390 -- > (i.e., one large branch) or an archipelago of elements of knowledge
391 -- > (i.e., many small branches). The estimation of `lambda` is left to the researcher’s
392 -- > discretion in light of her own expertise and research questions, which makes
393 -- > any phylomemy an artifact of the researcher’s perception
394 --
395 -- For @(lambda == 0)@, only the `precision` counts, whereas for @(lambda == 1)@, only the `recall` counts.
396 Similarity ->
397 Set (range, Cluster) ->
398 Set (range, Cluster) ->
399 Decimal.Arith Similarity
400 predictionMeasureF lambda relevantNodes retrievedNodes
401 | lambda == proba0 = probability precision
402 | lambda == proba1 = probability recall
403 | otherwise = probability $ precision * recall * (1 + betaSquare) / (recall + betaSquare * precision)
404 where
405 precision = cardinal relevantRetrievedNodes % cardinal retrievedNodes
406 recall = cardinal relevantRetrievedNodes % cardinal relevantNodes
407 relevantRetrievedNodes = Set.intersection relevantNodes retrievedNodes
408 lambdaDouble = lambda & Decimal.toScientificDecimal & toBoundedRealFloat @Double & fromLeft 1
409 -- ExplanationNote: the `tan` is just to spread `lambda`
410 -- Two commonly used values for β are:
411 -- - 2, which weighs recall higher than precision,
412 -- - and 0.5, which weighs recall lower than precision.
413 beta = tan (lambdaDouble * pi / 2)
414 betaSquare :: Rational
415 betaSquare = beta ^ (2 :: Int) & toRational