1 {-# LANGUAGE OverloadedStrings #-}
3 module Phylomemy.TemporalMatching where
5 import Clustering.FrequentItemSet.BruteForce qualified as Clustering
6 import Clustering.UnionFind.ST qualified as UF
7 import Control.Monad (Monad (..), foldM, forM_, mapM_, unless, when, zipWithM_)
8 import Control.Monad.ST qualified as ST
9 import Control.Monad.Trans.Class qualified as MT
10 import Control.Monad.Trans.Reader qualified as MT
11 import Control.Monad.Trans.Writer.CPS qualified as MT
12 import Data.Bool (Bool (..), otherwise, (||))
13 import Data.ByteString.Builder qualified as BS
14 import Data.ByteString.Short qualified as BSh
15 import Data.Either (either)
16 import Data.Eq qualified as Eq
17 import Data.Foldable (any, foldMap, toList)
18 import Data.Function (id, ($), (&), (.))
19 import Data.Functor ((<$>), (<&>))
20 import Data.List qualified as List
21 import Data.Map.Strict qualified as Map
22 import Data.Maybe (Maybe (..), fromJust, maybe)
23 import Data.Monoid (Monoid (..))
24 import Data.Ord (Ord (..))
25 import Data.Ord qualified as Ord
26 import Data.Ratio ((%))
27 import Data.Scientific (toBoundedRealFloat)
28 import Data.Semigroup (Semigroup (..))
29 import Data.Sequence qualified as Seq
31 import Data.Set qualified as Set
32 import Data.String (IsString (..), String)
33 import Data.Text.Short qualified as TS
35 import Numeric (showFFloat)
36 import Numeric.Decimal (MonadThrow (..))
37 import Numeric.Decimal qualified as Decimal
38 import Numeric.Probability
39 import Phylomemy.Indexation
40 import Text.Show (Show (..))
41 import Prelude (Double, Num (..), mod, fromIntegral, fromRational, pi, tan, toRational, undefined, (/))
44 -- import Data.DisjointMap qualified as DisMap
46 type MST range = Probability :-> (range, Cluster) :-> Seq.Seq (range, Cluster)
48 -- | Kruskal algorithm to find the maximal spanning trees.
50 -- We know in advance the edges which will be deleted,
51 -- they go from the lower weights to the higher weights
52 -- (tie breaking with an order on the vertices)
53 -- hence we can compute a maximal spanning tree (MST) wrt. weights
54 -- such that the next weight causing a split
55 -- is the minimal weight inside this MST.
60 -- range :-> Cluster :-> Probability :-> Seq.Seq (range, Cluster) ->
61 -- Phy branches, each as a maximal spanning tree
62 ((range, Cluster) :-> MST range)
63 branchesMSF phy = ST.runST do
64 -- Create a Point for all given nodes
65 rangeToClusterToPoint :: (range :-> Cluster :-> UF.Point s ((range, Cluster), MST range)) <-
66 (`Map.traverseWithKey` phy) \range ->
67 Map.traverseWithKey \cluster _weightToEdges ->
68 UF.fresh ((range, cluster), Map.empty)
69 -- Iterate from the heavier to the lighter edge
70 forM_ (weightToEdges & Map.toDescList) \(weight, links) -> do
71 -- Iterate through all the links of that weight
72 forM_ (links & toList) \(src@(srcR, srcC), dst@(dstR, dstC)) -> do
73 let srcP = rangeToClusterToPoint Map.! srcR Map.! srcC
74 let dstP = rangeToClusterToPoint Map.! dstR Map.! dstC
75 alreadyInSameMST <- UF.equivalent srcP dstP
76 unless alreadyInSameMST do
77 -- This edge (src -> dst) belongs to an MST
78 UF.union' srcP dstP \(srcD, srcB) (dstD, dstB) -> do
79 let root = min srcD dstD
82 Map.insertWith (Map.unionWith (<>)) weight (Map.singleton src (Seq.singleton dst)) $
83 Map.unionWith (Map.unionWith (<>)) srcB dstB
85 [UF.Point s ((range, Cluster), MST range)] =
88 & List.concatMap Map.elems
89 rootTrees :: [((range, Cluster), MST range)] <-
92 UF.redundant p >>= \case
94 False -> UF.descriptor p <&> (: acc)
98 return $ Map.fromList rootTrees
100 weightToEdges :: Probability :-> Seq.Seq ((range, Cluster), (range, Cluster))
104 [ Map.map (\dst -> (\dstC -> (src, (dstR, dstC))) <$> dst) dstWC
105 | (srcR, srcCRWC) <- phy & Map.toList
106 , (srcC, dstRWC) <- srcCRWC & Map.toList
107 , let src = (srcR, srcC)
108 , (dstR, dstWC) <- dstRWC & Map.toList
111 nodeBranchIndex :: Eq.Eq range => ((range, Cluster) :-> MST range) -> (range, Cluster) -> Maybe Int
112 nodeBranchIndex rootToMST n =
118 any ( \(src, dsts) -> src Eq.== n || any (Eq.== n) dsts)
122 (rootToMST & Map.toList)
125 -- TODO: order clusters of a range by their similarity
126 -- An On-Line Edge-Deletion Problem
127 -- https://dl.acm.org/doi/pdf/10.1145/322234.322235
128 -- type Branches range = DisSet.DisjointSet (range, Cluster)
129 type Phylomemy range = range :-> Cluster :-> range :-> Probability :-> Seq.Seq Cluster
131 -- TODO: remove Transaction wrapping in favor of type class giving `itemSet`
135 {-similarityMeasure :::-} (Cluster -> Cluster -> Probability) ->
136 range :-> Cluster :-> Seq.Seq (Clustering.Transaction Root (Document pos)) ->
137 -- The second range is greater than the first.
139 phylomemy similarity rangeToClusterToDocs =
140 (`Map.mapWithKey` rangeToClusterToDocs) \range clusterToDocs ->
141 (`Map.mapWithKey` clusterToDocs) \cluster _txs ->
142 let (_, rangeToClusterToDocsGT) = Map.split range rangeToClusterToDocs
143 in (`Map.mapMaybeWithKey` rangeToClusterToDocsGT) \_rangeGT clusterToDocsGT ->
147 [ (weight, Seq.singleton clusterGT)
148 | clusterGT <- clusterToDocsGT & Map.keys
149 , let weight = similarity cluster clusterGT
152 in if Map.null childWC
156 type PhylomemyDownstream range =
163 :-> Seq.Seq (range, Cluster)
164 phylomemyDownstream ::
166 (Cluster -> Cluster -> Probability) ->
167 range :-> Cluster :-> Seq.Seq (Clustering.Transaction Root (Document pos)) ->
168 PhylomemyDownstream range
169 phylomemyDownstream similarity rangeToClusterToDocs =
170 (`Map.mapWithKey` rangeToClusterToDocs) \range clusterToDocs ->
171 (`Map.mapWithKey` clusterToDocs) \cluster _txs ->
173 let (_, rangeToClusterToDocsGT) = Map.split range rangeToClusterToDocs
176 [ (similarity cluster clusterGT, Seq.singleton (rangeGT, clusterGT))
177 | (rangeGT, clusterToDocsGT) <- rangeToClusterToDocsGT & Map.toList
178 , clusterGT <- clusterToDocsGT & Map.keys
182 -- `phylomemyWeights` phy
184 -- returns the set of weights used by the given `Phylomemy`.
185 phylomemyWeights :: Phylomemy range -> Set Probability
187 Map.foldMapWithKey \_ ->
188 Map.foldMapWithKey \_ ->
189 Map.foldMapWithKey \_ ->
190 Set.fromList . Map.keys
193 -- `phylomemyRaise` minWeight phy
195 -- TODO: remove clusters not linked to
196 phylomemyRaise :: Probability -> Phylomemy range -> Phylomemy range
197 phylomemyRaise minWeight =
198 Map.map $ Map.map $ Map.mapMaybe $ (deleteEmptyMap .) $ \childWC ->
199 let (_lt, eqM, gt) = Map.splitLookup minWeight childWC
200 in maybe gt (\eq -> Map.insert minWeight eq gt) eqM
203 | Map.null m = Nothing
207 -- `phylomemyDOT` phy
209 -- returns a graph of the given `Phylomemy` in [DOT](https://graphviz.org/doc/info/lang.html) format.
210 phylomemyDOT :: Ord range => Show range => Phylomemy range -> BS.Builder
211 phylomemyDOT parentRCchildRWC = run do
212 let rootToMST = branchesMSF parentRCchildRWC
213 let branchesNum = Map.size rootToMST
214 comments ["Num of branches: " <> fromString (show branchesNum)]
217 line "splines=\"ortho\""
218 -- line "splines=false"
220 (parentRCchildRWC & Map.toList)
221 \(parentR, parentCchildRWC) parentRI -> do
222 let parentRB = "r" <> BS.intDec parentRI
223 line $ "subgraph cluster_" <> parentRB
225 comments ["Create a node for the range " <> parentRB]
229 , ("label", builderQuotedString (show parentR))
231 , ("style", "filled")
232 , ("fillcolor", "gray")
237 comments ["Create cluster nodes within the range " <> parentRB]
239 (parentCchildRWC & Map.toList)
240 \(parentC, _childRWC) parentCI -> do
241 let parentCL = parentC & Set.toList <&> unNgram . rootLabel
242 let parentBI = fromJust $ nodeBranchIndex rootToMST (parentR, parentC)
244 (parentRB <> "c" <> BS.intDec parentCI)
245 [ ("label", builderQuotedString (mconcat (List.intersperse ", " (parentCL <&> TS.unpack))))
246 , -- , ("pos", BS.intDec parentCI <> "," <> BS.intDec parentRI <> "!"
248 , ("fillcolor", builderQuotedString (show ((parentBI + 1) `mod` 12)))
249 , ("colorscheme", "paired12")
251 comments ["Horizontally align nodes within the same range"]
252 when (1 < Map.size parentCchildRWC) do
254 [parentRB, parentRB <> "c" <> BS.intDec 1]
256 forM_ (List.zip [1 .. Map.size parentCchildRWC - 1] [2 .. Map.size parentCchildRWC]) \(i, j) ->
258 [ parentRB <> "c" <> BS.intDec i
259 , parentRB <> "c" <> BS.intDec j
264 -- let parentRCAlignment =
265 -- List.intersperse " -> " $
266 -- "r" <> BS.intDec parentRI :
267 -- [ "r" <> BS.intDec parentRI <> "c" <> BS.intDec parentCI <> ":e -> " <> ""
268 -- | (_parentC, _childRWC) <- parentCchildRWC & Map.toList -- TODO: use Map.keys
269 -- | parentCI <- [1 ..]
271 -- when (1 < List.length parentRCAlignment) do
272 -- line $ mconcat parentRCAlignment <> "[weight=10,style=dotted]"
274 [ "Create edges from clusters of the range " <> parentRB
275 , "to clusters within subsequent ranges"
278 (parentCchildRWC & Map.toList) -- TODO: use Map.keys
279 \(_parentC, childRWC) parentCI -> do
280 -- TODO: se what to do here, using lookupMin, or not
281 case Map.lookupMin childRWC of
283 Just (childR, childWC) -> do
284 let childRI = 1 + Map.findIndex childR parentRCchildRWC
285 forM_ (childWC & Map.toList) \(weight, childCs) -> do
286 forM_ childCs \childC -> do
287 let childCI = 1 + Map.findIndex childC (parentRCchildRWC Map.! childR)
288 let parentRCB = mconcat [parentRB, "c", BS.intDec parentCI]
289 let childRCB = mconcat ["r", BS.intDec childRI, "c", BS.intDec childCI]
290 let labelB = builderQuotedString (showFFloat @Double (Just 2) (fromRational (runProbability weight)) "")
291 let weightB = BS.doubleDec (fromRational (runProbability weight))
293 [parentRCB, childRCB]
294 [ ("weight", weightB)
296 , ("fontcolor", "gray60")
297 , ("constraint", "false")
299 comments ["Vertically align range nodes"]
301 [ "r" <> BS.intDec parentRI
302 | parentRI <- [1 .. Map.size parentRCchildRWC]
304 when (1 < List.length rangeLinks) do
305 edges rangeLinks [("weight", "10"), ("style", "invis")]
307 run :: MT.ReaderT BSh.ShortByteString (MT.Writer BS.Builder) () -> BS.Builder
308 run = MT.execWriter . (`MT.runReaderT` {-indent-} "")
311 () <- MT.withReaderT (" " <>) s
315 MT.lift $ MT.tell (BS.shortByteString indent <> s <> "\n")
316 indexFrom1M_ xs f = zipWithM_ f xs [1 ..]
317 comments = mapM_ \c -> line $ "// " <> c
318 edges :: [BS.Builder] -> [(BS.Builder, BS.Builder)] -> MT.ReaderT BSh.ShortByteString (MT.Writer BS.Builder) ()
319 edges names as = line $ mconcat (List.intersperse " -> " names) <> attrs as
320 node name as = line $ name <> attrs as
323 | otherwise = "[" <> mconcat (List.intersperse "," [k <> "=" <> v | (k, v) <- as]) <> "]"
326 builderQuotedString :: String -> BS.Builder
327 builderQuotedString cs = BS.charUtf8 '"' <> foldMap escape cs <> BS.charUtf8 '"'
329 escape '\\' = BS.charUtf8 '\\' <> BS.charUtf8 '\\'
330 escape '\"' = BS.charUtf8 '\\' <> BS.charUtf8 '\"'
331 escape c = BS.charUtf8 c
333 -- newtype Transaction pos = Transaction (Document pos)
334 -- instance Eq (Transaction pos) where
335 -- Transaction x == Transaction y =
336 -- documentRoots x == documentRoots y
337 -- instance Ord (Transaction pos) where
338 -- Transaction x `compare` Transaction y =
339 -- documentRoots x `compare` documentRoots y
341 data Foliation = Foliation
345 data Group date = Group
346 { groupRoots :: Roots
347 , groupPeriod :: (date, date)
349 type Branch date = [Group date]
351 jaccardSimilarity :: Ord.Ord a => Set a -> Set a -> Probability
352 jaccardSimilarity x y =
355 fromIntegral (Set.size (Set.intersection x y))
356 % fromIntegral (Set.size (Set.union x y))
358 -- data FScore = FScoreAxiom
360 -- | https://en.wikipedia.org/wiki/F-score
361 -- L'idée de la F-mesure est de s'assurer qu'un classificateur fait de bonnes
362 -- prédictions de la classe pertinente (bonne précision) en suffisamment grand
363 -- nombre (bon rappel) sur un jeu de données cible. Tout comme la précision et
364 -- le rappel, la F-mesure varie de 0 (plus mauvaise valeur) à 1 (meilleure valeur possible).
366 -- @(`fScore` lambda)@
367 -- determines whether a given branch should continue to be divided or not at
368 -- the current value of @(delta)@. To that end, the quality score is set up by a
369 -- parameter @(lambda)@.
373 -- | Trade-off between `precision` and `recall`.
374 -- See https://en.wikipedia.org/wiki/Precision_and_recall
376 -- > [It] predetermine[s] the desired shape of the phylomemy: a continent
377 -- > (i.e., one large branch) or an archipelago of elements of knowledge
378 -- > (i.e., many small branches). The estimation of `lambda` is left to the researcher’s
379 -- > discretion in light of her own expertise and research questions, which makes
380 -- > any phylomemy an artifact of the researcher’s perception
382 -- For @(lambda == 0)@, only the `precision` counts, whereas for @(lambda == 1)@, only the `recall` counts.
383 lambda ::: Probability ->
387 -- | the set of all the fields of the branch bk
388 bk ::: [Group date] ->
391 fScore (Named lambda) (Named x) periods (Named bk) bx
392 | lambda Eq.== proba0 = precision x periods bk
393 | lambda Eq.== proba1 = recall x bk bx
395 reca <- runProbability <$> recall x bk bx
396 prec <- runProbability <$> precision x periods bk
398 denom = reca + fl * fl * prec
399 l = either undefined id $ toBoundedRealFloat @Double $ Decimal.toScientificDecimal lambda
400 fl = toRational (tan (l * pi Prelude./ 2))
403 else probability $ ((1 + fl * fl) * prec * reca) Prelude./ denom
405 -- | @(`precision` x ps bk)@ returns the probability to observe @(x)@
406 -- by choosing at random a cluster in @(bk)@ within the periods @(ps)@.
407 -- precision = TP / (TP+FP)
408 -- Precision is the fraction of true positive examples among the examples that
409 -- the model classified as positive.
410 -- In other words, the number of true positives divided by
411 -- the number of false positives plus true positives.
412 precision :: Eq.Eq date => MonadThrow m => Root -> [(date, date)] -> Branch date -> m Probability
413 precision x periods bk =
415 fromIntegral (List.length (List.filter (\g -> List.elem x (groupRoots g)) bk'))
416 % fromIntegral (List.length bk')
418 bk' = List.filter (\g -> List.elem (groupPeriod g) periods) bk
421 -- The recall 𝜌𝑘𝑥 = | | 𝑘 of 𝐵𝑘 against 𝑥. It is related to the probability to be in 𝐵𝑘 when choosing a cluster 𝑥 about 𝑥 at random in 𝜙.
422 -- recall = TP / (TP+FN)
423 -- Recall, also known as sensitivity, is the fraction of examples classified as positive,
424 -- among the total number of positive examples.
425 -- In other words, the number of true positives divided by
426 -- the number of true positives plus false negatives.
427 recall :: MonadThrow m => Root -> Branch date -> [Branch date] -> m Probability
430 fromIntegral (List.length (List.filter (\g -> Set.member x (groupRoots g)) bk))
431 % fromIntegral (List.length (List.filter (\g -> Set.member x (groupRoots g)) (List.concat bx)))