src/Phylomemy/TemporalMatching.hs

   1 {-# LANGUAGE OverloadedStrings #-}
   2
   3 module Phylomemy.TemporalMatching where
   4
   5 import Clustering.FrequentItemSet.BruteForce qualified as Clustering
   6 import Clustering.UnionFind.ST qualified as UF
   7 import Control.Monad (Monad (..), foldM, forM_, mapM_, unless, when, zipWithM_)
   8 import Control.Monad.ST qualified as ST
   9 import Control.Monad.Trans.Class qualified as MT
  10 import Control.Monad.Trans.Reader qualified as MT
  11 import Control.Monad.Trans.Writer.CPS qualified as MT
  12 import Data.Bool (Bool (..), otherwise, (||))
  13 import Data.ByteString.Builder qualified as BS
  14 import Data.ByteString.Short qualified as BSh
  15 import Data.Either (either)
  16 import Data.Eq qualified as Eq
  17 import Data.Foldable (any, foldMap, toList)
  18 import Data.Function (id, ($), (&), (.))
  19 import Data.Functor ((<$>), (<&>))
  20 import Data.List qualified as List
  21 import Data.Map.Strict qualified as Map
  22 import Data.Maybe (Maybe (..), fromJust, maybe)
  23 import Data.Monoid (Monoid (..))
  24 import Data.Ord (Ord (..))
  25 import Data.Ord qualified as Ord
  26 import Data.Ratio ((%))
  27 import Data.Scientific (toBoundedRealFloat)
  28 import Data.Semigroup (Semigroup (..))
  29 import Data.Sequence qualified as Seq
  30 import Data.Set (Set)
  31 import Data.Set qualified as Set
  32 import Data.String (IsString (..), String)
  33 import Data.Text.Short qualified as TS
  34 import Logic
  35 import Numeric (showFFloat)
  36 import Numeric.Decimal (MonadThrow (..))
  37 import Numeric.Decimal qualified as Decimal
  38 import Numeric.Probability
  39 import Phylomemy.Indexation
  40 import Text.Show (Show (..))
  41 import Prelude (Double, Num (..), mod, fromIntegral, fromRational, pi, tan, toRational, undefined, (/))
  42 import Data.Int (Int)
  43
  44 -- import Data.DisjointMap qualified as DisMap
  45
  46 type MST range = Probability :-> (range, Cluster) :-> Seq.Seq (range, Cluster)
  47
  48 -- | Kruskal algorithm to find the maximal spanning trees.
  49 --
  50 -- We know in advance the edges which will be deleted,
  51 -- they go from the lower weights to the higher weights
  52 -- (tie breaking with an order on the vertices)
  53 -- hence we can compute a maximal spanning tree (MST) wrt. weights
  54 -- such that the next weight causing a split
  55 -- is the minimal weight inside this MST.
  56 branchesMSF ::
  57   forall range.
  58   Ord range =>
  59   Phylomemy range ->
  60   -- range :-> Cluster :-> Probability :-> Seq.Seq (range, Cluster) ->
  61   -- Phy branches, each as a maximal spanning tree
  62   ((range, Cluster) :-> MST range)
  63 branchesMSF phy = ST.runST do
  64   -- Create a Point for all given nodes
  65   rangeToClusterToPoint :: (range :-> Cluster :-> UF.Point s ((range, Cluster), MST range)) <-
  66     (`Map.traverseWithKey` phy) \range ->
  67       Map.traverseWithKey \cluster _weightToEdges ->
  68         UF.fresh ((range, cluster), Map.empty)
  69   -- Iterate from the heavier to the lighter edge
  70   forM_ (weightToEdges & Map.toDescList) \(weight, links) -> do
  71     -- Iterate through all the links of that weight
  72     forM_ (links & toList) \(src@(srcR, srcC), dst@(dstR, dstC)) -> do
  73       let srcP = rangeToClusterToPoint Map.! srcR Map.! srcC
  74       let dstP = rangeToClusterToPoint Map.! dstR Map.! dstC
  75       alreadyInSameMST <- UF.equivalent srcP dstP
  76       unless alreadyInSameMST do
  77         -- This edge (src -> dst) belongs to an MST
  78         UF.union' srcP dstP \(srcD, srcB) (dstD, dstB) -> do
  79           let root = min srcD dstD
  80           return $
  81             (root,) $
  82               Map.insertWith (Map.unionWith (<>)) weight (Map.singleton src (Seq.singleton dst)) $
  83                 Map.unionWith (Map.unionWith (<>)) srcB dstB
  84   let points ::
  85         [UF.Point s ((range, Cluster), MST range)] =
  86           rangeToClusterToPoint
  87             & Map.elems
  88             & List.concatMap Map.elems
  89   rootTrees :: [((range, Cluster), MST range)] <-
  90     foldM
  91       ( \acc p ->
  92           UF.redundant p >>= \case
  93             True -> return acc
  94             False -> UF.descriptor p <&> (: acc)
  95       )
  96       []
  97       points
  98   return $ Map.fromList rootTrees
  99   where
 100     weightToEdges :: Probability :-> Seq.Seq ((range, Cluster), (range, Cluster))
 101     weightToEdges =
 102       Map.unionsWith
 103         (<>)
 104         [ Map.map (\dst -> (\dstC -> (src, (dstR, dstC))) <$> dst) dstWC
 105         | (srcR, srcCRWC) <- phy & Map.toList
 106         , (srcC, dstRWC) <- srcCRWC & Map.toList
 107         , let src = (srcR, srcC)
 108         , (dstR, dstWC) <- dstRWC & Map.toList
 109         ]
 110
 111 nodeBranchIndex :: Eq.Eq range => ((range, Cluster) :-> MST range) -> (range, Cluster) -> Maybe Int
 112 nodeBranchIndex rootToMST n =
 113   List.findIndex
 114     (\ (root, mst) ->
 115       root Eq.== n ||
 116       any
 117         ( \links ->
 118             any ( \(src, dsts) -> src Eq.== n || any (Eq.== n) dsts)
 119               (links & Map.toList)
 120         ) mst
 121     )
 122     (rootToMST & Map.toList)
 123
 124 -- TODO: branches
 125 -- TODO: order clusters of a range by their similarity
 126 -- An On-Line Edge-Deletion Problem
 127 -- https://dl.acm.org/doi/pdf/10.1145/322234.322235
 128 -- type Branches range = DisSet.DisjointSet (range, Cluster)
 129 type Phylomemy range = range :-> Cluster :-> range :-> Probability :-> Seq.Seq Cluster
 130
 131 -- TODO: remove Transaction wrapping in favor of type class giving `itemSet`
 132 -- TODO: threshold
 133 phylomemy ::
 134   Ord range =>
 135   {-similarityMeasure :::-} (Cluster -> Cluster -> Probability) ->
 136   range :-> Cluster :-> Seq.Seq (Clustering.Transaction Root (Document pos)) ->
 137   -- The second range is greater than the first.
 138   Phylomemy range
 139 phylomemy similarity rangeToClusterToDocs =
 140   (`Map.mapWithKey` rangeToClusterToDocs) \range clusterToDocs ->
 141     (`Map.mapWithKey` clusterToDocs) \cluster _txs ->
 142       let (_, rangeToClusterToDocsGT) = Map.split range rangeToClusterToDocs
 143       in (`Map.mapMaybeWithKey` rangeToClusterToDocsGT) \_rangeGT clusterToDocsGT ->
 144           let childWC =
 145                 Map.fromListWith
 146                   (<>)
 147                   [ (weight, Seq.singleton clusterGT)
 148                   | clusterGT <- clusterToDocsGT & Map.keys
 149                   , let weight = similarity cluster clusterGT
 150                   , proba0 < weight
 151                   ]
 152           in if Map.null childWC
 153               then Nothing
 154               else Just childWC
 155
 156 type PhylomemyDownstream range =
 157   -- everything
 158   range
 159     :-> Cluster
 160     :->
 161     -- children
 162     Probability
 163     :-> Seq.Seq (range, Cluster)
 164 phylomemyDownstream ::
 165   Ord range =>
 166   (Cluster -> Cluster -> Probability) ->
 167   range :-> Cluster :-> Seq.Seq (Clustering.Transaction Root (Document pos)) ->
 168   PhylomemyDownstream range
 169 phylomemyDownstream similarity rangeToClusterToDocs =
 170   (`Map.mapWithKey` rangeToClusterToDocs) \range clusterToDocs ->
 171     (`Map.mapWithKey` clusterToDocs) \cluster _txs ->
 172       -- TODO: avoid split
 173       let (_, rangeToClusterToDocsGT) = Map.split range rangeToClusterToDocs
 174       in Map.fromListWith
 175           (<>)
 176           [ (similarity cluster clusterGT, Seq.singleton (rangeGT, clusterGT))
 177           | (rangeGT, clusterToDocsGT) <- rangeToClusterToDocsGT & Map.toList
 178           , clusterGT <- clusterToDocsGT & Map.keys
 179           ]
 180
 181 -- | @
 182 -- `phylomemyWeights` phy
 183 -- @
 184 --  returns the set of weights used by the given `Phylomemy`.
 185 phylomemyWeights :: Phylomemy range -> Set Probability
 186 phylomemyWeights =
 187   Map.foldMapWithKey \_ ->
 188     Map.foldMapWithKey \_ ->
 189       Map.foldMapWithKey \_ ->
 190         Set.fromList . Map.keys
 191
 192 -- | @
 193 -- `phylomemyRaise` minWeight phy
 194 -- @
 195 -- TODO: remove clusters not linked to
 196 phylomemyRaise :: Probability -> Phylomemy range -> Phylomemy range
 197 phylomemyRaise minWeight =
 198   Map.map $ Map.map $ Map.mapMaybe $ (deleteEmptyMap .) $ \childWC ->
 199     let (_lt, eqM, gt) = Map.splitLookup minWeight childWC
 200     in maybe gt (\eq -> Map.insert minWeight eq gt) eqM
 201   where
 202     deleteEmptyMap m
 203       | Map.null m = Nothing
 204       | otherwise = Just m
 205
 206 -- | @
 207 -- `phylomemyDOT` phy
 208 -- @
 209 --  returns a graph of the given `Phylomemy` in [DOT](https://graphviz.org/doc/info/lang.html) format.
 210 phylomemyDOT :: Ord range => Show range => Phylomemy range -> BS.Builder
 211 phylomemyDOT parentRCchildRWC = run do
 212   let rootToMST = branchesMSF parentRCchildRWC
 213   let branchesNum = Map.size rootToMST
 214   comments ["Num of branches:  " <> fromString (show branchesNum)]
 215   line "digraph g"
 216   block do
 217     line "splines=\"ortho\""
 218     -- line "splines=false"
 219     indexFrom1M_
 220       (parentRCchildRWC & Map.toList)
 221       \(parentR, parentCchildRWC) parentRI -> do
 222         let parentRB = "r" <> BS.intDec parentRI
 223         line $ "subgraph cluster_" <> parentRB
 224         block do
 225           comments ["Create a node for the range " <> parentRB]
 226           node
 227             parentRB
 228             [ ("shape", "box")
 229             , ("label", builderQuotedString (show parentR))
 230             , ("color", "gray")
 231             , ("style", "filled")
 232             , ("fillcolor", "gray")
 233             ]
 234           line "color=gray"
 235           block do
 236             line "rank=same"
 237             comments ["Create cluster nodes within the range " <> parentRB]
 238             indexFrom1M_
 239               (parentCchildRWC & Map.toList)
 240               \(parentC, _childRWC) parentCI -> do
 241                 let parentCL = parentC & Set.toList <&> unNgram . rootLabel
 242                 let parentBI = fromJust $ nodeBranchIndex rootToMST (parentR, parentC)
 243                 node
 244                   (parentRB <> "c" <> BS.intDec parentCI)
 245                   [ ("label", builderQuotedString (mconcat (List.intersperse ", " (parentCL <&> TS.unpack))))
 246                   , -- , ("pos", BS.intDec parentCI <> "," <> BS.intDec parentRI <> "!"
 247                     ("style", "filled")
 248                   , ("fillcolor", builderQuotedString (show ((parentBI + 1) `mod` 12)))
 249                   , ("colorscheme", "paired12")
 250                   ]
 251             comments ["Horizontally align nodes within the same range"]
 252             when (1 < Map.size parentCchildRWC) do
 253               edges
 254                 [parentRB, parentRB <> "c" <> BS.intDec 1]
 255                 [("style", "invis")]
 256             forM_ (List.zip [1 .. Map.size parentCchildRWC - 1] [2 .. Map.size parentCchildRWC]) \(i, j) ->
 257               edges
 258                 [ parentRB <> "c" <> BS.intDec i
 259                 , parentRB <> "c" <> BS.intDec j
 260                 ]
 261                 [ ("weight", "10")
 262                 , ("style", "invis")
 263                 ]
 264         -- let parentRCAlignment =
 265         --      List.intersperse " -> " $
 266         --        "r" <> BS.intDec parentRI :
 267         --        [ "r" <> BS.intDec parentRI <> "c" <> BS.intDec parentCI <> ":e -> " <> ""
 268         --        | (_parentC, _childRWC) <- parentCchildRWC & Map.toList -- TODO: use Map.keys
 269         --        | parentCI <- [1 ..]
 270         --        ]
 271         -- when (1 < List.length parentRCAlignment) do
 272         --  line $ mconcat parentRCAlignment <> "[weight=10,style=dotted]"
 273         comments
 274           [ "Create edges from clusters of the range " <> parentRB
 275           , "to clusters within subsequent ranges"
 276           ]
 277         indexFrom1M_
 278           (parentCchildRWC & Map.toList) -- TODO: use Map.keys
 279           \(_parentC, childRWC) parentCI -> do
 280             -- TODO: se what to do here, using lookupMin, or not
 281             case Map.lookupMin childRWC of
 282               Nothing -> return ()
 283               Just (childR, childWC) -> do
 284                 let childRI = 1 + Map.findIndex childR parentRCchildRWC
 285                 forM_ (childWC & Map.toList) \(weight, childCs) -> do
 286                   forM_ childCs \childC -> do
 287                     let childCI = 1 + Map.findIndex childC (parentRCchildRWC Map.! childR)
 288                     let parentRCB = mconcat [parentRB, "c", BS.intDec parentCI]
 289                     let childRCB = mconcat ["r", BS.intDec childRI, "c", BS.intDec childCI]
 290                     let labelB = builderQuotedString (showFFloat @Double (Just 2) (fromRational (runProbability weight)) "")
 291                     let weightB = BS.doubleDec (fromRational (runProbability weight))
 292                     edges
 293                       [parentRCB, childRCB]
 294                       [ ("weight", weightB)
 295                       , ("label", labelB)
 296                       , ("fontcolor", "gray60")
 297                       , ("constraint", "false")
 298                       ]
 299     comments ["Vertically align range nodes"]
 300     let rangeLinks =
 301           [ "r" <> BS.intDec parentRI
 302           | parentRI <- [1 .. Map.size parentRCchildRWC]
 303           ]
 304     when (1 < List.length rangeLinks) do
 305       edges rangeLinks [("weight", "10"), ("style", "invis")]
 306   where
 307     run :: MT.ReaderT BSh.ShortByteString (MT.Writer BS.Builder) () -> BS.Builder
 308     run = MT.execWriter . (`MT.runReaderT` {-indent-} "")
 309     block s = do
 310       line "{"
 311       () <- MT.withReaderT ("  " <>) s
 312       line "}"
 313     line s = do
 314       indent <- MT.ask
 315       MT.lift $ MT.tell (BS.shortByteString indent <> s <> "\n")
 316     indexFrom1M_ xs f = zipWithM_ f xs [1 ..]
 317     comments = mapM_ \c -> line $ "// " <> c
 318     edges :: [BS.Builder] -> [(BS.Builder, BS.Builder)] -> MT.ReaderT BSh.ShortByteString (MT.Writer BS.Builder) ()
 319     edges names as = line $ mconcat (List.intersperse " -> " names) <> attrs as
 320     node name as = line $ name <> attrs as
 321     attrs as
 322       | List.null as = ""
 323       | otherwise = "[" <> mconcat (List.intersperse "," [k <> "=" <> v | (k, v) <- as]) <> "]"
 324
 325
 326 builderQuotedString :: String -> BS.Builder
 327 builderQuotedString cs = BS.charUtf8 '"' <> foldMap escape cs <> BS.charUtf8 '"'
 328   where
 329     escape '\\' = BS.charUtf8 '\\' <> BS.charUtf8 '\\'
 330     escape '\"' = BS.charUtf8 '\\' <> BS.charUtf8 '\"'
 331     escape c = BS.charUtf8 c
 332
 333 -- newtype Transaction pos = Transaction (Document pos)
 334 -- instance Eq (Transaction pos) where
 335 --  Transaction x == Transaction y =
 336 --    documentRoots x == documentRoots y
 337 -- instance Ord (Transaction pos) where
 338 --  Transaction x `compare` Transaction y =
 339 --    documentRoots x `compare` documentRoots y
 340
 341 data Foliation = Foliation
 342   {
 343   }
 344
 345 data Group date = Group
 346   { groupRoots :: Roots
 347   , groupPeriod :: (date, date)
 348   }
 349 type Branch date = [Group date]
 350
 351 jaccardSimilarity :: Ord.Ord a => Set a -> Set a -> Probability
 352 jaccardSimilarity x y =
 353   fromJust $
 354     probability $
 355       fromIntegral (Set.size (Set.intersection x y))
 356         % fromIntegral (Set.size (Set.union x y))
 357
 358 -- data FScore = FScoreAxiom
 359
 360 -- | https://en.wikipedia.org/wiki/F-score
 361 -- L'idée de la F-mesure est de s'assurer qu'un classificateur fait de bonnes
 362 -- prédictions de la classe pertinente (bonne précision) en suffisamment grand
 363 -- nombre (bon rappel) sur un jeu de données cible. Tout comme la précision et
 364 -- le rappel, la F-mesure varie de 0 (plus mauvaise valeur) à 1 (meilleure valeur possible).
 365 --
 366 -- @(`fScore` lambda)@
 367 -- determines whether a given branch should continue to be divided or not at
 368 -- the current value of @(delta)@. To that end, the quality score is set up by a
 369 -- parameter @(lambda)@.
 370 fScore ::
 371   Eq.Eq date =>
 372   MonadThrow m =>
 373   -- | Trade-off between `precision` and `recall`.
 374   -- See https://en.wikipedia.org/wiki/Precision_and_recall
 375   --
 376   -- > [It] predetermine[s] the desired shape of the phylomemy: a continent
 377   -- > (i.e., one large branch) or an archipelago of elements of knowledge
 378   -- > (i.e., many small branches). The estimation of `lambda` is left to the researcher’s
 379   -- > discretion in light of her own expertise and research questions, which makes
 380   -- > any phylomemy an artifact of the researcher’s perception
 381   --
 382   -- For @(lambda == 0)@, only the `precision` counts, whereas for @(lambda == 1)@, only the `recall` counts.
 383   lambda ::: Probability ->
 384   x ::: Root ->
 385   -- | periods
 386   [(date, date)] ->
 387   -- | the set of all the fields of the branch bk
 388   bk ::: [Group date] ->
 389   [[Group date]] ->
 390   m Probability
 391 fScore (Named lambda) (Named x) periods (Named bk) bx
 392   | lambda Eq.== proba0 = precision x periods bk
 393   | lambda Eq.== proba1 = recall x bk bx
 394   | otherwise = do
 395       reca <- runProbability <$> recall x bk bx
 396       prec <- runProbability <$> precision x periods bk
 397       let
 398         denom = reca + fl * fl * prec
 399         l = either undefined id $ toBoundedRealFloat @Double $ Decimal.toScientificDecimal lambda
 400         fl = toRational (tan (l * pi Prelude./ 2))
 401       if denom Eq.== 0
 402         then return proba0
 403         else probability $ ((1 + fl * fl) * prec * reca) Prelude./ denom
 404
 405 -- | @(`precision` x ps bk)@ returns the probability to observe @(x)@
 406 -- by choosing at random a cluster in @(bk)@ within the periods @(ps)@.
 407 -- precision = TP / (TP+FP)
 408 -- Precision is the fraction of true positive examples among the examples that
 409 -- the model classified as positive.
 410 -- In other words, the number of true positives divided by
 411 -- the number of false positives plus true positives.
 412 precision :: Eq.Eq date => MonadThrow m => Root -> [(date, date)] -> Branch date -> m Probability
 413 precision x periods bk =
 414   probability $
 415     fromIntegral (List.length (List.filter (\g -> List.elem x (groupRoots g)) bk'))
 416       % fromIntegral (List.length bk')
 417   where
 418     bk' = List.filter (\g -> List.elem (groupPeriod g) periods) bk
 419
 420 -- | sensitivity.
 421 -- The recall 𝜌𝑘𝑥 = | | 𝑘 of 𝐵𝑘 against 𝑥. It is related to the probability to be in 𝐵𝑘 when choosing a cluster 𝑥 about 𝑥 at random in 𝜙.
 422 -- recall = TP / (TP+FN)
 423 -- Recall, also known as sensitivity, is the fraction of examples classified as positive,
 424 -- among the total number of positive examples.
 425 -- In other words, the number of true positives divided by
 426 -- the number of true positives plus false negatives.
 427 recall :: MonadThrow m => Root -> Branch date -> [Branch date] -> m Probability
 428 recall x bk bx =
 429   probability $
 430     fromIntegral (List.length (List.filter (\g -> Set.member x (groupRoots g)) bk))
 431       % fromIntegral (List.length (List.filter (\g -> Set.member x (groupRoots g)) (List.concat bx)))