{-# LANGUAGE OverloadedStrings #-} module Phylomemy.TemporalMatching where import Clustering.FrequentItemSet.BruteForce qualified as Clustering import Clustering.UnionFind.ST qualified as UF import Control.Monad (Monad (..), foldM, forM_, mapM_, unless, when, zipWithM_) import Control.Monad.ST qualified as ST import Control.Monad.Trans.Class qualified as MT import Control.Monad.Trans.Reader qualified as MT import Control.Monad.Trans.Writer.CPS qualified as MT import Data.Bool (Bool (..), otherwise, (||)) import Data.ByteString.Builder qualified as BS import Data.ByteString.Short qualified as BSh import Data.Either (either) import Data.Eq qualified as Eq import Data.Foldable (any, foldMap, toList) import Data.Function (id, ($), (&), (.)) import Data.Functor ((<$>), (<&>)) import Data.List qualified as List import Data.Map.Strict qualified as Map import Data.Maybe (Maybe (..), fromJust, maybe) import Data.Monoid (Monoid (..)) import Data.Ord (Ord (..)) import Data.Ord qualified as Ord import Data.Ratio ((%)) import Data.Scientific (toBoundedRealFloat) import Data.Semigroup (Semigroup (..)) import Data.Sequence qualified as Seq import Data.Set (Set) import Data.Set qualified as Set import Data.String (IsString (..), String) import Data.Text.Short qualified as TS import Logic import Numeric (showFFloat) import Numeric.Decimal (MonadThrow (..)) import Numeric.Decimal qualified as Decimal import Numeric.Probability import Phylomemy.Indexation import Text.Show (Show (..)) import Prelude (Double, Num (..), mod, fromIntegral, fromRational, pi, tan, toRational, undefined, (/)) import Data.Int (Int) -- import Data.DisjointMap qualified as DisMap type MST range = Probability :-> (range, Cluster) :-> Seq.Seq (range, Cluster) -- | Kruskal algorithm to find the maximal spanning trees. -- -- We know in advance the edges which will be deleted, -- they go from the lower weights to the higher weights -- (tie breaking with an order on the vertices) -- hence we can compute a maximal spanning tree (MST) wrt. weights -- such that the next weight causing a split -- is the minimal weight inside this MST. branchesMSF :: forall range. Ord range => Phylomemy range -> -- range :-> Cluster :-> Probability :-> Seq.Seq (range, Cluster) -> -- Phy branches, each as a maximal spanning tree ((range, Cluster) :-> MST range) branchesMSF phy = ST.runST do -- Create a Point for all given nodes rangeToClusterToPoint :: (range :-> Cluster :-> UF.Point s ((range, Cluster), MST range)) <- (`Map.traverseWithKey` phy) \range -> Map.traverseWithKey \cluster _weightToEdges -> UF.fresh ((range, cluster), Map.empty) -- Iterate from the heavier to the lighter edge forM_ (weightToEdges & Map.toDescList) \(weight, links) -> do -- Iterate through all the links of that weight forM_ (links & toList) \(src@(srcR, srcC), dst@(dstR, dstC)) -> do let srcP = rangeToClusterToPoint Map.! srcR Map.! srcC let dstP = rangeToClusterToPoint Map.! dstR Map.! dstC alreadyInSameMST <- UF.equivalent srcP dstP unless alreadyInSameMST do -- This edge (src -> dst) belongs to an MST UF.union' srcP dstP \(srcD, srcB) (dstD, dstB) -> do let root = min srcD dstD return $ (root,) $ Map.insertWith (Map.unionWith (<>)) weight (Map.singleton src (Seq.singleton dst)) $ Map.unionWith (Map.unionWith (<>)) srcB dstB let points :: [UF.Point s ((range, Cluster), MST range)] = rangeToClusterToPoint & Map.elems & List.concatMap Map.elems rootTrees :: [((range, Cluster), MST range)] <- foldM ( \acc p -> UF.redundant p >>= \case True -> return acc False -> UF.descriptor p <&> (: acc) ) [] points return $ Map.fromList rootTrees where weightToEdges :: Probability :-> Seq.Seq ((range, Cluster), (range, Cluster)) weightToEdges = Map.unionsWith (<>) [ Map.map (\dst -> (\dstC -> (src, (dstR, dstC))) <$> dst) dstWC | (srcR, srcCRWC) <- phy & Map.toList , (srcC, dstRWC) <- srcCRWC & Map.toList , let src = (srcR, srcC) , (dstR, dstWC) <- dstRWC & Map.toList ] nodeBranchIndex :: Eq.Eq range => ((range, Cluster) :-> MST range) -> (range, Cluster) -> Maybe Int nodeBranchIndex rootToMST n = List.findIndex (\ (root, mst) -> root Eq.== n || any ( \links -> any ( \(src, dsts) -> src Eq.== n || any (Eq.== n) dsts) (links & Map.toList) ) mst ) (rootToMST & Map.toList) -- TODO: branches -- TODO: order clusters of a range by their similarity -- An On-Line Edge-Deletion Problem -- https://dl.acm.org/doi/pdf/10.1145/322234.322235 -- type Branches range = DisSet.DisjointSet (range, Cluster) type Phylomemy range = range :-> Cluster :-> range :-> Probability :-> Seq.Seq Cluster -- TODO: remove Transaction wrapping in favor of type class giving `itemSet` -- TODO: threshold phylomemy :: Ord range => {-similarityMeasure :::-} (Cluster -> Cluster -> Probability) -> range :-> Cluster :-> Seq.Seq (Clustering.Transaction Root (Document pos)) -> -- The second range is greater than the first. Phylomemy range phylomemy similarity rangeToClusterToDocs = (`Map.mapWithKey` rangeToClusterToDocs) \range clusterToDocs -> (`Map.mapWithKey` clusterToDocs) \cluster _txs -> let (_, rangeToClusterToDocsGT) = Map.split range rangeToClusterToDocs in (`Map.mapMaybeWithKey` rangeToClusterToDocsGT) \_rangeGT clusterToDocsGT -> let childWC = Map.fromListWith (<>) [ (weight, Seq.singleton clusterGT) | clusterGT <- clusterToDocsGT & Map.keys , let weight = similarity cluster clusterGT , proba0 < weight ] in if Map.null childWC then Nothing else Just childWC type PhylomemyDownstream range = -- everything range :-> Cluster :-> -- children Probability :-> Seq.Seq (range, Cluster) phylomemyDownstream :: Ord range => (Cluster -> Cluster -> Probability) -> range :-> Cluster :-> Seq.Seq (Clustering.Transaction Root (Document pos)) -> PhylomemyDownstream range phylomemyDownstream similarity rangeToClusterToDocs = (`Map.mapWithKey` rangeToClusterToDocs) \range clusterToDocs -> (`Map.mapWithKey` clusterToDocs) \cluster _txs -> -- TODO: avoid split let (_, rangeToClusterToDocsGT) = Map.split range rangeToClusterToDocs in Map.fromListWith (<>) [ (similarity cluster clusterGT, Seq.singleton (rangeGT, clusterGT)) | (rangeGT, clusterToDocsGT) <- rangeToClusterToDocsGT & Map.toList , clusterGT <- clusterToDocsGT & Map.keys ] -- | @ -- `phylomemyWeights` phy -- @ -- returns the set of weights used by the given `Phylomemy`. phylomemyWeights :: Phylomemy range -> Set Probability phylomemyWeights = Map.foldMapWithKey \_ -> Map.foldMapWithKey \_ -> Map.foldMapWithKey \_ -> Set.fromList . Map.keys -- | @ -- `phylomemyRaise` minWeight phy -- @ -- TODO: remove clusters not linked to phylomemyRaise :: Probability -> Phylomemy range -> Phylomemy range phylomemyRaise minWeight = Map.map $ Map.map $ Map.mapMaybe $ (deleteEmptyMap .) $ \childWC -> let (_lt, eqM, gt) = Map.splitLookup minWeight childWC in maybe gt (\eq -> Map.insert minWeight eq gt) eqM where deleteEmptyMap m | Map.null m = Nothing | otherwise = Just m -- | @ -- `phylomemyDOT` phy -- @ -- returns a graph of the given `Phylomemy` in [DOT](https://graphviz.org/doc/info/lang.html) format. phylomemyDOT :: Ord range => Show range => Phylomemy range -> BS.Builder phylomemyDOT parentRCchildRWC = run do let rootToMST = branchesMSF parentRCchildRWC let branchesNum = Map.size rootToMST comments ["Num of branches: " <> fromString (show branchesNum)] line "digraph g" block do line "splines=\"ortho\"" -- line "splines=false" indexFrom1M_ (parentRCchildRWC & Map.toList) \(parentR, parentCchildRWC) parentRI -> do let parentRB = "r" <> BS.intDec parentRI line $ "subgraph cluster_" <> parentRB block do comments ["Create a node for the range " <> parentRB] node parentRB [ ("shape", "box") , ("label", builderQuotedString (show parentR)) , ("color", "gray") , ("style", "filled") , ("fillcolor", "gray") ] line "color=gray" block do line "rank=same" comments ["Create cluster nodes within the range " <> parentRB] indexFrom1M_ (parentCchildRWC & Map.toList) \(parentC, _childRWC) parentCI -> do let parentCL = parentC & Set.toList <&> unNgram . rootLabel let parentBI = fromJust $ nodeBranchIndex rootToMST (parentR, parentC) node (parentRB <> "c" <> BS.intDec parentCI) [ ("label", builderQuotedString (mconcat (List.intersperse ", " (parentCL <&> TS.unpack)))) , -- , ("pos", BS.intDec parentCI <> "," <> BS.intDec parentRI <> "!" ("style", "filled") , ("fillcolor", builderQuotedString (show ((parentBI + 1) `mod` 12))) , ("colorscheme", "paired12") ] comments ["Horizontally align nodes within the same range"] when (1 < Map.size parentCchildRWC) do edges [parentRB, parentRB <> "c" <> BS.intDec 1] [("style", "invis")] forM_ (List.zip [1 .. Map.size parentCchildRWC - 1] [2 .. Map.size parentCchildRWC]) \(i, j) -> edges [ parentRB <> "c" <> BS.intDec i , parentRB <> "c" <> BS.intDec j ] [ ("weight", "10") , ("style", "invis") ] -- let parentRCAlignment = -- List.intersperse " -> " $ -- "r" <> BS.intDec parentRI : -- [ "r" <> BS.intDec parentRI <> "c" <> BS.intDec parentCI <> ":e -> " <> "" -- | (_parentC, _childRWC) <- parentCchildRWC & Map.toList -- TODO: use Map.keys -- | parentCI <- [1 ..] -- ] -- when (1 < List.length parentRCAlignment) do -- line $ mconcat parentRCAlignment <> "[weight=10,style=dotted]" comments [ "Create edges from clusters of the range " <> parentRB , "to clusters within subsequent ranges" ] indexFrom1M_ (parentCchildRWC & Map.toList) -- TODO: use Map.keys \(_parentC, childRWC) parentCI -> do -- TODO: se what to do here, using lookupMin, or not case Map.lookupMin childRWC of Nothing -> return () Just (childR, childWC) -> do let childRI = 1 + Map.findIndex childR parentRCchildRWC forM_ (childWC & Map.toList) \(weight, childCs) -> do forM_ childCs \childC -> do let childCI = 1 + Map.findIndex childC (parentRCchildRWC Map.! childR) let parentRCB = mconcat [parentRB, "c", BS.intDec parentCI] let childRCB = mconcat ["r", BS.intDec childRI, "c", BS.intDec childCI] let labelB = builderQuotedString (showFFloat @Double (Just 2) (fromRational (runProbability weight)) "") let weightB = BS.doubleDec (fromRational (runProbability weight)) edges [parentRCB, childRCB] [ ("weight", weightB) , ("label", labelB) , ("fontcolor", "gray60") , ("constraint", "false") ] comments ["Vertically align range nodes"] let rangeLinks = [ "r" <> BS.intDec parentRI | parentRI <- [1 .. Map.size parentRCchildRWC] ] when (1 < List.length rangeLinks) do edges rangeLinks [("weight", "10"), ("style", "invis")] where run :: MT.ReaderT BSh.ShortByteString (MT.Writer BS.Builder) () -> BS.Builder run = MT.execWriter . (`MT.runReaderT` {-indent-} "") block s = do line "{" () <- MT.withReaderT (" " <>) s line "}" line s = do indent <- MT.ask MT.lift $ MT.tell (BS.shortByteString indent <> s <> "\n") indexFrom1M_ xs f = zipWithM_ f xs [1 ..] comments = mapM_ \c -> line $ "// " <> c edges :: [BS.Builder] -> [(BS.Builder, BS.Builder)] -> MT.ReaderT BSh.ShortByteString (MT.Writer BS.Builder) () edges names as = line $ mconcat (List.intersperse " -> " names) <> attrs as node name as = line $ name <> attrs as attrs as | List.null as = "" | otherwise = "[" <> mconcat (List.intersperse "," [k <> "=" <> v | (k, v) <- as]) <> "]" builderQuotedString :: String -> BS.Builder builderQuotedString cs = BS.charUtf8 '"' <> foldMap escape cs <> BS.charUtf8 '"' where escape '\\' = BS.charUtf8 '\\' <> BS.charUtf8 '\\' escape '\"' = BS.charUtf8 '\\' <> BS.charUtf8 '\"' escape c = BS.charUtf8 c -- newtype Transaction pos = Transaction (Document pos) -- instance Eq (Transaction pos) where -- Transaction x == Transaction y = -- documentRoots x == documentRoots y -- instance Ord (Transaction pos) where -- Transaction x `compare` Transaction y = -- documentRoots x `compare` documentRoots y data Foliation = Foliation { } data Group date = Group { groupRoots :: Roots , groupPeriod :: (date, date) } type Branch date = [Group date] jaccardSimilarity :: Ord.Ord a => Set a -> Set a -> Probability jaccardSimilarity x y = fromJust $ probability $ fromIntegral (Set.size (Set.intersection x y)) % fromIntegral (Set.size (Set.union x y)) -- data FScore = FScoreAxiom -- | https://en.wikipedia.org/wiki/F-score -- L'idée de la F-mesure est de s'assurer qu'un classificateur fait de bonnes -- prédictions de la classe pertinente (bonne précision) en suffisamment grand -- nombre (bon rappel) sur un jeu de données cible. Tout comme la précision et -- le rappel, la F-mesure varie de 0 (plus mauvaise valeur) à 1 (meilleure valeur possible). -- -- @(`fScore` lambda)@ -- determines whether a given branch should continue to be divided or not at -- the current value of @(delta)@. To that end, the quality score is set up by a -- parameter @(lambda)@. fScore :: Eq.Eq date => MonadThrow m => -- | Trade-off between `precision` and `recall`. -- See https://en.wikipedia.org/wiki/Precision_and_recall -- -- > [It] predetermine[s] the desired shape of the phylomemy: a continent -- > (i.e., one large branch) or an archipelago of elements of knowledge -- > (i.e., many small branches). The estimation of `lambda` is left to the researcher’s -- > discretion in light of her own expertise and research questions, which makes -- > any phylomemy an artifact of the researcher’s perception -- -- For @(lambda == 0)@, only the `precision` counts, whereas for @(lambda == 1)@, only the `recall` counts. lambda ::: Probability -> x ::: Root -> -- | periods [(date, date)] -> -- | the set of all the fields of the branch bk bk ::: [Group date] -> [[Group date]] -> m Probability fScore (Named lambda) (Named x) periods (Named bk) bx | lambda Eq.== proba0 = precision x periods bk | lambda Eq.== proba1 = recall x bk bx | otherwise = do reca <- runProbability <$> recall x bk bx prec <- runProbability <$> precision x periods bk let denom = reca + fl * fl * prec l = either undefined id $ toBoundedRealFloat @Double $ Decimal.toScientificDecimal lambda fl = toRational (tan (l * pi Prelude./ 2)) if denom Eq.== 0 then return proba0 else probability $ ((1 + fl * fl) * prec * reca) Prelude./ denom -- | @(`precision` x ps bk)@ returns the probability to observe @(x)@ -- by choosing at random a cluster in @(bk)@ within the periods @(ps)@. -- precision = TP / (TP+FP) -- Precision is the fraction of true positive examples among the examples that -- the model classified as positive. -- In other words, the number of true positives divided by -- the number of false positives plus true positives. precision :: Eq.Eq date => MonadThrow m => Root -> [(date, date)] -> Branch date -> m Probability precision x periods bk = probability $ fromIntegral (List.length (List.filter (\g -> List.elem x (groupRoots g)) bk')) % fromIntegral (List.length bk') where bk' = List.filter (\g -> List.elem (groupPeriod g) periods) bk -- | sensitivity. -- The recall 𝜌𝑘𝑥 = | | 𝑘 of 𝐵𝑘 against 𝑥. It is related to the probability to be in 𝐵𝑘 when choosing a cluster 𝑥 about 𝑥 at random in 𝜙. -- recall = TP / (TP+FN) -- Recall, also known as sensitivity, is the fraction of examples classified as positive, -- among the total number of positive examples. -- In other words, the number of true positives divided by -- the number of true positives plus false negatives. recall :: MonadThrow m => Root -> Branch date -> [Branch date] -> m Probability recall x bk bx = probability $ fromIntegral (List.length (List.filter (\g -> Set.member x (groupRoots g)) bk)) % fromIntegral (List.length (List.filter (\g -> Set.member x (groupRoots g)) (List.concat bx)))