{-# LANGUAGE RankNTypes #-} {-# OPTIONS_GHC -Wno-deprecations #-} -- | Brute-force algorithms related to mining frequent item sets. -- -- Definition: in `Clustering.FrequentItemSet.References.RefAgrawalSrikantApriori`: -- -- > Given a set of transactions D, the problem of mining -- > association rules is to generate all association rules -- > that have support and confidence greater than the -- > user-specified minimum support (called minsup) and -- > minimum confidence (called minconf) respectively. module Clustering.FrequentItemSet.BruteForce ( type ItemSet, type Transactions, type Support (), type FrequentItemSet (), frequentItemSets, type AllItems (), allFrequentItemSets, type AssociationRule (), type AssociationConfidence (), type Association (..), associationRules, type ClosedFrequentItemSet (), closedFrequentItemSets, allClosedFrequentItemSets, ) where import Data.Bool import Data.Eq (Eq (..)) import Data.Foldable (fold) import Data.Function ((&)) import Data.Int (Int) import Data.List qualified as List import Data.Map.Strict qualified as Map import Data.Ord qualified as Ord import Data.Ratio ((%)) import Data.Set (Set) import Data.Set qualified as Set import Logic import Logic.Theory.Arithmetic (Zero) import Logic.Theory.Ord import Numeric.Probability import Text.Show (Show (..)) import Prelude (fromIntegral, (+)) -- import Debug.Trace (traceShow) import Data.Semigroup (Semigroup (..)) import Data.Sequence qualified as Seq import Data.Tuple (snd) import GHC.IsList (toList) type ItemSet = Set type Transactions item = [ItemSet item] data Support itemSet db = SupportAxiom instance Axiom (Support itemSet db ::: Int >= Zero) -- | Return the number of occurrences of @(itemSet)@ in @(db)@. support :: Ord.Ord item => itemSet ::: ItemSet item -> db ::: Transactions item -> Support itemSet db ::: Int support (Named itemSet) (Named ts) = SupportAxiom ... List.length (List.filter (itemSet `Set.isSubsetOf`) ts) data FrequentItemSet items db minSupp = FrequentItemSetAxiom -- | Determine the frequent itemsets. frequentItemSets :: Ord.Ord item => items ::: ItemSet item -> db ::: Transactions item -> minSupp ::: Int / minSupp > Zero -> [FrequentItemSet items db minSupp ::: ItemSet item] frequentItemSets (Named items) (Named ts) (Named minSupp) = [ FrequentItemSetAxiom ... sub | (sub, occ) <- occBySubSet ts , occ Ord.>= minSupp ] where -- TimeEfficiencyWarning: iterate only once through the given `(db)` -- to count the occurence of each subset of the given `(items)`. -- occBySubSet :: Transactions item -> [(ItemSet item, Int)] occBySubSet = List.foldr (\t -> List.map (\(sub, occ) -> (sub, if sub `Set.isSubsetOf` t then occ + 1 else occ))) [(sub, 0) | sub <- Set.powerSet items & Set.toList] data AllItems db = AllItemsAxiom -- | @ -- `allFrequentItemSets` db minSupp = `frequentItemSets` is db minSupp -- @ -- where `(is)` gathers all the items present in the given `(db)`. allFrequentItemSets :: Ord.Ord item => db ::: Transactions item -> minSupp ::: Int / minSupp > Zero -> [ FrequentItemSet (AllItems db) db minSupp ::: ItemSet item ] allFrequentItemSets ts = frequentItemSets (AllItemsAxiom ... fold (unName ts)) ts -- | An association rule. -- -- Definition: in `Clustering.FrequentItemSet.References.RefAgrawalSrikantApriori`: -- -- > An association rule is an implication of the form -- > X ⇒ Y, where X ⊂ I, Y ⊂ I, and X ∩ Y = ∅. -- > -- > The rule X ⇒ Y holds in the transaction set D with confidence c -- > if c% of transactions in D that contain X also contain Y. -- > -- > The rule X ⇒ Y has support s in the transaction set D -- > if s% of transactions in D contain X ∪ Y. data AssociationRule items db minSupp = AssociationRuleAxiom data Association items db minSupp minConf item = Association { associationCause :: FrequentItemSet items db minSupp ::: ItemSet item -- ^ CorrectnessProperty: `associationCause` is a `FrequentItemSet`. -- Because `freqSet` is frequent, and `causeSet` ⊂ `freqSet`. , associationConfidence :: AssociationConfidence items db minSupp minConf ::: Probability , -- CorrectnessProperty: `associationConfidence` is a `Probability`. -- Because P(consequenceSet | causeSet) = P(causeSet ∩ consequenceSet) / P(causeSet) associationConsequence :: FrequentItemSet items db minSupp ::: ItemSet item -- ^ CorrectnessProperty: `associationConsequence` is a `FrequentItemSet`. -- Because `freqSet` is frequent, and `consequenceSet` ⊂ `freqSet`. } deriving (Show, Eq) data AssociationConfidence items db minSupp minConf = AssociationConfidenceAxiom -- | By construction in `associationRules`. instance Axiom (AssociationConfidence items db minSupp minConf >= minConf) -- | @ -- `associationRules` freqSet db minConf -- @ -- generates association rules from the given `FrequentItemSet` `(freqSet)` -- in the given `(db)`, -- with a 'Confidence' greater or equal to the given `(minConf)`. -- -- Algorithm: in `Clustering.FrequentItemSet.References.RefAgrawalSrikantApriori`, -- section 1.1 "Problem Decomposition and Paper Organization", point 2: -- -- For a given `FrequentItemSet` @(freqSet)@, -- For every @(causeSet)@ non-empty subset of @(freqSet)@. -- output a rule of the form @(causeSet => (freqSet - causeSet))@ -- if in the given @(db)@, -- the ratio @(`support` freqSet)@ -- over @(`support` causeSet)@ -- is greater or egal to the given @(minConf)@. -- -- CorrectnessNote: -- > The association rules we consider are probabilistic in nature. -- > The presence of a rule X → A does not necessarily mean -- > that X + Y → A also holds because the latter may -- > not have minimum support. -- > Similarly, the presence of rules X → Y and Y → Z -- > does not necessarily mean > that X → Z holds -- > because the latter may not have minimum confidence. -- -- PerformanceTimeWarning: traverses the given `(db)` -- once for each non-empty subset of the given `(freqSet)`. associationRules :: Ord.Ord item => FrequentItemSet items db minSupp ::: ItemSet item -> db ::: Transactions item -> minConf ::: Probability -> [ AssociationRule items db minSupp ::: Association items db minSupp minConf item ] associationRules freqSet ts minConf = [ AssociationRuleAxiom ... Association { associationCause = FrequentItemSetAxiom ... causeSet , associationConfidence = AssociationConfidenceAxiom ... confidence , -- CorrectnessProperty: `consequenceSet` is frequent. -- Because `freqSet` is frequent, and `consequenceSet` ⊂ `freqSet`. associationConsequence = FrequentItemSetAxiom ... consequenceSet } | causeSet <- Set.powerSet (unName freqSet) & Set.toList , not (Set.null causeSet) , let consequenceSet = unName freqSet `Set.difference` causeSet , not (Set.null consequenceSet) , let Named causeOcc = support (() ... causeSet) ts , confidence <- probability (fromIntegral freqSetOcc % fromIntegral causeOcc) , unName minConf Ord.<= confidence ] where Named freqSetOcc = support freqSet ts data ClosedFrequentItemSet items db minSupp minSize = ClosedFrequentItemSetAxiom closedFrequentItemSets :: forall item db minSize minSupp items. Ord.Ord item => items ::: ItemSet item -> db ::: Transactions item -> minSupp ::: Int / minSupp > Zero -> minSize ::: Int / minSize > Zero -> [ ClosedFrequentItemSet items db minSupp minSize ::: (Set item, Seq.Seq (ItemSet item)) ] closedFrequentItemSets (Named items) (Named db) (Named minSupp) (Named minSize) = loop Set.empty items db where loop pathFIS pathGT pathDB | Set.null pathGT = [] | otherwise = List.concat [ [ ClosedFrequentItemSetAxiom ... (cfis, txs) | fromIntegral (Set.size cfis) Ord.>= minSize ] <> loop cfis (snd (Set.split itm pathGT)) (toList txs) | (itm, txs) <- gtDB , let cfis = Set.insert itm pathFIS , let supp = Seq.length txs , fromIntegral supp Ord.>= minSupp ] where -- Map each itm of the remaining pathGT -- to the remaining transactions pathDB -- containing itm. gtDB = Map.fromListWith (<>) [ (itm, Seq.singleton tx) | tx <- pathDB , itm <- Set.intersection pathGT tx & Set.toList ] & Map.toList allClosedFrequentItemSets :: Ord.Ord item => db ::: Transactions item -> minSupp ::: Int / minSupp > Zero -> minSize ::: Int / minSize > Zero -> [ ClosedFrequentItemSet (AllItems db) db minSupp minSize ::: (Set item, Seq.Seq (ItemSet item)) ] allClosedFrequentItemSets ts = closedFrequentItemSets (AllItemsAxiom ... fold (unName ts)) ts