{-# LANGUAGE RankNTypes #-} {-# OPTIONS_GHC -Wno-deprecations #-} -- | Brute-force algorithms related to mining frequent item sets. -- -- Definition: in `Clustering.FrequentItemSet.References.RefAgrawalSrikantApriori`: -- -- > Given a set of transactions D, the problem of mining -- > association rules is to generate all association rules -- > that have support and confidence greater than the -- > user-specified minimum support (called minsup) and -- > minimum confidence (called minconf) respectively. module Clustering.FrequentItemSet.BruteForce ( type ItemSet, type Transactions, type Support (), type FrequentItemSet (), frequentItemSets, type AllItems (), allFrequentItemSets, type AssociationRule (), type AssociationConfidence (), type Association (..), associationRules, type ClosedFrequentItemSet (), closedFrequentItemSets, allClosedFrequentItemSets, ) where import Data.Bool import Data.Eq (Eq (..)) import Data.Foldable (fold) import Data.Function ((&)) import Data.Int (Int) import Data.List qualified as List import Data.Map.Strict qualified as Map import Data.Ord qualified as Ord import Data.Ratio ((%)) import Data.Semigroup (Semigroup (..)) import Data.Sequence qualified as Seq import Data.Set (Set) import Data.Set qualified as Set import Data.Tuple (snd) import GHC.IsList (toList) import Logic import Logic.Theory.Arithmetic (Zero) import Logic.Theory.Ord import Numeric.Probability import Text.Show (Show (..)) import Prelude (fromIntegral, (+)) type ItemSet = Set type Transactions item = [ItemSet item] data Support itemSet db = SupportAxiom instance Axiom (Support itemSet db ::: Int >= Zero) -- | Return the number of occurrences of @(itemSet)@ in @(db)@. support :: Ord.Ord item => itemSet ::: ItemSet item -> db ::: Transactions item -> Support itemSet db ::: Int support (Named itemSet) (Named db) = SupportAxiom ... List.length (List.filter (itemSet `Set.isSubsetOf`) db) data FrequentItemSet items db minSupp = FrequentItemSetAxiom -- | Determine the frequent itemsets. frequentItemSets :: Ord.Ord item => items ::: ItemSet item -> db ::: Transactions item -> minSupp ::: Int / minSupp > Zero -> [FrequentItemSet items db minSupp ::: ItemSet item] frequentItemSets (Named items) (Named db) (Named minSupp) = [ FrequentItemSetAxiom ... sub | (sub, occ) <- occBySubSet db , occ Ord.>= minSupp ] where -- TimeEfficiencyWarning: iterate only once through the given `(db)` -- to count the occurence of each subset of the given `(items)`. -- occBySubSet :: Transactions item -> [(ItemSet item, Int)] occBySubSet = List.foldr (\t -> List.map (\(sub, occ) -> (sub, if sub `Set.isSubsetOf` t then occ + 1 else occ))) [(sub, 0) | sub <- Set.powerSet items & Set.toList] data AllItems db = AllItemsAxiom -- | @ -- `allFrequentItemSets` db minSupp = `frequentItemSets` is db minSupp -- @ -- where `(is)` gathers all the items present in the given `(db)`. allFrequentItemSets :: Ord.Ord item => db ::: Transactions item -> minSupp ::: Int / minSupp > Zero -> [ FrequentItemSet (AllItems db) db minSupp ::: ItemSet item ] allFrequentItemSets db = frequentItemSets (AllItemsAxiom ... fold (unName db)) db -- | An association rule. -- -- Definition: in `Clustering.FrequentItemSet.References.RefAgrawalSrikantApriori`: -- -- > An association rule is an implication of the form -- > X ⇒ Y, where X ⊂ I, Y ⊂ I, and X ∩ Y = ∅. -- > -- > The rule X ⇒ Y holds in the transaction set D with confidence c -- > if c% of transactions in D that contain X also contain Y. -- > -- > The rule X ⇒ Y has support s in the transaction set D -- > if s% of transactions in D contain X ∪ Y. data AssociationRule items db minSupp = AssociationRuleAxiom data Association items db minSupp minConf item = Association { associationCause :: FrequentItemSet items db minSupp ::: ItemSet item -- ^ CorrectnessProperty: `associationCause` is a `FrequentItemSet`. -- Because `freqSet` is frequent, and `causeSet` ⊂ `freqSet`. , associationConfidence :: AssociationConfidence items db minSupp minConf ::: Probability , -- CorrectnessProperty: `associationConfidence` is a `Probability`. -- Because P(consequenceSet | causeSet) = P(causeSet ∩ consequenceSet) / P(causeSet) associationConsequence :: FrequentItemSet items db minSupp ::: ItemSet item -- ^ CorrectnessProperty: `associationConsequence` is a `FrequentItemSet`. -- Because `freqSet` is frequent, and `consequenceSet` ⊂ `freqSet`. } deriving (Show, Eq) data AssociationConfidence items db minSupp minConf = AssociationConfidenceAxiom -- | By construction in `associationRules`. instance Axiom (AssociationConfidence items db minSupp minConf >= minConf) -- | @ -- `associationRules` freqSet db minConf -- @ -- generates association rules from the given `FrequentItemSet` `(freqSet)` -- in the given `(db)`, -- with a 'Confidence' greater or equal to the given `(minConf)`. -- -- Algorithm: in `Clustering.FrequentItemSet.References.RefAgrawalSrikantApriori`, -- section 1.1 "Problem Decomposition and Paper Organization", point 2: -- -- For a given `FrequentItemSet` @(freqSet)@, -- For every @(causeSet)@ non-empty subset of @(freqSet)@. -- output a rule of the form @(causeSet => (freqSet - causeSet))@ -- if in the given @(db)@, -- the ratio @(`support` freqSet)@ -- over @(`support` causeSet)@ -- is greater or egal to the given @(minConf)@. -- -- CorrectnessNote: -- > The association rules we consider are probabilistic in nature. -- > The presence of a rule X → A does not necessarily mean -- > that X + Y → A also holds because the latter may -- > not have minimum support. -- > Similarly, the presence of rules X → Y and Y → Z -- > does not necessarily mean > that X → Z holds -- > because the latter may not have minimum confidence. -- -- PerformanceTimeWarning: traverses the given `(db)` -- once for each non-empty subset of the given `(freqSet)`. associationRules :: Ord.Ord item => FrequentItemSet items db minSupp ::: ItemSet item -> db ::: Transactions item -> minConf ::: Probability -> [ AssociationRule items db minSupp ::: Association items db minSupp minConf item ] associationRules freqSet db minConf = [ AssociationRuleAxiom ... Association { associationCause = FrequentItemSetAxiom ... causeSet , associationConfidence = AssociationConfidenceAxiom ... confidence , -- CorrectnessProperty: `consequenceSet` is frequent. -- Because `freqSet` is frequent, and `consequenceSet` ⊂ `freqSet`. associationConsequence = FrequentItemSetAxiom ... consequenceSet } | causeSet <- Set.powerSet (unName freqSet) & Set.toList , not (Set.null causeSet) , let consequenceSet = unName freqSet `Set.difference` causeSet , not (Set.null consequenceSet) , let Named causeOcc = support (() ... causeSet) db , confidence <- probability (fromIntegral freqSetOcc % fromIntegral causeOcc) , unName minConf Ord.<= confidence ] where Named freqSetOcc = support freqSet db data ClosedFrequentItemSet items db minSupp minSize = ClosedFrequentItemSetAxiom -- | -- @ -- `closedFrequentItemSets` minSupp minSize items db -- @ -- returns a list of the closed frequent itemsets of -- the transactions @(db)@ restricted to the specified @(items)@, -- and such that the number of transactions containing them is greater or equal to @(minSupp)@, -- and such that the size of those transactions is greater or equal to @(minSize)@. -- Each closed frequent itemset is coupled with a sequence of the transactions containing them. closedFrequentItemSets :: forall item db minSize minSupp items. Ord.Ord item => minSupp ::: Int / minSupp > Zero -> minSize ::: Int / minSize > Zero -> items ::: ItemSet item -> db ::: Transactions item -> [ ClosedFrequentItemSet items db minSupp minSize ::: (ItemSet item, Seq.Seq (ItemSet item)) ] closedFrequentItemSets (Named minSupp) (Named minSize) (Named items) (Named db) = loop Set.empty items db where loop previousCFIS nextItems remainingDB | Set.null nextItems = [] | otherwise = List.concat [ -- Keep only the cfis which are big enough -- and recurse with the items of nextItems greater than itm, -- and with the transactions which include itm. [ ClosedFrequentItemSetAxiom ... (nextCFIS, nextDB) | fromIntegral (Set.size nextCFIS) Ord.>= minSize ] <> loop nextCFIS (snd (Set.split itm nextItems)) (toList nextDB) | (itm, nextDB) <- nextItemToDB , let nextCFIS = Set.insert itm previousCFIS , let supp = Seq.length nextDB , fromIntegral supp Ord.>= minSupp ] where -- Map each itm of the remaining nextItems -- to the transactions of remainingDB containing it. nextItemToDB = Map.fromListWith (<>) [ (itm, Seq.singleton tx) | tx <- remainingDB , itm <- Set.intersection nextItems tx & Set.toList ] & Map.toList -- | `closedFrequentItemSets` applied to all the items of the given transactions. allClosedFrequentItemSets :: Ord.Ord item => minSupp ::: Int / minSupp > Zero -> minSize ::: Int / minSize > Zero -> db ::: Transactions item -> [ ClosedFrequentItemSet (AllItems db) db minSupp minSize ::: (ItemSet item, Seq.Seq (ItemSet item)) ] allClosedFrequentItemSets minSupp minSize db = closedFrequentItemSets minSupp minSize (AllItemsAxiom ... fold (unName db)) db