1 {-# LANGUAGE RankNTypes #-}
2 {-# OPTIONS_GHC -Wno-deprecations #-}
4 -- | Brute-force algorithms related to mining frequent item sets.
6 -- Definition: in `Clustering.FrequentItemSet.References.RefAgrawalSrikantApriori`:
8 -- > Given a set of transactions D, the problem of mining
9 -- > association rules is to generate all association rules
10 -- > that have support and confidence greater than the
11 -- > user-specified minimum support (called minsup) and
12 -- > minimum confidence (called minconf) respectively.
13 module Clustering.FrequentItemSet.BruteForce (
17 type FrequentItemSet (),
21 type AssociationRule (),
22 type AssociationConfidence (),
23 type Association (..),
25 type ClosedFrequentItemSet (),
26 closedFrequentItemSets,
27 allClosedFrequentItemSets,
31 import Data.Eq (Eq (..))
32 import Data.Foldable (fold)
33 import Data.Function ((&))
35 import Data.List qualified as List
36 import Data.Map.Strict qualified as Map
37 import Data.Ord qualified as Ord
38 import Data.Ratio ((%))
39 import Data.Semigroup (Semigroup (..))
40 import Data.Sequence qualified as Seq
42 import Data.Set qualified as Set
43 import Data.Tuple (snd)
44 import GHC.IsList (toList)
46 import Logic.Theory.Arithmetic (Zero)
47 import Logic.Theory.Ord
48 import Numeric.Probability
49 import Text.Show (Show (..))
50 import Prelude (fromIntegral, (+))
53 type Transactions item = [ItemSet item]
55 data Support itemSet db = SupportAxiom
56 instance Axiom (Support itemSet db ::: Int >= Zero)
58 -- | Return the number of occurrences of @(itemSet)@ in @(db)@.
61 itemSet ::: ItemSet item ->
62 db ::: Transactions item ->
63 Support itemSet db ::: Int
64 support (Named itemSet) (Named db) =
65 SupportAxiom ... List.length (List.filter (itemSet `Set.isSubsetOf`) db)
67 data FrequentItemSet items db minSupp = FrequentItemSetAxiom
69 -- | Determine the frequent itemsets.
72 items ::: ItemSet item ->
73 db ::: Transactions item ->
74 minSupp ::: Int / minSupp > Zero ->
75 [FrequentItemSet items db minSupp ::: ItemSet item]
76 frequentItemSets (Named items) (Named db) (Named minSupp) =
77 [ FrequentItemSetAxiom ... sub
78 | (sub, occ) <- occBySubSet db
82 -- TimeEfficiencyWarning: iterate only once through the given `(db)`
83 -- to count the occurence of each subset of the given `(items)`.
84 -- occBySubSet :: Transactions item -> [(ItemSet item, Int)]
87 (\t -> List.map (\(sub, occ) -> (sub, if sub `Set.isSubsetOf` t then occ + 1 else occ)))
88 [(sub, 0) | sub <- Set.powerSet items & Set.toList]
90 data AllItems db = AllItemsAxiom
93 -- `allFrequentItemSets` db minSupp = `frequentItemSets` is db minSupp
95 -- where `(is)` gathers all the items present in the given `(db)`.
96 allFrequentItemSets ::
98 db ::: Transactions item ->
99 minSupp ::: Int / minSupp > Zero ->
100 [ FrequentItemSet (AllItems db) db minSupp
103 allFrequentItemSets db = frequentItemSets (AllItemsAxiom ... fold (unName db)) db
105 -- | An association rule.
107 -- Definition: in `Clustering.FrequentItemSet.References.RefAgrawalSrikantApriori`:
109 -- > An association rule is an implication of the form
110 -- > X ⇒ Y, where X ⊂ I, Y ⊂ I, and X ∩ Y = ∅.
112 -- > The rule X ⇒ Y holds in the transaction set D with confidence c
113 -- > if c% of transactions in D that contain X also contain Y.
115 -- > The rule X ⇒ Y has support s in the transaction set D
116 -- > if s% of transactions in D contain X ∪ Y.
117 data AssociationRule items db minSupp = AssociationRuleAxiom
119 data Association items db minSupp minConf item = Association
120 { associationCause :: FrequentItemSet items db minSupp ::: ItemSet item
121 -- ^ CorrectnessProperty: `associationCause` is a `FrequentItemSet`.
122 -- Because `freqSet` is frequent, and `causeSet` ⊂ `freqSet`.
123 , associationConfidence :: AssociationConfidence items db minSupp minConf ::: Probability
124 , -- CorrectnessProperty: `associationConfidence` is a `Probability`.
125 -- Because P(consequenceSet | causeSet) = P(causeSet ∩ consequenceSet) / P(causeSet)
126 associationConsequence :: FrequentItemSet items db minSupp ::: ItemSet item
127 -- ^ CorrectnessProperty: `associationConsequence` is a `FrequentItemSet`.
128 -- Because `freqSet` is frequent, and `consequenceSet` ⊂ `freqSet`.
132 data AssociationConfidence items db minSupp minConf = AssociationConfidenceAxiom
134 -- | By construction in `associationRules`.
135 instance Axiom (AssociationConfidence items db minSupp minConf >= minConf)
138 -- `associationRules` freqSet db minConf
140 -- generates association rules from the given `FrequentItemSet` `(freqSet)`
141 -- in the given `(db)`,
142 -- with a 'Confidence' greater or equal to the given `(minConf)`.
144 -- Algorithm: in `Clustering.FrequentItemSet.References.RefAgrawalSrikantApriori`,
145 -- section 1.1 "Problem Decomposition and Paper Organization", point 2:
147 -- For a given `FrequentItemSet` @(freqSet)@,
148 -- For every @(causeSet)@ non-empty subset of @(freqSet)@.
149 -- output a rule of the form @(causeSet => (freqSet - causeSet))@
150 -- if in the given @(db)@,
151 -- the ratio @(`support` freqSet)@
152 -- over @(`support` causeSet)@
153 -- is greater or egal to the given @(minConf)@.
156 -- > The association rules we consider are probabilistic in nature.
157 -- > The presence of a rule X → A does not necessarily mean
158 -- > that X + Y → A also holds because the latter may
159 -- > not have minimum support.
160 -- > Similarly, the presence of rules X → Y and Y → Z
161 -- > does not necessarily mean > that X → Z holds
162 -- > because the latter may not have minimum confidence.
164 -- PerformanceTimeWarning: traverses the given `(db)`
165 -- once for each non-empty subset of the given `(freqSet)`.
168 FrequentItemSet items db minSupp ::: ItemSet item ->
169 db ::: Transactions item ->
170 minConf ::: Probability ->
171 [ AssociationRule items db minSupp
172 ::: Association items db minSupp minConf item
174 associationRules freqSet db minConf =
175 [ AssociationRuleAxiom
177 { associationCause = FrequentItemSetAxiom ... causeSet
178 , associationConfidence = AssociationConfidenceAxiom ... confidence
179 , -- CorrectnessProperty: `consequenceSet` is frequent.
180 -- Because `freqSet` is frequent, and `consequenceSet` ⊂ `freqSet`.
181 associationConsequence = FrequentItemSetAxiom ... consequenceSet
183 | causeSet <- Set.powerSet (unName freqSet) & Set.toList
184 , not (Set.null causeSet)
185 , let consequenceSet = unName freqSet `Set.difference` causeSet
186 , not (Set.null consequenceSet)
187 , let Named causeOcc = support (() ... causeSet) db
188 , confidence <- probability (fromIntegral freqSetOcc % fromIntegral causeOcc)
189 , unName minConf Ord.<= confidence
192 Named freqSetOcc = support freqSet db
194 data ClosedFrequentItemSet items db minSupp minSize = ClosedFrequentItemSetAxiom
198 -- `closedFrequentItemSets` minSupp minSize items db
200 -- returns a list of the closed frequent itemsets of
201 -- the transactions @(db)@ restricted to the specified @(items)@,
202 -- and such that the number of transactions containing them is greater or equal to @(minSupp)@,
203 -- and such that the size of those transactions is greater or equal to @(minSize)@.
204 -- Each closed frequent itemset is coupled with a sequence of the transactions containing them.
205 closedFrequentItemSets ::
206 forall item db minSize minSupp items.
208 minSupp ::: Int / minSupp > Zero ->
209 minSize ::: Int / minSize > Zero ->
210 items ::: ItemSet item ->
211 db ::: Transactions item ->
212 [ ClosedFrequentItemSet items db minSupp minSize
213 ::: (ItemSet item, Seq.Seq (ItemSet item))
215 closedFrequentItemSets (Named minSupp) (Named minSize) (Named items) (Named db) =
216 loop Set.empty items db
218 loop previousCFIS nextItems remainingDB
219 | Set.null nextItems = []
222 [ -- Keep only the cfis which are big enough
223 -- and recurse with the items of nextItems greater than itm,
224 -- and with the transactions which include itm.
225 [ ClosedFrequentItemSetAxiom ... (nextCFIS, nextDB)
226 | fromIntegral (Set.size nextCFIS) Ord.>= minSize
228 <> loop nextCFIS (snd (Set.split itm nextItems)) (toList nextDB)
229 | (itm, nextDB) <- nextItemToDB
230 , let nextCFIS = Set.insert itm previousCFIS
231 , let supp = Seq.length nextDB
232 , fromIntegral supp Ord.>= minSupp
235 -- Map each itm of the remaining nextItems
236 -- to the transactions of remainingDB containing it.
240 [ (itm, Seq.singleton tx)
242 , itm <- Set.intersection nextItems tx & Set.toList
246 -- | `closedFrequentItemSets` applied to all the items of the given transactions.
247 allClosedFrequentItemSets ::
249 minSupp ::: Int / minSupp > Zero ->
250 minSize ::: Int / minSize > Zero ->
251 db ::: Transactions item ->
252 [ ClosedFrequentItemSet (AllItems db) db minSupp minSize
253 ::: (ItemSet item, Seq.Seq (ItemSet item))
255 allClosedFrequentItemSets minSupp minSize db = closedFrequentItemSets minSupp minSize (AllItemsAxiom ... fold (unName db)) db