1 {-# LANGUAGE RankNTypes #-}
2 {-# OPTIONS_GHC -Wno-deprecations #-}
4 -- | Brute-force algorithms related to mining frequent item sets.
6 -- Definition: in `Clustering.FrequentItemSet.References.RefAgrawalSrikantApriori`:
8 -- > Given a set of transactions D, the problem of mining
9 -- > association rules is to generate all association rules
10 -- > that have support and confidence greater than the
11 -- > user-specified minimum support (called minsup) and
12 -- > minimum confidence (called minconf) respectively.
13 module Clustering.FrequentItemSet.BruteForce (
17 type FrequentItemSet (),
21 type AssociationRule (),
22 type AssociationConfidence (),
23 type Association (..),
25 type ClosedFrequentItemSet (),
26 closedFrequentItemSets,
27 allClosedFrequentItemSets,
31 import Data.Eq (Eq (..))
32 import Data.Foldable (fold)
33 import Data.Function ((&))
35 import Data.List qualified as List
36 import Data.Map.Strict qualified as Map
37 import Data.Ord qualified as Ord
38 import Data.Ratio ((%))
40 import Data.Set qualified as Set
42 import Logic.Theory.Arithmetic (Zero)
43 import Logic.Theory.Ord
44 import Numeric.Probability
45 import Text.Show (Show (..))
46 import Prelude (fromIntegral, (+))
48 -- import Debug.Trace (traceShow)
49 import Data.Semigroup (Semigroup (..))
50 import Data.Sequence qualified as Seq
51 import Data.Tuple (snd)
52 import GHC.IsList (toList)
55 type Transactions item = [ItemSet item]
57 data Support itemSet db = SupportAxiom
58 instance Axiom (Support itemSet db ::: Int >= Zero)
60 -- | Return the number of occurrences of @(itemSet)@ in @(db)@.
63 itemSet ::: ItemSet item ->
64 db ::: Transactions item ->
65 Support itemSet db ::: Int
66 support (Named itemSet) (Named ts) =
67 SupportAxiom ... List.length (List.filter (itemSet `Set.isSubsetOf`) ts)
69 data FrequentItemSet items db minSupp = FrequentItemSetAxiom
71 -- | Determine the frequent itemsets.
74 items ::: ItemSet item ->
75 db ::: Transactions item ->
76 minSupp ::: Int / minSupp > Zero ->
77 [FrequentItemSet items db minSupp ::: ItemSet item]
78 frequentItemSets (Named items) (Named ts) (Named minSupp) =
79 [ FrequentItemSetAxiom ... sub
80 | (sub, occ) <- occBySubSet ts
84 -- TimeEfficiencyWarning: iterate only once through the given `(db)`
85 -- to count the occurence of each subset of the given `(items)`.
86 -- occBySubSet :: Transactions item -> [(ItemSet item, Int)]
89 (\t -> List.map (\(sub, occ) -> (sub, if sub `Set.isSubsetOf` t then occ + 1 else occ)))
90 [(sub, 0) | sub <- Set.powerSet items & Set.toList]
92 data AllItems db = AllItemsAxiom
95 -- `allFrequentItemSets` db minSupp = `frequentItemSets` is db minSupp
97 -- where `(is)` gathers all the items present in the given `(db)`.
98 allFrequentItemSets ::
100 db ::: Transactions item ->
101 minSupp ::: Int / minSupp > Zero ->
102 [ FrequentItemSet (AllItems db) db minSupp
105 allFrequentItemSets ts = frequentItemSets (AllItemsAxiom ... fold (unName ts)) ts
107 -- | An association rule.
109 -- Definition: in `Clustering.FrequentItemSet.References.RefAgrawalSrikantApriori`:
111 -- > An association rule is an implication of the form
112 -- > X ⇒ Y, where X ⊂ I, Y ⊂ I, and X ∩ Y = ∅.
114 -- > The rule X ⇒ Y holds in the transaction set D with confidence c
115 -- > if c% of transactions in D that contain X also contain Y.
117 -- > The rule X ⇒ Y has support s in the transaction set D
118 -- > if s% of transactions in D contain X ∪ Y.
119 data AssociationRule items db minSupp = AssociationRuleAxiom
121 data Association items db minSupp minConf item = Association
122 { associationCause :: FrequentItemSet items db minSupp ::: ItemSet item
123 -- ^ CorrectnessProperty: `associationCause` is a `FrequentItemSet`.
124 -- Because `freqSet` is frequent, and `causeSet` ⊂ `freqSet`.
125 , associationConfidence :: AssociationConfidence items db minSupp minConf ::: Probability
126 , -- CorrectnessProperty: `associationConfidence` is a `Probability`.
127 -- Because P(consequenceSet | causeSet) = P(causeSet ∩ consequenceSet) / P(causeSet)
128 associationConsequence :: FrequentItemSet items db minSupp ::: ItemSet item
129 -- ^ CorrectnessProperty: `associationConsequence` is a `FrequentItemSet`.
130 -- Because `freqSet` is frequent, and `consequenceSet` ⊂ `freqSet`.
134 data AssociationConfidence items db minSupp minConf = AssociationConfidenceAxiom
136 -- | By construction in `associationRules`.
137 instance Axiom (AssociationConfidence items db minSupp minConf >= minConf)
140 -- `associationRules` freqSet db minConf
142 -- generates association rules from the given `FrequentItemSet` `(freqSet)`
143 -- in the given `(db)`,
144 -- with a 'Confidence' greater or equal to the given `(minConf)`.
146 -- Algorithm: in `Clustering.FrequentItemSet.References.RefAgrawalSrikantApriori`,
147 -- section 1.1 "Problem Decomposition and Paper Organization", point 2:
149 -- For a given `FrequentItemSet` @(freqSet)@,
150 -- For every @(causeSet)@ non-empty subset of @(freqSet)@.
151 -- output a rule of the form @(causeSet => (freqSet - causeSet))@
152 -- if in the given @(db)@,
153 -- the ratio @(`support` freqSet)@
154 -- over @(`support` causeSet)@
155 -- is greater or egal to the given @(minConf)@.
158 -- > The association rules we consider are probabilistic in nature.
159 -- > The presence of a rule X → A does not necessarily mean
160 -- > that X + Y → A also holds because the latter may
161 -- > not have minimum support.
162 -- > Similarly, the presence of rules X → Y and Y → Z
163 -- > does not necessarily mean > that X → Z holds
164 -- > because the latter may not have minimum confidence.
166 -- PerformanceTimeWarning: traverses the given `(db)`
167 -- once for each non-empty subset of the given `(freqSet)`.
170 FrequentItemSet items db minSupp ::: ItemSet item ->
171 db ::: Transactions item ->
172 minConf ::: Probability ->
173 [ AssociationRule items db minSupp
174 ::: Association items db minSupp minConf item
176 associationRules freqSet ts minConf =
177 [ AssociationRuleAxiom
179 { associationCause = FrequentItemSetAxiom ... causeSet
180 , associationConfidence = AssociationConfidenceAxiom ... confidence
181 , -- CorrectnessProperty: `consequenceSet` is frequent.
182 -- Because `freqSet` is frequent, and `consequenceSet` ⊂ `freqSet`.
183 associationConsequence = FrequentItemSetAxiom ... consequenceSet
185 | causeSet <- Set.powerSet (unName freqSet) & Set.toList
186 , not (Set.null causeSet)
187 , let consequenceSet = unName freqSet `Set.difference` causeSet
188 , not (Set.null consequenceSet)
189 , let Named causeOcc = support (() ... causeSet) ts
190 , confidence <- probability (fromIntegral freqSetOcc % fromIntegral causeOcc)
191 , unName minConf Ord.<= confidence
194 Named freqSetOcc = support freqSet ts
196 data ClosedFrequentItemSet items db minSupp minSize = ClosedFrequentItemSetAxiom
198 closedFrequentItemSets ::
199 forall item db minSize minSupp items.
201 items ::: ItemSet item ->
202 db ::: Transactions item ->
203 minSupp ::: Int / minSupp > Zero ->
204 minSize ::: Int / minSize > Zero ->
205 [ ClosedFrequentItemSet items db minSupp minSize
206 ::: (Set item, Seq.Seq (ItemSet item))
208 closedFrequentItemSets (Named items) (Named db) (Named minSupp) (Named minSize) =
209 loop Set.empty items db
211 loop pathFIS pathGT pathDB
212 | Set.null pathGT = []
215 [ [ ClosedFrequentItemSetAxiom ... (cfis, txs)
216 | fromIntegral (Set.size cfis) Ord.>= minSize
218 <> loop cfis (snd (Set.split itm pathGT)) (toList txs)
220 , let cfis = Set.insert itm pathFIS
221 , let supp = Seq.length txs
222 , fromIntegral supp Ord.>= minSupp
225 -- Map each itm of the remaining pathGT
226 -- to the remaining transactions pathDB
231 [ (itm, Seq.singleton tx)
233 , itm <- Set.intersection pathGT tx & Set.toList
237 allClosedFrequentItemSets ::
239 db ::: Transactions item ->
240 minSupp ::: Int / minSupp > Zero ->
241 minSize ::: Int / minSize > Zero ->
242 [ ClosedFrequentItemSet (AllItems db) db minSupp minSize
243 ::: (Set item, Seq.Seq (ItemSet item))
245 allClosedFrequentItemSets ts = closedFrequentItemSets (AllItemsAxiom ... fold (unName ts)) ts