]> Git — Sourcephile - literate-phylomemy.git/blob - src/Clustering/FrequentItemSet/BruteForce.hs
init
[literate-phylomemy.git] / src / Clustering / FrequentItemSet / BruteForce.hs
1 {-# LANGUAGE RankNTypes #-}
2 {-# OPTIONS_GHC -Wno-deprecations #-}
3
4 -- | Brute-force algorithms related to mining frequent item sets.
5 --
6 -- Definition: in `Clustering.FrequentItemSet.References.RefAgrawalSrikantApriori`:
7 --
8 -- > Given a set of transactions D, the problem of mining
9 -- > association rules is to generate all association rules
10 -- > that have support and confidence greater than the
11 -- > user-specified minimum support (called minsup) and
12 -- > minimum confidence (called minconf) respectively.
13 module Clustering.FrequentItemSet.BruteForce (
14 type ItemSet,
15 type Transactions,
16 type Support (),
17 type FrequentItemSet (),
18 frequentItemSets,
19 type AllItems (),
20 allFrequentItemSets,
21 type AssociationRule (),
22 type AssociationConfidence (),
23 type Association (..),
24 associationRules,
25 type ClosedFrequentItemSet (),
26 closedFrequentItemSets,
27 allClosedFrequentItemSets,
28 ) where
29
30 import Data.Bool
31 import Data.Eq (Eq (..))
32 import Data.Foldable (fold)
33 import Data.Function ((&))
34 import Data.Int (Int)
35 import Data.List qualified as List
36 import Data.Map.Strict qualified as Map
37 import Data.Ord qualified as Ord
38 import Data.Ratio ((%))
39 import Data.Semigroup (Semigroup (..))
40 import Data.Sequence qualified as Seq
41 import Data.Set (Set)
42 import Data.Set qualified as Set
43 import Data.Tuple (snd)
44 import GHC.IsList (toList)
45 import Logic
46 import Logic.Theory.Arithmetic (Zero)
47 import Logic.Theory.Ord
48 import Numeric.Probability
49 import Text.Show (Show (..))
50 import Prelude (fromIntegral, (+))
51
52 type ItemSet = Set
53 type Transactions item = [ItemSet item]
54
55 data Support itemSet db = SupportAxiom
56 instance Axiom (Support itemSet db ::: Int >= Zero)
57
58 -- | Return the number of occurrences of @(itemSet)@ in @(db)@.
59 support ::
60 Ord.Ord item =>
61 itemSet ::: ItemSet item ->
62 db ::: Transactions item ->
63 Support itemSet db ::: Int
64 support (Named itemSet) (Named db) =
65 SupportAxiom ... List.length (List.filter (itemSet `Set.isSubsetOf`) db)
66
67 data FrequentItemSet items db minSupp = FrequentItemSetAxiom
68
69 -- | Determine the frequent itemsets.
70 frequentItemSets ::
71 Ord.Ord item =>
72 items ::: ItemSet item ->
73 db ::: Transactions item ->
74 minSupp ::: Int / minSupp > Zero ->
75 [FrequentItemSet items db minSupp ::: ItemSet item]
76 frequentItemSets (Named items) (Named db) (Named minSupp) =
77 [ FrequentItemSetAxiom ... sub
78 | (sub, occ) <- occBySubSet db
79 , occ Ord.>= minSupp
80 ]
81 where
82 -- TimeEfficiencyWarning: iterate only once through the given `(db)`
83 -- to count the occurence of each subset of the given `(items)`.
84 -- occBySubSet :: Transactions item -> [(ItemSet item, Int)]
85 occBySubSet =
86 List.foldr
87 (\t -> List.map (\(sub, occ) -> (sub, if sub `Set.isSubsetOf` t then occ + 1 else occ)))
88 [(sub, 0) | sub <- Set.powerSet items & Set.toList]
89
90 data AllItems db = AllItemsAxiom
91
92 -- | @
93 -- `allFrequentItemSets` db minSupp = `frequentItemSets` is db minSupp
94 -- @
95 -- where `(is)` gathers all the items present in the given `(db)`.
96 allFrequentItemSets ::
97 Ord.Ord item =>
98 db ::: Transactions item ->
99 minSupp ::: Int / minSupp > Zero ->
100 [ FrequentItemSet (AllItems db) db minSupp
101 ::: ItemSet item
102 ]
103 allFrequentItemSets db = frequentItemSets (AllItemsAxiom ... fold (unName db)) db
104
105 -- | An association rule.
106 --
107 -- Definition: in `Clustering.FrequentItemSet.References.RefAgrawalSrikantApriori`:
108 --
109 -- > An association rule is an implication of the form
110 -- > X ⇒ Y, where X ⊂ I, Y ⊂ I, and X ∩ Y = ∅.
111 -- >
112 -- > The rule X ⇒ Y holds in the transaction set D with confidence c
113 -- > if c% of transactions in D that contain X also contain Y.
114 -- >
115 -- > The rule X ⇒ Y has support s in the transaction set D
116 -- > if s% of transactions in D contain X ∪ Y.
117 data AssociationRule items db minSupp = AssociationRuleAxiom
118
119 data Association items db minSupp minConf item = Association
120 { associationCause :: FrequentItemSet items db minSupp ::: ItemSet item
121 -- ^ CorrectnessProperty: `associationCause` is a `FrequentItemSet`.
122 -- Because `freqSet` is frequent, and `causeSet` ⊂ `freqSet`.
123 , associationConfidence :: AssociationConfidence items db minSupp minConf ::: Probability
124 , -- CorrectnessProperty: `associationConfidence` is a `Probability`.
125 -- Because P(consequenceSet | causeSet) = P(causeSet ∩ consequenceSet) / P(causeSet)
126 associationConsequence :: FrequentItemSet items db minSupp ::: ItemSet item
127 -- ^ CorrectnessProperty: `associationConsequence` is a `FrequentItemSet`.
128 -- Because `freqSet` is frequent, and `consequenceSet` ⊂ `freqSet`.
129 }
130 deriving (Show, Eq)
131
132 data AssociationConfidence items db minSupp minConf = AssociationConfidenceAxiom
133
134 -- | By construction in `associationRules`.
135 instance Axiom (AssociationConfidence items db minSupp minConf >= minConf)
136
137 -- | @
138 -- `associationRules` freqSet db minConf
139 -- @
140 -- generates association rules from the given `FrequentItemSet` `(freqSet)`
141 -- in the given `(db)`,
142 -- with a 'Confidence' greater or equal to the given `(minConf)`.
143 --
144 -- Algorithm: in `Clustering.FrequentItemSet.References.RefAgrawalSrikantApriori`,
145 -- section 1.1 "Problem Decomposition and Paper Organization", point 2:
146 --
147 -- For a given `FrequentItemSet` @(freqSet)@,
148 -- For every @(causeSet)@ non-empty subset of @(freqSet)@.
149 -- output a rule of the form @(causeSet => (freqSet - causeSet))@
150 -- if in the given @(db)@,
151 -- the ratio @(`support` freqSet)@
152 -- over @(`support` causeSet)@
153 -- is greater or egal to the given @(minConf)@.
154 --
155 -- CorrectnessNote:
156 -- > The association rules we consider are probabilistic in nature.
157 -- > The presence of a rule X → A does not necessarily mean
158 -- > that X + Y → A also holds because the latter may
159 -- > not have minimum support.
160 -- > Similarly, the presence of rules X → Y and Y → Z
161 -- > does not necessarily mean > that X → Z holds
162 -- > because the latter may not have minimum confidence.
163 --
164 -- PerformanceTimeWarning: traverses the given `(db)`
165 -- once for each non-empty subset of the given `(freqSet)`.
166 associationRules ::
167 Ord.Ord item =>
168 FrequentItemSet items db minSupp ::: ItemSet item ->
169 db ::: Transactions item ->
170 minConf ::: Probability ->
171 [ AssociationRule items db minSupp
172 ::: Association items db minSupp minConf item
173 ]
174 associationRules freqSet db minConf =
175 [ AssociationRuleAxiom
176 ... Association
177 { associationCause = FrequentItemSetAxiom ... causeSet
178 , associationConfidence = AssociationConfidenceAxiom ... confidence
179 , -- CorrectnessProperty: `consequenceSet` is frequent.
180 -- Because `freqSet` is frequent, and `consequenceSet` ⊂ `freqSet`.
181 associationConsequence = FrequentItemSetAxiom ... consequenceSet
182 }
183 | causeSet <- Set.powerSet (unName freqSet) & Set.toList
184 , not (Set.null causeSet)
185 , let consequenceSet = unName freqSet `Set.difference` causeSet
186 , not (Set.null consequenceSet)
187 , let Named causeOcc = support (() ... causeSet) db
188 , confidence <- probability (fromIntegral freqSetOcc % fromIntegral causeOcc)
189 , unName minConf Ord.<= confidence
190 ]
191 where
192 Named freqSetOcc = support freqSet db
193
194 data ClosedFrequentItemSet items db minSupp minSize = ClosedFrequentItemSetAxiom
195
196 -- |
197 -- @
198 -- `closedFrequentItemSets` minSupp minSize items db
199 -- @
200 -- returns a list of the closed frequent itemsets of
201 -- the transactions @(db)@ restricted to the specified @(items)@,
202 -- and such that the number of transactions containing them is greater or equal to @(minSupp)@,
203 -- and such that the size of those transactions is greater or equal to @(minSize)@.
204 -- Each closed frequent itemset is coupled with a sequence of the transactions containing them.
205 closedFrequentItemSets ::
206 forall item db minSize minSupp items.
207 Ord.Ord item =>
208 minSupp ::: Int / minSupp > Zero ->
209 minSize ::: Int / minSize > Zero ->
210 items ::: ItemSet item ->
211 db ::: Transactions item ->
212 [ ClosedFrequentItemSet items db minSupp minSize
213 ::: (ItemSet item, Seq.Seq (ItemSet item))
214 ]
215 closedFrequentItemSets (Named minSupp) (Named minSize) (Named items) (Named db) =
216 loop Set.empty items db
217 where
218 loop previousCFIS nextItems remainingDB
219 | Set.null nextItems = []
220 | otherwise =
221 List.concat
222 [ -- Keep only the cfis which are big enough
223 -- and recurse with the items of nextItems greater than itm,
224 -- and with the transactions which include itm.
225 [ ClosedFrequentItemSetAxiom ... (nextCFIS, nextDB)
226 | fromIntegral (Set.size nextCFIS) Ord.>= minSize
227 ]
228 <> loop nextCFIS (snd (Set.split itm nextItems)) (toList nextDB)
229 | (itm, nextDB) <- nextItemToDB
230 , let nextCFIS = Set.insert itm previousCFIS
231 , let supp = Seq.length nextDB
232 , fromIntegral supp Ord.>= minSupp
233 ]
234 where
235 -- Map each itm of the remaining nextItems
236 -- to the transactions of remainingDB containing it.
237 nextItemToDB =
238 Map.fromListWith
239 (<>)
240 [ (itm, Seq.singleton tx)
241 | tx <- remainingDB
242 , itm <- Set.intersection nextItems tx & Set.toList
243 ]
244 & Map.toList
245
246 -- | `closedFrequentItemSets` applied to all the items of the given transactions.
247 allClosedFrequentItemSets ::
248 Ord.Ord item =>
249 minSupp ::: Int / minSupp > Zero ->
250 minSize ::: Int / minSize > Zero ->
251 db ::: Transactions item ->
252 [ ClosedFrequentItemSet (AllItems db) db minSupp minSize
253 ::: (ItemSet item, Seq.Seq (ItemSet item))
254 ]
255 allClosedFrequentItemSets minSupp minSize db = closedFrequentItemSets minSupp minSize (AllItemsAxiom ... fold (unName db)) db