]> Git — Sourcephile - literate-phylomemy.git/blob - src/Clustering/FrequentItemSet/BruteForce.hs
init
[literate-phylomemy.git] / src / Clustering / FrequentItemSet / BruteForce.hs
1 {-# LANGUAGE RankNTypes #-}
2 {-# OPTIONS_GHC -Wno-deprecations #-}
3
4 -- | Brute-force algorithms related to mining frequent item sets.
5 --
6 -- Definition: in `Clustering.FrequentItemSet.References.RefAgrawalSrikantApriori`:
7 --
8 -- > Given a set of transactions D, the problem of mining
9 -- > association rules is to generate all association rules
10 -- > that have support and confidence greater than the
11 -- > user-specified minimum support (called minsup) and
12 -- > minimum confidence (called minconf) respectively.
13 module Clustering.FrequentItemSet.BruteForce (
14 type ItemSet,
15 type Transactions,
16 type Support (),
17 type FrequentItemSet (),
18 frequentItemSets,
19 type AllItems (),
20 allFrequentItemSets,
21 type AssociationRule (),
22 type AssociationConfidence (),
23 type Association (..),
24 associationRules,
25 type ClosedFrequentItemSet (),
26 closedFrequentItemSets,
27 allClosedFrequentItemSets,
28 ) where
29
30 import Data.Bool
31 import Data.Eq (Eq (..))
32 import Data.Foldable (fold)
33 import Data.Function ((&))
34 import Data.Int (Int)
35 import Data.List qualified as List
36 import Data.Map.Strict qualified as Map
37 import Data.Ord qualified as Ord
38 import Data.Ratio ((%))
39 import Data.Set (Set)
40 import Data.Set qualified as Set
41 import Logic
42 import Logic.Theory.Arithmetic (Zero)
43 import Logic.Theory.Ord
44 import Numeric.Probability
45 import Text.Show (Show (..))
46 import Prelude (fromIntegral, (+))
47
48 -- import Debug.Trace (traceShow)
49 import Data.Semigroup (Semigroup (..))
50 import Data.Sequence qualified as Seq
51 import Data.Tuple (snd)
52 import GHC.IsList (toList)
53
54 type ItemSet = Set
55 type Transactions item = [ItemSet item]
56
57 data Support itemSet db = SupportAxiom
58 instance Axiom (Support itemSet db ::: Int >= Zero)
59
60 -- | Return the number of occurrences of @(itemSet)@ in @(db)@.
61 support ::
62 Ord.Ord item =>
63 itemSet ::: ItemSet item ->
64 db ::: Transactions item ->
65 Support itemSet db ::: Int
66 support (Named itemSet) (Named ts) =
67 SupportAxiom ... List.length (List.filter (itemSet `Set.isSubsetOf`) ts)
68
69 data FrequentItemSet items db minSupp = FrequentItemSetAxiom
70
71 -- | Determine the frequent itemsets.
72 frequentItemSets ::
73 Ord.Ord item =>
74 items ::: ItemSet item ->
75 db ::: Transactions item ->
76 minSupp ::: Int / minSupp > Zero ->
77 [FrequentItemSet items db minSupp ::: ItemSet item]
78 frequentItemSets (Named items) (Named ts) (Named minSupp) =
79 [ FrequentItemSetAxiom ... sub
80 | (sub, occ) <- occBySubSet ts
81 , occ Ord.>= minSupp
82 ]
83 where
84 -- TimeEfficiencyWarning: iterate only once through the given `(db)`
85 -- to count the occurence of each subset of the given `(items)`.
86 -- occBySubSet :: Transactions item -> [(ItemSet item, Int)]
87 occBySubSet =
88 List.foldr
89 (\t -> List.map (\(sub, occ) -> (sub, if sub `Set.isSubsetOf` t then occ + 1 else occ)))
90 [(sub, 0) | sub <- Set.powerSet items & Set.toList]
91
92 data AllItems db = AllItemsAxiom
93
94 -- | @
95 -- `allFrequentItemSets` db minSupp = `frequentItemSets` is db minSupp
96 -- @
97 -- where `(is)` gathers all the items present in the given `(db)`.
98 allFrequentItemSets ::
99 Ord.Ord item =>
100 db ::: Transactions item ->
101 minSupp ::: Int / minSupp > Zero ->
102 [ FrequentItemSet (AllItems db) db minSupp
103 ::: ItemSet item
104 ]
105 allFrequentItemSets ts = frequentItemSets (AllItemsAxiom ... fold (unName ts)) ts
106
107 -- | An association rule.
108 --
109 -- Definition: in `Clustering.FrequentItemSet.References.RefAgrawalSrikantApriori`:
110 --
111 -- > An association rule is an implication of the form
112 -- > X ⇒ Y, where X ⊂ I, Y ⊂ I, and X ∩ Y = ∅.
113 -- >
114 -- > The rule X ⇒ Y holds in the transaction set D with confidence c
115 -- > if c% of transactions in D that contain X also contain Y.
116 -- >
117 -- > The rule X ⇒ Y has support s in the transaction set D
118 -- > if s% of transactions in D contain X ∪ Y.
119 data AssociationRule items db minSupp = AssociationRuleAxiom
120
121 data Association items db minSupp minConf item = Association
122 { associationCause :: FrequentItemSet items db minSupp ::: ItemSet item
123 -- ^ CorrectnessProperty: `associationCause` is a `FrequentItemSet`.
124 -- Because `freqSet` is frequent, and `causeSet` ⊂ `freqSet`.
125 , associationConfidence :: AssociationConfidence items db minSupp minConf ::: Probability
126 , -- CorrectnessProperty: `associationConfidence` is a `Probability`.
127 -- Because P(consequenceSet | causeSet) = P(causeSet ∩ consequenceSet) / P(causeSet)
128 associationConsequence :: FrequentItemSet items db minSupp ::: ItemSet item
129 -- ^ CorrectnessProperty: `associationConsequence` is a `FrequentItemSet`.
130 -- Because `freqSet` is frequent, and `consequenceSet` ⊂ `freqSet`.
131 }
132 deriving (Show, Eq)
133
134 data AssociationConfidence items db minSupp minConf = AssociationConfidenceAxiom
135
136 -- | By construction in `associationRules`.
137 instance Axiom (AssociationConfidence items db minSupp minConf >= minConf)
138
139 -- | @
140 -- `associationRules` freqSet db minConf
141 -- @
142 -- generates association rules from the given `FrequentItemSet` `(freqSet)`
143 -- in the given `(db)`,
144 -- with a 'Confidence' greater or equal to the given `(minConf)`.
145 --
146 -- Algorithm: in `Clustering.FrequentItemSet.References.RefAgrawalSrikantApriori`,
147 -- section 1.1 "Problem Decomposition and Paper Organization", point 2:
148 --
149 -- For a given `FrequentItemSet` @(freqSet)@,
150 -- For every @(causeSet)@ non-empty subset of @(freqSet)@.
151 -- output a rule of the form @(causeSet => (freqSet - causeSet))@
152 -- if in the given @(db)@,
153 -- the ratio @(`support` freqSet)@
154 -- over @(`support` causeSet)@
155 -- is greater or egal to the given @(minConf)@.
156 --
157 -- CorrectnessNote:
158 -- > The association rules we consider are probabilistic in nature.
159 -- > The presence of a rule X → A does not necessarily mean
160 -- > that X + Y → A also holds because the latter may
161 -- > not have minimum support.
162 -- > Similarly, the presence of rules X → Y and Y → Z
163 -- > does not necessarily mean > that X → Z holds
164 -- > because the latter may not have minimum confidence.
165 --
166 -- PerformanceTimeWarning: traverses the given `(db)`
167 -- once for each non-empty subset of the given `(freqSet)`.
168 associationRules ::
169 Ord.Ord item =>
170 FrequentItemSet items db minSupp ::: ItemSet item ->
171 db ::: Transactions item ->
172 minConf ::: Probability ->
173 [ AssociationRule items db minSupp
174 ::: Association items db minSupp minConf item
175 ]
176 associationRules freqSet ts minConf =
177 [ AssociationRuleAxiom
178 ... Association
179 { associationCause = FrequentItemSetAxiom ... causeSet
180 , associationConfidence = AssociationConfidenceAxiom ... confidence
181 , -- CorrectnessProperty: `consequenceSet` is frequent.
182 -- Because `freqSet` is frequent, and `consequenceSet` ⊂ `freqSet`.
183 associationConsequence = FrequentItemSetAxiom ... consequenceSet
184 }
185 | causeSet <- Set.powerSet (unName freqSet) & Set.toList
186 , not (Set.null causeSet)
187 , let consequenceSet = unName freqSet `Set.difference` causeSet
188 , not (Set.null consequenceSet)
189 , let Named causeOcc = support (() ... causeSet) ts
190 , confidence <- probability (fromIntegral freqSetOcc % fromIntegral causeOcc)
191 , unName minConf Ord.<= confidence
192 ]
193 where
194 Named freqSetOcc = support freqSet ts
195
196 data ClosedFrequentItemSet items db minSupp minSize = ClosedFrequentItemSetAxiom
197
198 closedFrequentItemSets ::
199 forall item db minSize minSupp items.
200 Ord.Ord item =>
201 items ::: ItemSet item ->
202 db ::: Transactions item ->
203 minSupp ::: Int / minSupp > Zero ->
204 minSize ::: Int / minSize > Zero ->
205 [ ClosedFrequentItemSet items db minSupp minSize
206 ::: (Set item, Seq.Seq (ItemSet item))
207 ]
208 closedFrequentItemSets (Named items) (Named db) (Named minSupp) (Named minSize) =
209 loop Set.empty items db
210 where
211 loop pathFIS pathGT pathDB
212 | Set.null pathGT = []
213 | otherwise =
214 List.concat
215 [ [ ClosedFrequentItemSetAxiom ... (cfis, txs)
216 | fromIntegral (Set.size cfis) Ord.>= minSize
217 ]
218 <> loop cfis (snd (Set.split itm pathGT)) (toList txs)
219 | (itm, txs) <- gtDB
220 , let cfis = Set.insert itm pathFIS
221 , let supp = Seq.length txs
222 , fromIntegral supp Ord.>= minSupp
223 ]
224 where
225 -- Map each itm of the remaining pathGT
226 -- to the remaining transactions pathDB
227 -- containing itm.
228 gtDB =
229 Map.fromListWith
230 (<>)
231 [ (itm, Seq.singleton tx)
232 | tx <- pathDB
233 , itm <- Set.intersection pathGT tx & Set.toList
234 ]
235 & Map.toList
236
237 allClosedFrequentItemSets ::
238 Ord.Ord item =>
239 db ::: Transactions item ->
240 minSupp ::: Int / minSupp > Zero ->
241 minSize ::: Int / minSize > Zero ->
242 [ ClosedFrequentItemSet (AllItems db) db minSupp minSize
243 ::: (Set item, Seq.Seq (ItemSet item))
244 ]
245 allClosedFrequentItemSets ts = closedFrequentItemSets (AllItemsAxiom ... fold (unName ts)) ts