]> Git — Sourcephile - literate-phylomemy.git/blob - src/Clustering/FrequentItemSet/BruteForce.hs
init
[literate-phylomemy.git] / src / Clustering / FrequentItemSet / BruteForce.hs
1 {-# LANGUAGE RankNTypes #-}
2 {-# OPTIONS_GHC -Wno-deprecations #-}
3
4 -- | Brute-force algorithms related to mining frequent item sets.
5 --
6 -- Definition: in `Clustering.FrequentItemSet.References.RefAgrawalSrikantApriori`:
7 --
8 -- > Given a set of transactions D, the problem of mining
9 -- > association rules is to generate all association rules
10 -- > that have support and confidence greater than the
11 -- > user-specified minimum support (called minsup) and
12 -- > minimum confidence (called minconf) respectively.
13 module Clustering.FrequentItemSet.BruteForce (
14 type ItemSet,
15 Transaction (..),
16 type Support (),
17 type FrequentItemSet (),
18 frequentItemSets,
19 type AllItems (),
20 allFrequentItemSets,
21 type AssociationRule (),
22 type AssociationConfidence (),
23 type Association (..),
24 associationRules,
25 -- type Clusters,
26 -- type ClosedFrequentItemSets (),
27 -- closedFrequentItemSets,
28 -- allClosedFrequentItemSets,
29 ) where
30
31 import Data.Bool
32 import Data.Eq (Eq (..))
33 import Data.Foldable (foldMap)
34 import Data.Function ((&))
35 import Data.Functor ((<&>))
36 import Data.Int (Int)
37 import Data.List qualified as List
38 import Data.Monoid (Monoid (..))
39 import Data.Ord qualified as Ord
40 import Data.Ratio ((%))
41 import Data.Set (Set)
42 import Data.Set qualified as Set
43 import Data.Validity (Validity (..), delve)
44 import GHC.Generics (Generic)
45 import Logic
46 import Logic.Theory.Arithmetic (Zero)
47 import Logic.Theory.Ord
48 import Numeric.Probability
49 import Prelude (fromIntegral, (+))
50 import Text.Show (Show (..))
51
52 type ItemSet = Set
53
54 data Transaction item a = Transaction
55 { transactionData :: a
56 , transactionItems :: ItemSet item
57 }
58 deriving (Show, Generic)
59 instance (Validity (ItemSet item), Validity a) => Validity (Transaction item a) where
60 validate x =
61 mconcat
62 [ delve "transactionData" (transactionData x)
63 , delve "transactionItems" (transactionItems x)
64 ]
65
66 data Support itemSet db = SupportAxiom
67 instance Axiom (Support itemSet db ::: Int >= Zero)
68
69 -- | Return the number of occurrences of @(itemSet)@ in @(db)@.
70 support ::
71 Ord.Ord item =>
72 items ::: ItemSet item ->
73 db ::: [Transaction item a] ->
74 Support itemSet db ::: Int
75 support (Named items) (Named db) =
76 SupportAxiom ... List.length (List.filter (items `Set.isSubsetOf`) (db <&> transactionItems))
77
78 data FrequentItemSet items db minSupp = FrequentItemSetAxiom
79
80 -- | Determine the frequent itemsets.
81 frequentItemSets ::
82 Ord.Ord item =>
83 items ::: ItemSet item ->
84 db ::: [Transaction item a] ->
85 minSupp ::: Int / minSupp > Zero ->
86 [FrequentItemSet items db minSupp ::: ItemSet item]
87 frequentItemSets (Named items) (Named db) (Named minSupp) =
88 [ FrequentItemSetAxiom ... sub
89 | (sub, occ) <- occBySubSet (db <&> transactionItems)
90 , occ Ord.>= minSupp
91 ]
92 where
93 -- TimeEfficiencyWarning: iterate only once through the given `(db)`
94 -- to count the occurence of each subset of the given `(items)`.
95 -- occBySubSet :: [Transaction item a] -> [(ItemSet item, Int)]
96 occBySubSet =
97 List.foldr
98 (\t -> List.map (\(sub, occ) -> (sub, if sub `Set.isSubsetOf` t then occ + 1 else occ)))
99 [(sub, 0) | sub <- Set.powerSet items & Set.toList]
100
101 data AllItems db = AllItemsAxiom
102
103 -- | @
104 -- `allFrequentItemSets` db minSupp = `frequentItemSets` is db minSupp
105 -- @
106 -- where `(is)` gathers all the items present in the given `(db)`.
107 allFrequentItemSets ::
108 Ord.Ord item =>
109 db ::: [Transaction item a] ->
110 minSupp ::: Int / minSupp > Zero ->
111 [ FrequentItemSet (AllItems db) db minSupp
112 ::: ItemSet item
113 ]
114 allFrequentItemSets db =
115 frequentItemSets (AllItemsAxiom ... foldMap transactionItems (unName db)) db
116
117 -- | An association rule.
118 --
119 -- Definition: in `Clustering.FrequentItemSet.References.RefAgrawalSrikantApriori`:
120 --
121 -- > An association rule is an implication of the form
122 -- > X ⇒ Y, where X ⊂ I, Y ⊂ I, and X ∩ Y = ∅.
123 -- >
124 -- > The rule X ⇒ Y holds in the transaction set D with confidence c
125 -- > if c% of transactions in D that contain X also contain Y.
126 -- >
127 -- > The rule X ⇒ Y has support s in the transaction set D
128 -- > if s% of transactions in D contain X ∪ Y.
129 data AssociationRule items db minSupp = AssociationRuleAxiom
130
131 data Association items db minSupp minConf item = Association
132 { associationCause :: FrequentItemSet items db minSupp ::: ItemSet item
133 -- ^ CorrectnessProperty: `associationCause` is a `FrequentItemSet`.
134 -- Because `freqSet` is frequent, and `causeSet` ⊂ `freqSet`.
135 , associationConfidence :: AssociationConfidence items db minSupp minConf ::: Probability
136 , -- CorrectnessProperty: `associationConfidence` is a `Probability`.
137 -- Because P(consequenceSet | causeSet) = P(causeSet ∩ consequenceSet) / P(causeSet)
138 associationConsequence :: FrequentItemSet items db minSupp ::: ItemSet item
139 -- ^ CorrectnessProperty: `associationConsequence` is a `FrequentItemSet`.
140 -- Because `freqSet` is frequent, and `consequenceSet` ⊂ `freqSet`.
141 }
142 deriving (Show, Eq)
143
144 data AssociationConfidence items db minSupp minConf = AssociationConfidenceAxiom
145
146 -- | By construction in `associationRules`.
147 instance Axiom (AssociationConfidence items db minSupp minConf >= minConf)
148
149 -- | @
150 -- `associationRules` freqSet db minConf
151 -- @
152 -- generates association rules from the given `FrequentItemSet` `(freqSet)`
153 -- in the given `(db)`,
154 -- with a 'Confidence' greater or equal to the given `(minConf)`.
155 --
156 -- Algorithm: in `Clustering.FrequentItemSet.References.RefAgrawalSrikantApriori`,
157 -- section 1.1 "Problem Decomposition and Paper Organization", point 2:
158 --
159 -- For a given `FrequentItemSet` @(freqSet)@,
160 -- For every @(causeSet)@ non-empty subset of @(freqSet)@.
161 -- output a rule of the form @(causeSet => (freqSet - causeSet))@
162 -- if in the given @(db)@,
163 -- the ratio @(`support` freqSet)@
164 -- over @(`support` causeSet)@
165 -- is greater or egal to the given @(minConf)@.
166 --
167 -- CorrectnessNote:
168 -- > The association rules we consider are probabilistic in nature.
169 -- > The presence of a rule X → A does not necessarily mean
170 -- > that X + Y → A also holds because the latter may
171 -- > not have minimum support.
172 -- > Similarly, the presence of rules X → Y and Y → Z
173 -- > does not necessarily mean > that X → Z holds
174 -- > because the latter may not have minimum confidence.
175 --
176 -- PerformanceTimeWarning: traverses the given `(db)`
177 -- once for each non-empty subset of the given `(freqSet)`.
178 associationRules ::
179 Ord.Ord item =>
180 FrequentItemSet items db minSupp ::: ItemSet item ->
181 db ::: [Transaction item a] ->
182 minConf ::: Probability ->
183 [ AssociationRule items db minSupp
184 ::: Association items db minSupp minConf item
185 ]
186 associationRules freqSet db minConf =
187 [ AssociationRuleAxiom
188 ... Association
189 { associationCause = FrequentItemSetAxiom ... causeSet
190 , associationConfidence = AssociationConfidenceAxiom ... confidence
191 , -- CorrectnessProperty: `consequenceSet` is frequent.
192 -- Because `freqSet` is frequent, and `consequenceSet` ⊂ `freqSet`.
193 associationConsequence = FrequentItemSetAxiom ... consequenceSet
194 }
195 | causeSet <- Set.powerSet (unName freqSet) & Set.toList
196 , not (Set.null causeSet)
197 , let consequenceSet = unName freqSet `Set.difference` causeSet
198 , not (Set.null consequenceSet)
199 , let Named causeOcc = support (() ... causeSet) db
200 , confidence <- probability (fromIntegral freqSetOcc % fromIntegral causeOcc)
201 , unName minConf Ord.<= confidence
202 ]
203 where
204 Named freqSetOcc = support freqSet db