]> Git — Sourcephile - literate-phylomemy.git/blob - src/Clustering/FrequentItemSet/BruteForce.hs
init
[literate-phylomemy.git] / src / Clustering / FrequentItemSet / BruteForce.hs
1 {-# LANGUAGE RankNTypes #-}
2 {-# OPTIONS_GHC -Wno-deprecations #-}
3
4 -- | Brute-force algorithms related to mining frequent item sets.
5 --
6 -- Definition: in `Clustering.FrequentItemSet.References.RefAgrawalSrikantApriori`:
7 --
8 -- > Given a set of transactions D, the problem of mining
9 -- > association rules is to generate all association rules
10 -- > that have support and confidence greater than the
11 -- > user-specified minimum support (called minsup) and
12 -- > minimum confidence (called minconf) respectively.
13 module Clustering.FrequentItemSet.BruteForce (
14 type ItemSet,
15 type Transactions,
16 type Support (),
17 type FrequentItemSet (),
18 frequentItemSets,
19 type AllItems (),
20 allFrequentItemSets,
21 type AssociationRule (),
22 type AssociationConfidence (),
23 type Association (..),
24 associationRules,
25 ) where
26
27 import Data.Bool
28 import Data.Eq (Eq (..))
29 import Data.Foldable (fold)
30 import Data.Function ((&))
31 import Data.Int (Int)
32 import Data.List qualified as List
33 import Data.Ord qualified as Ord
34 import Data.Ratio ((%))
35 import Data.Set (Set)
36 import Data.Set qualified as Set
37 import Logic
38 import Logic.Theory.Arithmetic (Zero)
39 import Logic.Theory.Ord
40
41 -- import Numeric.Natural (Natural)
42 import Numeric.Probability
43 import Text.Show (Show (..))
44 import Prelude (fromIntegral, (+))
45
46 type ItemSet = Set
47 type Transactions item = [ItemSet item]
48
49 data Support itemSet db = SupportAxiom
50 instance Axiom (Support itemSet db ::: Int >= Zero)
51
52 -- | Return the number of occurrences of @(itemSet)@ in @(db)@.
53 support ::
54 Ord.Ord item =>
55 itemSet ::: ItemSet item ->
56 db ::: Transactions item ->
57 Support itemSet db ::: Int
58 support (Named itemSet) (Named ts) =
59 SupportAxiom ... List.length (List.filter (itemSet `Set.isSubsetOf`) ts)
60
61 data FrequentItemSet items db minSupp = FrequentItemSetAxiom
62
63 -- | Determine the frequent itemsets.
64 frequentItemSets ::
65 Ord.Ord item =>
66 items ::: ItemSet item ->
67 db ::: Transactions item ->
68 minSupp ::: Int / minSupp > Zero ->
69 [FrequentItemSet items db minSupp ::: ItemSet item]
70 frequentItemSets (Named items) (Named ts) (Named minSupp) =
71 [ FrequentItemSetAxiom ... sub
72 | (sub, occ) <- occBySubSet ts
73 , occ Ord.>= minSupp
74 ]
75 where
76 -- TimeEfficiencyWarning: iterate only once through the given `(db)`
77 -- to count the occurence of each subset of the given `(items)`.
78 -- occBySubSet :: Transactions item -> [(ItemSet item, Int)]
79 occBySubSet =
80 List.foldr
81 (\t -> List.map (\(sub, occ) -> (sub, if sub `Set.isSubsetOf` t then occ + 1 else occ)))
82 [(sub, 0) | sub <- Set.powerSet items & Set.toList]
83
84 data AllItems db = AllItemsAxiom
85
86 -- | @
87 -- `allFrequentItemSets` db minSupp = `frequentItemSets` is db minSupp
88 -- @
89 -- where `(is)` gathers all the items present in the given `(db)`.
90 allFrequentItemSets ::
91 Ord.Ord item =>
92 db ::: Transactions item ->
93 minSupp ::: Int / minSupp > Zero ->
94 [ FrequentItemSet (AllItems db) db minSupp
95 ::: ItemSet item
96 ]
97 allFrequentItemSets ts = frequentItemSets (AllItemsAxiom ... fold (unName ts)) ts
98
99 -- | An association rule.
100 --
101 -- Definition: in `Clustering.FrequentItemSet.References.RefAgrawalSrikantApriori`:
102 --
103 -- > An association rule is an implication of the form
104 -- > X ⇒ Y, where X ⊂ I, Y ⊂ I, and X ∩ Y = ∅.
105 -- >
106 -- > The rule X ⇒ Y holds in the transaction set D with confidence c
107 -- > if c% of transactions in D that contain X also contain Y.
108 -- >
109 -- > The rule X ⇒ Y has support s in the transaction set D
110 -- > if s% of transactions in D contain X ∪ Y.
111 data AssociationRule items db minSupp = AssociationRuleAxiom
112
113 data Association items db minSupp minConf item = Association
114 { associationCause :: FrequentItemSet items db minSupp ::: ItemSet item
115 -- ^ CorrectnessProperty: `associationCause` is a `FrequentItemSet`.
116 -- Because `freqSet` is frequent, and `causeSet` ⊂ `freqSet`.
117 , associationConfidence :: AssociationConfidence items db minSupp minConf ::: Probability
118 , -- CorrectnessProperty: `associationConfidence` is a `Probability`.
119 -- Because P(consequenceSet | causeSet) = P(causeSet ∩ consequenceSet) / P(causeSet)
120 associationConsequence :: FrequentItemSet items db minSupp ::: ItemSet item
121 -- ^ CorrectnessProperty: `associationConsequence` is a `FrequentItemSet`.
122 -- Because `freqSet` is frequent, and `consequenceSet` ⊂ `freqSet`.
123 }
124 deriving (Show, Eq)
125
126 data AssociationConfidence items db minSupp minConf = AssociationConfidenceAxiom
127
128 -- | By construction in `associationRules`.
129 instance Axiom (AssociationConfidence items db minSupp minConf <= minConf)
130
131 -- | @
132 -- `associationRules` freqSet db minConf
133 -- @
134 -- generates association rules from the given `FrequentItemSet` `(freqSet)`
135 -- in the given `(db)`,
136 -- with a 'Confidence' greater or equal to the given `(minConf)`.
137 --
138 -- Algorithm: in `Clustering.FrequentItemSet.References.RefAgrawalSrikantApriori`,
139 -- section 1.1 "Problem Decomposition and Paper Organization", point 2:
140 --
141 -- For a given `FrequentItemSet` @(freqSet)@,
142 -- For every @(causeSet)@ non-empty subset of @(freqSet)@.
143 -- output a rule of the form @(causeSet => (freqSet - causeSet))@
144 -- if in the given @(db)@,
145 -- the ratio @(`support` freqSet)@
146 -- over @(`support` causeSet)@
147 -- is greater or egal to the given @(minConf)@.
148 --
149 -- CorrectnessNote:
150 -- > The association rules we consider are probabilistic in nature.
151 -- > The presence of a rule X → A does not necessarily mean
152 -- > that X + Y → A also holds because the latter may
153 -- > not have minimum support.
154 -- > Similarly, the presence of rules X → Y and Y → Z
155 -- > does not necessarily mean > that X → Z holds
156 -- > because the latter may not have minimum confidence.
157 --
158 -- PerformanceTimeWarning: traverses the given `(db)`
159 -- once for each non-empty subset of the given `(freqSet)`.
160 associationRules ::
161 Ord.Ord item =>
162 FrequentItemSet items db minSupp ::: ItemSet item ->
163 db ::: Transactions item ->
164 minConf ::: Probability ->
165 [ AssociationRule items db minSupp
166 ::: Association items db minSupp minConf item
167 ]
168 associationRules freqSet ts minConf =
169 [ AssociationRuleAxiom
170 ... Association
171 { associationCause = FrequentItemSetAxiom ... causeSet
172 , associationConfidence = AssociationConfidenceAxiom ... confidence
173 , -- CorrectnessProperty: `consequenceSet` is frequent.
174 -- Because `freqSet` is frequent, and `consequenceSet` ⊂ `freqSet`.
175 associationConsequence = FrequentItemSetAxiom ... consequenceSet
176 }
177 | causeSet <- Set.powerSet (unName freqSet) & Set.toList
178 , not (List.null causeSet)
179 , let consequenceSet = unName freqSet `Set.difference` causeSet
180 , not (List.null consequenceSet)
181 , let Named causeOcc = support (() ... causeSet) ts
182 , confidence <- probability (fromIntegral freqSetOcc % fromIntegral causeOcc)
183 , -- CorrectnessProperty: `AxiomAssociationConfidenceGEminConf`
184 unName minConf Ord.<= confidence
185 ]
186 where
187 Named freqSetOcc = support freqSet ts