1 {-# LANGUAGE RankNTypes #-}
 
   2 {-# OPTIONS_GHC -Wno-deprecations #-}
 
   4 -- | Brute-force algorithms related to mining frequent item sets.
 
   6 -- Definition: in `Clustering.FrequentItemSet.References.RefAgrawalSrikantApriori`:
 
   8 -- > Given a set of transactions D, the problem of mining
 
   9 -- > association rules is to generate all association rules
 
  10 -- > that have support and confidence greater than the
 
  11 -- > user-specified minimum support (called minsup) and
 
  12 -- > minimum confidence (called minconf) respectively.
 
  13 module Clustering.FrequentItemSet.BruteForce (
 
  17   type FrequentItemSet (),
 
  21   type AssociationRule (),
 
  22   type AssociationConfidence (),
 
  23   type Association (..),
 
  25   type ClosedFrequentItemSet (),
 
  26   closedFrequentItemSets,
 
  27   allClosedFrequentItemSets,
 
  31 import Data.Eq (Eq (..))
 
  32 import Data.Foldable (fold)
 
  33 import Data.Function ((&))
 
  35 import Data.List qualified as List
 
  36 import Data.Map.Strict qualified as Map
 
  37 import Data.Ord qualified as Ord
 
  38 import Data.Ratio ((%))
 
  40 import Data.Set qualified as Set
 
  42 import Logic.Theory.Arithmetic (Zero)
 
  43 import Logic.Theory.Ord
 
  44 import Numeric.Probability
 
  45 import Text.Show (Show (..))
 
  46 import Prelude (fromIntegral, (+))
 
  48 -- import Debug.Trace (traceShow)
 
  49 import Data.Semigroup (Semigroup (..))
 
  50 import Data.Sequence qualified as Seq
 
  51 import Data.Tuple (snd)
 
  52 import GHC.IsList (toList)
 
  55 type Transactions item = [ItemSet item]
 
  57 data Support itemSet db = SupportAxiom
 
  58 instance Axiom (Support itemSet db ::: Int >= Zero)
 
  60 -- | Return the number of occurrences of @(itemSet)@ in @(db)@.
 
  63   itemSet ::: ItemSet item ->
 
  64   db ::: Transactions item ->
 
  65   Support itemSet db ::: Int
 
  66 support (Named itemSet) (Named ts) =
 
  67   SupportAxiom ... List.length (List.filter (itemSet `Set.isSubsetOf`) ts)
 
  69 data FrequentItemSet items db minSupp = FrequentItemSetAxiom
 
  71 -- | Determine the frequent itemsets.
 
  74   items ::: ItemSet item ->
 
  75   db ::: Transactions item ->
 
  76   minSupp ::: Int / minSupp > Zero ->
 
  77   [FrequentItemSet items db minSupp ::: ItemSet item]
 
  78 frequentItemSets (Named items) (Named ts) (Named minSupp) =
 
  79   [ FrequentItemSetAxiom ... sub
 
  80   | (sub, occ) <- occBySubSet ts
 
  84     -- TimeEfficiencyWarning: iterate only once through the given `(db)`
 
  85     -- to count the occurence of each subset of the given `(items)`.
 
  86     -- occBySubSet :: Transactions item -> [(ItemSet item, Int)]
 
  89         (\t -> List.map (\(sub, occ) -> (sub, if sub `Set.isSubsetOf` t then occ + 1 else occ)))
 
  90         [(sub, 0) | sub <- Set.powerSet items & Set.toList]
 
  92 data AllItems db = AllItemsAxiom
 
  95 -- `allFrequentItemSets` db minSupp = `frequentItemSets` is db minSupp
 
  97 -- where `(is)` gathers all the items present in the given `(db)`.
 
  98 allFrequentItemSets ::
 
 100   db ::: Transactions item ->
 
 101   minSupp ::: Int / minSupp > Zero ->
 
 102   [ FrequentItemSet (AllItems db) db minSupp
 
 105 allFrequentItemSets ts = frequentItemSets (AllItemsAxiom ... fold (unName ts)) ts
 
 107 -- | An association rule.
 
 109 -- Definition: in `Clustering.FrequentItemSet.References.RefAgrawalSrikantApriori`:
 
 111 -- > An association rule is an implication of the form
 
 112 -- > X ⇒ Y, where X ⊂ I, Y ⊂ I, and X ∩ Y = ∅.
 
 114 -- > The rule X ⇒ Y holds in the transaction set D with confidence c
 
 115 -- > if c% of transactions in D that contain X also contain Y.
 
 117 -- > The rule X ⇒ Y has support s in the transaction set D
 
 118 -- > if s% of transactions in D contain X ∪ Y.
 
 119 data AssociationRule items db minSupp = AssociationRuleAxiom
 
 121 data Association items db minSupp minConf item = Association
 
 122   { associationCause :: FrequentItemSet items db minSupp ::: ItemSet item
 
 123   -- ^ CorrectnessProperty: `associationCause` is a `FrequentItemSet`.
 
 124   -- Because `freqSet` is frequent, and `causeSet` ⊂ `freqSet`.
 
 125   , associationConfidence :: AssociationConfidence items db minSupp minConf ::: Probability
 
 126   , -- CorrectnessProperty: `associationConfidence` is a `Probability`.
 
 127     -- Because P(consequenceSet | causeSet) = P(causeSet ∩ consequenceSet) / P(causeSet)
 
 128     associationConsequence :: FrequentItemSet items db minSupp ::: ItemSet item
 
 129   -- ^ CorrectnessProperty: `associationConsequence` is a `FrequentItemSet`.
 
 130   -- Because `freqSet` is frequent, and `consequenceSet` ⊂ `freqSet`.
 
 134 data AssociationConfidence items db minSupp minConf = AssociationConfidenceAxiom
 
 136 -- | By construction in `associationRules`.
 
 137 instance Axiom (AssociationConfidence items db minSupp minConf >= minConf)
 
 140 -- `associationRules` freqSet db minConf
 
 142 -- generates association rules from the given `FrequentItemSet` `(freqSet)`
 
 143 -- in the given `(db)`,
 
 144 -- with a 'Confidence' greater or equal to the given `(minConf)`.
 
 146 -- Algorithm: in `Clustering.FrequentItemSet.References.RefAgrawalSrikantApriori`,
 
 147 -- section 1.1 "Problem Decomposition and Paper Organization", point 2:
 
 149 -- For a given `FrequentItemSet` @(freqSet)@,
 
 150 -- For every @(causeSet)@ non-empty subset of @(freqSet)@.
 
 151 -- output a rule of the form @(causeSet => (freqSet - causeSet))@
 
 152 -- if in the given @(db)@,
 
 153 -- the ratio @(`support` freqSet)@
 
 154 -- over @(`support` causeSet)@
 
 155 -- is greater or egal to the given @(minConf)@.
 
 158 -- > The association rules we consider are probabilistic in nature.
 
 159 -- > The presence of a rule X → A does not necessarily mean
 
 160 -- > that X + Y → A also holds because the latter may
 
 161 -- > not have minimum support.
 
 162 -- > Similarly, the presence of rules X → Y and Y → Z
 
 163 -- > does not necessarily mean > that X → Z holds
 
 164 -- > because the latter may not have minimum confidence.
 
 166 -- PerformanceTimeWarning: traverses the given `(db)`
 
 167 -- once for each non-empty subset of the given `(freqSet)`.
 
 170   FrequentItemSet items db minSupp ::: ItemSet item ->
 
 171   db ::: Transactions item ->
 
 172   minConf ::: Probability ->
 
 173   [ AssociationRule items db minSupp
 
 174     ::: Association items db minSupp minConf item
 
 176 associationRules freqSet ts minConf =
 
 177   [ AssociationRuleAxiom
 
 179       { associationCause = FrequentItemSetAxiom ... causeSet
 
 180       , associationConfidence = AssociationConfidenceAxiom ... confidence
 
 181       , -- CorrectnessProperty: `consequenceSet` is frequent.
 
 182         -- Because `freqSet` is frequent, and `consequenceSet` ⊂ `freqSet`.
 
 183         associationConsequence = FrequentItemSetAxiom ... consequenceSet
 
 185   | causeSet <- Set.powerSet (unName freqSet) & Set.toList
 
 186   , not (Set.null causeSet)
 
 187   , let consequenceSet = unName freqSet `Set.difference` causeSet
 
 188   , not (Set.null consequenceSet)
 
 189   , let Named causeOcc = support (() ... causeSet) ts
 
 190   , confidence <- probability (fromIntegral freqSetOcc % fromIntegral causeOcc)
 
 191   , unName minConf Ord.<= confidence
 
 194     Named freqSetOcc = support freqSet ts
 
 196 data ClosedFrequentItemSet items db minSupp minSize = ClosedFrequentItemSetAxiom
 
 198 closedFrequentItemSets ::
 
 199   forall item db minSize minSupp items.
 
 201   items ::: ItemSet item ->
 
 202   db ::: Transactions item ->
 
 203   minSupp ::: Int / minSupp > Zero ->
 
 204   minSize ::: Int / minSize > Zero ->
 
 205   [ ClosedFrequentItemSet items db minSupp minSize
 
 206     ::: (Set item, Seq.Seq (ItemSet item))
 
 208 closedFrequentItemSets (Named items) (Named db) (Named minSupp) (Named minSize) =
 
 209   loop Set.empty items db
 
 211     loop pathFIS pathGT pathDB
 
 212       | Set.null pathGT = []
 
 215             [ [ ClosedFrequentItemSetAxiom ... (cfis, txs)
 
 216               | fromIntegral (Set.size cfis) Ord.>= minSize
 
 218               <> loop cfis (snd (Set.split itm pathGT)) (toList txs)
 
 220             , let cfis = Set.insert itm pathFIS
 
 221             , let supp = Seq.length txs
 
 222             , fromIntegral supp Ord.>= minSupp
 
 225         -- Map each itm of the remaining pathGT
 
 226         -- to the remaining transactions pathDB
 
 231             [ (itm, Seq.singleton tx)
 
 233             , itm <- Set.intersection pathGT tx & Set.toList
 
 237 allClosedFrequentItemSets ::
 
 239   db ::: Transactions item ->
 
 240   minSupp ::: Int / minSupp > Zero ->
 
 241   minSize ::: Int / minSize > Zero ->
 
 242   [ ClosedFrequentItemSet (AllItems db) db minSupp minSize
 
 243     ::: (Set item, Seq.Seq (ItemSet item))
 
 245 allClosedFrequentItemSets ts = closedFrequentItemSets (AllItemsAxiom ... fold (unName ts)) ts