-- SPDX-License-Identifier: BSD-3-Clause -- SPDX-FileCopyrightText: 2010 Alexandre Termier {-# LANGUAGE BangPatterns #-} {-# LANGUAGE MagicHash #-} {-# LANGUAGE UnboxedTuples #-} {-# OPTIONS_GHC -Wno-unused-imports #-} -- | Original LCM algorithm from: -- -- \"An efficient algorithm for enumerating closed patterns in transaction databases\". -- Takeaki Uno, Tatsuya Asai, Yuzo Uchida, and Hiroki Arimura. -- In Discovery Science, pages 16–31, 2004. -- https://research.nii.ac.jp/~uno/papers/lcm_ds04.pdf -- https://www.annas-archive.org/scidb/10.1007/978-3-540-30214-8_2 -- -- Code adapted from HLCM: -- -- \"HLCM: a first experiment on parallel data mining with Haskell\". -- Alexandre Termier & Benjamin Négrevergne & Simon Marlow & Satnam Singh. -- https://lig-membres.imag.fr/termier/HLCM/hlcm.pdf -- https://hackage.haskell.org/package/hlcm -- -- Possible future work: -- -- \"Discovering closed frequent itemsets on multicore: -- parallelizing computations and optimizing memory accesses\". -- Benjamin Négrevergne & Alexandre Termier & Jean-François Méhaut & Takeaki Uno. -- https://people.irisa.fr/Alexandre.Termier/publis/2010_negrevergne_hppddm.pdf module Clustering.FrequentItemSet.LCM ( type ItemSet, Transaction (..), type FrequentItemSets (), frequentItemSets, type AllItems (), allFrequentItemSets, type Clusters, type ClosedFrequentItemSets (), closedFrequentItemSets, allClosedFrequentItemSets, itemToSupport, runLCM, type ItemSupport, ) where import Clustering.FrequentItemSet.BruteForce (ItemSet, Support (), Transaction (..)) import Data.Bifunctor (second) import Data.Bool import Data.Eq (Eq (..)) import Data.Foldable (fold, foldMap) import Data.Function (id, ($), (&), (.)) import Data.Functor ((<$>), (<&>)) import Data.Int (Int) import Data.Monoid (Monoid (..)) import Data.Ord (Down, Ord (..), comparing) import Data.Ratio ((%)) import Data.Semigroup (Semigroup (..)) import Data.Sequence qualified as Seq import Data.Tuple (fst, snd) import Data.Validity (Validity (..), delve) import Debug.Pretty.Simple (pTraceShow) import Debug.Trace import GHC.Generics (Generic) import GHC.IsList (toList) import GHC.Stack (HasCallStack) import Logic import Logic.Theory.Arithmetic (Zero) import Logic.Theory.Ord qualified as Logic.Ord import Numeric.Probability import Text.Show (Show (..)) import Prelude (Enum, Num, error, fromIntegral, (+), (-)) import Control.Exception (evaluate) import Control.Monad import Control.Monad.ST import Control.Parallel import Control.Parallel.Strategies import Data.Array.Base import Data.Array.ST import Data.Array.Unboxed import Data.ByteString.Char8 qualified as L import Data.List import Data.List qualified as List import Data.Map.Strict (Map) import Data.Map.Strict qualified as Map import Data.Maybe (catMaybes, fromJust, isNothing, maybeToList) import Data.Set (Set) import Data.Set qualified as Set import Data.Vector qualified as V import GHC.Exts hiding (Item) import GHC.ST data FrequentItemSets items db minSupp minSize = FrequentItemSetsAxiom -- | -- @ -- `frequentItemSets` minSupp minSize items db -- @ -- returns a list of the closed frequent itemsets of -- the transactions @(db)@ restricted to the specified @(items)@, -- and such that the number of transactions containing them is greater or equal to @(minSupp)@, -- and such that the size of those transactions is greater or equal to @(minSize)@. -- Each closed frequent itemset is coupled with the sequence of the transactions containing them. frequentItemSets :: forall a item db minSize minSupp items. Ord item => Show item => Show a => minSupp ::: Int / minSupp Logic.Ord.> Zero -> minSize ::: Int / minSize Logic.Ord.> Zero -> items ::: ItemSet item -> db ::: [Transaction item a] -> FrequentItemSets items db minSupp minSize ::: Clusters item a frequentItemSets (Named minSupp) (Named minSize) (Named items) (Named db) = FrequentItemSetsAxiom ... Map.fromDistinctAscList (loop Set.empty items db) where -- Lexicographic depth-first search for frequent item sets. loop previousFIS nextItems previousTxns | Set.null nextItems = [] | otherwise = -- pTraceShow (["LCM", "frequentItemSets", "loop"], (("previousFIS", previousFIS), ("nextItems", nextItems), ("previousTxns", previousTxns))) $ -- Map each item of `nextItems` to its transactions in `previousTxns`. let nextItemToTxns = itemToTxns nextItems previousTxns in (`List.concatMap` Map.toList nextItemToTxns) \(nextItem, nextTxns) -> -- Keep only the itemsets which are supported by enough transactions. if minSupp <= fromIntegral (Seq.length nextTxns) then let nextFIS = Set.insert nextItem previousFIS in -- Keep only the itemsets which have already gathered enough items. [ (nextFIS, nextTxns) | minSize <= fromIntegral (Set.size nextFIS) ] <> -- Recurse with each item of `nextItems` greater than `nextItem` -- (to traverse the frequent item sets as a tree instead of a poset lattice), -- and with the transactions containing `nextItem`. loop nextFIS (Set.split nextItem nextItems & snd) (nextTxns & toList) else [] -- | @ -- `itemToTxns` items db -- @ -- maps each item of `items` to the transactions of `db` containing it. -- -- This maps from an "horizontal" representation to a "vertical" one, -- itself mapping to an "horizontal" one. -- See p.8 of https://www.lirmm.fr/~lazaar/imagina/NL-DM-IMAGINA1819-part1.pdf -- -- It's a kind of occurrence deliver. -- p.4 of http://osdm.uantwerpen.be/papers/p77-uno.pdf itemToTxns :: Ord item => ItemSet item -> [Transaction item a] -> Map.Map item (Seq.Seq (Transaction item a)) itemToTxns itms txs = Map.fromListWith (<>) [ (itm, Seq.singleton tx) | tx <- txs , itm <- Set.intersection itms (transactionItems tx) & Set.toList ] data AllItems db = AllItemsAxiom -- | `frequentItemSets` applied to all the items of the given transactions. allFrequentItemSets :: Ord item => Show item => Show a => minSupp ::: Int / minSupp Logic.Ord.> Zero -> minSize ::: Int / minSize Logic.Ord.> Zero -> db ::: [Transaction item a] -> FrequentItemSets (AllItems db) db minSupp minSize ::: Clusters item a allFrequentItemSets minSupp minSize db = frequentItemSets minSupp minSize (AllItemsAxiom ... foldMap transactionItems (unName db)) db data ClosedFrequentItemSets items db minSupp minSize = ClosedFrequentItemSetsAxiom type Clusters item a = Map.Map (ItemSet item) (Seq.Seq (Transaction item a)) closedFrequentItemSets :: forall item db minSize minSupp items. HasCallStack => Ord item => Show item => minSupp ::: Int / minSupp Logic.Ord.> Zero -> minSize ::: Int / minSize Logic.Ord.> Zero -> items ::: ItemSet item -> db ::: [Set item] -> ClosedFrequentItemSets items db minSupp minSize ::: [(ItemSupport, ItemSet item)] closedFrequentItemSets (Named minSupp) (Named minSize) (Named items) (Named db) = ClosedFrequentItemSetsAxiom ... runLCM items minSupp minSize db allClosedFrequentItemSets :: HasCallStack => Ord item => Show item => minSupp ::: Int / minSupp Logic.Ord.> Zero -> minSize ::: Int / minSize Logic.Ord.> Zero -> db ::: [Set item] -> ClosedFrequentItemSets (AllItems db) db minSupp minSize ::: [(ItemSupport, ItemSet item)] allClosedFrequentItemSets minSupp minSize db = closedFrequentItemSets minSupp minSize (AllItemsAxiom ... fold (unName db)) db -- | -- -- Library for using the LCM algorithm in order to compute closed frequent pattern. -- Input must be a transaction database, either in text format (as a ByteString) -- or in @[[Item]]@ format, where @Item = Int@. -- -- Several bencharking functions allowing to tune parallel strategy used and depth -- cutoff are also provided. type ItemSupport = Int -- type Item = Int type Weight = Int ----------------------------------------------------------------- -- LCM functions ----------------------------------------------------------------- type ItemRank = Int -- | Get the data as a matrix of Items, parses it and -- and executes LCM to discover closed frequent Itemsets. runLCM :: forall item. Show item => HasCallStack => Ord item => Set item -> ItemSupport -> Int -> -- | The transaction database as matrix of Items (List of List) [Set item] -> -- | Output: list of closed frequent Itemsets [(ItemSupport, Set item)] runLCM items minSupp minSize db = let itemToSupp :: [(item, ItemSupport)] itemToSupp = itemToSupport items db & Map.toList & List.filter ((>= minSupp) . snd) & List.sortBy (comparing (Down . snd)) itemsSize :: Int itemsSize = List.length itemToSupp itemToRank :: Map.Map item ItemRank itemToRank = Map.fromList [ (i, List.head $ List.findIndices ((== i) . fst) itemToSupp) | (i, _) <- itemToSupp ] -- Rewrites the database to use `ItemRank` instead of `item` rankDB :: [Set ItemRank] rankDB = [ Set.fromList [ rank | i <- tx & Set.toList -- Items whose support is lower than `minSupp` -- have been filtered-out in `itemToSupp`, -- hence do not have a rank. , rank <- Map.lookup i itemToRank & maybeToList ] | tx <- db ] dbLT = List.foldr (\tx acc -> insertLT (tx & Set.toList) (-1) 1 acc) Nil rankDB rankToItem :: Array ItemRank item rankToItem = List.zip [0 ..] (fst <$> itemToSupp) & array (0, itemsSize - 1) unrank :: [(ItemSupport, Set ItemRank)] -> [(ItemSupport, Set item)] unrank = List.map $ second $ Set.map (rankToItem `unsafeAt`) in [ lcmLoop minSupp minSize 1 dbLT Set.empty candidateRank (rankToSuppLT items dbLT) items | candidateRank <- [0 .. Set.size items -1] ] & parBuffer 8 rdeepseq & runEval & List.concat & unrank -- | -- For a transaction database, a closed frequent Itemset, and a candidate item -- for extension of this closed frequent Itemset, recursively computes all -- the successor closed frequent Itemsets by PPC-extension. lcmLoop :: Show item => ItemSupport -> Int -> -- | Current depth in the search tree (for parallel optimisation purposes) Int -> -- | Transaction database. LexicoTreeItem -> -- | Input closed frequent Itemset. Set ItemRank -> -- | Candidate to extend the closed frequent Itemset above. ItemRank -> -- | Array associating each item with its frequency UArray ItemRank ItemSupport -> -- | Maximal item Set item -> -- | Result : list of closed frequent Itemsets. Each result is a list of Items, the head of the list being the frequency of the item. [(ItemSupport, Set ItemRank)] lcmLoop minSupp minSize depth previousDB previousRanks candidateRank rankToSupp items = let -- HLCM: line 1: CDB = project and reduce DB w.r.t. P and limit -- Reduce database by eliminating: -- - all items greater than `candidateRank`, -- - and all items with zero support. reducedDB = projectAndReduce candidateRank rankToSupp previousDB -- HLCM: line 2: Compute frequencies of items in CDB -- Compute items occurrences in reduced database. reducedRankToSupp = rankToSuppLT items reducedDB -- HLCM: line 3: CP = 100% frequent items in CDB -- Check which items actually appear in reduced database. candidateSupp = rankToSupp ! candidateRank -- HLCM: line 6: Candidates = frequent items of CDB that are not in CP -- Compute 100% frequent items, future candidates, and unfrequent items. (closedFreqRanks, candidateRanks, unfreqRanks) = computeCandidates minSupp candidateSupp items reducedRankToSupp in --pTraceShow (["lcmLoop"], minSupp, minSize, depth, previousDB, previousRanks, candidateRank, rankToSupp, items) $ -- HLCM: line 4: if max(CP) = limit then if not (List.null closedFreqRanks) -- if there is a result ... && last closedFreqRanks <= candidateRank -- ...and if it is OK to extend it then let -- HLCM: line 5: P' = P ∪ CP -- Result closed frequent Itemset = input closed frequent Itemset + 100% frequent Items closedItemset = previousRanks <> Set.fromList closedFreqRanks -- HLCM: line 8: for all e ∈ Candidates, e ≤ limit do -- Only candidates with value lower than input candidateRank -- can be used for further extension on this branch. smallCandidates = List.takeWhile (< candidateRank) candidateRanks in [ (candidateSupp, closedItemset) | minSize <= fromIntegral (Set.size closedItemset) ] <> if not (List.null smallCandidates) -- ... and if we have at least 1 possible extension then let -- Update items occurrences table by suppressing: -- - 100% frequent items, -- - and unfrequent items. newRankToSupp = suppressItems reducedRankToSupp closedFreqRanks unfreqRanks loop newCandidate = lcmLoop minSupp minSize (depth + 1) reducedDB closedItemset newCandidate newRankToSupp items in -- Recursively extend the candidates if 3 < depth -- create parallel sparks only for low search space depth then List.concat $ runEval $ parBuffer 2 rdeepseq $ List.map loop smallCandidates else List.concatMap loop smallCandidates else [] else [] -- | -- For a transaction database of type [[item]], compute the frequency -- of each item and return an array (item, frequency). itemToSupport :: Ord item => Set item -> [Set item] -> Map.Map item ItemSupport itemToSupport items db = Map.fromListWith (+) [ (itm, 1) | tx <- db , itm <- Set.intersection items tx & Set.toList ] {- accumArray (+) 0 (0, Set.size items) [ (idx, 1) | is <- db , i <- is & Set.toList , idx <- Set.lookupIndex i items & maybeToList ] -} -- XXX PERF: must be bad : the array is converted to list (one copy), -- then this list is sorted (more copies of small lists), and at -- last a new array is created... -- Try to improve this with a mutable array and more "in place" spirit... -- | -- For a given itemset being extended by a given candidate, compute : -- - the closure of this itemset -- - and the candidates for further extension. computeCandidates :: ItemSupport -> ItemSupport -> Set item -> UArray ItemRank ItemSupport -> -- (100% frequent items == closure, candidates for further extension, unfrequent items) ([ItemRank], [ItemRank], [ItemRank]) computeCandidates minSupp candidateSupp items rankToSupp = let (frequentItems, unfreqItems) = List.partition (\i -> rankToSupp ! i >= minSupp) [i | i <- [0 .. Set.size items - 1], rankToSupp ! i > 0] (closedFrequentRanks, candidateRanks) = List.partition (\i -> rankToSupp ! i == candidateSupp) frequentItems in (closedFrequentRanks, candidateRanks, unfreqItems) -- | -- Modifies an array associating Items with their frequency, in order to -- give a frequency of 0 to a given list of Items. -- -- NB : for performance reasons, this is REALLY a modification, made with unsafe operations. suppressItems :: -- | Array associating an item with its frequency UArray ItemRank ItemSupport -> -- | List of 100% frequent Items [ItemRank] -> -- | List of unfrequent Items [ItemRank] -> -- | Initial array, with frequencies of 100% frequent Items -- and unfrequent Items set to 0. UArray ItemRank ItemSupport suppressItems rankToSupp closedRanks unfreqRanks = runST do -- Can be used in multithread because no concurrent write arr <- unsafeThaw rankToSupp :: ST s (STUArray s ItemRank ItemSupport) forM_ closedRanks \i -> writeArray arr i 0 forM_ unfreqRanks \i -> writeArray arr i 0 -- Can be used in multithread because no concurrent write unsafeFreeze arr ----------------------------------------------------------------- -- LEXICOGRAPHIC TREE MANIPULATION ----------------------------------------------------------------- -- | -- Creates a new, reduced transaction database by eliminating all items -- greater than @candidateRank@ item, and all infrequent Items. projectAndReduce :: -- | Candidate item, on which the projection is made ItemRank -> -- | Array associating each item with its frequency in -- original transaction database. UArray ItemRank ItemSupport -> -- | Original transaction database LexicoTreeItem -> -- | Result : Reduced transaction database LexicoTreeItem projectAndReduce !candidateRank rankToSupp = go where go Nil = Nil go (Node e suiv alt w) | e > candidateRank = Nil | e == candidateRank = let !(suiv', addWeight) = filterInfrequent suiv rankToSupp in Node e suiv' Nil (w + addWeight) | otherwise = let !alt' = go alt !suiv' = go suiv in if rankToSupp ! e > 0 then if notNil suiv' && notNil alt' then Node e suiv' alt' 0 else if notNil suiv' then Node e suiv' Nil 0 else alt' else if notNil suiv' && notNil alt' then mergeAlts suiv' alt' else if notNil suiv' then suiv' else alt' -- | -- Suppress all infrequent Items from a transaction database expressed as -- lexicographic tree, and returns a new lexicographic tree. filterInfrequent :: -- | Original transaction database LexicoTreeItem -> -- | Array associating each item with its frequency in -- original transaction database. In this setting, -- an infrequent item as a frequency of 0 (because of preprocessing by -- ' suppressItems '). UArray ItemRank ItemSupport -> -- | Result : (transaction database without infrequent Items, weight to report in parent nodes) (LexicoTreeItem, Weight) filterInfrequent Nil _ = (Nil, 0) filterInfrequent (Node e suiv alt w) occs | occs ! e > 0 = (Node e suiv' alt' (w + ws), wa) | notNil suiv' && notNil alt' = (mergeAlts suiv' alt', w') | notNil alt' = (alt', w') | notNil suiv' = (suiv', w') | otherwise = (Nil, w') where w' = w + ws + wa !(suiv', ws) = filterInfrequent suiv occs !(alt', wa) = filterInfrequent alt occs {-# INLINE notNil #-} notNil :: LexicoTreeItem -> Bool notNil Nil = False notNil _ = True -- | -- Occurence delivering: -- Map each item of the given database to its support. rankToSuppLT :: Set item -> -- | Transaction database (in lexicographic tree format) LexicoTreeItem -> -- | Result : array associating each item to its frequency. UArray ItemRank ItemSupport rankToSuppLT items dbLT = runST do arr <- newArray_ (0, Set.size items - 1) -- TODO: this workaround should no longer be necessary -- Creates an empty array : each item starts with frequency 0 -- workaround for http://hackage.haskell.org/trac/ghc/ticket/3586 forM_ [0 .. Set.size items - 1] $ \i -> unsafeWrite arr i 0 -- Compute frequencies for each item by efficient tree traversal _ <- traverseLT dbLT arr unsafeFreeze arr -- | -- Efficient traversal of the transaction database as a lexicographic tree. -- Items frequencies are updated on the fly. traverseLT :: forall s. -- | Transaction database LexicoTreeItem -> -- | Array associating each item with its frequency. UPDATED by this function ! STUArray s ItemRank ItemSupport -> ST s () traverseLT tree arr = ST $ \s -> case go tree s of (# s', _ #) -> (# s', () #) where go :: LexicoTreeItem -> State# s -> (# State# s, Int# #) go Nil s = (# s, 0# #) go (Node item child alt w@(I# w#)) s0 = case go child s0 of (# s1, childw #) -> case go alt s1 of (# s2, altw #) -> case unsafeRead arr item of ST f -> case f s2 of (# _s3, I# itemw #) -> case unsafeWrite arr item (I# itemw + I# childw + w) of ST f' -> case f' s2 of (# s4, _ #) -> (# s4, childw +# w# +# altw #) -- RankToSupp -- | Type for a lexicographic tree, implementating a n-ary tree over a binary tree. data LexicoTreeItem = -- | Void node Nil | -- | A node : item, next node (next in transaction), alternative node (other branch), weight Node {-# UNPACK #-} !ItemRank !LexicoTreeItem -- NB. experimental strictness annotation !LexicoTreeItem -- NB. experimental strictness annotation {-# UNPACK #-} !Int deriving (Eq, Show) -- | -- Inserts a transaction in list format into the lexicographic tree. -- Automatically merges identical transactions. -- Performs suffix intersection. insertLT :: -- | Transaction to insert into lexicographic tree [ItemRank] -> -- | "coreI" item, for suffix intersection. ItemRank -> -- | Weight of the transaction to insert ItemSupport -> -- | Input lexicographic tree LexicoTreeItem -> -- | Result : a new lexicographic tree with the transaction inserted LexicoTreeItem insertLT [] _ _ lt = lt insertLT lst _ w Nil = createPath lst w insertLT [x] i w (Node e suiv alt weight) | x < e = Node x Nil (Node e suiv alt weight) w | x == e = Node e suiv alt (weight + w) | x > e = Node e suiv (insertLT [x] i w alt) weight insertLT (x : xs) i w (Node e suiv alt weight) | x < e = Node x (createPath xs w) (Node e suiv alt weight) 0 | x == e = if (e /= i) then Node e (insertLT xs i w suiv) alt weight else suffixIntersectionLT xs w (Node e suiv alt weight) | x > e = Node e suiv (insertLT (x : xs) i w alt) weight insertLT _ _ _ _ = error "insertLT" -- | -- From a transaction and its weight, directly creates a path-shaped lexicographic tree. createPath :: -- | Transaction [ItemRank] -> -- | Weight of the transaction Int -> -- | Result : a path-shaped lexicographic tree encoding the transaction LexicoTreeItem createPath [] _ = Nil createPath [x] w = Node x Nil Nil w createPath (x : xs) w = Node x (createPath xs w) Nil 0 -- | -- Perform the "suffix intersection" operation with the suffix of a transaction -- and the corresponding part of a lexicographic tree. -- -- For more details, see "prefixIntersection" in Takeaki Uno's papers about LCM. suffixIntersectionLT :: -- | Suffix of the transaction to insert. [ItemRank] -> -- | Weight of the transaction to insert Int -> -- | (Sub-)lexicographic tree where the transaction must be inserted. The @next@ part (see data type comments) -- should be a simple path, it will be the target of intersection with the suffix. LexicoTreeItem -> -- | Result : lexicographic tree where the suffix has been added, with correct intersections performed. LexicoTreeItem suffixIntersectionLT _ w (Node e Nil alt weight) = Node e Nil alt (weight + w) suffixIntersectionLT lst w (Node e suiv alt weight) = let (newSuiv, addWeight) = suffInterSuiv lst w suiv in Node e newSuiv alt (weight + addWeight) suffixIntersectionLT _ _ _ = error "suffixIntersectionLT" -- | -- Intersects a list-shaped transaction and a path-shaped lexicographic tree. -- The result is a path shaped lexicographic tree with weights correctly updated. suffInterSuiv :: -- | Transaction as list [ItemRank] -> -- | Weight of the above transaction Int -> -- | Path-shaped lexicographic tree LexicoTreeItem -> -- | Result : (path-shaped lexicographic tree representing the intersection -- of transaction and input path , 0 if intersection not [] / sum of weights else) (LexicoTreeItem, Int) suffInterSuiv lst w suiv = let (lstSuiv, weightSuiv) = getLstSuiv suiv inter = List.intersect lstSuiv lst in if (inter /= []) then (createPath inter (weightSuiv + w), 0) else (Nil, weightSuiv + w) -- | -- Collects all the nodes of lexicographic tree in a list of elements. getLstSuiv :: -- | Path shaped lexicographic tree. LexicoTreeItem -> -- | Result : (list of elements in the path, sum of weights of nodes in the path) ([ItemRank], Int) getLstSuiv Nil = ([], 0) getLstSuiv (Node e suiv Nil weight) = let (lst, w) = getLstSuiv suiv in (e : lst, w + weight) getLstSuiv _ = error "getLstSuiv" -- | -- Merge two lexicographic trees. mergeAlts :: LexicoTreeItem -> LexicoTreeItem -> LexicoTreeItem mergeAlts Nil lt = lt mergeAlts lt Nil = lt mergeAlts (Node e1 suiv1 alt1 w1) (Node e2 suiv2 alt2 w2) | e1 < e2 = (Node e1 suiv1 (mergeAlts alt1 (Node e2 suiv2 alt2 w2)) w1) | e1 > e2 = (Node e2 suiv2 (mergeAlts (Node e1 suiv1 alt1 w1) alt2) w2) | e1 == e2 = (Node e1 (mergeAlts suiv1 suiv2) (mergeAlts alt1 alt2) (w1 + w2)) mergeAlts _ _ = error "mergeAlts"