1 -- SPDX-License-Identifier: BSD-3-Clause
2 -- SPDX-FileCopyrightText: 2010 Alexandre Termier
4 {-# LANGUAGE BangPatterns #-}
5 {-# LANGUAGE MagicHash #-}
6 {-# LANGUAGE UnboxedTuples #-}
7 {-# OPTIONS_GHC -Wno-unused-imports #-}
9 -- | Original LCM algorithm from:
11 -- \"An efficient algorithm for enumerating closed patterns in transaction databases\".
12 -- Takeaki Uno, Tatsuya Asai, Yuzo Uchida, and Hiroki Arimura.
13 -- In Discovery Science, pages 16–31, 2004.
14 -- https://research.nii.ac.jp/~uno/papers/lcm_ds04.pdf
15 -- https://www.annas-archive.org/scidb/10.1007/978-3-540-30214-8_2
17 -- Code adapted from HLCM:
19 -- \"HLCM: a first experiment on parallel data mining with Haskell\".
20 -- Alexandre Termier & Benjamin Négrevergne & Simon Marlow & Satnam Singh.
21 -- https://lig-membres.imag.fr/termier/HLCM/hlcm.pdf
22 -- https://hackage.haskell.org/package/hlcm
24 -- Possible future work:
26 -- \"Discovering closed frequent itemsets on multicore:
27 -- parallelizing computations and optimizing memory accesses\".
28 -- Benjamin Négrevergne & Alexandre Termier & Jean-François Méhaut & Takeaki Uno.
29 -- https://people.irisa.fr/Alexandre.Termier/publis/2010_negrevergne_hppddm.pdf
30 module Clustering.FrequentItemSet.LCM (
33 type FrequentItemSets (),
38 type ClosedFrequentItemSets (),
39 closedFrequentItemSets,
40 allClosedFrequentItemSets,
47 import Clustering.FrequentItemSet.BruteForce (ItemSet, Support (), Transaction (..))
48 import Data.Bifunctor (second)
50 import Data.Eq (Eq (..))
51 import Data.Foldable (fold, foldMap)
52 import Data.Function (id, ($), (&), (.))
53 import Data.Functor ((<$>), (<&>))
55 import Data.Monoid (Monoid (..))
56 import Data.Ord (Down, Ord (..), comparing)
57 import Data.Ratio ((%))
58 import Data.Semigroup (Semigroup (..))
59 import Data.Sequence qualified as Seq
60 import Data.Tuple (fst, snd)
61 import Data.Validity (Validity (..), delve)
62 import Debug.Pretty.Simple (pTraceShow)
64 import GHC.Generics (Generic)
65 import GHC.IsList (toList)
66 import GHC.Stack (HasCallStack)
68 import Logic.Theory.Arithmetic (Zero)
69 import Logic.Theory.Ord qualified as Logic.Ord
70 import Numeric.Probability
71 import Text.Show (Show (..))
72 import Prelude (Enum, Num, error, fromIntegral, (+), (-))
74 import Control.Exception (evaluate)
76 import Control.Monad.ST
77 import Control.Parallel
78 import Control.Parallel.Strategies
79 import Data.Array.Base
81 import Data.Array.Unboxed
82 import Data.ByteString.Char8 qualified as L
84 import Data.List qualified as List
85 import Data.Map.Strict (Map)
86 import Data.Map.Strict qualified as Map
87 import Data.Maybe (catMaybes, fromJust, isNothing, maybeToList)
89 import Data.Set qualified as Set
90 import Data.Vector qualified as V
91 import GHC.Exts hiding (Item)
94 data FrequentItemSets items db minSupp minSize = FrequentItemSetsAxiom
98 -- `frequentItemSets` minSupp minSize items db
100 -- returns a list of the closed frequent itemsets of
101 -- the transactions @(db)@ restricted to the specified @(items)@,
102 -- and such that the number of transactions containing them is greater or equal to @(minSupp)@,
103 -- and such that the size of those transactions is greater or equal to @(minSize)@.
104 -- Each closed frequent itemset is coupled with the sequence of the transactions containing them.
106 forall a item db minSize minSupp items.
110 minSupp ::: Int / minSupp Logic.Ord.> Zero ->
111 minSize ::: Int / minSize Logic.Ord.> Zero ->
112 items ::: ItemSet item ->
113 db ::: [Transaction item a] ->
114 FrequentItemSets items db minSupp minSize
116 frequentItemSets (Named minSupp) (Named minSize) (Named items) (Named db) =
117 FrequentItemSetsAxiom ...
118 Map.fromDistinctAscList (loop Set.empty items db)
120 -- Lexicographic depth-first search for frequent item sets.
121 loop previousFIS nextItems previousTxns
122 | Set.null nextItems = []
124 -- pTraceShow (["LCM", "frequentItemSets", "loop"], (("previousFIS", previousFIS), ("nextItems", nextItems), ("previousTxns", previousTxns))) $
125 -- Map each item of `nextItems` to its transactions in `previousTxns`.
126 let nextItemToTxns = itemToTxns nextItems previousTxns
127 in (`List.concatMap` Map.toList nextItemToTxns) \(nextItem, nextTxns) ->
128 -- Keep only the itemsets which are supported by enough transactions.
129 if minSupp <= fromIntegral (Seq.length nextTxns)
131 let nextFIS = Set.insert nextItem previousFIS
132 in -- Keep only the itemsets which have already gathered enough items.
133 [ (nextFIS, nextTxns)
134 | minSize <= fromIntegral (Set.size nextFIS)
137 -- Recurse with each item of `nextItems` greater than `nextItem`
138 -- (to traverse the frequent item sets as a tree instead of a poset lattice),
139 -- and with the transactions containing `nextItem`.
140 loop nextFIS (Set.split nextItem nextItems & snd) (nextTxns & toList)
144 -- `itemToTxns` items db
146 -- maps each item of `items` to the transactions of `db` containing it.
148 -- This maps from an "horizontal" representation to a "vertical" one,
149 -- itself mapping to an "horizontal" one.
150 -- See p.8 of https://www.lirmm.fr/~lazaar/imagina/NL-DM-IMAGINA1819-part1.pdf
152 -- It's a kind of occurrence deliver.
153 -- p.4 of http://osdm.uantwerpen.be/papers/p77-uno.pdf
157 [Transaction item a] ->
158 Map.Map item (Seq.Seq (Transaction item a))
159 itemToTxns itms txs =
162 [ (itm, Seq.singleton tx)
164 , itm <- Set.intersection itms (transactionItems tx) & Set.toList
167 data AllItems db = AllItemsAxiom
169 -- | `frequentItemSets` applied to all the items of the given transactions.
170 allFrequentItemSets ::
174 minSupp ::: Int / minSupp Logic.Ord.> Zero ->
175 minSize ::: Int / minSize Logic.Ord.> Zero ->
176 db ::: [Transaction item a] ->
177 FrequentItemSets (AllItems db) db minSupp minSize
179 allFrequentItemSets minSupp minSize db =
183 (AllItemsAxiom ... foldMap transactionItems (unName db))
186 data ClosedFrequentItemSets items db minSupp minSize = ClosedFrequentItemSetsAxiom
188 type Clusters item a =
191 (Seq.Seq (Transaction item a))
193 closedFrequentItemSets ::
194 forall item db minSize minSupp items.
198 minSupp ::: Int / minSupp Logic.Ord.> Zero ->
199 minSize ::: Int / minSize Logic.Ord.> Zero ->
200 items ::: ItemSet item ->
202 ClosedFrequentItemSets items db minSupp minSize
203 ::: [(ItemSupport, ItemSet item)]
204 closedFrequentItemSets (Named minSupp) (Named minSize) (Named items) (Named db) =
205 ClosedFrequentItemSetsAxiom ...
206 runLCM items minSupp minSize db
208 allClosedFrequentItemSets ::
212 minSupp ::: Int / minSupp Logic.Ord.> Zero ->
213 minSize ::: Int / minSize Logic.Ord.> Zero ->
215 ClosedFrequentItemSets (AllItems db) db minSupp minSize
216 ::: [(ItemSupport, ItemSet item)]
217 allClosedFrequentItemSets minSupp minSize db =
218 closedFrequentItemSets
221 (AllItemsAxiom ... fold (unName db))
226 -- Library for using the LCM algorithm in order to compute closed frequent pattern.
227 -- Input must be a transaction database, either in text format (as a ByteString)
228 -- or in @[[Item]]@ format, where @Item = Int@.
230 -- Several bencharking functions allowing to tune parallel strategy used and depth
231 -- cutoff are also provided.
232 type ItemSupport = Int
237 -----------------------------------------------------------------
239 -----------------------------------------------------------------
243 -- | Get the data as a matrix of Items, parses it and
244 -- and executes LCM to discover closed frequent Itemsets.
253 -- | The transaction database as matrix of Items (List of List)
255 -- | Output: list of closed frequent Itemsets
256 [(ItemSupport, Set item)]
257 runLCM items minSupp minSize db =
259 itemToSupp :: [(item, ItemSupport)]
261 itemToSupport items db
263 & List.filter ((>= minSupp) . snd)
264 & List.sortBy (comparing (Down . snd))
267 itemsSize = List.length itemToSupp
269 itemToRank :: Map.Map item ItemRank
272 [ (i, List.head $ List.findIndices ((== i) . fst) itemToSupp)
273 | (i, _) <- itemToSupp
276 -- Rewrites the database to use `ItemRank` instead of `item`
277 rankDB :: [Set ItemRank]
281 | i <- tx & Set.toList
282 -- Items whose support is lower than `minSupp`
283 -- have been filtered-out in `itemToSupp`,
284 -- hence do not have a rank.
285 , rank <- Map.lookup i itemToRank & maybeToList
290 dbLT = List.foldr (\tx acc -> insertLT (tx & Set.toList) (-1) 1 acc) Nil rankDB
292 rankToItem :: Array ItemRank item
294 List.zip [0 ..] (fst <$> itemToSupp)
295 & array (0, itemsSize - 1)
297 unrank :: [(ItemSupport, Set ItemRank)] -> [(ItemSupport, Set item)]
298 unrank = List.map $ second $ Set.map (rankToItem `unsafeAt`)
300 [ lcmLoop minSupp minSize 1 dbLT Set.empty candidateRank (rankToSuppLT items dbLT) items
301 | candidateRank <- [0 .. Set.size items -1]
303 & parBuffer 8 rdeepseq
309 -- For a transaction database, a closed frequent Itemset, and a candidate item
310 -- for extension of this closed frequent Itemset, recursively computes all
311 -- the successor closed frequent Itemsets by PPC-extension.
316 -- | Current depth in the search tree (for parallel optimisation purposes)
318 -- | Transaction database.
320 -- | Input closed frequent Itemset.
322 -- | Candidate to extend the closed frequent Itemset above.
324 -- | Array associating each item with its frequency
325 UArray ItemRank ItemSupport ->
328 -- | Result : list of closed frequent Itemsets. Each result is a list of Items, the head of the list being the frequency of the item.
329 [(ItemSupport, Set ItemRank)]
330 lcmLoop minSupp minSize depth previousDB previousRanks candidateRank rankToSupp items =
332 -- HLCM: line 1: CDB = project and reduce DB w.r.t. P and limit
333 -- Reduce database by eliminating:
334 -- - all items greater than `candidateRank`,
335 -- - and all items with zero support.
336 reducedDB = projectAndReduce candidateRank rankToSupp previousDB
338 -- HLCM: line 2: Compute frequencies of items in CDB
339 -- Compute items occurrences in reduced database.
340 reducedRankToSupp = rankToSuppLT items reducedDB
342 -- HLCM: line 3: CP = 100% frequent items in CDB
343 -- Check which items actually appear in reduced database.
344 candidateSupp = rankToSupp ! candidateRank
346 -- HLCM: line 6: Candidates = frequent items of CDB that are not in CP
347 -- Compute 100% frequent items, future candidates, and unfrequent items.
348 (closedFreqRanks, candidateRanks, unfreqRanks) =
349 computeCandidates minSupp candidateSupp items reducedRankToSupp
351 --pTraceShow (["lcmLoop"], minSupp, minSize, depth, previousDB, previousRanks, candidateRank, rankToSupp, items) $
352 -- HLCM: line 4: if max(CP) = limit then
353 if not (List.null closedFreqRanks) -- if there is a result ...
354 && last closedFreqRanks <= candidateRank -- ...and if it is OK to extend it
357 -- HLCM: line 5: P' = P ∪ CP
358 -- Result closed frequent Itemset = input closed frequent Itemset + 100% frequent Items
359 closedItemset = previousRanks <> Set.fromList closedFreqRanks
361 -- HLCM: line 8: for all e ∈ Candidates, e ≤ limit do
362 -- Only candidates with value lower than input candidateRank
363 -- can be used for further extension on this branch.
364 smallCandidates = List.takeWhile (< candidateRank) candidateRanks
366 [ (candidateSupp, closedItemset)
367 | minSize <= fromIntegral (Set.size closedItemset)
369 <> if not (List.null smallCandidates) -- ... and if we have at least 1 possible extension
372 -- Update items occurrences table by suppressing:
373 -- - 100% frequent items,
374 -- - and unfrequent items.
375 newRankToSupp = suppressItems reducedRankToSupp closedFreqRanks unfreqRanks
377 loop newCandidate = lcmLoop minSupp minSize (depth + 1) reducedDB closedItemset newCandidate newRankToSupp items
379 -- Recursively extend the candidates
380 if 3 < depth -- create parallel sparks only for low search space depth
381 then List.concat $ runEval $ parBuffer 2 rdeepseq $ List.map loop smallCandidates
382 else List.concatMap loop smallCandidates
387 -- For a transaction database of type [[item]], compute the frequency
388 -- of each item and return an array (item, frequency).
393 Map.Map item ItemSupport
394 itemToSupport items db =
399 , itm <- Set.intersection items tx & Set.toList
409 , i <- is & Set.toList
410 , idx <- Set.lookupIndex i items & maybeToList
414 -- XXX PERF: must be bad : the array is converted to list (one copy),
415 -- then this list is sorted (more copies of small lists), and at
416 -- last a new array is created...
417 -- Try to improve this with a mutable array and more "in place" spirit...
420 -- For a given itemset being extended by a given candidate, compute :
421 -- - the closure of this itemset
422 -- - and the candidates for further extension.
427 UArray ItemRank ItemSupport ->
428 -- (100% frequent items == closure, candidates for further extension, unfrequent items)
429 ([ItemRank], [ItemRank], [ItemRank])
430 computeCandidates minSupp candidateSupp items rankToSupp =
432 (frequentItems, unfreqItems) =
434 (\i -> rankToSupp ! i >= minSupp)
435 [i | i <- [0 .. Set.size items - 1], rankToSupp ! i > 0]
436 (closedFrequentRanks, candidateRanks) =
437 List.partition (\i -> rankToSupp ! i == candidateSupp) frequentItems
439 (closedFrequentRanks, candidateRanks, unfreqItems)
442 -- Modifies an array associating Items with their frequency, in order to
443 -- give a frequency of 0 to a given list of Items.
445 -- NB : for performance reasons, this is REALLY a modification, made with unsafe operations.
447 -- | Array associating an item with its frequency
448 UArray ItemRank ItemSupport ->
449 -- | List of 100% frequent Items
451 -- | List of unfrequent Items
453 -- | Initial array, with frequencies of 100% frequent Items
454 -- and unfrequent Items set to 0.
455 UArray ItemRank ItemSupport
456 suppressItems rankToSupp closedRanks unfreqRanks =
458 -- Can be used in multithread because no concurrent write
459 arr <- unsafeThaw rankToSupp :: ST s (STUArray s ItemRank ItemSupport)
460 forM_ closedRanks \i -> writeArray arr i 0
461 forM_ unfreqRanks \i -> writeArray arr i 0
462 -- Can be used in multithread because no concurrent write
465 -----------------------------------------------------------------
466 -- LEXICOGRAPHIC TREE MANIPULATION
467 -----------------------------------------------------------------
470 -- Creates a new, reduced transaction database by eliminating all items
471 -- greater than @candidateRank@ item, and all infrequent Items.
473 -- | Candidate item, on which the projection is made
475 -- | Array associating each item with its frequency in
476 -- original transaction database.
477 UArray ItemRank ItemSupport ->
478 -- | Original transaction database
480 -- | Result : Reduced transaction database
482 projectAndReduce !candidateRank rankToSupp = go
485 go (Node e suiv alt w)
486 | e > candidateRank = Nil
487 | e == candidateRank =
488 let !(suiv', addWeight) = filterInfrequent suiv rankToSupp
489 in Node e suiv' Nil (w + addWeight)
495 if rankToSupp ! e > 0
497 if notNil suiv' && notNil alt'
498 then Node e suiv' alt' 0
499 else if notNil suiv' then Node e suiv' Nil 0 else alt'
501 if notNil suiv' && notNil alt'
502 then mergeAlts suiv' alt'
503 else if notNil suiv' then suiv' else alt'
506 -- Suppress all infrequent Items from a transaction database expressed as
507 -- lexicographic tree, and returns a new lexicographic tree.
509 -- | Original transaction database
511 -- | Array associating each item with its frequency in
512 -- original transaction database. In this setting,
513 -- an infrequent item as a frequency of 0 (because of preprocessing by
514 -- ' suppressItems ').
515 UArray ItemRank ItemSupport ->
516 -- | Result : (transaction database without infrequent Items, weight to report in parent nodes)
517 (LexicoTreeItem, Weight)
518 filterInfrequent Nil _ = (Nil, 0)
519 filterInfrequent (Node e suiv alt w) occs
520 | occs ! e > 0 = (Node e suiv' alt' (w + ws), wa)
521 | notNil suiv' && notNil alt' = (mergeAlts suiv' alt', w')
522 | notNil alt' = (alt', w')
523 | notNil suiv' = (suiv', w')
524 | otherwise = (Nil, w')
527 !(suiv', ws) = filterInfrequent suiv occs
528 !(alt', wa) = filterInfrequent alt occs
530 {-# INLINE notNil #-}
531 notNil :: LexicoTreeItem -> Bool
536 -- Occurence delivering:
537 -- Map each item of the given database to its support.
540 -- | Transaction database (in lexicographic tree format)
542 -- | Result : array associating each item to its frequency.
543 UArray ItemRank ItemSupport
544 rankToSuppLT items dbLT =
546 arr <- newArray_ (0, Set.size items - 1)
547 -- TODO: this workaround should no longer be necessary
548 -- Creates an empty array : each item starts with frequency 0
549 -- workaround for http://hackage.haskell.org/trac/ghc/ticket/3586
550 forM_ [0 .. Set.size items - 1] $ \i -> unsafeWrite arr i 0
551 -- Compute frequencies for each item by efficient tree traversal
552 _ <- traverseLT dbLT arr
556 -- Efficient traversal of the transaction database as a lexicographic tree.
557 -- Items frequencies are updated on the fly.
560 -- | Transaction database
562 -- | Array associating each item with its frequency. UPDATED by this function !
563 STUArray s ItemRank ItemSupport ->
565 traverseLT tree arr = ST $ \s ->
567 (# s', _ #) -> (# s', () #)
573 go Nil s = (# s, 0# #)
574 go (Node item child alt w@(I# w#)) s0 =
579 case unsafeRead arr item of
582 (# _s3, I# itemw #) ->
583 case unsafeWrite arr item (I# itemw + I# childw + w) of
586 (# s4, _ #) -> (# s4, childw +# w# +# altw #)
590 -- | Type for a lexicographic tree, implementating a n-ary tree over a binary tree.
594 | -- | A node : item, next node (next in transaction), alternative node (other branch), weight
596 {-# UNPACK #-} !ItemRank
597 !LexicoTreeItem -- NB. experimental strictness annotation
598 !LexicoTreeItem -- NB. experimental strictness annotation
603 -- Inserts a transaction in list format into the lexicographic tree.
604 -- Automatically merges identical transactions.
605 -- Performs suffix intersection.
607 -- | Transaction to insert into lexicographic tree
609 -- | "coreI" item, for suffix intersection.
611 -- | Weight of the transaction to insert
613 -- | Input lexicographic tree
615 -- | Result : a new lexicographic tree with the transaction inserted
617 insertLT [] _ _ lt = lt
618 insertLT lst _ w Nil = createPath lst w
619 insertLT [x] i w (Node e suiv alt weight)
620 | x < e = Node x Nil (Node e suiv alt weight) w
621 | x == e = Node e suiv alt (weight + w)
622 | x > e = Node e suiv (insertLT [x] i w alt) weight
623 insertLT (x : xs) i w (Node e suiv alt weight)
624 | x < e = Node x (createPath xs w) (Node e suiv alt weight) 0
627 then Node e (insertLT xs i w suiv) alt weight
628 else suffixIntersectionLT xs w (Node e suiv alt weight)
629 | x > e = Node e suiv (insertLT (x : xs) i w alt) weight
630 insertLT _ _ _ _ = error "insertLT"
633 -- From a transaction and its weight, directly creates a path-shaped lexicographic tree.
637 -- | Weight of the transaction
639 -- | Result : a path-shaped lexicographic tree encoding the transaction
641 createPath [] _ = Nil
642 createPath [x] w = Node x Nil Nil w
643 createPath (x : xs) w = Node x (createPath xs w) Nil 0
646 -- Perform the "suffix intersection" operation with the suffix of a transaction
647 -- and the corresponding part of a lexicographic tree.
649 -- For more details, see "prefixIntersection" in Takeaki Uno's papers about LCM.
650 suffixIntersectionLT ::
651 -- | Suffix of the transaction to insert.
653 -- | Weight of the transaction to insert
655 -- | (Sub-)lexicographic tree where the transaction must be inserted. The @next@ part (see data type comments)
656 -- should be a simple path, it will be the target of intersection with the suffix.
658 -- | Result : lexicographic tree where the suffix has been added, with correct intersections performed.
660 suffixIntersectionLT _ w (Node e Nil alt weight) = Node e Nil alt (weight + w)
661 suffixIntersectionLT lst w (Node e suiv alt weight) =
662 let (newSuiv, addWeight) = suffInterSuiv lst w suiv
663 in Node e newSuiv alt (weight + addWeight)
664 suffixIntersectionLT _ _ _ = error "suffixIntersectionLT"
667 -- Intersects a list-shaped transaction and a path-shaped lexicographic tree.
668 -- The result is a path shaped lexicographic tree with weights correctly updated.
670 -- | Transaction as list
672 -- | Weight of the above transaction
674 -- | Path-shaped lexicographic tree
676 -- | Result : (path-shaped lexicographic tree representing the intersection
677 -- of transaction and input path , 0 if intersection not [] / sum of weights else)
678 (LexicoTreeItem, Int)
679 suffInterSuiv lst w suiv =
681 (lstSuiv, weightSuiv) = getLstSuiv suiv
682 inter = List.intersect lstSuiv lst
685 then (createPath inter (weightSuiv + w), 0)
686 else (Nil, weightSuiv + w)
689 -- Collects all the nodes of lexicographic tree in a list of elements.
691 -- | Path shaped lexicographic tree.
693 -- | Result : (list of elements in the path, sum of weights of nodes in the path)
695 getLstSuiv Nil = ([], 0)
696 getLstSuiv (Node e suiv Nil weight) =
697 let (lst, w) = getLstSuiv suiv
698 in (e : lst, w + weight)
699 getLstSuiv _ = error "getLstSuiv"
702 -- Merge two lexicographic trees.
703 mergeAlts :: LexicoTreeItem -> LexicoTreeItem -> LexicoTreeItem
704 mergeAlts Nil lt = lt
705 mergeAlts lt Nil = lt
706 mergeAlts (Node e1 suiv1 alt1 w1) (Node e2 suiv2 alt2 w2)
707 | e1 < e2 = (Node e1 suiv1 (mergeAlts alt1 (Node e2 suiv2 alt2 w2)) w1)
708 | e1 > e2 = (Node e2 suiv2 (mergeAlts (Node e1 suiv1 alt1 w1) alt2) w2)
709 | e1 == e2 = (Node e1 (mergeAlts suiv1 suiv2) (mergeAlts alt1 alt2) (w1 + w2))
710 mergeAlts _ _ = error "mergeAlts"