1 -- SPDX-License-Identifier: BSD-3-Clause
2 -- SPDX-FileCopyrightText: 2010 Alexandre Termier et al.
4 {-# LANGUAGE BangPatterns #-}
5 {-# LANGUAGE MagicHash #-}
6 {-# LANGUAGE UnboxedTuples #-}
7 {-# OPTIONS_GHC -Wno-unused-imports #-}
9 -- | Original LCM algorithm from:
11 -- \"An efficient algorithm for enumerating closed patterns in transaction databases\".
12 -- Takeaki Uno, Tatsuya Asai, Yuzo Uchida, and Hiroki Arimura.
13 -- In Discovery Science, pages 16–31, 2004.
14 -- https://research.nii.ac.jp/~uno/papers/lcm_ds04.pdf
15 -- https://www.annas-archive.org/scidb/10.1007/978-3-540-30214-8_2
17 -- Code adapted from HLCM:
19 -- \"HLCM: a first experiment on parallel data mining with Haskell\".
20 -- Alexandre Termier & Benjamin Négrevergne & Simon Marlow & Satnam Singh.
21 -- https://lig-membres.imag.fr/termier/HLCM/hlcm.pdf
22 -- https://hackage.haskell.org/package/hlcm
24 -- Possible future work:
26 -- \"Discovering closed frequent itemsets on multicore:
27 -- parallelizing computations and optimizing memory accesses\".
28 -- Benjamin Négrevergne & Alexandre Termier & Jean-François Méhaut & Takeaki Uno.
29 -- https://people.irisa.fr/Alexandre.Termier/publis/2010_negrevergne_hppddm.pdf
30 module Clustering.FrequentItemSet.LCM (
33 type FrequentItemSets (),
38 type ClosedFrequentItemSets (),
39 closedFrequentItemSets,
40 allClosedFrequentItemSets,
47 import Clustering.FrequentItemSet.BruteForce (ItemSet, Support (), Transaction (..))
48 import Data.Bifunctor (second)
50 import Data.Eq (Eq (..))
51 import Data.Foldable (fold, foldMap)
52 import Data.Function (id, ($), (&), (.))
53 import Data.Functor ((<$>), (<&>))
55 import Data.Monoid (Monoid (..))
56 import Data.Ord (Down, Ord (..), comparing)
57 import Data.Ratio ((%))
58 import Data.Semigroup (Semigroup (..))
59 import Data.Sequence qualified as Seq
60 import Data.Tuple (fst, snd)
61 import Data.Validity (Validity (..), delve)
62 import Debug.Pretty.Simple (pTraceShow)
64 import GHC.Generics (Generic)
65 import GHC.IsList (toList)
66 import GHC.Stack (HasCallStack)
68 import Logic.Theory.Arithmetic (Zero)
69 import Logic.Theory.Ord qualified as Logic.Ord
70 import Numeric.Probability
71 import Text.Show (Show (..))
72 import Prelude (Enum, Num, error, fromIntegral, (+), (-))
74 import Control.Exception (evaluate)
76 import Control.Monad.ST
77 import Control.Parallel
78 import Control.Parallel.Strategies
79 import Data.Array.Base
81 import Data.Array.Unboxed
82 import Data.ByteString.Char8 qualified as L
84 import Data.List qualified as List
85 import Data.Map.Strict (Map)
86 import Data.Map.Strict qualified as Map
87 import Data.Maybe (catMaybes, fromJust, isNothing, maybeToList)
89 import Data.Set qualified as Set
90 import Data.Vector qualified as V
91 import GHC.Exts hiding (Item)
94 data FrequentItemSets items db minSupp minSize = FrequentItemSetsAxiom
98 -- `frequentItemSets` minSupp minSize items db
100 -- returns a list of the closed frequent itemsets of
101 -- the transactions @(db)@ restricted to the specified @(items)@,
102 -- and such that the number of transactions containing them is greater or equal to @(minSupp)@,
103 -- and such that the size of those transactions is greater or equal to @(minSize)@.
104 -- Each closed frequent itemset is coupled with the sequence of the transactions containing them.
106 forall a item db minSize minSupp items.
110 minSupp ::: Int / minSupp Logic.Ord.> Zero ->
111 minSize ::: Int / minSize Logic.Ord.> Zero ->
112 items ::: ItemSet item ->
113 db ::: [Transaction item a] ->
114 FrequentItemSets items db minSupp minSize
116 frequentItemSets (Named minSupp) (Named minSize) (Named items) (Named db) =
117 FrequentItemSetsAxiom ...
118 Map.fromDistinctAscList (loop Set.empty items db)
120 -- Lexicographic depth-first search for frequent item sets.
121 loop previousFIS nextItems previousTxns
122 | Set.null nextItems = []
124 -- pTraceShow (["LCM", "frequentItemSets", "loop"], (("previousFIS", previousFIS), ("nextItems", nextItems), ("previousTxns", previousTxns))) $
125 -- Map each item of `nextItems` to its transactions in `previousTxns`.
126 let nextItemToTxns = itemToTxns nextItems previousTxns
127 in (`List.concatMap` Map.toList nextItemToTxns) \(nextItem, nextTxns) ->
128 -- Keep only the itemsets which are supported by enough transactions.
129 if minSupp <= fromIntegral (Seq.length nextTxns)
131 let nextFIS = Set.insert nextItem previousFIS
132 in -- Keep only the itemsets which have already gathered enough items.
133 [ (nextFIS, nextTxns)
134 | minSize <= fromIntegral (Set.size nextFIS)
137 -- Recurse with each item of `nextItems` greater than `nextItem`
138 -- (to traverse the frequent item sets as a tree instead of a poset lattice),
139 -- and with the transactions containing `nextItem`.
140 loop nextFIS (Set.split nextItem nextItems & snd) (nextTxns & toList)
144 -- `itemToTxns` items db
146 -- maps each item of `items` to the transactions of `db` containing it.
148 -- This maps from an "horizontal" representation to a "vertical" one,
149 -- itself mapping to an "horizontal" one.
150 -- See p.8 of https://www.lirmm.fr/~lazaar/imagina/NL-DM-IMAGINA1819-part1.pdf
152 -- It's a kind of occurrence deliver.
153 -- p.4 of http://osdm.uantwerpen.be/papers/p77-uno.pdf
157 [Transaction item a] ->
158 Map item (Seq.Seq (Transaction item a))
159 itemToTxns itms txs =
162 [ (itm, Seq.singleton tx)
164 , itm <- Set.intersection itms (transactionItems tx) & Set.toList
167 data AllItems db = AllItemsAxiom
169 -- | `frequentItemSets` applied to all the items of the given transactions.
170 allFrequentItemSets ::
174 minSupp ::: Int / minSupp Logic.Ord.> Zero ->
175 minSize ::: Int / minSize Logic.Ord.> Zero ->
176 db ::: [Transaction item a] ->
177 FrequentItemSets (AllItems db) db minSupp minSize
179 allFrequentItemSets minSupp minSize db =
183 (AllItemsAxiom ... foldMap transactionItems (unName db))
186 data ClosedFrequentItemSets items db minSupp minSize = ClosedFrequentItemSetsAxiom
188 type Clusters item a =
191 (Seq.Seq (Transaction item a))
193 closedFrequentItemSets ::
194 forall item db minSize minSupp items.
198 minSupp ::: Int / minSupp Logic.Ord.> Zero ->
199 minSize ::: Int / minSize Logic.Ord.> Zero ->
200 items ::: ItemSet item ->
202 ClosedFrequentItemSets items db minSupp minSize
203 ::: [(ItemSupport, ItemSet item)]
204 closedFrequentItemSets (Named minSupp) (Named minSize) (Named items) (Named db) =
205 ClosedFrequentItemSetsAxiom ...
206 runLCM items minSupp minSize db
208 allClosedFrequentItemSets ::
212 minSupp ::: Int / minSupp Logic.Ord.> Zero ->
213 minSize ::: Int / minSize Logic.Ord.> Zero ->
215 ClosedFrequentItemSets (AllItems db) db minSupp minSize
216 ::: [(ItemSupport, ItemSet item)]
217 allClosedFrequentItemSets minSupp minSize db =
218 closedFrequentItemSets
221 (AllItemsAxiom ... fold (unName db))
224 type ItemSupport = Int
236 [(ItemSupport, Set item)]
237 runLCM items minSupp minSize db =
239 itemToSupp :: [(item, ItemSupport)]
241 itemToSupport items db
243 & List.filter ((>= minSupp) . snd)
244 & List.sortBy (comparing (Down . snd))
247 itemsSize = List.length itemToSupp
249 itemToRank :: Map item ItemRank
252 [ (i, List.head $ List.findIndices ((== i) . fst) itemToSupp)
253 | (i, _) <- itemToSupp
256 -- Rewrites the database to use `ItemRank` instead of `item`
257 rankDB :: [Set ItemRank]
261 | i <- tx & Set.toList
262 -- Items whose support is lower than `minSupp`
263 -- have been filtered-out in `itemToSupp`,
264 -- hence do not have a rank.
265 , rank <- Map.lookup i itemToRank & maybeToList
270 -- Rewrite the database as a `LexicoTreeItem`
271 dbLT = List.foldr (\tx acc -> insertLT (tx & Set.toList) (-1) 1 acc) Nil rankDB
273 rankToItem :: Array ItemRank item
275 List.zip [0 ..] (fst <$> itemToSupp)
276 & array (0, itemsSize - 1)
278 unrank :: [(ItemSupport, Set ItemRank)] -> [(ItemSupport, Set item)]
279 unrank = List.map $ second $ Set.map (rankToItem `unsafeAt`)
281 [ lcmLoop minSupp minSize 1 dbLT Set.empty candidateRank (rankToSuppLT items dbLT) items
282 | candidateRank <- [0 .. Set.size items -1]
284 & parBuffer 8 rdeepseq
290 -- For a transaction database, a closed frequent Itemset, and a candidate item
291 -- for extension of this closed frequent Itemset, recursively computes all
292 -- the successor closed frequent Itemsets by PPC-extension.
297 -- | Current depth in the search tree (for parallel optimisation purposes)
299 -- | Transaction database.
301 -- | Input closed frequent Itemset.
303 -- | Candidate to extend the closed frequent Itemset above.
305 -- | Array associating each item with its frequency
306 UArray ItemRank ItemSupport ->
309 -- | Result : list of closed frequent Itemsets. Each result is a list of Items, the head of the list being the frequency of the item.
310 [(ItemSupport, Set ItemRank)]
311 lcmLoop minSupp minSize depth previousDB previousRanks candidateRank rankToSupp items =
313 -- HLCM: line 1: CDB = project and reduce DB w.r.t. P and limit
314 -- Reduce database by eliminating:
315 -- - all items greater than `candidateRank`,
316 -- - and all items with zero support.
317 reducedDB = projectAndReduce candidateRank rankToSupp previousDB
319 -- HLCM: line 2: Compute frequencies of items in CDB
320 -- Compute items occurrences in reduced database.
321 reducedRankToSupp = rankToSuppLT items reducedDB
323 -- HLCM: line 3: CP = 100% frequent items in CDB
324 -- Check which items actually appear in reduced database.
325 candidateSupp = rankToSupp ! candidateRank
327 -- HLCM: line 6: Candidates = frequent items of CDB that are not in CP
328 -- Compute 100% frequent items, future candidates, and unfrequent items.
329 (closedFreqRanks, candidateRanks, unfreqRanks) =
330 computeCandidates minSupp candidateSupp items reducedRankToSupp
332 --pTraceShow (["lcmLoop"], minSupp, minSize, depth, previousDB, previousRanks, candidateRank, rankToSupp, items) $
333 -- HLCM: line 4: if max(CP) = limit then
334 if not (List.null closedFreqRanks) -- if there is a result ...
335 && last closedFreqRanks <= candidateRank -- ...and if it is OK to extend it
338 -- HLCM: line 5: P' = P ∪ CP
339 -- Result closed frequent Itemset = input closed frequent Itemset + 100% frequent Items
340 closedItemset = previousRanks <> Set.fromList closedFreqRanks
342 -- HLCM: line 8: for all e ∈ Candidates, e ≤ limit do
343 -- Only candidates with value lower than input candidateRank
344 -- can be used for further extension on this branch.
345 smallCandidates = List.takeWhile (< candidateRank) candidateRanks
347 [ (candidateSupp, closedItemset)
348 | minSize <= fromIntegral (Set.size closedItemset)
350 <> if not (List.null smallCandidates) -- ... and if we have at least 1 possible extension
353 -- Update items occurrences table by suppressing:
354 -- - 100% frequent items,
355 -- - and unfrequent items.
356 newRankToSupp = suppressItems reducedRankToSupp closedFreqRanks unfreqRanks
358 loop newCandidate = lcmLoop minSupp minSize (depth + 1) reducedDB closedItemset newCandidate newRankToSupp items
360 -- Recursively extend the candidates
361 if 3 < depth -- create parallel sparks only for low search space depth
362 then List.concat $ runEval $ parBuffer 2 rdeepseq $ List.map loop smallCandidates
363 else List.concatMap loop smallCandidates
368 -- For a transaction database of type [[item]], compute the frequency
369 -- of each item and return an array (item, frequency).
375 itemToSupport items db =
380 , itm <- Set.intersection items tx & Set.toList
384 -- For a given itemset being extended by a given candidate, compute :
385 -- - the closure of this itemset
386 -- - and the candidates for further extension.
391 UArray ItemRank ItemSupport ->
392 -- (100% frequent items == closure, candidates for further extension, unfrequent items)
393 ([ItemRank], [ItemRank], [ItemRank])
394 computeCandidates minSupp candidateSupp items rankToSupp =
396 (frequentItems, unfreqItems) =
398 (\i -> rankToSupp ! i >= minSupp)
399 [i | i <- [0 .. Set.size items - 1], rankToSupp ! i > 0]
400 (closedFrequentRanks, candidateRanks) =
401 List.partition (\i -> rankToSupp ! i == candidateSupp) frequentItems
403 (closedFrequentRanks, candidateRanks, unfreqItems)
406 -- Modifies an array associating Items with their frequency, in order to
407 -- give a frequency of 0 to a given list of Items.
409 -- NB : for performance reasons, this is REALLY a modification, made with unsafe operations.
411 -- | Array associating an item with its frequency
412 UArray ItemRank ItemSupport ->
413 -- | List of 100% frequent Items
415 -- | List of unfrequent Items
417 -- | Initial array, with frequencies of 100% frequent Items
418 -- and unfrequent Items set to 0.
419 UArray ItemRank ItemSupport
420 suppressItems rankToSupp closedRanks unfreqRanks =
422 -- Can be used in multithread because no concurrent write
423 arr <- unsafeThaw rankToSupp :: ST s (STUArray s ItemRank ItemSupport)
424 forM_ closedRanks \i -> writeArray arr i 0
425 forM_ unfreqRanks \i -> writeArray arr i 0
426 -- Can be used in multithread because no concurrent write
429 -----------------------------------------------------------------
430 -- LEXICOGRAPHIC TREE MANIPULATION
431 -----------------------------------------------------------------
434 -- Creates a new, reduced transaction database by eliminating all items
435 -- greater than @candidateRank@ item, and all infrequent Items.
437 -- | Candidate item, on which the projection is made
439 -- | Array associating each item with its frequency in
440 -- original transaction database.
441 UArray ItemRank ItemSupport ->
442 -- | Original transaction database
444 -- | Result : Reduced transaction database
446 projectAndReduce !candidateRank rankToSupp = go
449 go (Node e suiv alt w)
450 | e > candidateRank = Nil
451 | e == candidateRank =
452 let !(suiv', addWeight) = filterInfrequent suiv rankToSupp
453 in Node e suiv' Nil (w + addWeight)
459 if rankToSupp ! e > 0
461 if notNil suiv' && notNil alt'
462 then Node e suiv' alt' 0
463 else if notNil suiv' then Node e suiv' Nil 0 else alt'
465 if notNil suiv' && notNil alt'
466 then mergeAlts suiv' alt'
467 else if notNil suiv' then suiv' else alt'
472 -- Suppress all infrequent Items from a transaction database expressed as
473 -- lexicographic tree, and returns a new lexicographic tree.
475 -- | Original transaction database
477 -- | Array associating each item with its frequency in
478 -- original transaction database. In this setting,
479 -- an infrequent item as a frequency of 0 (because of preprocessing by
480 -- ' suppressItems ').
481 UArray ItemRank ItemSupport ->
482 -- | Result : (transaction database without infrequent Items, weight to report in parent nodes)
483 (LexicoTreeItem, Weight)
484 filterInfrequent Nil _ = (Nil, 0)
485 filterInfrequent (Node e suiv alt w) occs
486 | occs ! e > 0 = (Node e suiv' alt' (w + ws), wa)
487 | notNil suiv' && notNil alt' = (mergeAlts suiv' alt', w')
488 | notNil alt' = (alt', w')
489 | notNil suiv' = (suiv', w')
490 | otherwise = (Nil, w')
493 !(suiv', ws) = filterInfrequent suiv occs
494 !(alt', wa) = filterInfrequent alt occs
496 {-# INLINE notNil #-}
497 notNil :: LexicoTreeItem -> Bool
502 -- Occurence delivering:
503 -- Map each item of the given database to its support.
506 -- | Transaction database (in lexicographic tree format)
508 -- | Result : array associating each item to its frequency.
509 UArray ItemRank ItemSupport
510 rankToSuppLT items dbLT =
512 arr <- newArray_ (0, Set.size items - 1)
513 -- TODO: this workaround should no longer be necessary
514 -- Creates an empty array : each item starts with frequency 0
515 -- workaround for http://hackage.haskell.org/trac/ghc/ticket/3586
516 forM_ [0 .. Set.size items - 1] $ \i -> unsafeWrite arr i 0
517 -- Compute frequencies for each item by efficient tree traversal
518 _ <- traverseLT dbLT arr
522 -- Efficient traversal of the transaction database as a lexicographic tree.
523 -- Items frequencies are updated on the fly.
526 -- | Transaction database
528 -- | Array associating each item with its frequency. UPDATED by this function !
529 STUArray s ItemRank ItemSupport ->
531 traverseLT tree arr = ST $ \s ->
533 (# s', _ #) -> (# s', () #)
539 go Nil s = (# s, 0# #)
540 go (Node item child alt w@(I# w#)) s0 =
545 case unsafeRead arr item of
548 (# _s3, I# itemw #) ->
549 case unsafeWrite arr item (I# itemw + I# childw + w) of
552 (# s4, _ #) -> (# s4, childw +# w# +# altw #)
556 -- | Type for a lexicographic tree, implementating a n-ary tree over a binary tree.
560 | -- | A node : item, next node (next in transaction), alternative node (other branch), weight
562 {-# UNPACK #-} !ItemRank
563 !LexicoTreeItem -- NB. experimental strictness annotation
564 !LexicoTreeItem -- NB. experimental strictness annotation
569 -- Inserts a transaction in list format into the lexicographic tree.
570 -- Automatically merges identical transactions.
571 -- Performs suffix intersection.
573 -- | Transaction to insert into lexicographic tree
575 -- | "coreI" item, for suffix intersection.
577 -- | Weight of the transaction to insert
579 -- | Input lexicographic tree
581 -- | Result : a new lexicographic tree with the transaction inserted
583 insertLT [] _ _ lt = lt
584 insertLT lst _ w Nil = createPath lst w
585 insertLT [x] i w (Node e suiv alt weight)
586 | x < e = Node x Nil (Node e suiv alt weight) w
587 | x == e = Node e suiv alt (weight + w)
588 | x > e = Node e suiv (insertLT [x] i w alt) weight
589 insertLT (x : xs) i w (Node e suiv alt weight)
590 | x < e = Node x (createPath xs w) (Node e suiv alt weight) 0
593 then Node e (insertLT xs i w suiv) alt weight
594 else suffixIntersectionLT xs w (Node e suiv alt weight)
595 | x > e = Node e suiv (insertLT (x : xs) i w alt) weight
596 insertLT _ _ _ _ = error "insertLT"
599 -- From a transaction and its weight, directly creates a path-shaped lexicographic tree.
603 -- | Weight of the transaction
605 -- | Result : a path-shaped lexicographic tree encoding the transaction
607 createPath [] _ = Nil
608 createPath [x] w = Node x Nil Nil w
609 createPath (x : xs) w = Node x (createPath xs w) Nil 0
612 -- Perform the "suffix intersection" operation with the suffix of a transaction
613 -- and the corresponding part of a lexicographic tree.
615 -- For more details, see "prefixIntersection" in Takeaki Uno's papers about LCM.
616 suffixIntersectionLT ::
617 -- | Suffix of the transaction to insert.
619 -- | Weight of the transaction to insert
621 -- | (Sub-)lexicographic tree where the transaction must be inserted. The @next@ part (see data type comments)
622 -- should be a simple path, it will be the target of intersection with the suffix.
624 -- | Result : lexicographic tree where the suffix has been added, with correct intersections performed.
626 suffixIntersectionLT _ w (Node e Nil alt weight) = Node e Nil alt (weight + w)
627 suffixIntersectionLT lst w (Node e suiv alt weight) =
628 let (newSuiv, addWeight) = suffInterSuiv lst w suiv
629 in Node e newSuiv alt (weight + addWeight)
630 suffixIntersectionLT _ _ _ = error "suffixIntersectionLT"
633 -- Intersects a list-shaped transaction and a path-shaped lexicographic tree.
634 -- The result is a path shaped lexicographic tree with weights correctly updated.
636 -- | Transaction as list
638 -- | Weight of the above transaction
640 -- | Path-shaped lexicographic tree
642 -- | Result : (path-shaped lexicographic tree representing the intersection
643 -- of transaction and input path , 0 if intersection not [] / sum of weights else)
644 (LexicoTreeItem, Int)
645 suffInterSuiv lst w suiv =
647 (lstSuiv, weightSuiv) = getLstSuiv suiv
648 inter = List.intersect lstSuiv lst
651 then (createPath inter (weightSuiv + w), 0)
652 else (Nil, weightSuiv + w)
655 -- Collects all the nodes of lexicographic tree in a list of elements.
657 -- | Path shaped lexicographic tree.
659 -- | Result : (list of elements in the path, sum of weights of nodes in the path)
661 getLstSuiv Nil = ([], 0)
662 getLstSuiv (Node e suiv Nil weight) =
663 let (lst, w) = getLstSuiv suiv
664 in (e : lst, w + weight)
665 getLstSuiv _ = error "getLstSuiv"
668 -- Merge two lexicographic trees.
669 mergeAlts :: LexicoTreeItem -> LexicoTreeItem -> LexicoTreeItem
670 mergeAlts Nil lt = lt
671 mergeAlts lt Nil = lt
672 mergeAlts (Node e1 suiv1 alt1 w1) (Node e2 suiv2 alt2 w2)
673 | e1 < e2 = (Node e1 suiv1 (mergeAlts alt1 (Node e2 suiv2 alt2 w2)) w1)
674 | e1 > e2 = (Node e2 suiv2 (mergeAlts (Node e1 suiv1 alt1 w1) alt2) w2)
675 | e1 == e2 = (Node e1 (mergeAlts suiv1 suiv2) (mergeAlts alt1 alt2) (w1 + w2))
676 mergeAlts _ _ = error "mergeAlts"