本文目录导读:

我来详细介绍Python实现数据关联规则(如Apriori算法和FP-Growth算法)的几种方法。
使用mlxtend库实现Apriori算法
安装所需库
pip install mlxtend pandas
完整案例代码
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules
# 示例数据:购物篮数据
dataset = [
['牛奶', '面包', '黄油'],
['啤酒', '面包'],
['牛奶', '尿布', '啤酒', '可乐'],
['面包', '牛奶', '尿布', '啤酒'],
['面包', '黄油', '牛奶'],
['啤酒', '尿布'],
['牛奶', '尿布', '啤酒', '面包'],
['尿布', '啤酒', '可乐']
]
# 1. 数据预处理:将交易数据转换为One-Hot编码格式
te = TransactionEncoder()
te_ary = te.fit(dataset).transform(dataset)
df = pd.DataFrame(te_ary, columns=te.columns_)
print("转换后的数据格式:")
print(df.head())
# 2. 挖掘频繁项集
# 设置最小支持度阈值
min_support = 0.3
frequent_itemsets = apriori(df, min_support=min_support, use_colnames=True)
print(f"\n频繁项集(支持度 >= {min_support}):")
print(frequent_itemsets)
# 3. 生成关联规则
# 设置最小置信度和提升度
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.6)
print("\n关联规则:")
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])
# 4. 添加更多评估指标
rules_with_metrics = association_rules(
frequent_itemsets,
metric="lift",
min_threshold=1.0
)
print("\n带有完整评估指标的规则:")
print(rules_with_metrics[['antecedents', 'consequents', 'support', 'confidence', 'lift', 'leverage', 'conviction']])
自定义Apriori算法实现
from itertools import combinations
from collections import Counter
import pandas as pd
class AprioriCustom:
def __init__(self, min_support=0.3, min_confidence=0.6):
self.min_support = min_support
self.min_confidence = min_confidence
self.frequent_itemsets = []
self.rules = []
def fit(self, transactions):
# 数据预处理
self.transactions = transactions
self.num_transactions = len(transactions)
# 步骤1:生成频繁1项集
item_counts = Counter()
for transaction in transactions:
for item in transaction:
item_counts[item] += 1
# 过滤支持度小于阈值的项
self.frequent_itemsets_1 = {
frozenset([item]): count / self.num_transactions
for item, count in item_counts.items()
if count / self.num_transactions >= self.min_support
}
# 存储所有频繁项集
self.frequent_itemsets = [self.frequent_itemsets_1]
# 步骤2:生成更长的频繁项集
k = 2
current_itemsets = list(self.frequent_itemsets_1.keys())
while current_itemsets:
# 生成候选k项集
candidates = self._generate_candidates(current_itemsets, k)
# 计算支持度
frequent_k = {}
for candidate in candidates:
support = self._calculate_support(candidate)
if support >= self.min_support:
frequent_k[candidate] = support
if frequent_k:
self.frequent_itemsets.append(frequent_k)
current_itemsets = list(frequent_k.keys())
k += 1
else:
break
# 步骤3:生成关联规则
self._generate_rules()
return self
def _generate_candidates(self, itemsets, k):
"""生成候选k项集"""
candidates = set()
for i in range(len(itemsets)):
for j in range(i + 1, len(itemsets)):
# 合并两个k-1项集
candidate = itemsets[i] | itemsets[j]
if len(candidate) == k:
# 检查所有k-1子集是否都是频繁的
all_subsets_frequent = True
for subset in combinations(candidate, k-1):
if frozenset(subset) not in itemsets:
all_subsets_frequent = False
break
if all_subsets_frequent:
candidates.add(candidate)
return list(candidates)
def _calculate_support(self, itemset):
"""计算项集的支持度"""
count = sum(1 for transaction in self.transactions
if itemset.issubset(set(transaction)))
return count / self.num_transactions
def _generate_rules(self):
"""生成关联规则"""
self.rules = []
# 遍历所有频繁项集(长度>=2)
for itemset_dict in self.frequent_itemsets[1:]: # 跳过频繁1项集
for itemset, support in itemset_dict.items():
# 生成所有可能的规则
for i in range(1, len(itemset)):
for antecedent in combinations(list(itemset), i):
antecedent = set(antecedent)
consequent = itemset - antecedent
if antecedent and consequent:
# 计算置信度
antecedent_support = self._calculate_support(antecedent)
confidence = support / antecedent_support
if confidence >= self.min_confidence:
# 计算提升度
consequent_support = self._calculate_support(consequent)
lift = confidence / consequent_support
self.rules.append({
'antecedents': antecedent,
'consequents': consequent,
'support': support,
'confidence': confidence,
'lift': lift
})
# 使用自定义Apriori算法
if __name__ == "__main__":
# 使用前面的数据集
dataset = [
['牛奶', '面包', '黄油'],
['啤酒', '面包'],
['牛奶', '尿布', '啤酒', '可乐'],
['面包', '牛奶', '尿布', '啤酒'],
['面包', '黄油', '牛奶'],
['啤酒', '尿布'],
['牛奶', '尿布', '啤酒', '面包'],
['尿布', '啤酒', '可乐']
]
# 训练模型
apriori = AprioriCustom(min_support=0.3, min_confidence=0.6)
apriori.fit(dataset)
# 显示频繁项集
print("频繁项集:")
for i, itemset_dict in enumerate(apriori.frequent_itemsets):
print(f"\n{i+1}-项集:")
for itemset, support in itemset_dict.items():
print(f" {set(itemset)}: 支持度={support:.2f}")
# 显示关联规则
print(f"\n关联规则(置信度 >= {apriori.min_confidence}):")
for rule in apriori.rules:
print(f"{rule['antecedents']} -> {rule['consequents']}: "
f"支持度={rule['support']:.2f}, "
f"置信度={rule['confidence']:.2f}, "
f"提升度={rule['lift']:.2f}")
实际应用案例:电商购物篮分析
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder
def create_sample_ecommerce_data():
"""创建模拟电商数据"""
data = {
'Transaction_ID': [1, 1, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 6, 6, 7, 7, 7, 8, 8],
'Product': ['手机', '手机壳', '耳机', '手机', '充电器',
'笔记本电脑', '鼠标', '键盘', '电脑包',
'手机', '耳机', '充电器', '笔记本电脑', '鼠标',
'手机', '手机壳', '笔记本电脑', '鼠标', '键盘',
'耳机', '充电器']
}
return pd.DataFrame(data)
def ecommerce_basket_analysis():
"""电商购物篮分析"""
# 加载数据
df = create_sample_ecommerce_data()
# 将数据转换为购物篮格式
basket = df.groupby('Transaction_ID')['Product'].apply(list).values.tolist()
# 数据预处理
te = TransactionEncoder()
te_ary = te.fit(basket).transform(basket)
df_basket = pd.DataFrame(te_ary, columns=te.columns_)
# 挖掘频繁项集
frequent_itemsets = apriori(df_basket, min_support=0.2, use_colnames=True)
# 生成关联规则
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.0)
# 筛选高质量规则
high_quality_rules = rules[(rules['lift'] > 1.5) &
(rules['confidence'] > 0.5)]
print("=== 电商购物篮分析结果 ===")
print(f"\n总交易数: {len(df_basket)}")
print(f"发现频繁项集: {len(frequent_itemsets)}")
print(f"发现关联规则: {len(rules)}")
print(f"高质量规则: {len(high_quality_rules)}")
print("\n=== 高质量关联规则 ===")
for i, rule in high_quality_rules.iterrows():
antecedents = set(rule['antecedents'])
consequents = set(rule['consequents'])
print(f"\n规则 {i+1}: {antecedents} -> {consequents}")
print(f" 支持度: {rule['support']:.2%}")
print(f" 置信度: {rule['confidence']:.2%}")
print(f" 提升度: {rule['lift']:.2f}")
# 可视化规则
try:
import matplotlib.pyplot as plt
# 绘制提升度分布
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
# 支持度 vs 置信度散点图
ax1.scatter(rules['support'], rules['confidence'], alpha=0.5)
ax1.set_xlabel('支持度')
ax1.set_ylabel('置信度')
ax1.set_title('关联规则:支持度 vs 置信度')
# 提升度分布
ax2.hist(rules['lift'], bins=20, alpha=0.7)
ax2.set_xlabel('提升度')
ax2.set_ylabel('频率')
ax2.set_title('提升度分布')
plt.tight_layout()
plt.show()
except ImportError:
print("\n请安装matplotlib以查看可视化结果")
# 运行分析
if __name__ == "__main__":
ecommerce_basket_analysis()
使用FP-Growth算法(大数据集推荐)
from mlxtend.frequent_patterns import fpgrowth
import pandas as pd
def fpgrowth_example():
"""使用FP-Growth算法(比Apriori更高效)"""
from mlxtend.preprocessing import TransactionEncoder
# 示例数据
dataset = [
['牛奶', '面包', '黄油'],
['啤酒', '面包'],
['牛奶', '尿布', '啤酒', '可乐'],
['面包', '牛奶', '尿布', '啤酒'],
['面包', '黄油', '牛奶']
]
# 数据预处理
te = TransactionEncoder()
te_ary = te.fit(dataset).transform(dataset)
df = pd.DataFrame(te_ary, columns=te.columns_)
# FP-Growth挖掘频繁项集
frequent_itemsets = fpgrowth(df, min_support=0.3, use_colnames=True)
print("FP-Growth挖掘到的频繁项集:")
print(frequent_itemsets)
# 生成关联规则
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.6)
print("\n生成的关联规则:")
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])
# 运行FP-Growth示例
if __name__ == "__main__":
fpgrowth_example()
实用技巧和建议
参数调优建议
def parameter_tuning_tips():
"""关联规则参数调优建议"""
tips = {
"min_support": {
"范围": "0.01-0.5",
"建议": "数据量大用较小值(0.01-0.1),数据量小用较大值(0.1-0.3)",
"影响": "值越小,发现的规则越多,但可能包含噪声"
},
"min_confidence": {
"范围": "0.5-0.9",
"建议": "一般从0.6开始尝试",
"影响": "值越大,规则越可靠,但可能遗漏重要规则"
},
"min_lift": {
"范围": "1.0-3.0",
"建议": "大于1.3表示规则有实际意义",
"影响": "提升度>1表示正相关,<1表示负相关"
}
}
print("=== 参数调优指南 ===")
for param, info in tips.items():
print(f"\n{param}:")
for key, value in info.items():
print(f" {key}: {value}")
# 使用
parameter_tuning_tips()
性能优化建议
def performance_tips():
"""性能优化建议"""
print("=== 性能优化建议 ===")
print("""
1. 对于大数据集,优先使用FP-Growth而非Apriori
2. 适当提高min_support可以减少计算量
3. 使用数据采样进行初步探索
4. 考虑使用数据分区策略
5. 对稀疏数据使用压缩表示方法
6. 使用并行计算框架(如Spark MLlib)处理超大数据集
""")
# 使用
performance_tips()
Python实现关联规则挖掘的主要步骤:
- 数据预处理:将交易数据转换为One-Hot编码
- 频繁项集挖掘:使用Apriori或FP-Growth算法
- 规则生成:根据频繁项集生成关联规则
- 规则评估:使用支持度、置信度、提升度等指标筛选规则
- 结果解释:解释和可视化发现的规则
选择合适的算法:
- 小数据集:Apriori(简单易用)
- 大数据集:FP-Growth(更高效)
- 商业应用:mlxtend库(可靠、功能完善)