本文目录导读:

我来介绍几种Python实现数据脱敏的常用方法:
基础脱敏方法
手机号脱敏
import re
def mask_phone(phone):
"""手机号中间四位用****代替"""
return re.sub(r'(\d{3})\d{4}(\d{4})', r'\1****\2', str(phone))
# 示例
phone = "13812345678"
print(mask_phone(phone)) # 输出: 138****5678
身份证号脱敏
def mask_id_card(id_card):
"""身份证号显示前6位和后4位"""
if len(id_card) == 18:
return id_card[:6] + "********" + id_card[-4:]
return id_card
# 示例
id_card = "110101199001011234"
print(mask_id_card(id_card)) # 输出: 110101********1234
邮箱脱敏
def mask_email(email):
"""邮箱显示第一个字符和域名"""
parts = email.split('@')
if len(parts) == 2:
username, domain = parts
if len(username) > 1:
username = username[0] + '*' * (len(username) - 1)
return f"{username}@{domain}"
return email
# 示例
email = "zhangsan@example.com"
print(mask_email(email)) # 输出: z*******@example.com
银行卡号脱敏
def mask_bank_card(card_number):
"""银行卡号显示前4位和后4位"""
card_str = str(card_number).replace(' ', '')
if len(card_str) >= 8:
return card_str[:4] + " **** **** " + card_str[-4:]
return card_str
# 示例
card = "6222021234567890"
print(mask_bank_card(card)) # 输出: 6222 **** **** 7890
通用脱敏函数
def data_masking(data, mask_type='phone'):
"""
通用数据脱敏函数
参数:
data: 需要脱敏的数据
mask_type: 脱敏类型 (phone, id_card, email, name, bank_card)
返回:
脱敏后的数据
"""
if not data:
return data
mask_type = mask_type.lower()
if mask_type == 'phone':
return re.sub(r'(\d{3})\d{4}(\d{4})', r'\1****\2', str(data))
elif mask_type == 'id_card':
if len(str(data)) == 18:
return str(data)[:6] + "********" + str(data)[-4:]
return str(data)
elif mask_type == 'email':
parts = str(data).split('@')
if len(parts) == 2:
username, domain = parts
if len(username) > 1:
username = username[0] + '*' * (len(username) - 1)
return f"{username}@{domain}"
return str(data)
elif mask_type == 'name':
"""姓名脱敏:显示姓氏,名字用*代替"""
name = str(data)
if len(name) == 2:
return name[0] + '*'
elif len(name) >= 3:
return name[0] + '*' * (len(name) - 2) + name[-1]
return name
elif mask_type == 'bank_card':
card_str = str(data).replace(' ', '')
if len(card_str) >= 8:
return card_str[:4] + "****" + card_str[-4:]
return card_str
return data
# 使用示例
print(data_masking("张三", 'name')) # 输出: 张*
print(data_masking("13812345678", 'phone')) # 输出: 138****5678
使用第三方库实现
使用 faker 库
# 安装: pip install faker
from faker import Faker
def generate_fake_data(real_data, type='phone'):
"""使用faker生成虚假数据替换真实数据"""
fake = Faker('zh_CN')
if type == 'phone':
return fake.phone_number()
elif type == 'name':
return fake.name()
elif type == 'email':
return fake.email()
elif type == 'id_card':
return fake.ssn()
return real_data
# 示例
real_phone = "13812345678"
fake_phone = generate_fake_data(real_phone, 'phone')
print(f"真实: {real_phone} -> 虚假: {fake_phone}")
使用 desensitization 库
# 安装: pip install desensitization
from desensitization import Desensitization
des = Desensitization()
# 手机号脱敏
phone = des.phone('13812345678') # 返回: 138****5678
# 身份证脱敏
id_card = des.id_card('110101199001011234') # 返回: 110101**********34
# 银行卡脱敏
bank_card = des.bank_card('6222021234567890') # 返回: 6222 **** **** 7890
批量数据处理
import pandas as pd
def batch_data_masking(df, mask_config):
"""
批量数据脱敏
参数:
df: pandas DataFrame
mask_config: 字典,格式为{列名: 脱敏类型}
返回:
脱敏后的DataFrame
"""
df_masked = df.copy()
for column, mask_type in mask_config.items():
if column in df_masked.columns:
df_masked[column] = df_masked[column].apply(
lambda x: data_masking(x, mask_type)
)
return df_masked
# 示例使用
data = {
'姓名': ['张三', '李四', '王五'],
'手机号': ['13812345678', '13987654321', '13711112222'],
'身份证': ['110101199001011234', '110101199002022345', '110101199003033456'],
'邮箱': ['zhangsan@example.com', 'lisi@test.com', 'wangwu@demo.com']
}
df = pd.DataFrame(data)
print("原始数据:")
print(df)
# 配置脱敏规则
mask_config = {
'姓名': 'name',
'手机号': 'phone',
'身份证': 'id_card',
'邮箱': 'email'
}
df_masked = batch_data_masking(df, mask_config)
print("\n脱敏后数据:")
print(df_masked)
完整的脱敏工具类
class DataMaskingTool:
"""数据脱敏工具类"""
def __init__(self):
self.mask_functions = {
'phone': self._mask_phone,
'id_card': self._mask_id_card,
'email': self._mask_email,
'name': self._mask_name,
'bank_card': self._mask_bank_card,
'address': self._mask_address
}
def _mask_phone(self, phone):
return re.sub(r'(\d{3})\d{4}(\d{4})', r'\1****\2', str(phone))
def _mask_id_card(self, id_card):
if len(str(id_card)) == 18:
return str(id_card)[:6] + "********" + str(id_card)[-4:]
return str(id_card)
def _mask_email(self, email):
parts = str(email).split('@')
if len(parts) == 2:
username, domain = parts
if len(username) > 1:
username = username[0] + '*' * (len(username) - 1)
return f"{username}@{domain}"
return str(email)
def _mask_name(self, name):
name = str(name)
if len(name) == 2:
return name[0] + '*'
elif len(name) >= 3:
return name[0] + '*' * (len(name) - 2) + name[-1]
return name
def _mask_bank_card(self, card_number):
card_str = str(card_number).replace(' ', '')
if len(card_str) >= 8:
return card_str[:4] + " **** **** " + card_str[-4:]
return card_str
def _mask_address(self, address):
"""地址脱敏:只保留省市区"""
address = str(address)
if len(address) > 6:
return address[:6] + '****'
return address
def mask(self, data, mask_type='phone'):
"""执行脱敏"""
if not data:
return data
mask_func = self.mask_functions.get(mask_type.lower())
if mask_func:
return mask_func(data)
return data
def mask_dataframe(self, df, mask_config):
"""批量处理DataFrame"""
df_masked = df.copy()
for column, mask_type in mask_config.items():
if column in df_masked.columns:
df_masked[column] = df_masked[column].apply(
lambda x: self.mask(x, mask_type)
)
return df_masked
# 使用示例
tool = DataMaskingTool()
# 单条数据脱敏
print(tool.mask("13812345678", "phone"))
print(tool.mask("张三", "name"))
print(tool.mask("北京市海淀区中关村大街1号", "address"))
# 批量处理DataFrame
# df_masked = tool.mask_dataframe(df, mask_config)
使用建议
- 根据场景选择:生产环境建议使用成熟的第三方库
- 可逆性考虑:如果需要还原,考虑使用加密而非脱敏
- 安全性:脱敏后的数据要确保无法通过关联分析还原
- 性能优化:大批量处理时考虑并行处理
- 日志记录:记录脱敏操作日志,便于审计
这些方法可以根据实际需求组合使用,满足不同的数据脱敏场景。