本文目录导读:

是的,实用脚本完全可以实现批量匹配功能,根据你的具体需求,有多种实现方式,下面我整理了几种常见场景的批量匹配脚本方案:
批量匹配(正则表达式)
Python示例:批量匹配文件中的邮箱
import re
import os
def batch_match_emails(directory, output_file):
"""批量匹配目录下所有txt文件中的邮箱"""
email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
all_emails = set()
for filename in os.listdir(directory):
if filename.endswith('.txt'):
filepath = os.path.join(directory, filename)
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
emails = re.findall(email_pattern, content)
all_emails.update(emails)
with open(output_file, 'w') as f:
for email in sorted(all_emails):
f.write(email + '\n')
# 使用
batch_match_emails('./data', './matched_emails.txt')
数据表批量匹配(CSV/Excel)
用pandas进行模糊匹配
import pandas as pd
from fuzzywuzzy import fuzz, process
def batch_fuzzy_match(source_df, target_df, column_name, threshold=80):
"""批量进行模糊匹配"""
results = []
for source_value in source_df[column_name]:
# 在目标数据中找最相似的匹配
best_match = process.extractOne(
source_value,
target_df[column_name].tolist(),
scorer=fuzz.token_sort_ratio
)
if best_match and best_match[1] >= threshold:
results.append({
'source': source_value,
'match': best_match[0],
'score': best_match[1]
})
else:
results.append({
'source': source_value,
'match': None,
'score': 0
})
return pd.DataFrame(results)
# 使用
source = pd.read_csv('source.csv')
target = pd.read_csv('target.csv')
result = batch_fuzzy_match(source, target, 'company_name')
result.to_csv('match_results.csv')
文件批量匹配(按内容/名称)
Shell脚本:批量查找包含特定内容的文件
#!/bin/bash # 批量匹配目录下包含"ERROR"的日志文件 MATCH_DIR="/var/log/app" PATTERN="ERROR" OUTPUT_FILE="error_files.txt" # 递归搜索所有文件 grep -rl "$PATTERN" "$MATCH_DIR" > "$OUTPUT_FILE" # 显示匹配的文件数量 echo "共找到 $(wc -l < "$OUTPUT_FILE") 个匹配文件"
图像/文件去重匹配
Python:批量匹配相似图片
import os
from PIL import Image
import imagehash
def batch_image_match(directory, threshold=10):
"""批量匹配相似图片"""
hash_dict = {}
duplicates = []
# 计算所有图片的hash值
for filename in os.listdir(directory):
if filename.lower().endswith(('.png', '.jpg', '.jpeg')):
filepath = os.path.join(directory, filename)
with Image.open(filepath) as img:
h = imagehash.phash(img)
hash_dict[filename] = h
# 两两比较
files = list(hash_dict.keys())
for i in range(len(files)):
for j in range(i+1, len(files)):
h1 = hash_dict[files[i]]
h2 = hash_dict[files[j]]
if h1 - h2 < threshold: # 差值越小越相似
duplicates.append((files[i], files[j], h1 - h2))
return duplicates
# 使用
result = batch_image_match('./images', threshold=15)
for img1, img2, diff in result:
print(f"相似图片:{img1} 和 {img2},差异值:{diff}")
批量匹配任意模式(通用框架)
可配置的批量匹配脚本
import re
from typing import List, Dict, Any
class BatchMatcher:
"""通用批量匹配器"""
def __init__(self, patterns: List[str], case_sensitive=False):
self.patterns = [
re.compile(p, 0 if case_sensitive else re.IGNORECASE)
for p in patterns
]
def match_text(self, text: str) -> Dict[str, List[str]]:
"""对单段文本进行所有匹配"""
results = {}
for i, pattern in enumerate(self.patterns):
matches = pattern.findall(text)
if matches:
results[f'pattern_{i}'] = matches
return results
def batch_match_files(self, file_list: List[str]) -> Dict[str, Dict]:
"""批量匹配多个文件"""
all_results = {}
for filepath in file_list:
try:
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
matches = self.match_text(content)
if matches:
all_results[filepath] = matches
except Exception as e:
print(f"Error processing {filepath}: {e}")
return all_results
# 使用示例
matcher = BatchMatcher([
r'\d{3}-\d{2}-\d{4}', # SSN格式
r'\b[A-Z][a-z]+ [A-Z][a-z]+\b', # 人名
r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),])+' # URL
])
files = ['doc1.txt', 'doc2.txt', 'doc3.txt']
results = matcher.batch_match_files(files)
for file, matches in results.items():
print(f"\n在 {file} 中找到:")
for pattern, matched_items in matches.items():
print(f" {pattern}: {matched_items}")
实用建议
- 性能优化:处理大量数据时,使用生成器而不是一次性加载所有数据
- 错误处理:添加异常捕获和日志记录
- 并行处理:对大量文件可使用多线程/多进程
- 交互界面:简单场景可用
argparse添加命令行参数
你可以告诉我具体的匹配场景(如匹配什么类型的数据、数据量多大、输出要求等),我可以帮你定制更精准的脚本。