实用脚本能批量匹配吗?

wen 实用脚本 9

本文目录导读:

实用脚本能批量匹配吗?

  1. 文本内容批量匹配(正则表达式)
  2. 数据表批量匹配(CSV/Excel)
  3. 文件批量匹配(按内容/名称)
  4. 图像/文件去重匹配
  5. 批量匹配任意模式(通用框架)
  6. 实用建议

是的,实用脚本完全可以实现批量匹配功能,根据你的具体需求,有多种实现方式,下面我整理了几种常见场景的批量匹配脚本方案:

批量匹配(正则表达式)

Python示例:批量匹配文件中的邮箱

import re
import os
def batch_match_emails(directory, output_file):
    """批量匹配目录下所有txt文件中的邮箱"""
    email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
    all_emails = set()
    for filename in os.listdir(directory):
        if filename.endswith('.txt'):
            filepath = os.path.join(directory, filename)
            with open(filepath, 'r', encoding='utf-8') as f:
                content = f.read()
                emails = re.findall(email_pattern, content)
                all_emails.update(emails)
    with open(output_file, 'w') as f:
        for email in sorted(all_emails):
            f.write(email + '\n')
# 使用
batch_match_emails('./data', './matched_emails.txt')

数据表批量匹配(CSV/Excel)

用pandas进行模糊匹配

import pandas as pd
from fuzzywuzzy import fuzz, process
def batch_fuzzy_match(source_df, target_df, column_name, threshold=80):
    """批量进行模糊匹配"""
    results = []
    for source_value in source_df[column_name]:
        # 在目标数据中找最相似的匹配
        best_match = process.extractOne(
            source_value, 
            target_df[column_name].tolist(),
            scorer=fuzz.token_sort_ratio
        )
        if best_match and best_match[1] >= threshold:
            results.append({
                'source': source_value,
                'match': best_match[0],
                'score': best_match[1]
            })
        else:
            results.append({
                'source': source_value,
                'match': None,
                'score': 0
            })
    return pd.DataFrame(results)
# 使用
source = pd.read_csv('source.csv')
target = pd.read_csv('target.csv')
result = batch_fuzzy_match(source, target, 'company_name')
result.to_csv('match_results.csv')

文件批量匹配(按内容/名称)

Shell脚本:批量查找包含特定内容的文件

#!/bin/bash
# 批量匹配目录下包含"ERROR"的日志文件
MATCH_DIR="/var/log/app"
PATTERN="ERROR"
OUTPUT_FILE="error_files.txt"
# 递归搜索所有文件
grep -rl "$PATTERN" "$MATCH_DIR" > "$OUTPUT_FILE"
# 显示匹配的文件数量
echo "共找到 $(wc -l < "$OUTPUT_FILE") 个匹配文件"

图像/文件去重匹配

Python:批量匹配相似图片

import os
from PIL import Image
import imagehash
def batch_image_match(directory, threshold=10):
    """批量匹配相似图片"""
    hash_dict = {}
    duplicates = []
    # 计算所有图片的hash值
    for filename in os.listdir(directory):
        if filename.lower().endswith(('.png', '.jpg', '.jpeg')):
            filepath = os.path.join(directory, filename)
            with Image.open(filepath) as img:
                h = imagehash.phash(img)
                hash_dict[filename] = h
    # 两两比较
    files = list(hash_dict.keys())
    for i in range(len(files)):
        for j in range(i+1, len(files)):
            h1 = hash_dict[files[i]]
            h2 = hash_dict[files[j]]
            if h1 - h2 < threshold:  # 差值越小越相似
                duplicates.append((files[i], files[j], h1 - h2))
    return duplicates
# 使用
result = batch_image_match('./images', threshold=15)
for img1, img2, diff in result:
    print(f"相似图片:{img1} 和 {img2},差异值:{diff}")

批量匹配任意模式(通用框架)

可配置的批量匹配脚本

import re
from typing import List, Dict, Any
class BatchMatcher:
    """通用批量匹配器"""
    def __init__(self, patterns: List[str], case_sensitive=False):
        self.patterns = [
            re.compile(p, 0 if case_sensitive else re.IGNORECASE) 
            for p in patterns
        ]
    def match_text(self, text: str) -> Dict[str, List[str]]:
        """对单段文本进行所有匹配"""
        results = {}
        for i, pattern in enumerate(self.patterns):
            matches = pattern.findall(text)
            if matches:
                results[f'pattern_{i}'] = matches
        return results
    def batch_match_files(self, file_list: List[str]) -> Dict[str, Dict]:
        """批量匹配多个文件"""
        all_results = {}
        for filepath in file_list:
            try:
                with open(filepath, 'r', encoding='utf-8') as f:
                    content = f.read()
                matches = self.match_text(content)
                if matches:
                    all_results[filepath] = matches
            except Exception as e:
                print(f"Error processing {filepath}: {e}")
        return all_results
# 使用示例
matcher = BatchMatcher([
    r'\d{3}-\d{2}-\d{4}',   # SSN格式
    r'\b[A-Z][a-z]+ [A-Z][a-z]+\b',  # 人名
    r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),])+'  # URL
])
files = ['doc1.txt', 'doc2.txt', 'doc3.txt']
results = matcher.batch_match_files(files)
for file, matches in results.items():
    print(f"\n在 {file} 中找到:")
    for pattern, matched_items in matches.items():
        print(f"  {pattern}: {matched_items}")

实用建议

  1. 性能优化:处理大量数据时,使用生成器而不是一次性加载所有数据
  2. 错误处理:添加异常捕获和日志记录
  3. 并行处理:对大量文件可使用多线程/多进程
  4. 交互界面:简单场景可用 argparse 添加命令行参数

你可以告诉我具体的匹配场景(如匹配什么类型的数据、数据量多大、输出要求等),我可以帮你定制更精准的脚本。

抱歉,评论功能暂时关闭!