Python案例怎么拆分PDF文档?

wen python案例 8

本文目录导读:

Python案例怎么拆分PDF文档?

  1. 方法一:使用PyPDF2(最常用)
  2. 方法二:使用pdfplumber(更好的页面处理)
  3. 方法三:高级拆分(按指定范围)
  4. 方法四:批量处理并添加水印
  5. 安装依赖
  6. 完整示例:带GUI界面的拆分工具
  7. 实用技巧

我来介绍几种Python拆分PDF文档的方法,从简单到复杂:

使用PyPDF2(最常用)

import PyPDF2
def split_pdf_pypdf2(pdf_path, output_dir):
    """
    将PDF按页拆分为单个文件
    """
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        for page_num in range(len(pdf_reader.pages)):
            pdf_writer = PyPDF2.PdfWriter()
            pdf_writer.add_page(pdf_reader.pages[page_num])
            output_file = f"{output_dir}/page_{page_num + 1}.pdf"
            with open(output_file, 'wb') as output:
                pdf_writer.write(output)
            print(f"已创建: {output_file}")
# 使用示例
split_pdf_pypdf2('input.pdf', 'output_folder')

使用pdfplumber(更好的页面处理)

import pdfplumber
from reportlab.pdfgen import canvas
import io
def split_pdf_pdfplumber(pdf_path, output_dir):
    """
    使用pdfplumber提取并拆分PDF
    """
    with pdfplumber.open(pdf_path) as pdf:
        total_pages = len(pdf.pages)
        for page_num in range(total_pages):
            page = pdf.pages[page_num]
            # 创建新PDF文件
            output_file = f"{output_dir}/page_{page_num + 1}.pdf"
            packet = io.BytesIO()
            # 创建画布
            c = canvas.Canvas(packet, pagesize=(page.width, page.height))
            c.showPage()
            c.save()
            # 保存单个页面
            with open(output_file, 'wb') as f:
                f.write(packet.getvalue())
            print(f"已提取第 {page_num + 1} 页")
# 使用示例
split_pdf_pdfplumber('document.pdf', 'output_pages')

高级拆分(按指定范围)

import PyPDF2
from pathlib import Path
class PDFSplitter:
    def __init__(self, pdf_path):
        self.pdf_path = pdf_path
        self.pdf = PyPDF2.PdfReader(pdf_path)
    def split_by_ranges(self, ranges, output_prefix="split"):
        """
        按指定页码范围拆分
        ranges: [(1,3), (4,6), (7,10)] 表示拆分成3个文件
        """
        path = Path(self.pdf_path)
        output_dir = path.parent / "split_output"
        output_dir.mkdir(exist_ok=True)
        for i, (start, end) in enumerate(ranges, 1):
            pdf_writer = PyPDF2.PdfWriter()
            # 注意:页码从1开始,但API从0开始
            for page_num in range(start-1, end):
                pdf_writer.add_page(self.pdf.pages[page_num])
            output_file = output_dir / f"{output_prefix}_{i}.pdf"
            with open(output_file, 'wb') as f:
                pdf_writer.write(f)
            print(f"已创建: {output_file}")
    def split_every_n_pages(self, n):
        """
        每n页拆分为一个文件
        """
        total_pages = len(self.pdf.pages)
        ranges = []
        for i in range(0, total_pages, n):
            end = min(i + n, total_pages)
            ranges.append((i+1, end))
        self.split_by_ranges(ranges, f"every_{n}_pages")
# 使用示例
splitter = PDFSplitter("report.pdf")
# 按页码范围拆分
splitter.split_by_ranges([(1,3), (4,6), (7,10)])
# 每5页拆分一个文件
splitter.split_every_n_pages(5)

批量处理并添加水印

import PyPDF2
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
import io
def split_with_watermark(pdf_path, watermark_text="Confidential"):
    """
    拆分PDF并添加水印
    """
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        for page_num in range(len(pdf_reader.pages)):
            # 创建水印
            packet = io.BytesIO()
            c = canvas.Canvas(packet, pagesize=letter)
            c.setFont("Helvetica", 30)
            c.setFillColorRGB(0.5, 0.5, 0.5, 0.3)  # 灰色半透明
            c.saveState()
            c.translate(300, 400)
            c.rotate(45)
            c.drawString(0, 0, watermark_text)
            c.restoreState()
            c.save()
            packet.seek(0)
            watermark = PyPDF2.PdfReader(packet)
            # 合并水印到页面
            pdf_writer = PyPDF2.PdfWriter()
            page = pdf_reader.pages[page_num]
            page.merge_page(watermark.pages[0])
            pdf_writer.add_page(page)
            # 保存
            output_file = f"watermarked_page_{page_num + 1}.pdf"
            with open(output_file, 'wb') as output:
                pdf_writer.write(output)
# 使用示例
split_with_watermark("document.pdf", "内部资料")

安装依赖

# 安装必要的库
pip install PyPDF2 pdfplumber reportlab
# 如果需要处理加密PDF
pip install pypdf2[crypto]

完整示例:带GUI界面的拆分工具

import tkinter as tk
from tkinter import filedialog, messagebox
import PyPDF2
import os
class PDFSplitApp:
    def __init__(self, root):
        self.root = root
        self.root.title("PDF拆分工具")
        self.root.geometry("400x300")
        self.pdf_path = None
        # 创建界面元素
        self.create_widgets()
    def create_widgets(self):
        # 选择文件按钮
        tk.Button(self.root, text="选择PDF文件", 
                 command=self.select_file).pack(pady=10)
        # 文件路径显示
        self.file_label = tk.Label(self.root, text="未选择文件")
        self.file_label.pack()
        # 拆分方式选择
        tk.Label(self.root, text="拆分方式:").pack()
        self.split_method = tk.StringVar(value="all")
        tk.Radiobutton(self.root, text="每页单独拆分", 
                      variable=self.split_method, 
                      value="all").pack()
        tk.Radiobutton(self.root, text="每X页拆分", 
                      variable=self.split_method, 
                      value="custom").pack()
        # 自定义页数输入
        self.page_entry = tk.Entry(self.root, width=10)
        self.page_entry.insert(0, "5")
        # 拆分按钮
        tk.Button(self.root, text="开始拆分", 
                 command=self.split_pdf).pack(pady=20)
        # 选择输出目录
        self.output_label = tk.Label(self.root, text="输出目录: 当前目录")
        self.output_label.pack()
        tk.Button(self.root, text="选择输出目录", 
                 command=self.select_output_dir).pack()
    def select_file(self):
        self.pdf_path = filedialog.askopenfilename(
            filetypes=[("PDF文件", "*.pdf")]
        )
        if self.pdf_path:
            self.file_label.config(text=os.path.basename(self.pdf_path))
    def select_output_dir(self):
        dir_path = filedialog.askdirectory()
        if dir_path:
            self.output_dir = dir_path
            self.output_label.config(text=f"输出目录: {dir_path}")
    def split_pdf(self):
        if not self.pdf_path:
            messagebox.showerror("错误", "请选择PDF文件")
            return
        try:
            with open(self.pdf_path, 'rb') as file:
                pdf_reader = PyPDF2.PdfReader(file)
                total_pages = len(pdf_reader.pages)
                output_dir = getattr(self, 'output_dir', os.path.dirname(self.pdf_path))
                if self.split_method.get() == "all":
                    # 每页拆分
                    for page_num in range(total_pages):
                        pdf_writer = PyPDF2.PdfWriter()
                        pdf_writer.add_page(pdf_reader.pages[page_num])
                        output_file = os.path.join(
                            output_dir, 
                            f"page_{page_num + 1}.pdf"
                        )
                        with open(output_file, 'wb') as output:
                            pdf_writer.write(output)
                else:
                    # 自定义页数拆分
                    n = int(self.page_entry.get())
                    for i in range(0, total_pages, n):
                        pdf_writer = PyPDF2.PdfWriter()
                        end = min(i + n, total_pages)
                        for page_num in range(i, end):
                            pdf_writer.add_page(pdf_reader.pages[page_num])
                        output_file = os.path.join(
                            output_dir,
                            f"part_{i//n + 1}_pages_{i+1}-{end}.pdf"
                        )
                        with open(output_file, 'wb') as output:
                            pdf_writer.write(output)
                messagebox.showinfo("成功", f"PDF拆分完成!共创建了{total_pages}个文件")
        except Exception as e:
            messagebox.showerror("错误", f"拆分失败: {str(e)}")
# 运行应用
if __name__ == "__main__":
    root = tk.Tk()
    app = PDFSplitApp(root)
    root.mainloop()

实用技巧

  1. 处理加密PDF

    # 如果有密码
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    pdf_reader.decrypt('password')
  2. 提取特定页面

    def extract_pages(pdf_path, pages_to_extract):
     """提取指定页码列表"""
     with open(pdf_path, 'rb') as file:
         pdf_reader = PyPDF2.PdfReader(file)
         pdf_writer = PyPDF2.PdfWriter()
         for page_num in pages_to_extract:
             pdf_writer.add_page(pdf_reader.pages[page_num - 1])
         with open('extracted_pages.pdf', 'wb') as output:
             pdf_writer.write(output)
  3. 合并PDF后再拆分

    def merge_then_split(pdf_list, output_dir):
     """先合并多个PDF,再按页拆分"""
     merger = PyPDF2.PdfMerger()
     for pdf in pdf_list:
         merger.append(pdf)
     merger.write("merged.pdf")
     merger.close()
     # 然后拆分合并后的PDF
     split_pdf_pypdf2("merged.pdf", output_dir)

选择哪种方法取决于你的具体需求:

  • 简单拆分:使用PyPDF2
  • 需要处理加密PDF:使用PyPDF2的decrypt功能
  • 批量处理:使用面向对象的方式封装
  • 可视化操作:使用GUI版本

抱歉,评论功能暂时关闭!