本文目录导读:

我来介绍几种Python拆分PDF文档的方法,从简单到复杂:
使用PyPDF2(最常用)
import PyPDF2
def split_pdf_pypdf2(pdf_path, output_dir):
"""
将PDF按页拆分为单个文件
"""
with open(pdf_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
for page_num in range(len(pdf_reader.pages)):
pdf_writer = PyPDF2.PdfWriter()
pdf_writer.add_page(pdf_reader.pages[page_num])
output_file = f"{output_dir}/page_{page_num + 1}.pdf"
with open(output_file, 'wb') as output:
pdf_writer.write(output)
print(f"已创建: {output_file}")
# 使用示例
split_pdf_pypdf2('input.pdf', 'output_folder')
使用pdfplumber(更好的页面处理)
import pdfplumber
from reportlab.pdfgen import canvas
import io
def split_pdf_pdfplumber(pdf_path, output_dir):
"""
使用pdfplumber提取并拆分PDF
"""
with pdfplumber.open(pdf_path) as pdf:
total_pages = len(pdf.pages)
for page_num in range(total_pages):
page = pdf.pages[page_num]
# 创建新PDF文件
output_file = f"{output_dir}/page_{page_num + 1}.pdf"
packet = io.BytesIO()
# 创建画布
c = canvas.Canvas(packet, pagesize=(page.width, page.height))
c.showPage()
c.save()
# 保存单个页面
with open(output_file, 'wb') as f:
f.write(packet.getvalue())
print(f"已提取第 {page_num + 1} 页")
# 使用示例
split_pdf_pdfplumber('document.pdf', 'output_pages')
高级拆分(按指定范围)
import PyPDF2
from pathlib import Path
class PDFSplitter:
def __init__(self, pdf_path):
self.pdf_path = pdf_path
self.pdf = PyPDF2.PdfReader(pdf_path)
def split_by_ranges(self, ranges, output_prefix="split"):
"""
按指定页码范围拆分
ranges: [(1,3), (4,6), (7,10)] 表示拆分成3个文件
"""
path = Path(self.pdf_path)
output_dir = path.parent / "split_output"
output_dir.mkdir(exist_ok=True)
for i, (start, end) in enumerate(ranges, 1):
pdf_writer = PyPDF2.PdfWriter()
# 注意:页码从1开始,但API从0开始
for page_num in range(start-1, end):
pdf_writer.add_page(self.pdf.pages[page_num])
output_file = output_dir / f"{output_prefix}_{i}.pdf"
with open(output_file, 'wb') as f:
pdf_writer.write(f)
print(f"已创建: {output_file}")
def split_every_n_pages(self, n):
"""
每n页拆分为一个文件
"""
total_pages = len(self.pdf.pages)
ranges = []
for i in range(0, total_pages, n):
end = min(i + n, total_pages)
ranges.append((i+1, end))
self.split_by_ranges(ranges, f"every_{n}_pages")
# 使用示例
splitter = PDFSplitter("report.pdf")
# 按页码范围拆分
splitter.split_by_ranges([(1,3), (4,6), (7,10)])
# 每5页拆分一个文件
splitter.split_every_n_pages(5)
批量处理并添加水印
import PyPDF2
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
import io
def split_with_watermark(pdf_path, watermark_text="Confidential"):
"""
拆分PDF并添加水印
"""
with open(pdf_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
for page_num in range(len(pdf_reader.pages)):
# 创建水印
packet = io.BytesIO()
c = canvas.Canvas(packet, pagesize=letter)
c.setFont("Helvetica", 30)
c.setFillColorRGB(0.5, 0.5, 0.5, 0.3) # 灰色半透明
c.saveState()
c.translate(300, 400)
c.rotate(45)
c.drawString(0, 0, watermark_text)
c.restoreState()
c.save()
packet.seek(0)
watermark = PyPDF2.PdfReader(packet)
# 合并水印到页面
pdf_writer = PyPDF2.PdfWriter()
page = pdf_reader.pages[page_num]
page.merge_page(watermark.pages[0])
pdf_writer.add_page(page)
# 保存
output_file = f"watermarked_page_{page_num + 1}.pdf"
with open(output_file, 'wb') as output:
pdf_writer.write(output)
# 使用示例
split_with_watermark("document.pdf", "内部资料")
安装依赖
# 安装必要的库 pip install PyPDF2 pdfplumber reportlab # 如果需要处理加密PDF pip install pypdf2[crypto]
完整示例:带GUI界面的拆分工具
import tkinter as tk
from tkinter import filedialog, messagebox
import PyPDF2
import os
class PDFSplitApp:
def __init__(self, root):
self.root = root
self.root.title("PDF拆分工具")
self.root.geometry("400x300")
self.pdf_path = None
# 创建界面元素
self.create_widgets()
def create_widgets(self):
# 选择文件按钮
tk.Button(self.root, text="选择PDF文件",
command=self.select_file).pack(pady=10)
# 文件路径显示
self.file_label = tk.Label(self.root, text="未选择文件")
self.file_label.pack()
# 拆分方式选择
tk.Label(self.root, text="拆分方式:").pack()
self.split_method = tk.StringVar(value="all")
tk.Radiobutton(self.root, text="每页单独拆分",
variable=self.split_method,
value="all").pack()
tk.Radiobutton(self.root, text="每X页拆分",
variable=self.split_method,
value="custom").pack()
# 自定义页数输入
self.page_entry = tk.Entry(self.root, width=10)
self.page_entry.insert(0, "5")
# 拆分按钮
tk.Button(self.root, text="开始拆分",
command=self.split_pdf).pack(pady=20)
# 选择输出目录
self.output_label = tk.Label(self.root, text="输出目录: 当前目录")
self.output_label.pack()
tk.Button(self.root, text="选择输出目录",
command=self.select_output_dir).pack()
def select_file(self):
self.pdf_path = filedialog.askopenfilename(
filetypes=[("PDF文件", "*.pdf")]
)
if self.pdf_path:
self.file_label.config(text=os.path.basename(self.pdf_path))
def select_output_dir(self):
dir_path = filedialog.askdirectory()
if dir_path:
self.output_dir = dir_path
self.output_label.config(text=f"输出目录: {dir_path}")
def split_pdf(self):
if not self.pdf_path:
messagebox.showerror("错误", "请选择PDF文件")
return
try:
with open(self.pdf_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
total_pages = len(pdf_reader.pages)
output_dir = getattr(self, 'output_dir', os.path.dirname(self.pdf_path))
if self.split_method.get() == "all":
# 每页拆分
for page_num in range(total_pages):
pdf_writer = PyPDF2.PdfWriter()
pdf_writer.add_page(pdf_reader.pages[page_num])
output_file = os.path.join(
output_dir,
f"page_{page_num + 1}.pdf"
)
with open(output_file, 'wb') as output:
pdf_writer.write(output)
else:
# 自定义页数拆分
n = int(self.page_entry.get())
for i in range(0, total_pages, n):
pdf_writer = PyPDF2.PdfWriter()
end = min(i + n, total_pages)
for page_num in range(i, end):
pdf_writer.add_page(pdf_reader.pages[page_num])
output_file = os.path.join(
output_dir,
f"part_{i//n + 1}_pages_{i+1}-{end}.pdf"
)
with open(output_file, 'wb') as output:
pdf_writer.write(output)
messagebox.showinfo("成功", f"PDF拆分完成!共创建了{total_pages}个文件")
except Exception as e:
messagebox.showerror("错误", f"拆分失败: {str(e)}")
# 运行应用
if __name__ == "__main__":
root = tk.Tk()
app = PDFSplitApp(root)
root.mainloop()
实用技巧
-
处理加密PDF:
# 如果有密码 pdf_reader = PyPDF2.PdfReader(pdf_file) pdf_reader.decrypt('password') -
提取特定页面:
def extract_pages(pdf_path, pages_to_extract): """提取指定页码列表""" with open(pdf_path, 'rb') as file: pdf_reader = PyPDF2.PdfReader(file) pdf_writer = PyPDF2.PdfWriter() for page_num in pages_to_extract: pdf_writer.add_page(pdf_reader.pages[page_num - 1]) with open('extracted_pages.pdf', 'wb') as output: pdf_writer.write(output) -
合并PDF后再拆分:
def merge_then_split(pdf_list, output_dir): """先合并多个PDF,再按页拆分""" merger = PyPDF2.PdfMerger() for pdf in pdf_list: merger.append(pdf) merger.write("merged.pdf") merger.close() # 然后拆分合并后的PDF split_pdf_pypdf2("merged.pdf", output_dir)
选择哪种方法取决于你的具体需求:
- 简单拆分:使用PyPDF2
- 需要处理加密PDF:使用PyPDF2的decrypt功能
- 批量处理:使用面向对象的方式封装
- 可视化操作:使用GUI版本