本文目录导读:

我来详细介绍如何使用Python解析Word文档,主要使用 python-docx 库,它支持 .docx 格式。
安装所需库
pip install python-docx
基础用法:读取Word文档
1 读取段落文本
from docx import Document
# 打开Word文档
doc = Document('example.docx')
# 读取所有段落
for i, paragraph in enumerate(doc.paragraphs):
print(f"段落 {i+1}: {paragraph.text}")
2 读取表格数据
from docx import Document
doc = Document('example.docx')
# 遍历所有表格
for table_idx, table in enumerate(doc.tables):
print(f"\n=== 表格 {table_idx+1} ===")
# 遍历行
for row_idx, row in enumerate(table.rows):
cells = [cell.text for cell in row.cells]
print(f"行 {row_idx+1}: {cells}")
完整解析案例
1 综合解析Word文档
from docx import Document
from docx.shared import Pt, Inches
from docx.enum.text import WD_ALIGN_PARAGRAPH
import json
class WordParser:
def __init__(self, file_path):
self.doc = Document(file_path)
self.result = {
'metadata': {},
'paragraphs': [],
'tables': [],
'images': []
}
def parse_metadata(self):
"""解析文档属性"""
props = self.doc.core_properties
self.result['metadata'] = {
'title': props.title,
'author': props.author,
'created': str(props.created),
'modified': str(props.modified),
'last_modified_by': props.last_modified_by
}
def parse_paragraphs(self):
"""解析段落"""
for para in self.doc.paragraphs:
if para.text.strip(): # 忽略空段落
# 获取段落格式
format_info = {
'alignment': str(para.alignment),
'style': para.style.name if para.style else None
}
# 获取文本和格式
runs_info = []
for run in para.runs:
run_data = {
'text': run.text,
'bold': run.bold,
'italic': run.italic,
'font_size': str(run.font.size) if run.font.size else None,
'font_name': run.font.name,
'color': str(run.font.color.rgb) if run.font.color and run.font.color.rgb else None
}
runs_info.append(run_data)
self.result['paragraphs'].append({
'text': para.text,
'format': format_info,
'runs': runs_info
})
def parse_tables(self):
"""解析表格"""
for table in self.doc.tables:
table_data = {
'rows': len(table.rows),
'columns': len(table.columns),
'data': []
}
for row in table.rows:
row_data = []
for cell in row.cells:
cell_data = {
'text': cell.text,
'paragraphs': [p.text for p in cell.paragraphs if p.text.strip()]
}
row_data.append(cell_data)
table_data['data'].append(row_data)
self.result['tables'].append(table_data)
def parse_images(self):
"""解析图片(内联图片)"""
for rel in self.doc.part.rels.values():
if "image" in rel.reltype:
image = rel.target_part
image_data = {
'file_name': image.partname,
'content_type': image.content_type,
'blob': image.blob
}
self.result['images'].append(image_data)
def parse_all(self):
"""执行全面解析"""
self.parse_metadata()
self.parse_paragraphs()
self.parse_tables()
self.parse_images()
return self.result
def export_to_json(self, output_path='output.json'):
"""导出为JSON"""
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(self.result, f, ensure_ascii=False, indent=2)
# 使用示例
parser = WordParser('document.docx')
result = parser.parse_all()
print(f"文档标题: {result['metadata'].get('title')}")
print(f"段落数量: {len(result['paragraphs'])}")
print(f"表格数量: {len(result['tables'])}")
print(f"图片数量: {len(result['images'])}")
# 导出为JSON
parser.export_to_json('document_parsed.json')
2 按样式提取内容
from docx import Document
def extract_by_style(doc_path, style_name):
"""提取指定样式的段落"""
doc = Document(doc_path)
result = []
for para in doc.paragraphs:
if para.style and para.style.name == style_name:
result.append(para.text)
return result
def extract_headings(doc_path):
"""提取所有标题"""
doc = Document(doc_path)
headings = []
for para in doc.paragraphs:
if para.style and para.style.name.startswith('Heading'):
headings.append({
'level': para.style.name,
'text': para.text
})
return headings
# 使用示例
doc_path = 'document.docx'
headings = extract_headings(doc_path)
for h in headings:
print(f"{h['level']}: {h['text']}")
# 提取特定样式
normal_texts = extract_by_style(doc_path, 'Normal')
3 处理带格式的文本
from docx import Document
def extract_formatted_text(doc_path):
"""提取文本及格式信息"""
doc = Document(doc_path)
formatted_content = []
for para in doc.paragraphs:
if not para.text.strip():
continue
para_data = {
'text': para.text,
'formatted_parts': []
}
for run in para.runs:
if run.text.strip():
format_info = {
'text': run.text,
'bold': run.bold,
'italic': run.italic,
'underline': run.underline,
'font_size': run.font.size,
'font_name': run.font.name
}
para_data['formatted_parts'].append(format_info)
formatted_content.append(para_data)
return formatted_content
# 使用示例
content = extract_formatted_text('document.docx')
for para in content[:5]: # 只显示前5个段落
print(f"段落文本: {para['text'][:50]}...")
for part in para['formatted_parts'][:2]:
if part['bold']:
print(f" 加粗文本: {part['text']}")
高级应用场景
1 合并多个Word文档
from docx import Document
from docx.oxml import parse_xml
import os
def merge_word_documents(file_paths, output_path):
"""合并多个Word文档"""
if not file_paths:
return
# 使用第一个文档作为基础
merged_doc = Document(file_paths[0])
for file_path in file_paths[1:]:
doc = Document(file_path)
# 合并段落
for para in doc.paragraphs:
merged_doc.add_paragraph(para.text, style=para.style)
# 合并表格
for table in doc.tables:
# 添加新表格
new_table = merged_doc.add_table(
rows=len(table.rows),
cols=len(table.columns)
)
for i, row in enumerate(table.rows):
for j, cell in enumerate(row.cells):
new_table.rows[i].cells[j].text = cell.text
merged_doc.save(output_path)
print(f"合并完成,保存到: {output_path}")
# 使用示例
files = ['doc1.docx', 'doc2.docx', 'doc3.docx']
merge_word_documents(files, 'merged.docx')
2 查找替换功能
from docx import Document
def find_and_replace(doc_path, search_text, replace_text, output_path=None):
"""查找替换功能"""
doc = Document(doc_path)
# 在段落中替换
for para in doc.paragraphs:
for run in para.runs:
if search_text in run.text:
run.text = run.text.replace(search_text, replace_text)
# 在表格中替换
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
for para in cell.paragraphs:
for run in para.runs:
if search_text in run.text:
run.text = run.text.replace(search_text, replace_text)
# 保存
output_path = output_path or doc_path
doc.save(output_path)
print(f"替换完成,保存到: {output_path}")
# 使用示例
find_and_replace('document.docx', '旧文本', '新文本', 'document_updated.docx')
注意事项
- 格式兼容性:
python-docx主要支持.docx格式,不支持老旧的.doc格式 - 内存管理:处理大文件时注意内存使用
- 编码问题需要正确处理 UTF-8 编码
- 复杂格式:某些复杂的Word格式(如宏、嵌入对象)可能无法完全解析
替代方案
如果需要处理 .doc 格式或更复杂的Word文档,可以考虑:
- win32com:Windows平台,支持完整Word功能
- pywin32:Windows平台COM接口
- python-docx2txt:简单的文本提取
- Apache POI (通过subprocess):跨平台复杂处理
这些方法可以满足大部分Word文档解析需求,根据具体场景选择合适的方法。