本文目录导读:

我来详细介绍Python解析XML文件的几种常用方法,并提供具体案例。
使用xml.etree.ElementTree(推荐)
这是Python标准库中最常用的XML解析方法。
示例XML文件(books.xml):
<?xml version="1.0" encoding="UTF-8"?>
<library>
<book id="1" category="编程">
<title>Python编程从入门到实践</title>
<author>Eric Matthes</author>
<price>89.00</price>
<year>2020</year>
</book>
<book id="2" category="编程">
<title>流畅的Python</title>
<author>Luciano Ramalho</author>
<price>139.00</price>
<year>2021</year>
</book>
<book id="3" category="数据科学">
<title>利用Python进行数据分析</title>
<author>Wes McKinney</author>
<price>119.00</price>
<year>2019</year>
</book>
</library>
基础解析案例:
import xml.etree.ElementTree as ET
# 1. 解析XML文件
tree = ET.parse('books.xml')
root = tree.getroot()
print("根元素标签:", root.tag)
print("根元素属性:", root.attrib)
# 2. 遍历所有book元素
for book in root.findall('book'):= book.find('title').text
author = book.find('author').text
price = book.find('price').text
book_id = book.get('id')
category = book.get('category')
print(f"ID: {book_id}, 类别: {category}")
print(f" 书名: {title}")
print(f" 作者: {author}")
print(f" 价格: {price}")
print("-" * 30)
高级解析操作:
import xml.etree.ElementTree as ET
tree = ET.parse('books.xml')
root = tree.getroot()
# 1. 查找特定元素
# 查找第一本书
first_book = root.find('book')
print(f"第一本书: {first_book.find('title').text}")
# 查找所有编程类书籍
programming_books = root.findall(".//book[@category='编程']")
print("\n编程类书籍:")
for book in programming_books:
print(f" - {book.find('title').text}")
# 2. 过滤价格大于100的书籍
expensive_books = []
for book in root.findall('book'):
price = float(book.find('price').text)
if price > 100:
expensive_books.append({
'title': book.find('title').text,
'price': price
})
print("\n价格超过100元的书籍:")
for book in expensive_books:
print(f" - {book['title']}: ¥{book['price']:.2f}")
# 3. 使用XPath表达式
# 查找所有标题s = [elem.text for elem in root.findall('.//title')]
print(f"\n所有书籍标题: {all_titles}")
# 查找作者
all_authors = root.findall('.//author')
for author in all_authors:
print(f"作者: {author.text}")
使用lxml库(功能更强大)
需要先安装:pip install lxml
from lxml import etree
# 1. 解析XML字符串
xml_string = """<?xml version="1.0" encoding="UTF-8"?>
<root>
<item id="1">
<name>苹果</name>
<price>5.0</price>
</item>
<item id="2">
<name>香蕉</name>
<price>3.5</price>
</item>
</root>"""
root = etree.fromstring(xml_string)
# 2. 高级XPath查询
items = root.xpath('//item')
for item in items:
item_id = item.get('id')
name = item.xpath('name/text()')[0]
price = item.xpath('price/text()')[0]
print(f"ID: {item_id}, 名称: {name}, 价格: {price}")
# 3. 条件查询
# 查找价格大于4的商品
expensive_items = root.xpath("//item[price > 4]")
for item in expensive_items:
name = item.xpath('name/text()')[0]
price = item.xpath('price/text()')[0]
print(f"昂贵商品: {name}, 价格: {price}")
完整实用案例:XML数据处理
import xml.etree.ElementTree as ET
import json
class XMLBookProcessor:
def __init__(self, file_path):
self.tree = ET.parse(file_path)
self.root = self.tree.getroot()
def get_all_books(self):
"""获取所有书籍信息"""
books = []
for book in self.root.findall('book'):
book_info = {
'id': book.get('id'),
'category': book.get('category'),
'title': book.find('title').text,
'author': book.find('author').text,
'price': float(book.find('price').text),
'year': int(book.find('year').text)
}
books.append(book_info)
return books
def filter_by_category(self, category):
"""按类别筛选书籍"""
return [book for book in self.get_all_books()
if book['category'] == category]
def get_statistics(self):
"""获取统计信息"""
books = self.get_all_books()
if not books:
return {}
stats = {
'total_books': len(books),
'average_price': sum(b['price'] for b in books) / len(books),
'categories': {},
'years': set()
}
for book in books:
# 统计类别
cat = book['category']
stats['categories'][cat] = stats['categories'].get(cat, 0) + 1
# 统计年份
stats['years'].add(book['year'])
return stats
def export_to_json(self, output_file):
"""导出为JSON格式"""
books = self.get_all_books()
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(books, f, ensure_ascii=False, indent=2)
print(f"已导出到 {output_file}")
def search_by_title(self, keyword):
"""根据标题关键词搜索"""
books = self.get_all_books()
return [book for book in books
if keyword.lower() in book['title'].lower()]
# 使用示例
processor = XMLBookProcessor('books.xml')
# 获取所有书籍
all_books = processor.get_all_books()
print("所有书籍:")
for book in all_books:
print(f" [{book['id']}] {book['title']} - {book['author']}")
# 按类别筛选
programming_books = processor.filter_by_category('编程')
print(f"\n编程类书籍({len(programming_books)}本):")
for book in programming_books:
print(f" {book['title']}")
# 获取统计信息
stats = processor.get_statistics()
print(f"\n统计信息:")
print(f" 总书籍数: {stats['total_books']}")
print(f" 平均价格: ¥{stats['average_price']:.2f}")
print(f" 类别分布: {stats['categories']}")
print(f" 出版年份: {stats['years']}")
# 搜索书籍
search_result = processor.search_by_title('Python')
print(f"\n搜索'Python'结果:")
for book in search_result:
print(f" {book['title']}")
# 导出为JSON
processor.export_to_json('books.json')
处理大型XML文件
对于大型XML文件,使用迭代解析避免内存占用:
import xml.etree.ElementTree as ET
def process_large_xml(file_path):
"""处理大型XML文件的迭代解析"""
context = ET.iterparse(file_path, events=('start', 'end'))
context = iter(context)
# 获取根元素
event, root = next(context)
count = 0
for event, elem in context:
if event == 'end' and elem.tag == 'book':
# 处理每个book元素
title = elem.find('title').text if elem.find('title') is not None else ''
author = elem.find('author').text if elem.find('author') is not None else ''
print(f"Processing book: {title} by {author}")
count += 1
# 清除已处理的元素以释放内存
root.clear()
print(f"总共处理了 {count} 本书籍")
# 使用示例(适用于大文件)
# process_large_xml('large_books.xml')
-
标准库vs第三方库:
xml.etree.ElementTree:标准库,适合大多数场景lxml:功能更强大,支持完整XPath语法minidom:另一个标准库选项,但API较复杂
-
主要操作方法:
find():查找第一个匹配子元素findall():查找所有匹配子元素iter():遍历所有元素get():获取属性值text:获取文本内容
-
性能优化:
- 使用
iterparse()处理大文件 - 及时清理已处理的元素
- 考虑使用第三方库提升性能
- 使用
-
错误处理:
- 处理XML解析异常
- 验证元素是否存在
- 处理编码问题
这些方法基本覆盖了Python解析XML的常见需求,你可以根据具体场景选择合适的方法。