本文目录导读:

在Python中筛选网页节点通常使用 BeautifulSoup 或 lxml 库,以下是几种常见的筛选方法和实用案例:
安装所需库
pip install beautifulsoup4 lxml requests
基础案例:按标签名筛选
from bs4 import BeautifulSoup
html = '''
<html>
<body>
<h1>标题</h1>
<p class="content">第一段</p>
<p class="content">第二段</p>
<div id="main">
<a href="https://example.com">链接1</a>
<a href="https://test.com">链接2</a>
</div>
</body>
</html>
'''
soup = BeautifulSoup(html, 'lxml')
# 获取所有p标签
all_p = soup.find_all('p')
for p in all_p:
print(p.text) # 输出: 第一段 第二段
# 获取第一个h1标签
first_h1 = soup.find('h1')
print(first_h1.text) # 输出: 标题
按CSS类名筛选
# 按class筛选
content_ps = soup.find_all('p', class_='content')
# 或使用CSS选择器
content_ps = soup.select('.content')
for p in content_ps:
print(p.text)
按ID属性筛选
# 按id筛选
main_div = soup.find('div', id='main')
# 或使用CSS选择器
main_div = soup.select_one('#main')
# 获取div内的所有链接
links = main_div.find_all('a')
for link in links:
print(f"链接文字: {link.text}, URL: {link['href']}")
按属性值筛选
# 按href属性筛选
example_links = soup.find_all('a', href='https://example.com')
# 按属性部分匹配
test_links = soup.find_all('a', href=lambda x: x and 'test' in x)
# 按多个属性筛选
specific_links = soup.find_all('a', {'href': 'https://example.com', 'class': 'external'})
复杂选择器示例
# 选择所有直接子元素中的p标签
direct_children = soup.select('div > p')
# 选择紧跟在h1后面的p标签
adjacent_p = soup.select('h1 + p')
# 选择属性以特定值开头的元素
start_with_links = soup.select('a[href^="https://"]')
# 选择属性包含特定值的元素
contains_links = soup.select('a[href*="example"]')
实际应用案例:爬取新闻标题
import requests
from bs4 import BeautifulSoup
def fetch_news_titles(url):
"""
抓取网页中的新闻标题
"""
try:
# 发送HTTP请求
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
# 解析HTML
soup = BeautifulSoup(response.text, 'lxml')
# 假设新闻标题在h2标签中,且class为"news-title"
# 实际情况需要根据目标网站的HTML结构调整
titles = soup.select('h2.news-title')
# 或
# titles = soup.find_all('h2', class_='news-title')
news_list = []
for title in titles:
news_list.append(title.text.strip())
return news_list
except Exception as e:
print(f"获取新闻标题时出错: {e}")
return []
# 使用示例s = fetch_news_titles('https://example-news-site.com')
# for i, title in enumerate(titles[:10], 1):
# print(f"{i}. {title}")
使用lxml库筛选
from lxml import etree
html = '''
<html>
<body>
<div class="container">
<ul>
<li class="item active">项目1</li>
<li class="item">项目2</li>
<li class="item">项目3</li>
</ul>
</div>
</body>
</html>
'''
# 解析HTML
root = etree.HTML(html)
# 使用XPath筛选
items = root.xpath('//li[@class="item"]')
for item in items:
print(item.text) # 输出: 项目1 项目2 项目3
# 筛选特定class的元素
active_item = root.xpath('//li[contains(@class, "active")]')
if active_item:
print(active_item[0].text) # 输出: 项目1
# 获取所有li标签
all_li = root.findall('.//li')
for li in all_li:
print(li.text)
实用筛选技巧
# 1. 处理嵌套结构
html = '''
<div class="article">
<h2>标题1</h2>
<p class="date">2024-01-01</p>
<div class="content">内容1</div>
</div>
<div class="article">
<h2>标题2</h2>
<p class="date">2024-01-02</p>
<div class="content">内容2</div>
</div>
'''
soup = BeautifulSoup(html, 'lxml')
articles = soup.select('.article')
for article in articles:= article.find('h2').text
date = article.find('.date').text if article.find('.date') else '无日期'
content = article.find('.content').text
print(f"标题: {title}, 日期: {date}, 内容: {content}")
# 2. 模糊匹配查找
# 查找class包含"title"的元素elements = soup.find_all(class_=lambda x: x and 'title' in x)
# 3. 查找文本包含特定内容的元素
for element in soup.find_all(string=lambda text: '特定内容' in text):
print(f"找到包含特定内容的元素: {element.parent.name}")
完整实践案例:提取商品信息
import requests
from bs4 import BeautifulSoup
import json
def extract_product_info(url):
"""
提取商品信息(示例结构)
"""
# 模拟响应
html_content = '''
<div class="product-list">
<div class="product" data-id="001">
<img src="img1.jpg" alt="商品1">
<h3 class="product-name">苹果 iPhone 15</h3>
<p class="price">¥5999.00</p>
<div class="rating">4.5</div>
</div>
<div class="product" data-id="002">
<img src="img2.jpg" alt="商品2">
<h3 class="product-name">华为 Mate 60</h3>
<p class="price">¥6999.00</p>
<div class="rating">4.8</div>
</div>
</div>
'''
soup = BeautifulSoup(html_content, 'lxml')
products = soup.select('.product')
product_list = []
for product in products:
product_info = {
'id': product.get('data-id'),
'name': product.select_one('.product-name').text.strip(),
'price': product.select_one('.price').text.strip(),
'rating': product.select_one('.rating').text.strip(),
'image_url': product.select_one('img')['src']
}
product_list.append(product_info)
return product_list
# 使用示例
products = extract_product_info('example-url')
print(json.dumps(products, ensure_ascii=False, indent=2))
选择合适的方法
- 按标签名:
find_all('tagname')- 最简单直接 - 按CSS类:
find_all(class_='classname')或select('.classname') - 按ID:
find(id='idname')或select_one('#idname') - 复杂条件:
select()支持完整的CSS选择器 - XPath:使用
lxml库,适合复杂的树形结构筛选 - 正则表达式:
find_all(text=re.compile(r'pattern'))
选择哪种方法取决于你的具体需求和网页结构复杂度。