本文目录导读:

我来为您介绍几个Python爬虫的实用案例,从简单到复杂。
基础案例:爬取网页标题
import requests
from bs4 import BeautifulSoup
# 爬取单个网页的标题
url = "https://www.example.com"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
try:
response = requests.get(url, headers=headers, timeout=10)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'html.parser')= soup.title.string
print(f"网页标题: {title}")
except Exception as e:
print(f"爬取失败: {e}")
中级案例:抓取新闻列表
import requests
from bs4 import BeautifulSoup
import csv
import time
def crawl_news():
"""爬取新闻标题和链接"""
url = "https://news.163.com/"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
try:
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'html.parser')
news_list = []
# 根据实际网页结构调整选择器
for item in soup.select('.data_row .news_title a'):
title = item.get_text().strip()
link = item.get('href')
if title and link:
news_list.append([title, link])
# 保存到CSV
with open('news.csv', 'w', newline='', encoding='utf-8-sig') as f:
writer = csv.writer(f)
writer.writerow(['标题', '链接'])
writer.writerows(news_list)
print(f"成功爬取 {len(news_list)} 条新闻")
return news_list
except Exception as e:
print(f"爬取失败: {e}")
return []
# 运行爬虫
news_data = crawl_news()
高级案例:带代理和反反爬的爬虫
import requests
from bs4 import BeautifulSoup
import random
import time
from fake_useragent import UserAgent
class AdvancedSpider:
"""高级爬虫,包含代理和反反爬策略"""
def __init__(self):
self.ua = UserAgent()
self.proxies = [
{'http': 'http://proxy1.com:8080'},
{'http': 'http://proxy2.com:8080'},
]
def get_random_headers(self):
"""生成随机请求头"""
return {
'User-Agent': self.ua.random,
'Accept': 'text/html,application/xhtml+xml',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
}
def retry_request(self, url, max_retries=3):
"""带重试机制的请求"""
for i in range(max_retries):
try:
headers = self.get_random_headers()
proxy = random.choice(self.proxies)
response = requests.get(
url,
headers=headers,
# proxies=proxy, # 如果需要代理
timeout=10
)
if response.status_code == 200:
return response
else:
print(f"请求失败,状态码: {response.status_code}")
except Exception as e:
print(f"第{i+1}次尝试失败: {e}")
time.sleep(random.uniform(1, 3))
return None
def parse_content(self, url):
"""解析页面内容"""
response = self.retry_request(url)
if response:
soup = BeautifulSoup(response.text, 'html.parser')
# 这里添加具体的解析逻辑
return soup
return None
# 使用示例
spider = AdvancedSpider()
content = spider.parse_content("https://example.com")
实战案例:爬取电商商品信息
import requests
from bs4 import BeautifulSoup
import json
import time
import random
def crawl_product_info(keyword, pages=5):
"""爬取商品信息(以京东为例)"""
base_url = "https://search.jd.com/Search"
products = []
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Referer': 'https://www.jd.com/',
}
for page in range(1, pages + 1):
params = {
'keyword': keyword,
'page': page,
'enc': 'utf-8',
}
try:
response = requests.get(base_url, params=params, headers=headers)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'html.parser')
# 解析商品列表
items = soup.select('.gl-item')
for item in items:
try:
product = {
'title': item.select('.p-name em')[0].text.strip(),
'price': item.select('.p-price i')[0].text.strip(),
'shop': item.select('.p-shop a')[0].text.strip(),
'link': 'https:' + item.select('.p-name a')[0].get('href', ''),
}
products.append(product)
except:
continue
print(f"已爬取第{page}页,共{len(items)}个商品")
time.sleep(random.uniform(1, 3)) # 随机延时
except Exception as e:
print(f"爬取第{page}页失败: {e}")
# 保存结果
with open(f'{keyword}_products.json', 'w', encoding='utf-8') as f:
json.dump(products, f, ensure_ascii=False, indent=2)
print(f"共爬取 {len(products)} 个商品")
return products
# 运行
products = crawl_product_info("Python书籍", pages=3)
异步爬虫案例
import aiohttp
import asyncio
from bs4 import BeautifulSoup
import time
async def fetch_url(session, url):
"""异步获取网页内容"""
try:
async with session.get(url, timeout=10) as response:
return await response.text()
except Exception as e:
print(f"请求失败: {url}, 错误: {e}")
return None
async def parse_page(session, url):
"""异步解析页面"""
html = await fetch_url(session, url)
if html:
soup = BeautifulSoup(html, 'html.parser')
title = soup.title.string if soup.title else "No title"
return {'url': url, 'title': title}
return None
async def crawl_multiple_pages(urls):
"""并发爬取多个页面"""
async with aiohttp.ClientSession() as session:
tasks = [parse_page(session, url) for url in urls]
results = await asyncio.gather(*tasks)
return [r for r in results if r]
# 使用示例
async def main():
urls = [
'https://www.python.org',
'https://www.github.com',
'https://stackoverflow.com',
]
start_time = time.time()
results = await crawl_multiple_pages(urls)
elapsed = time.time() - start_time
print(f"爬取完成,用时: {elapsed:.2f}秒")
for result in results:
print(f"URL: {result['url']}, 标题: {result['title']}")
# 运行异步爬虫
asyncio.run(main())
安装所需库
pip install requests beautifulsoup4 lxml fake-useragent aiohttp
注意事项
- 遵守Robots协议:查看网站的robots.txt文件
- 控制请求频率:添加适当的延时,避免对服务器造成压力
- 动态IP处理:对于反爬严格的网站,可能需要使用代理池
- 数据存储:考虑使用数据库存储大量数据
- 异常处理:完善的错误处理机制
这些案例涵盖了从基础到高级的爬虫技术,您可以根据实际需求选择适合的方案,记得在爬取数据时遵守相关法律法规和网站的使用条款。