本文目录导读:

我来介绍一下如何用Java写一个简单的搜索引擎爬虫,我们会从基础版本开始,逐步完善功能。
基础爬虫框架
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class SimpleCrawler {
private Set<String> visitedUrls;
private Queue<String> urlsToVisit;
private int maxPages;
private int currentPageCount;
public SimpleCrawler(int maxPages) {
this.visitedUrls = new HashSet<>();
this.urlsToVisit = new LinkedList<>();
this.maxPages = maxPages;
this.currentPageCount = 0;
}
public void start(String seedUrl) {
urlsToVisit.add(seedUrl);
while (!urlsToVisit.isEmpty() && currentPageCount < maxPages) {
String url = urlsToVisit.poll();
if (!visitedUrls.contains(url)) {
visitedUrls.add(url);
crawl(url);
}
}
System.out.println("抓取完成,共抓取了 " + currentPageCount + " 个页面");
}
private void crawl(String url) {
try {
System.out.println("正在抓取: " + url);
String content = fetchPage(url);
if (content != null) {
// 页面处理
processPage(url, content);
// 提取链接
List<String> links = extractLinks(content);
// 将新链接加入队列
for (String link : links) {
if (!visitedUrls.contains(link) && !urlsToVisit.contains(link)) {
urlsToVisit.add(link);
}
}
currentPageCount++;
System.out.println("当前进度: " + currentPageCount + "/" + maxPages);
}
} catch (Exception e) {
System.err.println("抓取 " + url + " 时出错: " + e.getMessage());
}
}
private String fetchPage(String urlStr) {
try {
URL url = new URL(urlStr);
URLConnection connection = url.openConnection();
connection.setRequestProperty("User-Agent", "Mozilla/5.0");
connection.setConnectTimeout(5000);
connection.setReadTimeout(5000);
BufferedReader reader = new BufferedReader(
new InputStreamReader(connection.getInputStream(), "UTF-8")
);
StringBuilder content = new StringBuilder();
String line;
while ((line = reader.readLine()) != null) {
content.append(line).append("\n");
}
reader.close();
return content.toString();
} catch (Exception e) {
System.err.println("获取页面内容失败: " + e.getMessage());
return null;
}
}
private void processPage(String url, String content) {
// 这里可以添加页面处理逻辑
System.out.println("页面标题: " + extractTitle(content));
System.out.println("页面大小: " + content.length() + " 字符");
}
private List<String> extractLinks(String content) {
List<String> links = new ArrayList<>();
// 使用正则表达式提取链接
Pattern pattern = Pattern.compile("href=[\"']([^\"']+)[\"']");
Matcher matcher = pattern.matcher(content);
while (matcher.find()) {
String link = matcher.group(1);
// 过滤无效链接
if (link.startsWith("http") || link.startsWith("https")) {
links.add(link);
} else if (link.startsWith("/")) {
// 相对路径处理(这里简化处理)
links.add("http://example.com" + link);
}
}
return links;
}
private String extractTitle(String content) {
Pattern pattern = Pattern.compile("<title>(.*?)</title>", Pattern.DOTALL);
Matcher matcher = pattern.matcher(content);
if (matcher.find()) {
return matcher.group(1).trim();
}
return "未知标题";
}
}
增强版本 - 带索引功能
import java.io.*;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
public class SearchEngineCrawler extends SimpleCrawler {
private Map<String, String> pageIndex; // URL -> 页面内容摘要
private Map<String, List<String>> wordIndex; // 单词 -> URL列表
public SearchEngineCrawler(int maxPages) {
super(maxPages);
this.pageIndex = new ConcurrentHashMap<>();
this.wordIndex = new ConcurrentHashMap<>();
}
@Override
protected void processPage(String url, String content) {
super.processPage(url, content);
// 保存页面内容摘要
String summary = content.substring(0, Math.min(200, content.length()));
pageIndex.put(url, summary);
// 建立单词索引
indexContent(url, content);
}
private void indexContent(String url, String content) {
// 移除HTML标签
String text = content.replaceAll("<[^>]+>", " ");
// 分割单词
String[] words = text.toLowerCase().split("\\W+");
for (String word : words) {
if (word.length() > 2) { // 忽略短词
wordIndex.computeIfAbsent(word, k -> new ArrayList<>())
.add(url);
}
}
}
public List<String> search(String query) {
// 简单搜索实现
String[] keywords = query.toLowerCase().split("\\s+");
Set<String> result = new HashSet<>();
for (String keyword : keywords) {
List<String> urls = wordIndex.get(keyword);
if (urls != null) {
result.addAll(urls);
}
}
return new ArrayList<>(result);
}
public void saveIndex(String filePath) throws IOException {
try (ObjectOutputStream oos = new ObjectOutputStream(
new FileOutputStream(filePath))) {
oos.writeObject(pageIndex);
oos.writeObject(wordIndex);
}
}
public void loadIndex(String filePath) throws IOException, ClassNotFoundException {
try (ObjectInputStream ois = new ObjectInputStream(
new FileInputStream(filePath))) {
pageIndex = (Map<String, String>) ois.readObject();
wordIndex = (Map<String, List<String>>) ois.readObject();
}
}
}
测试代码
public class CrawlerDemo {
public static void main(String[] args) {
// 基础版本测试
System.out.println("=== 基础爬虫测试 ===");
SimpleCrawler simpleCrawler = new SimpleCrawler(10);
simpleCrawler.start("https://example.com");
// 搜索引擎版本测试
System.out.println("\n=== 搜索引擎爬虫测试 ===");
SearchEngineCrawler engineCrawler = new SearchEngineCrawler(10);
engineCrawler.start("https://example.com");
// 搜索测试
System.out.println("\n=== 搜索测试 ===");
List<String> results = engineCrawler.search("example");
System.out.println("搜索 'example' 结果:");
for (String url : results) {
System.out.println(" - " + url);
}
}
}
多线程版本
import java.util.concurrent.*;
import java.util.concurrent.locks.ReentrantLock;
public class ConcurrentCrawler {
private final Set<String> visitedUrlSet = ConcurrentHashMap.newKeySet();
private final BlockingQueue<String> urlQueue = new LinkedBlockingQueue<>();
private final ExecutorService executorService;
private final int threadCount;
private volatile boolean isRunning = true;
public ConcurrentCrawler(int threadCount) {
this.threadCount = threadCount;
this.executorService = Executors.newFixedThreadPool(threadCount);
}
public void start(String seedUrl, int maxPages) {
urlQueue.add(seedUrl);
for (int i = 0; i < threadCount; i++) {
executorService.submit(new CrawlerTask(maxPages));
}
}
public void stop() {
isRunning = false;
executorService.shutdown();
}
private class CrawlerTask implements Runnable {
private final int maxPages;
private int crawledCount = 0;
public CrawlerTask(int maxPages) {
this.maxPages = maxPages;
}
@Override
public void run() {
while (isRunning && crawledCount < maxPages) {
try {
String url = urlQueue.poll(1, TimeUnit.SECONDS);
if (url != null && visitedUrlSet.add(url)) {
// 爬取逻辑...
System.out.println(Thread.currentThread().getName() +
" 正在爬取: " + url);
crawledCount++;
}
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
break;
}
}
}
}
}
使用建议
- 遵守robots.txt:检查网站的爬虫协议
- 设置请求间隔:避免对服务器造成负担
- 处理异常:完善的错误处理机制
- 去重机制:避免重复爬取
- 存储优化:考虑使用数据库存储索引
这个简单的爬虫可以作为学习参考,实际生产环境需要考虑更多因素,如分布式爬取、动态页面渲染等。