本文目录导读:

我来提供几个Java实现数据去重的常见案例:
使用Set集合去重(最简单)
import java.util.*;
import java.util.stream.Collectors;
public class SetDeduplication {
public static void main(String[] args) {
List<String> list = Arrays.asList("A", "B", "A", "C", "B", "D");
// 方法1:使用HashSet
Set<String> set = new HashSet<>(list);
System.out.println("HashSet去重结果: " + set);
// 方法2:保持插入顺序
Set<String> linkedSet = new LinkedHashSet<>(list);
System.out.println("LinkedHashSet去重结果: " + linkedSet);
// 方法3:使用Stream API
List<String> uniqueList = list.stream()
.distinct()
.collect(Collectors.toList());
System.out.println("Stream去重结果: " + uniqueList);
}
}
对象去重(自定义equals和hashCode)
import java.util.*;
import java.util.stream.Collectors;
class Person {
private String name;
private int age;
public Person(String name, int age) {
this.name = name;
this.age = age;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
Person person = (Person) o;
return age == person.age && Objects.equals(name, person.name);
}
@Override
public int hashCode() {
return Objects.hash(name, age);
}
@Override
public String toString() {
return "Person{name='" + name + "', age=" + age + "}";
}
}
public class ObjectDeduplication {
public static void main(String[] args) {
List<Person> persons = Arrays.asList(
new Person("张三", 25),
new Person("李四", 30),
new Person("张三", 25), // 重复
new Person("王五", 28),
new Person("李四", 30) // 重复
);
// 使用Set去重
Set<Person> uniquePersons = new LinkedHashSet<>(persons);
System.out.println("对象去重结果:");
uniquePersons.forEach(System.out::println);
// 使用Stream去重
List<Person> uniqueList = persons.stream()
.distinct()
.collect(Collectors.toList());
System.out.println("\nStream去重结果:");
uniqueList.forEach(System.out::println);
}
}
按对象属性去重(使用Map)
import java.util.*;
import java.util.function.Function;
import java.util.stream.Collectors;
class Student {
private String id;
private String name;
private String grade;
public Student(String id, String name, String grade) {
this.id = id;
this.name = name;
this.grade = grade;
}
public String getId() { return id; }
public String getName() { return name; }
public String getGrade() { return grade; }
@Override
public String toString() {
return "Student{id='" + id + "', name='" + name + "', grade='" + grade + "'}";
}
}
public class PropertyDeduplication {
public static void main(String[] args) {
List<Student> students = Arrays.asList(
new Student("001", "张三", "A"),
new Student("002", "李四", "B"),
new Student("001", "张三", "A"), // 重复ID
new Student("003", "王五", "A"),
new Student("002", "李四", "B") // 重复ID
);
// 方法1:使用Map按ID去重
Map<String, Student> uniqueMap = students.stream()
.collect(Collectors.toMap(
Student::getId,
Function.identity(),
(existing, replacement) -> existing // 保留第一个
));
System.out.println("按ID去重结果:");
uniqueMap.values().forEach(System.out::println);
// 方法2:使用ArrayList和contains自定义逻辑
List<Student> uniqueList = new ArrayList<>();
Set<String> idSet = new HashSet<>();
for (Student student : students) {
if (idSet.add(student.getId())) {
uniqueList.add(student);
}
}
System.out.println("\n使用Set辅助去重结果:");
uniqueList.forEach(System.out::println);
}
}
文件数据去重(实战案例)
import java.io.*;
import java.util.*;
public class FileDeduplication {
public static void main(String[] args) {
String filePath = "data.txt";
// 写入测试数据
writeTestData(filePath);
// 读取并去重
deduplicateFile(filePath);
}
private static void writeTestData(String filePath) {
try (BufferedWriter writer = new BufferedWriter(new FileWriter(filePath))) {
writer.write("北京,100\n");
writer.write("上海,200\n");
writer.write("北京,100\n"); // 重复
writer.write("广州,300\n");
writer.write("上海,200\n"); // 重复
writer.write("深圳,400\n");
} catch (IOException e) {
e.printStackTrace();
}
}
private static void deduplicateFile(String filePath) {
Set<String> uniqueLines = new LinkedHashSet<>();
// 读取文件
try (BufferedReader reader = new BufferedReader(new FileReader(filePath))) {
String line;
while ((line = reader.readLine()) != null) {
uniqueLines.add(line);
}
} catch (IOException e) {
e.printStackTrace();
}
// 写入去重后的数据
String outputPath = "data_deduplicated.txt";
try (BufferedWriter writer = new BufferedWriter(new FileWriter(outputPath))) {
for (String line : uniqueLines) {
writer.write(line);
writer.newLine();
}
System.out.println("去重完成,结果已保存到: " + outputPath);
} catch (IOException e) {
e.printStackTrace();
}
}
}
大数据量去重(使用TreeSet排序去重)
import java.util.*;
public class LargeDataDeduplication {
public static void main(String[] args) {
// 模拟大量数据
List<Integer> largeList = generateLargeData();
// 方法1:使用HashSet(无序)
long startTime = System.currentTimeMillis();
Set<Integer> hashSet = new HashSet<>(largeList);
long endTime = System.currentTimeMillis();
System.out.println("HashSet去重耗时: " + (endTime - startTime) + "ms");
System.out.println("去重后数量: " + hashSet.size());
// 方法2:使用TreeSet(有序)
startTime = System.currentTimeMillis();
Set<Integer> treeSet = new TreeSet<>(largeList);
endTime = System.currentTimeMillis();
System.out.println("TreeSet去重耗时: " + (endTime - startTime) + "ms");
System.out.println("去重后数量: " + treeSet.size());
// 方法3:使用并行流
startTime = System.currentTimeMillis();
List<Integer> parallelList = largeList.parallelStream()
.distinct()
.collect(ArrayList::new, ArrayList::add, ArrayList::addAll);
endTime = System.currentTimeMillis();
System.out.println("并行流去重耗时: " + (endTime - startTime) + "ms");
System.out.println("去重后数量: " + parallelList.size());
}
private static List<Integer> generateLargeData() {
List<Integer> list = new ArrayList<>();
Random random = new Random();
for (int i = 0; i < 1000000; i++) {
list.add(random.nextInt(100000));
}
return list;
}
}
public class PerformanceComparison {
public static void main(String[] args) {
// 测试数据
List<Integer> testData = generateTestData(100000);
// 1. HashSet去重(最快)
long start = System.nanoTime();
Set<Integer> set = new HashSet<>(testData);
System.out.println("HashSet: " + (System.nanoTime() - start) / 1_000_000 + "ms");
// 2. Stream distinct
start = System.nanoTime();
List<Integer> streamList = testData.stream().distinct().collect(Collectors.toList());
System.out.println("Stream: " + (System.nanoTime() - start) / 1_000_000 + "ms");
// 3. 手动循环去重
start = System.nanoTime();
List<Integer> manualList = new ArrayList<>();
for (Integer num : testData) {
if (!manualList.contains(num)) {
manualList.add(num);
}
}
System.out.println("Manual: " + (System.nanoTime() - start) / 1_000_000 + "ms");
}
}
推荐使用场景:
- 简单数据去重:使用HashSet或Stream API
- 保持原始顺序:使用LinkedHashSet
- 按属性去重:使用Map或Stream的toMap
- 大数据量:使用HashSet或并行流
- 需要排序:使用TreeSet