如何用Java案例实现数据去重？

wen java案例 2026-06-18 9

本文目录导读：

如何用Java案例实现数据去重？

使用Set集合去重（最简单）
对象去重（自定义equals和hashCode）
按对象属性去重（使用Map）
文件数据去重（实战案例）
大数据量去重（使用TreeSet排序去重）

我来提供几个Java实现数据去重的常见案例：

使用Set集合去重（最简单）

import java.util.*;
import java.util.stream.Collectors;
public class SetDeduplication {
    public static void main(String[] args) {
        List<String> list = Arrays.asList("A", "B", "A", "C", "B", "D");
        // 方法1：使用HashSet
        Set<String> set = new HashSet<>(list);
        System.out.println("HashSet去重结果: " + set);
        // 方法2：保持插入顺序
        Set<String> linkedSet = new LinkedHashSet<>(list);
        System.out.println("LinkedHashSet去重结果: " + linkedSet);
        // 方法3：使用Stream API
        List<String> uniqueList = list.stream()
            .distinct()
            .collect(Collectors.toList());
        System.out.println("Stream去重结果: " + uniqueList);
    }
}

对象去重（自定义equals和hashCode）

import java.util.*;
import java.util.stream.Collectors;
class Person {
    private String name;
    private int age;
    public Person(String name, int age) {
        this.name = name;
        this.age = age;
    }
    @Override
    public boolean equals(Object o) {
        if (this == o) return true;
        if (o == null || getClass() != o.getClass()) return false;
        Person person = (Person) o;
        return age == person.age && Objects.equals(name, person.name);
    }
    @Override
    public int hashCode() {
        return Objects.hash(name, age);
    }
    @Override
    public String toString() {
        return "Person{name='" + name + "', age=" + age + "}";
    }
}
public class ObjectDeduplication {
    public static void main(String[] args) {
        List<Person> persons = Arrays.asList(
            new Person("张三", 25),
            new Person("李四", 30),
            new Person("张三", 25),  // 重复
            new Person("王五", 28),
            new Person("李四", 30)   // 重复
        );
        // 使用Set去重
        Set<Person> uniquePersons = new LinkedHashSet<>(persons);
        System.out.println("对象去重结果:");
        uniquePersons.forEach(System.out::println);
        // 使用Stream去重
        List<Person> uniqueList = persons.stream()
            .distinct()
            .collect(Collectors.toList());
        System.out.println("\nStream去重结果:");
        uniqueList.forEach(System.out::println);
    }
}

按对象属性去重（使用Map）

import java.util.*;
import java.util.function.Function;
import java.util.stream.Collectors;
class Student {
    private String id;
    private String name;
    private String grade;
    public Student(String id, String name, String grade) {
        this.id = id;
        this.name = name;
        this.grade = grade;
    }
    public String getId() { return id; }
    public String getName() { return name; }
    public String getGrade() { return grade; }
    @Override
    public String toString() {
        return "Student{id='" + id + "', name='" + name + "', grade='" + grade + "'}";
    }
}
public class PropertyDeduplication {
    public static void main(String[] args) {
        List<Student> students = Arrays.asList(
            new Student("001", "张三", "A"),
            new Student("002", "李四", "B"),
            new Student("001", "张三", "A"),  // 重复ID
            new Student("003", "王五", "A"),
            new Student("002", "李四", "B")   // 重复ID
        );
        // 方法1：使用Map按ID去重
        Map<String, Student> uniqueMap = students.stream()
            .collect(Collectors.toMap(
                Student::getId,
                Function.identity(),
                (existing, replacement) -> existing  // 保留第一个
            ));
        System.out.println("按ID去重结果:");
        uniqueMap.values().forEach(System.out::println);
        // 方法2：使用ArrayList和contains自定义逻辑
        List<Student> uniqueList = new ArrayList<>();
        Set<String> idSet = new HashSet<>();
        for (Student student : students) {
            if (idSet.add(student.getId())) {
                uniqueList.add(student);
            }
        }
        System.out.println("\n使用Set辅助去重结果:");
        uniqueList.forEach(System.out::println);
    }
}

文件数据去重（实战案例）

import java.io.*;
import java.util.*;
public class FileDeduplication {
    public static void main(String[] args) {
        String filePath = "data.txt";
        // 写入测试数据
        writeTestData(filePath);
        // 读取并去重
        deduplicateFile(filePath);
    }
    private static void writeTestData(String filePath) {
        try (BufferedWriter writer = new BufferedWriter(new FileWriter(filePath))) {
            writer.write("北京,100\n");
            writer.write("上海,200\n");
            writer.write("北京,100\n");  // 重复
            writer.write("广州,300\n");
            writer.write("上海,200\n");  // 重复
            writer.write("深圳,400\n");
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    private static void deduplicateFile(String filePath) {
        Set<String> uniqueLines = new LinkedHashSet<>();
        // 读取文件
        try (BufferedReader reader = new BufferedReader(new FileReader(filePath))) {
            String line;
            while ((line = reader.readLine()) != null) {
                uniqueLines.add(line);
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
        // 写入去重后的数据
        String outputPath = "data_deduplicated.txt";
        try (BufferedWriter writer = new BufferedWriter(new FileWriter(outputPath))) {
            for (String line : uniqueLines) {
                writer.write(line);
                writer.newLine();
            }
            System.out.println("去重完成，结果已保存到: " + outputPath);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}

大数据量去重（使用TreeSet排序去重）

import java.util.*;
public class LargeDataDeduplication {
    public static void main(String[] args) {
        // 模拟大量数据
        List<Integer> largeList = generateLargeData();
        // 方法1：使用HashSet（无序）
        long startTime = System.currentTimeMillis();
        Set<Integer> hashSet = new HashSet<>(largeList);
        long endTime = System.currentTimeMillis();
        System.out.println("HashSet去重耗时: " + (endTime - startTime) + "ms");
        System.out.println("去重后数量: " + hashSet.size());
        // 方法2：使用TreeSet（有序）
        startTime = System.currentTimeMillis();
        Set<Integer> treeSet = new TreeSet<>(largeList);
        endTime = System.currentTimeMillis();
        System.out.println("TreeSet去重耗时: " + (endTime - startTime) + "ms");
        System.out.println("去重后数量: " + treeSet.size());
        // 方法3：使用并行流
        startTime = System.currentTimeMillis();
        List<Integer> parallelList = largeList.parallelStream()
            .distinct()
            .collect(ArrayList::new, ArrayList::add, ArrayList::addAll);
        endTime = System.currentTimeMillis();
        System.out.println("并行流去重耗时: " + (endTime - startTime) + "ms");
        System.out.println("去重后数量: " + parallelList.size());
    }
    private static List<Integer> generateLargeData() {
        List<Integer> list = new ArrayList<>();
        Random random = new Random();
        for (int i = 0; i < 1000000; i++) {
            list.add(random.nextInt(100000));
        }
        return list;
    }
}

public class PerformanceComparison {
    public static void main(String[] args) {
        // 测试数据
        List<Integer> testData = generateTestData(100000);
        // 1. HashSet去重（最快）
        long start = System.nanoTime();
        Set<Integer> set = new HashSet<>(testData);
        System.out.println("HashSet: " + (System.nanoTime() - start) / 1_000_000 + "ms");
        // 2. Stream distinct
        start = System.nanoTime();
        List<Integer> streamList = testData.stream().distinct().collect(Collectors.toList());
        System.out.println("Stream: " + (System.nanoTime() - start) / 1_000_000 + "ms");
        // 3. 手动循环去重
        start = System.nanoTime();
        List<Integer> manualList = new ArrayList<>();
        for (Integer num : testData) {
            if (!manualList.contains(num)) {
                manualList.add(num);
            }
        }
        System.out.println("Manual: " + (System.nanoTime() - start) / 1_000_000 + "ms");
    }
}

推荐使用场景：

简单数据去重：使用HashSet或Stream API
保持原始顺序：使用LinkedHashSet
按属性去重：使用Map或Stream的toMap
大数据量：使用HashSet或并行流
需要排序：使用TreeSet