如何用Java案例实现数据去重?

wen java案例 9

本文目录导读:

如何用Java案例实现数据去重?

  1. 使用Set集合去重(最简单)
  2. 对象去重(自定义equals和hashCode)
  3. 按对象属性去重(使用Map)
  4. 文件数据去重(实战案例)
  5. 大数据量去重(使用TreeSet排序去重)

我来提供几个Java实现数据去重的常见案例:

使用Set集合去重(最简单)

import java.util.*;
import java.util.stream.Collectors;
public class SetDeduplication {
    public static void main(String[] args) {
        List<String> list = Arrays.asList("A", "B", "A", "C", "B", "D");
        // 方法1:使用HashSet
        Set<String> set = new HashSet<>(list);
        System.out.println("HashSet去重结果: " + set);
        // 方法2:保持插入顺序
        Set<String> linkedSet = new LinkedHashSet<>(list);
        System.out.println("LinkedHashSet去重结果: " + linkedSet);
        // 方法3:使用Stream API
        List<String> uniqueList = list.stream()
            .distinct()
            .collect(Collectors.toList());
        System.out.println("Stream去重结果: " + uniqueList);
    }
}

对象去重(自定义equals和hashCode)

import java.util.*;
import java.util.stream.Collectors;
class Person {
    private String name;
    private int age;
    public Person(String name, int age) {
        this.name = name;
        this.age = age;
    }
    @Override
    public boolean equals(Object o) {
        if (this == o) return true;
        if (o == null || getClass() != o.getClass()) return false;
        Person person = (Person) o;
        return age == person.age && Objects.equals(name, person.name);
    }
    @Override
    public int hashCode() {
        return Objects.hash(name, age);
    }
    @Override
    public String toString() {
        return "Person{name='" + name + "', age=" + age + "}";
    }
}
public class ObjectDeduplication {
    public static void main(String[] args) {
        List<Person> persons = Arrays.asList(
            new Person("张三", 25),
            new Person("李四", 30),
            new Person("张三", 25),  // 重复
            new Person("王五", 28),
            new Person("李四", 30)   // 重复
        );
        // 使用Set去重
        Set<Person> uniquePersons = new LinkedHashSet<>(persons);
        System.out.println("对象去重结果:");
        uniquePersons.forEach(System.out::println);
        // 使用Stream去重
        List<Person> uniqueList = persons.stream()
            .distinct()
            .collect(Collectors.toList());
        System.out.println("\nStream去重结果:");
        uniqueList.forEach(System.out::println);
    }
}

按对象属性去重(使用Map)

import java.util.*;
import java.util.function.Function;
import java.util.stream.Collectors;
class Student {
    private String id;
    private String name;
    private String grade;
    public Student(String id, String name, String grade) {
        this.id = id;
        this.name = name;
        this.grade = grade;
    }
    public String getId() { return id; }
    public String getName() { return name; }
    public String getGrade() { return grade; }
    @Override
    public String toString() {
        return "Student{id='" + id + "', name='" + name + "', grade='" + grade + "'}";
    }
}
public class PropertyDeduplication {
    public static void main(String[] args) {
        List<Student> students = Arrays.asList(
            new Student("001", "张三", "A"),
            new Student("002", "李四", "B"),
            new Student("001", "张三", "A"),  // 重复ID
            new Student("003", "王五", "A"),
            new Student("002", "李四", "B")   // 重复ID
        );
        // 方法1:使用Map按ID去重
        Map<String, Student> uniqueMap = students.stream()
            .collect(Collectors.toMap(
                Student::getId,
                Function.identity(),
                (existing, replacement) -> existing  // 保留第一个
            ));
        System.out.println("按ID去重结果:");
        uniqueMap.values().forEach(System.out::println);
        // 方法2:使用ArrayList和contains自定义逻辑
        List<Student> uniqueList = new ArrayList<>();
        Set<String> idSet = new HashSet<>();
        for (Student student : students) {
            if (idSet.add(student.getId())) {
                uniqueList.add(student);
            }
        }
        System.out.println("\n使用Set辅助去重结果:");
        uniqueList.forEach(System.out::println);
    }
}

文件数据去重(实战案例)

import java.io.*;
import java.util.*;
public class FileDeduplication {
    public static void main(String[] args) {
        String filePath = "data.txt";
        // 写入测试数据
        writeTestData(filePath);
        // 读取并去重
        deduplicateFile(filePath);
    }
    private static void writeTestData(String filePath) {
        try (BufferedWriter writer = new BufferedWriter(new FileWriter(filePath))) {
            writer.write("北京,100\n");
            writer.write("上海,200\n");
            writer.write("北京,100\n");  // 重复
            writer.write("广州,300\n");
            writer.write("上海,200\n");  // 重复
            writer.write("深圳,400\n");
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    private static void deduplicateFile(String filePath) {
        Set<String> uniqueLines = new LinkedHashSet<>();
        // 读取文件
        try (BufferedReader reader = new BufferedReader(new FileReader(filePath))) {
            String line;
            while ((line = reader.readLine()) != null) {
                uniqueLines.add(line);
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
        // 写入去重后的数据
        String outputPath = "data_deduplicated.txt";
        try (BufferedWriter writer = new BufferedWriter(new FileWriter(outputPath))) {
            for (String line : uniqueLines) {
                writer.write(line);
                writer.newLine();
            }
            System.out.println("去重完成,结果已保存到: " + outputPath);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}

大数据量去重(使用TreeSet排序去重)

import java.util.*;
public class LargeDataDeduplication {
    public static void main(String[] args) {
        // 模拟大量数据
        List<Integer> largeList = generateLargeData();
        // 方法1:使用HashSet(无序)
        long startTime = System.currentTimeMillis();
        Set<Integer> hashSet = new HashSet<>(largeList);
        long endTime = System.currentTimeMillis();
        System.out.println("HashSet去重耗时: " + (endTime - startTime) + "ms");
        System.out.println("去重后数量: " + hashSet.size());
        // 方法2:使用TreeSet(有序)
        startTime = System.currentTimeMillis();
        Set<Integer> treeSet = new TreeSet<>(largeList);
        endTime = System.currentTimeMillis();
        System.out.println("TreeSet去重耗时: " + (endTime - startTime) + "ms");
        System.out.println("去重后数量: " + treeSet.size());
        // 方法3:使用并行流
        startTime = System.currentTimeMillis();
        List<Integer> parallelList = largeList.parallelStream()
            .distinct()
            .collect(ArrayList::new, ArrayList::add, ArrayList::addAll);
        endTime = System.currentTimeMillis();
        System.out.println("并行流去重耗时: " + (endTime - startTime) + "ms");
        System.out.println("去重后数量: " + parallelList.size());
    }
    private static List<Integer> generateLargeData() {
        List<Integer> list = new ArrayList<>();
        Random random = new Random();
        for (int i = 0; i < 1000000; i++) {
            list.add(random.nextInt(100000));
        }
        return list;
    }
}
public class PerformanceComparison {
    public static void main(String[] args) {
        // 测试数据
        List<Integer> testData = generateTestData(100000);
        // 1. HashSet去重(最快)
        long start = System.nanoTime();
        Set<Integer> set = new HashSet<>(testData);
        System.out.println("HashSet: " + (System.nanoTime() - start) / 1_000_000 + "ms");
        // 2. Stream distinct
        start = System.nanoTime();
        List<Integer> streamList = testData.stream().distinct().collect(Collectors.toList());
        System.out.println("Stream: " + (System.nanoTime() - start) / 1_000_000 + "ms");
        // 3. 手动循环去重
        start = System.nanoTime();
        List<Integer> manualList = new ArrayList<>();
        for (Integer num : testData) {
            if (!manualList.contains(num)) {
                manualList.add(num);
            }
        }
        System.out.println("Manual: " + (System.nanoTime() - start) / 1_000_000 + "ms");
    }
}

推荐使用场景:

  • 简单数据去重:使用HashSet或Stream API
  • 保持原始顺序:使用LinkedHashSet
  • 按属性去重:使用Map或Stream的toMap
  • 大数据量:使用HashSet或并行流
  • 需要排序:使用TreeSet

抱歉,评论功能暂时关闭!