如何用Java案例实现文本相似度?

wen java案例 3

本文目录导读:

如何用Java案例实现文本相似度?

  1. 余弦相似度算法
  2. 编辑距离(Levenshtein距离)算法
  3. Jaccard相似系数
  4. 综合相似度计算器
  5. 使用说明
  6. 实际应用建议

我来为您提供几个Java实现文本相似度计算的案例,涵盖不同算法:

余弦相似度算法

实现代码

import java.util.*;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
public class CosineSimilarity {
    /**
     * 计算两个文本的余弦相似度
     */
    public static double calculate(String text1, String text2) {
        // 分词
        List<String> words1 = tokenize(text1);
        List<String> words2 = tokenize(text2);
        // 构建词频向量
        Map<String, Integer> freq1 = getWordFrequency(words1);
        Map<String, Integer> freq2 = getWordFrequency(words2);
        // 获取所有不重复的词
        Set<String> allWords = new HashSet<>();
        allWords.addAll(freq1.keySet());
        allWords.addAll(freq2.keySet());
        // 计算点积和模长
        double dotProduct = 0;
        double norm1 = 0;
        double norm2 = 0;
        for (String word : allWords) {
            int count1 = freq1.getOrDefault(word, 0);
            int count2 = freq2.getOrDefault(word, 0);
            dotProduct += count1 * count2;
            norm1 += count1 * count1;
            norm2 += count2 * count2;
        }
        // 防止除以0
        if (norm1 == 0 || norm2 == 0) {
            return 0;
        }
        return dotProduct / (Math.sqrt(norm1) * Math.sqrt(norm2));
    }
    /**
     * 简单分词(实际应用可使用分词库如HanLP)
     */
    private static List<String> tokenize(String text) {
        // 去除标点符号,按空格和标点分词
        String cleaned = text.toLowerCase()
                .replaceAll("[\\p{Punct}\\s]+", " ")
                .trim();
        return Arrays.asList(cleaned.split("\\s+"));
    }
    /**
     * 统计词频
     */
    private static Map<String, Integer> getWordFrequency(List<String> words) {
        Map<String, Integer> freq = new HashMap<>();
        for (String word : words) {
            freq.put(word, freq.getOrDefault(word, 0) + 1);
        }
        return freq;
    }
    public static void main(String[] args) {
        // 测试案例
        String text1 = "Java is a programming language";
        String text2 = "Java programming language is popular";
        String text3 = "Python is another programming language";
        System.out.println("文本1: " + text1);
        System.out.println("文本2: " + text2);
        System.out.println("文本3: " + text3);
        System.out.println();
        System.out.println("文本1 vs 文本2 余弦相似度: " + 
                String.format("%.2f", calculate(text1, text2)));
        System.out.println("文本1 vs 文本3 余弦相似度: " + 
                String.format("%.2f", calculate(text1, text3)));
    }
}

编辑距离(Levenshtein距离)算法

public class LevenshteinDistance {
    /**
     * 计算编辑距离
     */
    public static int calculate(String s1, String s2) {
        int[][] dp = new int[s1.length() + 1][s2.length() + 1];
        // 初始化
        for (int i = 0; i <= s1.length(); i++) {
            dp[i][0] = i;
        }
        for (int j = 0; j <= s2.length(); j++) {
            dp[0][j] = j;
        }
        // 动态规划
        for (int i = 1; i <= s1.length(); i++) {
            for (int j = 1; j <= s2.length(); j++) {
                if (s1.charAt(i - 1) == s2.charAt(j - 1)) {
                    dp[i][j] = dp[i - 1][j - 1];
                } else {
                    dp[i][j] = Math.min(
                        dp[i - 1][j] + 1,      // 删除
                        Math.min(
                            dp[i][j - 1] + 1,      // 插入
                            dp[i - 1][j - 1] + 1   // 替换
                        )
                    );
                }
            }
        }
        return dp[s1.length()][s2.length()];
    }
    /**
     * 计算相似度 (0-1之间)
     */
    public static double similarity(String s1, String s2) {
        if (s1.isEmpty() && s2.isEmpty()) return 1.0;
        if (s1.isEmpty() || s2.isEmpty()) return 0.0;
        int maxLen = Math.max(s1.length(), s2.length());
        int distance = calculate(s1, s2);
        return 1.0 - (double) distance / maxLen;
    }
    public static void main(String[] args) {
        String s1 = "kitten";
        String s2 = "sitting";
        String s3 = "kitchen";
        System.out.println("字符串1: " + s1);
        System.out.println("字符串2: " + s2);
        System.out.println("字符串3: " + s3);
        System.out.println();
        System.out.println("编辑距离 (" + s1 + " vs " + s2 + "): " + 
                calculate(s1, s2));
        System.out.println("相似度: " + 
                String.format("%.2f", similarity(s1, s2)));
        System.out.println();
        System.out.println("编辑距离 (" + s1 + " vs " + s3 + "): " + 
                calculate(s1, s3));
        System.out.println("相似度: " + 
                String.format("%.2f", similarity(s1, s3)));
    }
}

Jaccard相似系数

import java.util.*;
public class JaccardSimilarity {
    /**
     * 计算Jaccard相似度
     */
    public static double calculate(String text1, String text2) {
        Set<String> set1 = new HashSet<>(tokenize(text1));
        Set<String> set2 = new HashSet<>(tokenize(text2));
        // 计算交集
        Set<String> intersection = new HashSet<>(set1);
        intersection.retainAll(set2);
        // 计算并集
        Set<String> union = new HashSet<>(set1);
        union.addAll(set2);
        // 防止除以0
        if (union.isEmpty()) return 0;
        return (double) intersection.size() / union.size();
    }
    /**
     * N-gram Jaccard相似度(考虑词序)
     */
    public static double calculateNGram(String text1, String text2, int n) {
        Set<String> ngrams1 = getNGrams(text1, n);
        Set<String> ngrams2 = getNGrams(text2, n);
        Set<String> intersection = new HashSet<>(ngrams1);
        intersection.retainAll(ngrams2);
        Set<String> union = new HashSet<>(ngrams1);
        union.addAll(ngrams2);
        if (union.isEmpty()) return 0;
        return (double) intersection.size() / union.size();
    }
    /**
     * 生成N-gram
     */
    private static Set<String> getNGrams(String text, int n) {
        Set<String> ngrams = new HashSet<>();
        String cleaned = text.replaceAll("\\s+", "").toLowerCase();
        for (int i = 0; i <= cleaned.length() - n; i++) {
            ngrams.add(cleaned.substring(i, i + n));
        }
        return ngrams;
    }
    private static List<String> tokenize(String text) {
        return Arrays.asList(text.toLowerCase().split("\\s+"));
    }
    public static void main(String[] args) {
        String text1 = "the cat sat on the mat";
        String text2 = "the cat sat on a mat";
        String text3 = "the dog sat on the log";
        System.out.println("文本1: " + text1);
        System.out.println("文本2: " + text2);
        System.out.println("文本3: " + text3);
        System.out.println();
        System.out.println("Jaccard相似度:");
        System.out.println("文本1 vs 文本2: " + 
                String.format("%.2f", calculate(text1, text2)));
        System.out.println("文本1 vs 文本3: " + 
                String.format("%.2f", calculate(text1, text3)));
        System.out.println();
        System.out.println("2-gram Jaccard相似度:");
        System.out.println("文本1 vs 文本2: " + 
                String.format("%.2f", calculateNGram(text1, text2, 2)));
        System.out.println("文本1 vs 文本3: " + 
                String.format("%.2f", calculateNGram(text1, text3, 2)));
    }
}

综合相似度计算器

import java.util.*;
import java.util.stream.Collectors;
public class TextSimilarityAnalyzer {
    // 相似度算法类型
    public enum Algorithm {
        COSINE, LEVENSHTEIN, JACCARD, JACCARD_NGRAM
    }
    /**
     * 分析文本相似度
     */
    public static SimilarityResult analyze(String text1, String text2) {
        SimilarityResult result = new SimilarityResult();
        result.cosineSimilarity = CosineSimilarity.calculate(text1, text2);
        result.levenshteinSimilarity = LevenshteinDistance.similarity(text1, text2);
        result.jaccardSimilarity = JaccardSimilarity.calculate(text1, text2);
        result.jaccardNGramSimilarity = JaccardSimilarity.calculateNGram(text1, text2, 2);
        // 计算综合评分(可根据需求调整权重)
        result.compositeScore = (result.cosineSimilarity * 0.3 
                + result.levenshteinSimilarity * 0.2 
                + result.jaccardSimilarity * 0.3 
                + result.jaccardNGramSimilarity * 0.2);
        return result;
    }
    /**
     * 相似度结果类
     */
    public static class SimilarityResult {
        public double cosineSimilarity;
        public double levenshteinSimilarity;
        public double jaccardSimilarity;
        public double jaccardNGramSimilarity;
        public double compositeScore;
        @Override
        public String toString() {
            return String.format(
                "相似度分析结果:\n" +
                "  余弦相似度: %.2f\n" +
                "  编辑距离相似度: %.2f\n" +
                "  Jaccard相似度: %.2f\n" +
                "  2-gram Jaccard相似度: %.2f\n" +
                "  综合评分: %.2f",
                cosineSimilarity, levenshteinSimilarity,
                jaccardSimilarity, jaccardNGramSimilarity,
                compositeScore
            );
        }
    }
    public static void main(String[] args) {
        // 测试用例
        String[][] testCases = {
            {"Java is a programming language", "Java programming language is popular"},
            {"The quick brown fox", "The quick brown fox jumps over the lazy dog"},
            {"Hello World", "Hi World"},
            {"完全相同的文本", "完全相同的文本"},
            {"毫无关系的文本", "完全不同的话题"}
        };
        for (int i = 0; i < testCases.length; i++) {
            System.out.println("=== 测试案例 " + (i + 1) + " ===");
            System.out.println("文本A: " + testCases[i][0]);
            System.out.println("文本B: " + testCases[i][1]);
            System.out.println();
            SimilarityResult result = analyze(testCases[i][0], testCases[i][1]);
            System.out.println(result);
            System.out.println();
        }
    }
}

使用说明

  1. 余弦相似度:适用于长文本比较,考虑词频
  2. 编辑距离:适用于短文本或字符串的精确匹配
  3. Jaccard相似度:适用于关键词匹配,考虑词集
  4. N-gram Jaccard:考虑词序,适合文本结构比较

实际应用建议

// 在实际项目中,推荐使用专门的分词库
// Maven依赖示例:
/*
<dependency>
    <groupId>com.hankcs</groupId>
    <artifactId>hanlp</artifactId>
    <version>portable-1.8.4</version>
</dependency>
*/
// 使用HanLP分词示例(需要导入HanLP库)
public class AdvancedTextSimilarity {
    public static double calculateWithHanLP(String text1, String text2) {
        // 使用HanLP进行中文分词
        List<String> words1 = HanLP.segment(text1).stream()
                .map(term -> term.word)
                .collect(Collectors.toList());
        List<String> words2 = HanLP.segment(text2).stream()
                .map(term -> term.word)
                .collect(Collectors.toList());
        // 然后使用余弦相似度或其他算法计算
        return CosineSimilarity.calculate(
            String.join(" ", words1),
            String.join(" ", words2)
        );
    }
}

这些实现可以直接用于文本相似度计算的业务场景中。

抱歉,评论功能暂时关闭!