本文目录导读:

我来为您提供几个Java实现文本相似度计算的案例,涵盖不同算法:
余弦相似度算法
实现代码
import java.util.*;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
public class CosineSimilarity {
/**
* 计算两个文本的余弦相似度
*/
public static double calculate(String text1, String text2) {
// 分词
List<String> words1 = tokenize(text1);
List<String> words2 = tokenize(text2);
// 构建词频向量
Map<String, Integer> freq1 = getWordFrequency(words1);
Map<String, Integer> freq2 = getWordFrequency(words2);
// 获取所有不重复的词
Set<String> allWords = new HashSet<>();
allWords.addAll(freq1.keySet());
allWords.addAll(freq2.keySet());
// 计算点积和模长
double dotProduct = 0;
double norm1 = 0;
double norm2 = 0;
for (String word : allWords) {
int count1 = freq1.getOrDefault(word, 0);
int count2 = freq2.getOrDefault(word, 0);
dotProduct += count1 * count2;
norm1 += count1 * count1;
norm2 += count2 * count2;
}
// 防止除以0
if (norm1 == 0 || norm2 == 0) {
return 0;
}
return dotProduct / (Math.sqrt(norm1) * Math.sqrt(norm2));
}
/**
* 简单分词(实际应用可使用分词库如HanLP)
*/
private static List<String> tokenize(String text) {
// 去除标点符号,按空格和标点分词
String cleaned = text.toLowerCase()
.replaceAll("[\\p{Punct}\\s]+", " ")
.trim();
return Arrays.asList(cleaned.split("\\s+"));
}
/**
* 统计词频
*/
private static Map<String, Integer> getWordFrequency(List<String> words) {
Map<String, Integer> freq = new HashMap<>();
for (String word : words) {
freq.put(word, freq.getOrDefault(word, 0) + 1);
}
return freq;
}
public static void main(String[] args) {
// 测试案例
String text1 = "Java is a programming language";
String text2 = "Java programming language is popular";
String text3 = "Python is another programming language";
System.out.println("文本1: " + text1);
System.out.println("文本2: " + text2);
System.out.println("文本3: " + text3);
System.out.println();
System.out.println("文本1 vs 文本2 余弦相似度: " +
String.format("%.2f", calculate(text1, text2)));
System.out.println("文本1 vs 文本3 余弦相似度: " +
String.format("%.2f", calculate(text1, text3)));
}
}
编辑距离(Levenshtein距离)算法
public class LevenshteinDistance {
/**
* 计算编辑距离
*/
public static int calculate(String s1, String s2) {
int[][] dp = new int[s1.length() + 1][s2.length() + 1];
// 初始化
for (int i = 0; i <= s1.length(); i++) {
dp[i][0] = i;
}
for (int j = 0; j <= s2.length(); j++) {
dp[0][j] = j;
}
// 动态规划
for (int i = 1; i <= s1.length(); i++) {
for (int j = 1; j <= s2.length(); j++) {
if (s1.charAt(i - 1) == s2.charAt(j - 1)) {
dp[i][j] = dp[i - 1][j - 1];
} else {
dp[i][j] = Math.min(
dp[i - 1][j] + 1, // 删除
Math.min(
dp[i][j - 1] + 1, // 插入
dp[i - 1][j - 1] + 1 // 替换
)
);
}
}
}
return dp[s1.length()][s2.length()];
}
/**
* 计算相似度 (0-1之间)
*/
public static double similarity(String s1, String s2) {
if (s1.isEmpty() && s2.isEmpty()) return 1.0;
if (s1.isEmpty() || s2.isEmpty()) return 0.0;
int maxLen = Math.max(s1.length(), s2.length());
int distance = calculate(s1, s2);
return 1.0 - (double) distance / maxLen;
}
public static void main(String[] args) {
String s1 = "kitten";
String s2 = "sitting";
String s3 = "kitchen";
System.out.println("字符串1: " + s1);
System.out.println("字符串2: " + s2);
System.out.println("字符串3: " + s3);
System.out.println();
System.out.println("编辑距离 (" + s1 + " vs " + s2 + "): " +
calculate(s1, s2));
System.out.println("相似度: " +
String.format("%.2f", similarity(s1, s2)));
System.out.println();
System.out.println("编辑距离 (" + s1 + " vs " + s3 + "): " +
calculate(s1, s3));
System.out.println("相似度: " +
String.format("%.2f", similarity(s1, s3)));
}
}
Jaccard相似系数
import java.util.*;
public class JaccardSimilarity {
/**
* 计算Jaccard相似度
*/
public static double calculate(String text1, String text2) {
Set<String> set1 = new HashSet<>(tokenize(text1));
Set<String> set2 = new HashSet<>(tokenize(text2));
// 计算交集
Set<String> intersection = new HashSet<>(set1);
intersection.retainAll(set2);
// 计算并集
Set<String> union = new HashSet<>(set1);
union.addAll(set2);
// 防止除以0
if (union.isEmpty()) return 0;
return (double) intersection.size() / union.size();
}
/**
* N-gram Jaccard相似度(考虑词序)
*/
public static double calculateNGram(String text1, String text2, int n) {
Set<String> ngrams1 = getNGrams(text1, n);
Set<String> ngrams2 = getNGrams(text2, n);
Set<String> intersection = new HashSet<>(ngrams1);
intersection.retainAll(ngrams2);
Set<String> union = new HashSet<>(ngrams1);
union.addAll(ngrams2);
if (union.isEmpty()) return 0;
return (double) intersection.size() / union.size();
}
/**
* 生成N-gram
*/
private static Set<String> getNGrams(String text, int n) {
Set<String> ngrams = new HashSet<>();
String cleaned = text.replaceAll("\\s+", "").toLowerCase();
for (int i = 0; i <= cleaned.length() - n; i++) {
ngrams.add(cleaned.substring(i, i + n));
}
return ngrams;
}
private static List<String> tokenize(String text) {
return Arrays.asList(text.toLowerCase().split("\\s+"));
}
public static void main(String[] args) {
String text1 = "the cat sat on the mat";
String text2 = "the cat sat on a mat";
String text3 = "the dog sat on the log";
System.out.println("文本1: " + text1);
System.out.println("文本2: " + text2);
System.out.println("文本3: " + text3);
System.out.println();
System.out.println("Jaccard相似度:");
System.out.println("文本1 vs 文本2: " +
String.format("%.2f", calculate(text1, text2)));
System.out.println("文本1 vs 文本3: " +
String.format("%.2f", calculate(text1, text3)));
System.out.println();
System.out.println("2-gram Jaccard相似度:");
System.out.println("文本1 vs 文本2: " +
String.format("%.2f", calculateNGram(text1, text2, 2)));
System.out.println("文本1 vs 文本3: " +
String.format("%.2f", calculateNGram(text1, text3, 2)));
}
}
综合相似度计算器
import java.util.*;
import java.util.stream.Collectors;
public class TextSimilarityAnalyzer {
// 相似度算法类型
public enum Algorithm {
COSINE, LEVENSHTEIN, JACCARD, JACCARD_NGRAM
}
/**
* 分析文本相似度
*/
public static SimilarityResult analyze(String text1, String text2) {
SimilarityResult result = new SimilarityResult();
result.cosineSimilarity = CosineSimilarity.calculate(text1, text2);
result.levenshteinSimilarity = LevenshteinDistance.similarity(text1, text2);
result.jaccardSimilarity = JaccardSimilarity.calculate(text1, text2);
result.jaccardNGramSimilarity = JaccardSimilarity.calculateNGram(text1, text2, 2);
// 计算综合评分(可根据需求调整权重)
result.compositeScore = (result.cosineSimilarity * 0.3
+ result.levenshteinSimilarity * 0.2
+ result.jaccardSimilarity * 0.3
+ result.jaccardNGramSimilarity * 0.2);
return result;
}
/**
* 相似度结果类
*/
public static class SimilarityResult {
public double cosineSimilarity;
public double levenshteinSimilarity;
public double jaccardSimilarity;
public double jaccardNGramSimilarity;
public double compositeScore;
@Override
public String toString() {
return String.format(
"相似度分析结果:\n" +
" 余弦相似度: %.2f\n" +
" 编辑距离相似度: %.2f\n" +
" Jaccard相似度: %.2f\n" +
" 2-gram Jaccard相似度: %.2f\n" +
" 综合评分: %.2f",
cosineSimilarity, levenshteinSimilarity,
jaccardSimilarity, jaccardNGramSimilarity,
compositeScore
);
}
}
public static void main(String[] args) {
// 测试用例
String[][] testCases = {
{"Java is a programming language", "Java programming language is popular"},
{"The quick brown fox", "The quick brown fox jumps over the lazy dog"},
{"Hello World", "Hi World"},
{"完全相同的文本", "完全相同的文本"},
{"毫无关系的文本", "完全不同的话题"}
};
for (int i = 0; i < testCases.length; i++) {
System.out.println("=== 测试案例 " + (i + 1) + " ===");
System.out.println("文本A: " + testCases[i][0]);
System.out.println("文本B: " + testCases[i][1]);
System.out.println();
SimilarityResult result = analyze(testCases[i][0], testCases[i][1]);
System.out.println(result);
System.out.println();
}
}
}
使用说明
- 余弦相似度:适用于长文本比较,考虑词频
- 编辑距离:适用于短文本或字符串的精确匹配
- Jaccard相似度:适用于关键词匹配,考虑词集
- N-gram Jaccard:考虑词序,适合文本结构比较
实际应用建议
// 在实际项目中,推荐使用专门的分词库
// Maven依赖示例:
/*
<dependency>
<groupId>com.hankcs</groupId>
<artifactId>hanlp</artifactId>
<version>portable-1.8.4</version>
</dependency>
*/
// 使用HanLP分词示例(需要导入HanLP库)
public class AdvancedTextSimilarity {
public static double calculateWithHanLP(String text1, String text2) {
// 使用HanLP进行中文分词
List<String> words1 = HanLP.segment(text1).stream()
.map(term -> term.word)
.collect(Collectors.toList());
List<String> words2 = HanLP.segment(text2).stream()
.map(term -> term.word)
.collect(Collectors.toList());
// 然后使用余弦相似度或其他算法计算
return CosineSimilarity.calculate(
String.join(" ", words1),
String.join(" ", words2)
);
}
}
这些实现可以直接用于文本相似度计算的业务场景中。