本文目录导读:

我来介绍几个用Python实现文本分类的实用案例,从简单到复杂逐步展开。
基础案例:使用朴素贝叶斯分类新闻
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
# 示例数据:新闻文本
data = {
'text': [
'苹果发布新款iPhone手机',
'华为推出5G折叠屏手机',
'中国男足击败日本队',
'NBA总决赛湖人夺冠',
'人工智能改变生活方式',
'量子计算最新突破',
'长三角经济快速发展',
'央行调整存款利率'
],
'category': ['科技', '科技', '体育', '体育', '科技', '科技', '经济', '经济']
}
df = pd.DataFrame(data)
# 特征提取
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(df['text'])
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(
X, df['category'], test_size=0.3, random_state=42
)
# 训练模型
classifier = MultinomialNB()
classifier.fit(X_train, y_train)
# 预测
y_pred = classifier.predict(X_test)
# 评估
print("分类报告:")
print(classification_report(y_test, y_pred))
# 预测新文本
new_texts = ['北京冬奥会中国获金牌', '特斯拉股价上涨']
X_new = vectorizer.transform(new_texts)
predictions = classifier.predict(X_new)
print("\n新文本预测结果:")
for text, pred in zip(new_texts, predictions):
print(f"'{text}' -> {pred}")
实战案例:情感分析(好评/差评)
import nltk
import re
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
# 下载停用词
nltk.download('stopwords')
class TextClassifier:
def __init__(self):
self.pipeline = Pipeline([
('vectorizer', TfidfVectorizer(
max_features=5000,
stop_words=stopwords.words('chinese'),
ngram_range=(1, 2)
)),
('classifier', LogisticRegression(
C=1.0,
max_iter=1000
))
])
def preprocess(self, texts):
"""文本预处理"""
processed = []
for text in texts:
# 去除特殊字符
text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z\s]', '', text)
# 转为小写
text = text.lower()
processed.append(text)
return processed
def train(self, texts, labels):
"""训练模型"""
texts = self.preprocess(texts)
self.pipeline.fit(texts, labels)
print("模型训练完成!")
def predict(self, texts):
"""预测"""
texts = self.preprocess(texts)
return self.pipeline.predict(texts)
def predict_proba(self, texts):
"""预测概率"""
texts = self.preprocess(texts)
return self.pipeline.predict_proba(texts)
# 示例:电影评论情感分析
def sentiment_demo():
# 训练数据
train_texts = [
'这部电影太精彩了,演员演技很棒',
'剧情很无聊,浪费了两个小时',
'非常感人的故事,推荐观看',
'特效很烂,剧情逻辑不通',
'值得反复观看的好电影',
'导演水平太差,完全不推荐',
'画面精美,配乐动听',
'演员表演极其做作,看不下去'
]
train_labels = [1, 0, 1, 0, 1, 0, 1, 0] # 1=好评, 0=差评
# 创建并训练分类器
classifier = TextClassifier()
classifier.train(train_texts, train_labels)
# 测试预测
test_texts = [
'非常棒的观影体验',
'这部电影差强人意',
'演技浮夸,剧情老套'
]
predictions = classifier.predict(test_texts)
probabilities = classifier.predict_proba(test_texts)
print("\n情感分析结果:")
for text, pred, prob in zip(test_texts, predictions, probabilities):
sentiment = '好评' if pred == 1 else '差评'
confidence = max(prob) * 100
print(f"'{text}' -> {sentiment} (置信度: {confidence:.1f}%)")
if __name__ == "__main__":
sentiment_demo()
进阶案例:使用深度学习(PyTorch)
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
class TextClassificationDataset(Dataset):
def __init__(self, texts, labels, tokenizer, max_length=128):
self.texts = texts
self.labels = labels
self.tokenizer = tokenizer
self.max_length = max_length
def __len__(self):
return len(self.texts)
def __getitem__(self, idx):
text = str(self.texts[idx])
label = self.labels[idx]
encoding = self.tokenizer(
text,
truncation=True,
padding='max_length',
max_length=self.max_length,
return_tensors='pt'
)
return {
'input_ids': encoding['input_ids'].flatten(),
'attention_mask': encoding['attention_mask'].flatten(),
'label': torch.tensor(label, dtype=torch.long)
}
class BERTClassifier(nn.Module):
def __init__(self, n_classes):
super(BERTClassifier, self).__init__()
self.bert = BertModel.from_pretrained('bert-base-chinese')
self.dropout = nn.Dropout(0.1)
self.classifier = nn.Linear(self.bert.config.hidden_size, n_classes)
def forward(self, input_ids, attention_mask):
outputs = self.bert(
input_ids=input_ids,
attention_mask=attention_mask
)
pooled_output = outputs.pooler_output
output = self.dropout(pooled_output)
return self.classifier(output)
def train_deep_learning_model():
# 初始化 tokenizer 和模型
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
model = BERTClassifier(n_classes=2) # 二分类
# 准备数据
texts = [
'苹果发布新款iPhone手机',
'华为推出5G折叠屏手机',
'这部电影太精彩了',
'剧情很无聊,浪费时间'
]
labels = [1, 1, 0, 0] # 假设1是科技类,0是非科技类
# 创建数据集和数据加载器
dataset = TextClassificationDataset(texts, labels, tokenizer)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)
# 训练设置
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-5)
# 训练循环
model.train()
for epoch in range(3):
total_loss = 0
for batch in dataloader:
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['label'].to(device)
optimizer.zero_grad()
outputs = model(input_ids, attention_mask)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
total_loss += loss.item()
print(f'Epoch {epoch+1}, Loss: {total_loss/len(dataloader):.4f}')
print("深度学习模型训练完成!")
# 预测示例
model.eval()
test_text = "三星发布新款手机"
encoding = tokenizer(
test_text,
truncation=True,
padding='max_length',
max_length=128,
return_tensors='pt'
)
with torch.no_grad():
input_ids = encoding['input_ids'].to(device)
attention_mask = encoding['attention_mask'].to(device)
outputs = model(input_ids, attention_mask)
prediction = torch.argmax(outputs, dim=1)
print(f"预测类别: {'科技' if prediction[0] == 1 else '非科技'}")
if __name__ == "__main__":
# 运行基础案例
print("=== 基础案例:朴素贝叶斯分类 ===")
# 执行上面的基础案例代码
# 运行情感分析
print("\n=== 情感分析案例 ===")
sentiment_demo()
# 运行深度学习案例
print("\n=== 深度学习案例 ===")
train_deep_learning_model()
完整项目:垃圾短信分类
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
class SpamClassifier:
def __init__(self):
self.vectorizer = TfidfVectorizer(
max_features=3000,
min_df=2,
max_df=0.8
)
self.classifier = SVC(kernel='linear', probability=True)
def train(self, X_train, y_train):
"""训练垃圾短信分类器"""
# 特征提取
X_train_tfidf = self.vectorizer.fit_transform(X_train)
# 训练
self.classifier.fit(X_train_tfidf, y_train)
print(f"训练完成,训练集大小: {len(X_train)}")
def evaluate(self, X_test, y_test):
"""评估模型"""
X_test_tfidf = self.vectorizer.transform(X_test)
y_pred = self.classifier.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
print(f"准确率: {accuracy:.2%}")
print("\n混淆矩阵:")
print(f"TN: {cm[0,0]}, FP: {cm[0,1]}")
print(f"FN: {cm[1,0]}, TP: {cm[1,1]}")
# 可视化混淆矩阵
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('混淆矩阵')
plt.ylabel('真实标签')
plt.xlabel('预测标签')
plt.show()
return accuracy
def predict(self, texts):
"""预测新短信"""
texts_tfidf = self.vectorizer.transform(texts)
predictions = self.classifier.predict(texts_tfidf)
probabilities = self.classifier.predict_proba(texts_tfidf)
results = []
for text, pred, prob in zip(texts, predictions, probabilities):
prob_spam = prob[1] if pred == 1 else prob[0]
results.append({
'text': text,
'is_spam': bool(pred),
'confidence': max(prob)
})
return results
# 使用示例
def spam_detection_demo():
# 示例数据
ham_texts = [
'明天下午三点开会,请准时参加',
'晚上一起吃饭吧',
'作业已经提交,请查收'
]
spam_texts = [
'恭喜您中奖了!点击链接领取奖品',
'只需支付99元,即可获得万元大礼包',
'您的账户异常,请立即点击链接验证'
]
# 准备数据
all_texts = ham_texts + spam_texts
all_labels = [0, 0, 0, 1, 1, 1] # 0=正常, 1=垃圾
# 划分数据集
X_train, X_test, y_train, y_test = train_test_split(
all_texts, all_labels, test_size=0.3, random_state=42
)
# 训练分类器
classifier = SpamClassifier()
classifier.train(X_train, y_train)
# 评估
if len(X_test) > 0:
classifier.evaluate(X_test, y_test)
# 预测新短信
new_messages = [
'亲爱的用户,您的积分即将过期',
'项目进度汇报:已完成80%',
'加微信领红包,速速联系'
]
results = classifier.predict(new_messages)
print("\n短信检测结果:")
for result in results:
status = "🚫 垃圾短信" if result['is_spam'] else "✅ 正常短信"
print(f"[{status}] {result['text']} (确信度: {result['confidence']:.1%})")
if __name__ == "__main__":
print("=== 垃圾短信分类器 ===")
spam_detection_demo()
使用建议
- 数据准备:确保有足够的标注数据(至少各100条以上)
- 特征选择:根据文本语言选择合适的分词工具
- 模型选择:
- 小数据集:朴素贝叶斯、SVM
- 大数据集:深度学习模型
- 评估指标:使用准确率、精确率、召回率、F1分数
- 优化策略:
- 调整文本预处理参数
- 优化特征提取参数
- 尝试不同模型
- 使用交叉验证
这些案例覆盖了从基础到进阶的文本分类实现,你可以根据实际需求选择合适的方案。