本文目录导读:

为PHP项目实现数据抽样,可以根据不同需求选择多种方法,以下是几种常见的实现方案:
SQL层面的随机抽样(推荐)
使用MySQL的RAND()函数
// 简单随机抽样 - 从10000条数据中随机抽取100条
function simpleRandomSample($table, $sampleSize = 100, $total = null) {
if ($total === null) {
$stmt = $pdo->query("SELECT COUNT(*) FROM $table");
$total = $stmt->fetchColumn();
}
// 如果数据量小于样本量,返回所有数据
if ($total <= $sampleSize) {
return $pdo->query("SELECT * FROM $table")->fetchAll();
}
// 使用ORDER BY RAND()随机排序
$sql = "SELECT * FROM $table ORDER BY RAND() LIMIT ?";
$stmt = $pdo->prepare($sql);
$stmt->execute([$sampleSize]);
return $stmt->fetchAll();
}
更高效的SQL抽样(大表时推荐)
// 使用TABLESAMPLE (MySQL 8.0+)
function sampleUsingTablesample($table, $percentage = 10) {
// MySQL 8.0.1+ 支持 TABLESAMPLE
$sql = "SELECT * FROM $table TABLESAMPLE SYSTEM($percentage PERCENT)";
return $pdo->query($sql)->fetchAll();
}
// 使用WHERE条件抽样
function sampleUsingWhereClause($table, $sampleSize) {
$total = $pdo->query("SELECT COUNT(*) FROM $table")->fetchColumn();
$step = max(1, floor($total / $sampleSize));
$sql = "SELECT * FROM $table WHERE id % ? = 0 LIMIT ?";
$stmt = $pdo->prepare($sql);
$stmt->execute([$step, $sampleSize]);
return $stmt->fetchAll();
}
PHP端抽样(适用于中小型数据集)
class DataSampler {
private $data;
public function __construct(array $data) {
$this->data = $data;
}
// 简单随机抽样
public function simpleRandomSample($size) {
if ($size >= count($this->data)) {
return $this->data;
}
$keys = array_rand($this->data, $size);
if ($size === 1) {
$keys = [$keys];
}
return array_intersect_key($this->data, array_flip($keys));
}
// 系统抽样(等距抽样)
public function systematicSample($size) {
$total = count($this->data);
if ($size >= $total) return $this->data;
$interval = floor($total / $size);
$sample = [];
$start = rand(0, $interval - 1);
for ($i = $start; $i < $total; $i += $interval) {
$sample[] = $this->data[$i];
}
return array_slice($sample, 0, $size);
}
// 分层抽样
public function stratifiedSample($size, $groupBy) {
$groups = [];
foreach ($this->data as $item) {
$groupKey = $item[$groupBy];
$groups[$groupKey][] = $item;
}
$sample = [];
$total = count($this->data);
foreach ($groups as $key => $group) {
$groupSize = count($group);
// 按照比例分配样本数
$samplesForGroup = max(1, floor($size * ($groupSize / $total)));
$sampler = new self($group);
$sample = array_merge($sample, $sampler->simpleRandomSample($samplesForGroup));
}
return $sample;
}
// 蓄水池抽样(适用于未知大小的流数据)
public function reservoirSample($k) {
$reservoir = [];
$i = 0;
foreach ($this->data as $item) {
if ($i < $k) {
$reservoir[] = $item;
} else {
$j = rand(0, $i);
if ($j < $k) {
$reservoir[$j] = $item;
}
}
$i++;
}
return $reservoir;
}
}
大数据集的高效抽样
class BigDataSampler {
// 处理百万级数据
public function sampleLargeDataset($filePath, $sampleSize) {
$sample = [];
$lineCount = 0;
$handle = fopen($filePath, "r");
if (!$handle) return [];
// 先计算总行数
while (fgets($handle) !== false) {
$lineCount++;
}
// 生成随机位置
$positions = [];
for ($i = 0; $i < $sampleSize; $i++) {
$positions[] = rand(1, $lineCount);
}
sort($positions);
// 读取指定行
rewind($handle);
$currentLine = 0;
$posIndex = 0;
while (($line = fgets($handle)) !== false && $posIndex < $sampleSize) {
$currentLine++;
if ($currentLine == $positions[$posIndex]) {
$sample[] = $line;
$posIndex++;
}
}
fclose($handle);
return $sample;
}
// 分批处理大表数据
public function sampleFromDatabase($pdo, $table, $sampleSize, $chunkSize = 10000) {
$sample = [];
$total = $pdo->query("SELECT COUNT(*) FROM $table")->fetchColumn();
$chunks = ceil($total / $chunkSize);
for ($i = 0; $i < $chunks; $i++) {
$offset = $i * $chunkSize;
$chunkData = $pdo->query("SELECT * FROM $table LIMIT $offset, $chunkSize")->fetchAll();
$sampler = new DataSampler($chunkData);
$samplesPerChunk = ceil($sampleSize / $chunks);
$sample = array_merge($sample, $sampler->simpleRandomSample($samplesPerChunk));
}
return array_slice($sample, 0, $sampleSize);
}
}
实用工具类
class DataSamplingUtil {
// 按百分比抽样
public static function percentageSample($arr, $percentage) {
$size = count($arr);
$sampleSize = ceil($size * $percentage / 100);
return (new DataSampler($arr))->simpleRandomSample($sampleSize);
}
// 加权随机抽样
public static function weightedSample($items, $weights, $count) {
$totalWeight = array_sum($weights);
$sample = [];
$itemsWithWeights = [];
foreach ($items as $key => $item) {
$itemsWithWeights[] = ['item' => $item, 'weight' => $weights[$key] / $totalWeight];
}
for ($i = 0; $i < $count; $i++) {
$random = mt_rand() / mt_getrandmax();
$cumulative = 0;
foreach ($itemsWithWeights as $entry) {
$cumulative += $entry['weight'];
if ($random <= $cumulative) {
$sample[] = $entry['item'];
break;
}
}
}
return $sample;
}
// 种子随机抽样(可重现的随机抽样)
public static function seededSample($arr, $size, $seed) {
mt_srand($seed);
$indexes = range(0, count($arr) - 1);
shuffle($indexes);
$sample = [];
for ($i = 0; $i < min($size, count($arr)); $i++) {
$sample[] = $arr[$indexes[$i]];
}
mt_srand(); // 重置随机种子
return $sample;
}
}
使用示例
// 数据库抽样
$sample = simpleRandomSample('users', 100);
// PHP数组抽样
$sampler = new DataSampler($usersArray);
$sample = $sampler->simpleRandomSample(50);
$sample = $sampler->stratifiedSample(100, 'department');
// 百分比抽样
$sample = DataSamplingUtil::percentageSample($data, 20);
// 加权抽样
$items = ['A', 'B', 'C'];
$weights = [5, 3, 2];
$sample = DataSamplingUtil::weightedSample($items, $weights, 10);
// 可重现抽样
$sample = DataSamplingUtil::seededSample($data, 50, 12345);
性能建议
- 小数据集 (< 10000条):使用PHP端抽样
- 中等数据集 (10000-100000条):使用SQL ORDER BY RAND()
- 大数据集 (> 100000条):使用TABLESAMPLE或分批处理
- 实时系统:预计算并维护抽样数据
- 质量要求高:使用分层抽样保证代表性
选择哪种方法取决于数据量、性能要求、以及是否需要保证抽样的统计代表性。