Python案例如何实现手势识别？

wen python案例 2026-06-17 3

本文目录导读：

Python案例如何实现手势识别？

使用OpenCV + MediaPipe（推荐方案）
使用OpenCV + 图像处理（基于轮廓）
使用TensorFlow训练手势分类模型
安装依赖
实用技巧和注意事项

我来介绍几种Python实现手势识别的方法,从简单到复杂：

使用OpenCV + MediaPipe（推荐方案）

这是最简单且准确率高的方法,适合新手：

import cv2
import mediapipe as mp
class HandGestureRecognizer:
    def __init__(self):
        self.mp_hands = mp.solutions.hands
        self.hands = self.mp_hands.Hands(
            static_image_mode=False,
            max_num_hands=2,
            min_detection_confidence=0.5,
            min_tracking_confidence=0.5
        )
        self.mp_draw = mp.solutions.drawing_utils
    def recognize_gesture(self, hand_landmarks):
        """识别手势"""
        # 获取手指状态（伸出/弯曲）
        fingers = []
        # 大拇指
        if hand_landmarks.landmark[4].x < hand_landmarks.landmark[3].x:
            fingers.append(1)  # 大拇指伸出
        else:
            fingers.append(0)
        # 其他四指
        for i in range(8, 21, 4):
            if hand_landmarks.landmark[i].y < hand_landmarks.landmark[i-2].y:
                fingers.append(1)
            else:
                fingers.append(0)
        # 根据手指状态判断手势
        if fingers == [0, 1, 1, 0, 0]:
            return "剪刀✌️"
        elif fingers == [0, 0, 0, 0, 0]:
            return "拳头✊"
        elif fingers == [1, 1, 1, 1, 1]:
            return "手掌🖐️"
        elif fingers == [0, 1, 0, 0, 0]:
            return "点赞👍"
        else:
            return "其他手势"
    def run_camera(self):
        """运行摄像头手势识别"""
        cap = cv2.VideoCapture(0)
        while True:
            success, img = cap.read()
            if not success:
                break
            # 翻转图像
            img = cv2.flip(img, 1)
            img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            # 检测手势
            results = self.hands.process(img_rgb)
            if results.multi_hand_landmarks:
                for hand_landmarks in results.multi_hand_landmarks:
                    # 绘制手部关键点
                    self.mp_draw.draw_landmarks(
                        img, hand_landmarks, self.mp_hands.HAND_CONNECTIONS
                    )
                    # 识别手势
                    gesture = self.recognize_gesture(hand_landmarks)
                    # 显示手势名称
                    cv2.putText(img, gesture, (10, 50), 
                               cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
            cv2.imshow("Gesture Recognition", img)
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break
        cap.release()
        cv2.destroyAllWindows()
# 使用示例
if __name__ == "__main__":
    recognizer = HandGestureRecognizer()
    recognizer.run_camera()

使用OpenCV + 图像处理（基于轮廓）

更基础的方法,不依赖第三方库（除了OpenCV）：

import cv2
import numpy as np
class SimpleGestureRecognizer:
    def __init__(self):
        self.cap = cv2.VideoCapture(0)
    def process_image(self, img):
        """图像预处理"""
        # 转换颜色空间
        img_ycbcr = cv2.cvtColor(img, cv2.COLOR_BGR2YCrCb)
        # 皮肤颜色检测（YCrCb空间）
        lower = np.array([0, 133, 77])
        upper = np.array([255, 173, 127])
        mask = cv2.inRange(img_ycbcr, lower, upper)
        # 形态学操作
        kernel = np.ones((5, 5), np.uint8)
        mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel)
        mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel)
        return mask
    def find_contours(self, mask):
        """查找轮廓"""
        contours, _ = cv2.findContours(
            mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
        )
        if contours:
            max_contour = max(contours, key=cv2.contourArea)
            if cv2.contourArea(max_contour) > 3000:  # 最小面积阈值
                return max_contour
        return None
    def count_fingers(self, contour):
        """计算手指数量"""
        # 获取凸包
        hull = cv2.convexHull(contour, returnPoints=False)
        defects = cv2.convexityDefects(contour, hull)
        if defects is None:
            return 0
        # 统计凹陷点数量
        count = 0
        for i in range(defects.shape[0]):
            s, e, f, d = defects[i, 0]
            far = tuple(contour[f][0])
            # 计算角度
            a = np.linalg.norm(contour[s][0] - contour[f][0])
            b = np.linalg.norm(contour[e][0] - contour[f][0])
            c = np.linalg.norm(contour[s][0] - contour[e][0])
            angle = np.arccos((a**2 + b**2 - c**2) / (2 * a * b))
            if angle < np.pi/2:  # 角度小于90度
                count += 1
        return count + 1  # 手指数量 = 凹陷点数量 + 1
    def run(self):
        """运行识别程序"""
        while True:
            ret, frame = self.cap.read()
            if not ret:
                break
            frame = cv2.flip(frame, 1)
            # 处理图像
            mask = self.process_image(frame)
            # 查找轮廓
            contour = self.find_contours(mask)
            if contour is not None:
                # 画轮廓
                cv2.drawContours(frame, [contour], -1, (0, 255, 0), 2)
                # 计算手指
                finger_count = self.count_fingers(contour)
                # 显示结果
                cv2.putText(frame, f"Fingers: {finger_count}", (10, 50),
                           cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
            # 显示图像
            cv2.imshow("Gesture Recognition", frame)
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break
        self.cap.release()
        cv2.destroyAllWindows()
# 使用示例
if __name__ == "__main__":
    recognizer = SimpleGestureRecognizer()
    recognizer.run()

使用TensorFlow训练手势分类模型

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import cv2
import numpy as np
import os
class GestureModelTrainer:
    def __init__(self, num_classes=5):
        self.num_classes = num_classes
    def create_model(self):
        """创建CNN模型"""
        model = keras.Sequential([
            layers.Conv2D(32, (3, 3), activation='relu', input_shape=(64, 64, 3)),
            layers.MaxPooling2D((2, 2)),
            layers.Conv2D(64, (3, 3), activation='relu'),
            layers.MaxPooling2D((2, 2)),
            layers.Conv2D(64, (3, 3), activation='relu'),
            layers.Flatten(),
            layers.Dense(64, activation='relu'),
            layers.Dropout(0.5),
            layers.Dense(self.num_classes, activation='softmax')
        ])
        model.compile(
            optimizer='adam',
            loss='categorical_crossentropy',
            metrics=['accuracy']
        )
        return model
    def train(self, data_dir, epochs=10):
        """训练模型"""
        # 数据增强
        data_gen = keras.preprocessing.image.ImageDataGenerator(
            rescale=1./255,
            rotation_range=20,
            width_shift_range=0.2,
            height_shift_range=0.2,
            horizontal_flip=True,
            validation_split=0.2
        )
        # 加载数据
        train_gen = data_gen.flow_from_directory(
            data_dir,
            target_size=(64, 64),
            batch_size=32,
            class_mode='categorical',
            subset='training'
        )
        val_gen = data_gen.flow_from_directory(
            data_dir,
            target_size=(64, 64),
            batch_size=32,
            class_mode='categorical',
            subset='validation'
        )
        # 创建并训练模型
        model = self.create_model()
        history = model.fit(
            train_gen,
            validation_data=val_gen,
            epochs=epochs
        )
        # 保存模型
        model.save('gesture_model.h5')
        return model
class GesturePredictor:
    def __init__(self, model_path='gesture_model.h5'):
        self.model = keras.models.load_model(model_path)
        self.gesture_names = ['fist', 'palm', 'thumb', 'peace', 'ok']
    def predict(self, image):
        """预测单个图像"""
        # 预处理
        img = cv2.resize(image, (64, 64))
        img = img / 255.0
        img = np.expand_dims(img, axis=0)
        # 预测
        prediction = self.model.predict(img)
        class_idx = np.argmax(prediction[0])
        confidence = prediction[0][class_idx]
        return self.gesture_names[class_idx], confidence
    def run_realtime(self):
        """实时识别"""
        cap = cv2.VideoCapture(0)
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            frame = cv2.flip(frame, 1)
            # 预测手势
            gesture, confidence = self.predict(frame)
            # 显示结果
            cv2.putText(frame, f"{gesture} ({confidence:.2f})", (10, 50),
                       cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
            cv2.imshow("Gesture Recognition", frame)
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break
        cap.release()
        cv2.destroyAllWindows()

安装依赖

# 基础依赖
pip install opencv-python mediapipe numpy
# 如果使用TensorFlow
pip install tensorflow
# 其他有用的库
pip install sklearn matplotlib

实用技巧和注意事项

手势识别优化建议：

class GestureOptimizer:
    @staticmethod
    def preprocess_image(img):
        """图像预处理优化"""
        # 高斯模糊减少噪声
        blurred = cv2.GaussianBlur(img, (5, 5), 0)
        # 直方图均衡化增强对比度
        lab = cv2.cvtColor(blurred, cv2.COLOR_BGR2LAB)
        lab[:, :, 0] = cv2.equalizeHist(lab[:, :, 0])
        enhanced = cv2.cvtColor(lab, cv2.COLOR_LAB2BGR)
        return enhanced
    @staticmethod
    def detect_hand_region(img):
        """检测手部区域（ROI）"""
        # 使用YOLO或简单的边界框检测
        # 这里用简单的皮肤检测
        lower_skin = np.array([0, 20, 70], dtype=np.uint8)
        upper_skin = np.array([20, 255, 255], dtype=np.uint8)
        hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
        mask = cv2.inRange(hsv, lower_skin, upper_skin)
        # 找到最大轮廓
        contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, 
                                       cv2.CHAIN_APPROX_SIMPLE)
        if contours:
            max_contour = max(contours, key=cv2.contourArea)
            x, y, w, h = cv2.boundingRect(max_contour)
            return (x, y, w, h)
        return None

完整示例：综合应用

import cv2
import mediapipe as mp
import numpy as np
from collections import deque
class AdvancedGestureRecognizer:
    def __init__(self):
        self.mp_hands = mp.solutions.hands
        self.hands = self.mp_hands.Hands(
            static_image_mode=False,
            max_num_hands=2,
            min_detection_confidence=0.7,
            min_tracking_confidence=0.5
        )
        self.mp_draw = mp.solutions.drawing_utils
        # 手势状态
        self.gesture_history = deque(maxlen=5)
        self.current_gesture = None
        self.gesture_count = 0
    def get_gesture(self, landmarks, img_shape):
        """高级手势识别"""
        h, w, _ = img_shape
        finger_tips = [4, 8, 12, 16, 20]  # 指尖
        finger_mcp = [2, 5, 9, 13, 17]   # 指节
        fingers = []
        for i in range(5):
            if landmarks[finger_tips[i]].y < landmarks[finger_mcp[i]].y:
                fingers.append(1)
            else:
                fingers.append(0)
        # 计算手势
        if sum(fingers) == 0:
            return "fist"
        elif sum(fingers) == 5:
            return "palm"
        elif fingers == [1, 1, 0, 0, 0]:
            return "peace"
        elif fingers == [0, 1, 0, 0, 0]:
            return "point"
        elif fingers == [1, 0, 0, 0, 0]:
            return "thumb_up"
        else:
            return "unknown"
    def recognize_volume(self, landmarks, img_shape):
        """识别手势控制音量（示例）"""
        h, w, _ = img_shape
        # 计算拇指和食指的距离
        thumb_tip = landmarks[4]
        index_tip = landmarks[8]
        distance = np.sqrt(
            (thumb_tip.x - index_tip.x)**2 + 
            (thumb_tip.y - index_tip.y)**2
        )
        # 映射到音量范围
        volume = np.interp(distance, [0.05, 0.3], [0, 100])
        return min(100, max(0, volume))
    def run_with_feedback(self):
        """运行带反馈的识别"""
        cap = cv2.VideoCapture(0)
        while True:
            success, img = cap.read()
            if not success:
                break
            img = cv2.flip(img, 1)
            img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            results = self.hands.process(img_rgb)
            if results.multi_hand_landmarks:
                for hand_landmarks in results.multi_hand_landmarks:
                    # 画手部关键点
                    self.mp_draw.draw_landmarks(
                        img, hand_landmarks, self.mp_hands.HAND_CONNECTIONS
                    )
                    # 识别手势
                    gesture = self.get_gesture(
                        hand_landmarks.landmark, img.shape
                    )
                    # 音量控制
                    volume = self.recognize_volume(
                        hand_landmarks.landmark, img.shape
                    )
                    # 显示信息
                    cv2.putText(img, f"Gesture: {gesture}", (10, 50),
                               cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
                    # 显示音量条
                    cv2.rectangle(img, (50, 100), (50 + int(volume*2), 130),
                                 (0, 255, 0), -1)
                    cv2.putText(img, f"Volume: {int(volume)}%", (50, 90),
                               cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 1)
            cv2.imshow("Advanced Gesture Recognition", img)
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break
        cap.release()
        cv2.destroyAllWindows()
# 运行高级手势识别
if __name__ == "__main__":
    recognizer = AdvancedGestureRecognizer()
    recognizer.run_with_feedback()

这些方法分别适用于不同的场景：

MediaPipe方案：最简单，准确率高，适合快速开发
OpenCV传统方法：不需要额外库，但准确率较低
深度学习方案：需要训练数据，但可定制性强
综合应用：结合多种技术实现更复杂的功能

建议从MediaPipe方案开始,它最容易上手且效果最好。