第七节:CNN练习2手势识别
Posted 快乐江湖
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了第七节:CNN练习2手势识别相关的知识,希望对你有一定的参考价值。
文章目录
一:Sebastien Marcel Static Hand Posture Database(静态手势数据集)介绍
Sebastien Marcel Static Hand Posture Database提供了6种手势姿势,如下图,分别代表
- A
- B
- C
- five
- point
- V
图片格式为.ppm
- PBM 是位图(bitmap),仅有黑与白,没有灰
- PGM 是灰度图(grayscale)
- PPM 是通过RGB三种颜色显现的图像(pixmaps)
数据下载并解压后,格式如下
二:模型设计
设计网络结构如下
其中MTB结构如下图所示
三:代码编写
(1)数据预处理文件
preprocess.py
:原始数据集每个文件夹图片数据均不一样,并且在测试集中还存在两类测试图片,分别表示不同环境拍摄(complex
和uniform
),所以这里对所有数据集进行整合操作,将训练集和测试集进行合并,然后再按照8:2:1
的比例分别分配给训练集、验证集和测试集。最终会生成train_hand_gesture.txt
、eval_hand_gesture.txt
、test_hand_gesture.txt
三个文件,后续所有操作将会针对这三个文件进行,而不在原始数据集上进行
以train_hand_gesture.txt
为例,每一行表示图片的路径和其所属类别
- “A”:0
- “B”:1
- “C”:2
- “Five”:3
- “Point”:4
- “V”:5
0|./data/Marcel-Train\\A\\A-train1043.ppm
0|./data/Marcel-Train\\A\\A-train0577.ppm
2|./data/Marcel-Train\\C\\C-train455.ppm
0|./data/Marcel-Train\\A\\A-train1291.ppm
4|./data/Marcel-Train\\Point\\Point-train0220.ppm
0|./data/Marcel-Train\\A\\A-train1072.ppm
0|./data/Marcel-Train\\A\\A-train0599.ppm
4|./data/Marcel-Train\\Point\\Point-train0060.ppm
2|./data/Marcel-Test\\C\\complex\\C-complex07.ppm
4|./data/Marcel-Train\\Point\\Point-train0944.ppm
4|./data/Marcel-Train\\Point\\Point-train1333.ppm
4|./data/Marcel-Train\\Point\\Point-train0778.ppm
0|./data/Marcel-Train\\A\\A-train0795.ppm
3|./data/Marcel-Train\\Five\\Five-train148.ppm
3|./data/Marcel-Train\\Five\\Five-train394.ppm
3|./data/Marcel-Train\\Five\\Five-train441.ppm
...
代码如下,关键之处有注释说明
import os
import json
from Parameters import parameters
import random
from PIL import Image
random.seed(parameters.seed)
# 获取某个文件夹下面所有后缀名为suffix的文件,并返回其path的list
import os
import json
from Parameters import parameters
import random
from PIL import Image
random.seed(parameters.seed)
# 获取某个文件夹下面所有后缀名为suffix的文件,并返回其path的list
def recursive_fetching(root, suffix):
all_file_path = []
# get_all_files函数会被递归调用
def get_all_files(path):
all_file_list = os.listdir(path)
# 遍历path文件夹下的所有文件和目录
for file in all_file_list:
filepath = os.path.join(path, file)
# 如果是目录则再次递归
if os.pa
th.isdir(filepath):
get_all_files(filepath)
# 如果是文件则保存其文件路径和文件名到all_file_path中
elif os.path.isfile(filepath):
all_file_path.append(filepath)
# 把根目录传入
get_all_files(root)
# 筛选所有后缀名为suffix的文件
file_paths = [it for it in all_file_path if os.path.split(it)[-1].split('.')[-1].lower() in suffix]
return file_paths
# 加载meta文件
def load_meta(meta_path):
with open(meta_path, 'r') as fr:
return [line.strip().split('|') for line in fr.readlines()]
# 加载图片
def load_image(image_path):
return Image.open(image_path)
# 构建类别到id的映射
cls_mapper =
"clsToid": "A": 0, "B": 1, "C": 2, "Five": 3, "Point": 4, "V": 5,
"idTocls": 0: "A", 1: "B", 2: "C", 3: "Five", 4: "Point", 5: "V"
if not os.path.exists(parameters.cls_mapper_path):
json.dump(cls_mapper, open(parameters.cls_mapper_path, 'w'))
train_items = recursive_fetching(parameters.train_data_root, 'ppm') # 获取Marcel-Train文件夹下数据路径
test_items = recursive_fetching(parameters.test_data_root, 'ppm') # 获取Marcel-Test文件夹下数据路径
dataset = train_items + test_items # 合并
random.shuffle(dataset) # 打乱数据集
dataset_num = len(dataset)
print("数据集总数目:", dataset_num)
"""
最终dataset_dict大概长这样子
dataset_dict =
0: ["./data/Marcel-Test/A/complex/A-complex32.ppm", "./data/Marcel-Test/A/complex/A-complex31.ppm", ...]
1: ["./data/Marcel-Train/B/B-train119.ppm", "./data/Marcel-Test/B/uniform/B-uniform04.ppm", ...]
...
5: [...]
"""
dataset_dict =
for it in dataset:
# 例如"./data/Marcel-Train/B/B-train119.ppm",cls_name就是B, cls_id就是1
cls_name = os.path.split(it)[-1].split('-')[0]
cls_id = cls_mapper["clsToid"][cls_name]
# 例如,把所有属于B的训练数据和图片数据放到一个列表中,该列表的k值为1
if cls_id not in dataset_dict:
dataset_dict[cls_id] = [it]
else:
dataset_dict[cls_id].append(it)
# 每个列表按照比例分配到train、eval、test中
train_ratio, eval_ratio, test_ratio = 0.8, 0.1, 0.1
train_set, eval_set, test_set = [], [], []
for idx, set_list in dataset_dict.items():
length = len(set_list)
train_num, eval_num = int(length*train_ratio), int(length*eval_ratio)
test_num = length - train_num - eval_num
random.shuffle(set_list)
train_set.extend(set_list[:train_num])
eval_set.extend(set_list[train_num:train_num+eval_num])
test_set.extend(set_list[train_num+eval_num:])
random.shuffle(train_set)
random.shuffle(eval_set)
random.shuffle(test_set)
# print(train_set)
# print(eval_set)
# print(test_set)
# print(len(train_set) + len(eval_set) + len(test_set))
# 写入metafile
with open(parameters.metadata_train_path, 'w') as fw:
for path in train_set:
cls_name = os.path.split(path)[-1].split('-')[0]
cls_id = cls_mapper["clsToid"][cls_name]
fw.write("%d|%s\\n" % (cls_id, path))
with open(parameters.metadata_eval_path, 'w') as fw:
for path in eval_set:
cls_name = os.path.split(path)[-1].split('-')[0]
cls_id = cls_mapper["clsToid"][cls_name]
fw.write("%d|%s\\n" % (cls_id, path))
with open(parameters.metadata_test_path, 'w') as fw:
for path in test_set:
cls_name = os.path.split(path)[-1].split('-')[0]
cls_id = cls_mapper["clsToid"][cls_name]
fw.write("%d|%s\\n" % (cls_id, path))
# 测试,看一下所有图片的颜色模式和对应大小
mode_set, size_set = [], []
for _, path in load_meta(parameters.metadata_train_path):
img = load_image(path)
mode_set.append(img.mode)
size_set.append(img.size)
print(set(mode_set), set(size_set))
结果显示这批图片颜色模式均为RGB,但是大小各异,所以在后面加载的时候需要统一图片大小
(2)MyDataset编写
Pytorch中的DataLoader
在需要传入你自己封装好的MyDataset
,它是torch.utils.data.Dataset
的子类,至少重写__getitem(self)__
、__len(self)__
方法。另外在编写MyDataset
时我们还要引入数据增强,也即transofroms.Compose
import torch
from torch.utils.data import DataLoader
from Parameters import parameters
from preporcess import load_meta, load_image
from torchvision import transforms
transform_train = transforms.Compose(
[
transforms.Resize((112, 112)), # 保证输入图像大小为112×112
transforms.RandomRotation(degrees=45), # 减小倾斜图片影像
transforms.GaussianBlur(kernel_size=(3, 3)), # 抑制模糊图片影响
transforms.RandomHorizontalFlip(), # 左右手
transforms.ToTensor(),
transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) # 标准化
]
)
transform_test = transforms.Compose(
[
transforms.Resize((112, 112)), # 保证输入图像大小为112×112
# transforms.RandomRotation(degrees=45), # 减小倾斜图片影像
# transforms.GaussianBlur(kernel_size=(3, 3)), # 抑制模糊图片影响
# transforms.RandomHorizontalFlip(), # 左右手
transforms.ToTensor(),
transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) # 标准化
]
)
class MyDataset(torch.utils.data.Dataset):
def __init__(self, metadata_path):
self.dataset = load_meta(metadata_path) # [(0, image_path), (), ...]
self.metadata_path = metadata_path
def __getitem__(self, idx):
item = self.dataset[idx]
cls_id, path = int(item[0]), item[1]
img = load_image(path)
if self.metadata_path == parameters.metadata_train_path or self.metadata_path == parameters.metadata_eval_path:
return transform_train(img), cls_id
# 对于测试集不需要数据增强
return transform_test(img), cls_id
def __len__(self):
return len(self.dataset)
(3)模型编写
import torch
import torch.nn as nn
import torch.nn.functional as F
from Parameters import parameters
# 定义mish激活函数
def mish(x):
return x * torch.tanh(F.softplus(x))
# 封装mish激活函数
class Mish(nn.Module):
def __init__(self):
super(Mish, self).__init__()
def forward(self, x):
return mish(x)
# 深度可分离卷积
class DSConv2d(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size):
super(DSConv2d, self).__init__()
# 保证kernel_size必须是奇数
assert kernel_size % 2 == 1, "kernel_size必须为奇数"
self.depth_conv = nn.Conv2d(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=kernel_size,
padding=(kernel_size//2, kernel_size//2),
groups=in_channels
)
self.pointwise_conv = nn.Conv2d(
in_channels=in_channels,
out_channels=out_channels,
kernel_size=1
)
def forward(self, input_x):
out = self.depth_conv(input_x)
out = self.pointwise_conv(out)
return out
# 编写MTB模块(残差网络)
class MTB(nn.Module):
def __init__(self, in_channels):
super(MTB, self).__init__()
self.left_flow = nn.Sequential(
# 点卷积
nn.Conv2d(in_channels=in_channels, out_channels=in_channels, kernel_size=1),
nn.BatchNorm2d(in_channels),
Mish(),
# 深度可分离卷积
DSConv2d(in_channels=in_channels, out_channels=in_channels, kernel_size=3),
nn.BatchNorm2d(in_channels),
Mish(),
# 7×7卷积
nn.Conv2d(in_channels=in_channels, out_channels=in_channels, kernel_size=7, padding=(7//2, 7//2)),
)
self.right_flow = nn.Sequential(
# 7×7卷积
nn.Conv2d(in_channels=in_channels, out_channels=in_channels, kernel_size=7, padding=(7 // 2, 7 // 2)),
nn.BatchNorm2d(in_channels),
Mish(),
# 深度可分离卷积
DSConv2d(in_channels=in_channels, out_channels=in_channels, kernel_size=3),
nn.BatchNorm2d(in_channels),
Mish(),
# 点卷积
nn.Conv2d(in_channels=in_channels, out_channels=in_channels, kernel_size=1),
)
def forward(self, input_ft):
left = self.left_flow(input_ft)
right = self.right_flow(input_ft)
out = left + right + input_ft
out = mish(out)
return out
# [N, 3, 112, 112] -> [N, 256, 7, 7]
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(in_channels=parameters.data_channels, out_channels=64, kernel_size=3, padding=(3//2, 3//2)),
nn.BatchNorm2d(64),
Mish(),
# MTB模块
MTB(in_channels=64),
nn.MaxPool2d(kernel_size=2, stride=2),
nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, padding=(3 // 2, 3 // 2)),
nn.BatchNorm2d(128),
Mish(),
# MTB模块
MTB(in_channels=128),
nn.MaxPool2d(kernel_size=2, stride=2),
nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, padding=(3 // 2, 3 // 2)),
nn.BatchNorm2d(256),
Mish(),
# MTB模块
MTB(in_channels=256),
nn.MaxPool2d(kernel_size=2, stride=2),
MTB(in_channels=256),
MTB(in_channels以上是关于第七节:CNN练习2手势识别的主要内容,如果未能解决你的问题,请参考以下文章