原始数据划分以及TFrecords实战

Posted 2020-11-06 stoner
tags:
篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了原始数据划分以及TFrecords实战相关的知识，希望对你有一定的参考价值。
1. 前言

? ? ?本次代码是实现对自己的数据进行训练集、验证集和测试集划分，以及将三个集合制作成.TFrecords文件的实际操作，其中原始图片是Kaggle经典的猫狗大战的训练集中各抽出100章图片组合成的。
? ? ?其中总的图片数目为200张，训练集设定为总数据的70%，验证集为总数据的20%，而测试集为总数据的10%。
2. 实际代码

#_*_ coding:utf-8 _*_
"""
@author:Stoner
@time:2018/5/1722:14
"""
import os
import numpy as np
import math
from sklearn.model_selection import train_test_split
import tensorflow as tf
from PIL import Image
import matplotlib.pyplot as plt

#存放图片文件的地址
data_path = 'Cat_vs_Dog/'
#各个集合所占的比例
train_size = 0.7
val_size = 0.2
#TFrecords文件存放文件夹：
tfrecords_path = 'Train_val_test_tfrecords/'
#选择需要将train、val还是test转换为TFrecords文件
tfrecords_list = ['train','val','test']
tfrecords_choise = tfrecords_list[2]
#.tfrecords文件所在目录
filename = os.path.join(tfrecords_path,tfrecords_choise+'.tfrecords')
BATCH_SIZE = 4


#获得数据，并将其转换为数据集、交叉验证机以及测试集
def getDatafile(data_path,train_size,val_size):
    #用于存放从文件中读取到的文件名
    images_path = []
    #os.walk是一个简单易用的文件、目录遍历器
    for root,sub_folders,files in os.walk('Cat_vs_Dog'):
        for name in files:
            images_path.append(os.path.join(root,name))
    # print('root:\n',root)
    # print('sub_folder:\n',sub_folders)
    # print('files:\n',files)
    # print('images_path:\n',images_path)

    # 用于存放图片数据集所有的标签
    labels = []
    for image_path in images_path:
        label = int(image_path.replace('\\','/')[11])  # 将对应的label提取出来
        labels.append(label)
    print('labels:\n',labels)

    # 先将图片路径和标签合并
    print('np.array([images_path, labels]):\n',np.array([images_path, labels]))
    temp = np.array([images_path, labels]).transpose()
    #通过transpose将数组合并，也就是文件和标签对应
    print('temp:\n',temp)
    # 提前随机打乱
    np.random.shuffle(temp)

    #temp第0列全为image数据
    images_data_list = temp[:, 0]    # image path
    #temp第1列全为label数据
    labels_data_list = temp[:, 1]         # label

    #通过sklearn完成数据划分
    ''' 
    X_train,X_test,y_train,y_test = train_test_split(images_data_list,labels_data_list,test_size=0.3,random_state=0)
    print(X_train)
    y_test = [int(float(i)) for i in y_test]
    print(y_train)
    '''

    # 手动代码实现数据集的划分
    # math.ceil()函数返回数字的上入整数
    # 1.首先实现训练集、验证集和测试集的划分数目
    train_num = math.ceil(len(temp) * train_size)
    val_num = math.ceil(len(temp) * val_size)

    #训练集数据划分
    train_img = images_data_list[0:train_num]
    train_labels = labels_data_list[0:train_num]
    train_labels = [int(float(i)) for i in train_labels]
    # print(train_img)
    # print(train_labels)

    #验证集数据划分
    val_img = images_data_list[train_num:train_num+val_num]
    val_labels = labels_data_list[train_num:train_num+val_num]
    val_labels = [int(float(i)) for i in val_labels]

    #测试集数据划分
    test_img = images_data_list[train_num+val_num:]
    test_labels = labels_data_list[train_num+val_num:]
    test_labels = [int(float(i)) for i in test_labels]

    #打印查看
    print('训练集数据：\n',len(train_img))
    print('测试集标签：\n',len(test_labels))

    #把训练集、验证集和测试集存放在一个字典中，方便调用
    data = {
        'train_img':train_img,
        'train_labels':train_labels,
        'val_img':val_img,
        'val_labels':val_labels,
        'test_img':test_img,
        'test_labels':test_labels
    }
    # 返回图片路径列表和对应标签列表
    return data


#把传入的value转化为整数型的属性，int64_list对应着 tf.train.Example 的定义
def _int64_feature(value):
    return tf.train.Feature(int64_list = tf.train.Int64List(value = [value]))
#把传入的value转化为字符串型的属性，bytes_list对应着 tf.train.Example 的定义
def _bytes_feature(value):
    return tf.train.Feature(bytes_list = tf.train.BytesList(value = [value]))


#制作TFrecords文件
def create_record(data,data_path,tfrecords_path,tfrecords_choise):
    # 根据tfrecords_choise来对应输出TFrecords文件
    writer = tf.python_io.TFRecordWriter(tfrecords_path +tfrecords_choise+'.tfrecords')
    choice_data = data[tfrecords_choise+'_img']
    choice_labels = data[tfrecords_choise + '_labels']

    #打印看一下自己选择的是哪个数据集及其大小
    print('选择的数据集是：',tfrecords_choise)
    print('%s集的大小为：'%tfrecords_choise,data[tfrecords_choise+'_img'].shape)


    for i in range(len(choice_data)):
        img_path = choice_data[i]    #每个图片的地址
        img = Image.open(img_path)
        img = img.resize((208, 208))
        img_raw = img.tobytes()  #将图片转化为二进制格式
        example = tf.train.Example(features = tf.train.Features(feature = {
                                    "label": _int64_feature(choice_labels[i]),
                                    "img_raw": _bytes_feature(img_raw),
                                    }))
        writer.write(example.SerializeToString())  #序列化为字符串
    writer.close()


#解析TFrecords文件
def read_and_decode(filename, batch_size): #读取.tfrecords文件
    # 创建一个队列
    filename_queue = tf.train.string_input_producer([filename])
    reader = tf.TFRecordReader()
    #返回文件名和文件
    _, serialized_example = reader.read(filename_queue)
    #features保存'label'和'img_raw'
    features = tf.parse_single_example(serialized_example,
                        features={
                        'label': tf.FixedLenFeature([], tf.int64),
                        'img_raw' : tf.FixedLenFeature([], tf.string),
                        })

    img = tf.decode_raw(features['img_raw'], tf.uint8)
    #这里将img从string转换为uint
    img = tf.reshape(img, [208, 208, 3])
    ima = tf.cast(img, tf.float32) * (1/255)
    label = tf.cast(features['label'], tf.int32)
    #打乱顺序组合成batch
    img_batch, label_batch = tf.train.shuffle_batch([img, label],
                                            batch_size= batch_size,
                                            num_threads=64,
                                            capacity=2000,
                                            min_after_dequeue=1500,
                                            )
    #返回指定数据集生成的batch
    return img_batch, tf.reshape(label_batch,[batch_size])

#主函数
if __name__ == '__main__':

    #返回包含训练集、验证集和测试集的数据综合
    data = getDatafile(data_path, train_size, val_size)
    #通过tfrecords_choise可以指定将哪个集转化为TFrecords文件
    create_record(data,data_path, tfrecords_path,tfrecords_choise)
    #生成指定数据集的TFrecords文件
    image_batch, label_batch = read_and_decode(filename, BATCH_SIZE)
    #打印查看batch的类型、大小等信息
    print('image_batch.type:',image_batch)
    print('label_batch.type:', label_batch)

    with tf.Session()  as sess:
        i = 0
        # 启动多线程处理输入数据
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(coord=coord)

        try:
            while not coord.should_stop() and i<1:
                #获取batch这个Tensor中的图像和标签的值
                images, labels = sess.run([image_batch, label_batch])
                print('image.type:',type(images))
                print('image.shape:',images.shape)
                plt.figure(figsize=(10, 8))
                for j in np.arange(BATCH_SIZE):
                    plt.subplot(2, images.shape[0]/2, (j + 1))
                    plt.title('label: %d' % labels[j],fontsize = 16)
                    plt.imshow(images[j,:,:,:])
                plt.show()
                i+=1
        except tf.errors.OutOfRangeError:
            print('done!')
        finally:
            # When done, ask the threads to stop.
            coord.request_stop()
        #等待线程结束
        coord.join(threads)
以上是关于原始数据划分以及TFrecords实战的主要内容，如果未能解决你的问题，请参考以下文章