将主数据目录拆分为训练/验证/测试集

Posted 2023-02-16

技术标签:

【中文标题】将主数据目录拆分为训练/验证/测试集【英文标题】：To split the main data directory into Train/validation/test Set 【发布时间】：2021-04-19 07:03:14 【问题描述】：

我正在研究 X 射线图像分类，我的数据存储在 1 个目录中，我需要将其分为训练集、验证集和测试集。我确实设法使用 ImagedDataGenerator 分离了训练集和验证集，但在分离测试集时遇到了麻烦。这是我的代码。

import split

# Path
Images = 'data_processed_cropped_32'
data_set = os.path.join(r'C:\Users\320067835\Desktop\Thesis\Data\png', Images)

#split.ratio('data_processed_cropped_32', output="output", seed=1337, ratio=(0.8, 0.1,0.1))

# Image size
img_width = 32
img_height = 32

# Data augmentation
data_gen = tf.keras.preprocessing.image.ImageDataGenerator(rescale = 1/255, horizontal_flip = True,
                                                            rotation_range = 0,validation_split=0.2)

train_set = data_gen.flow_from_directory(data_set, target_size = (img_width, img_height), color_mode = 'grayscale',
                                        class_mode = 'categorical', batch_size = 32, interpolation = 'nearest',
                                        subset ='training')

validation_set = data_gen.flow_from_directory(data_set, target_size= (img_width,img_height), color_mode='grayscale',
                                              batch_size=32, class_mode='categorical', interpolation= 'nearest',
                                              subset='validation')
# Build a model
cnn = Sequential()

cnn.add(keras.Input(shape = (32,32,1)))
cnn.add(Conv2D(16,(3,3), padding = 'same', activation = 'relu', input_shape= (img_width,img_height,1)))
cnn.add(MaxPooling2D(2,2))
cnn.add(Conv2D(32,(3,3), padding = 'same',activation = 'relu', input_shape= (img_width, img_height,1)))
cnn.add(MaxPooling2D(2,2))

cnn.add(Flatten())

cnn.add(Dense(units = 100, activation = 'relu'))
cnn.add(Dense(units = 50, activation = 'relu'))
cnn.add(Dense(units=23, activation = 'softmax'))
cnn.summary()
cnn.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

cnn.fit(train_set,validation_data = validation_set,epochs = 20)

我厌倦了使用拆分文件夹，但它不起作用。我认为大多数情况下没有正确使用它，因为我不知道在拆分数据后如何访问这 3 个文件夹。或者有没有其他方法可以拆分我的测试集？

【问题讨论】：

This 可能会有所帮助。我经历了它，但我总共有 23 节课，我不确定以这种方式将火车和测试分开是否有意义。对于一些班级来说，这似乎仍然可行。 【参考方案1】：

我经常需要这样做，我开发了一个彻底的函数来完成拆分。它相当冗长，因为它做了很多检查等。代码发布在下面。

import os
import shutil
from tqdm import tqdm
from sklearn.model_selection import train_test_split

def tr_te_val_split(s_dir, dest_dir, train_size, test_size): 
    if train_size <0 or train_size >1:
        print('*** Train size must be a float between 0.0 and 1.0, process terminated ***')
        return
    if test_size <0 or test_size >1:
        print('*** Test size must be a float between 0.0 and 1.0, process terminated ***')
        return
    if test_size + train_size >1:
        print ('*** The sum of the train size plus the test size must be <= 1, process terminating ***')
        return
    
    remainder= 1-train_size # percent available for test and validation
    test_size= test_size/remainder
    if os.path.isdir(dest_dir)==False:
        os.mkdir(dest_dir)
        print ('The dest_dir you specified ', dest_dir, ' does not exist, created it for you ')        
    dest_list=os.listdir(dest_dir) # list content of destination directory
    for d in ['train', 'test', 'valid']:
        d_path=os.path.join(dest_dir,d)
        if d not in dest_list:
            os.mkdir(d_path)  # create train, test and valid directories in the destination directory
        else: # check to see if there are any files in these directories
            d_list=os.listdir(d_path)
            if len(d_list) > 0:  # there are files or directories in d
                cycle=True
                print('*** WARNING***  there is content in ', d_path)
                while cycle:
                    ans=input(' enter D to delete content, C to continue and keep content or Q to Quit ')
                    if ans not in ['D', 'd', 'C', 'c', 'Q', 'q']:
                        print('your response ', ans, ' was not a  D, C or Q, try again')
                    else:
                        cycle=False
                        if ans in ['Q', 'q']:
                            print ('**** PROCESS TERMINATED BY USER ****')
                            return
                        else:
                            if ans in ['D', 'd']:
                                print(' Removing all files and sub directories in ', d_path)
                                for f in d_list:
                                    f_path=os.path.join (d_path,f)
                                    if os.path.isdir(f_path):                                        
                                        shutil.rmtree(f_path)                                        
                                    else:
                                        os.remove(f_path)
            
    class_list=os.listdir(s_dir)  # listof classes     
    for klass in tqdm(class_list): # iterate through the classes
        klass_path=os.path.join(s_dir, klass) # path to class directory
        f_list=os.listdir(klass_path) # get the list of file names
        ftrain, ftv= train_test_split(f_list, train_size=train_size, random_state=123 )
        ftest, fvalid= train_test_split(ftv, train_size= test_size, random_state=123 )        
        for d in ['train', 'test', 'valid']:
            d_path=os.path.join(dest_dir,d)
            d_class_path=os.path.join(d_path,klass)
            if os.path.isdir(d_class_path)==False:
                os.mkdir(d_class_path)
            if d=='train':
                fx=ftrain
            elif d=='test':
                fx=ftest
            else:
                fx=fvalid
            for f in fx:
                f_path=os.path.join(klass_path, f)
                d_f_path=os.path.join(d_class_path,f)
                shutil.copy(f_path, d_f_path)
    for d in ['train', 'test', 'valid']:
        file_count=0
        d_path=os.path.join(dest_dir, d)
        d_list=os.listdir(d_path)
        for klass in d_list:
            klass_path=os.path.join(d_path, klass)
            klass_list=os.listdir(klass_path)
            d_count=len(klass_list)
            file_count=file_count + d_count
            if d == 'train':
                tr_count=file_count
            elif d =='test':
                te_count=file_count
            else:
                tv_count=file_count
    print ('Process Completed ', tr_count, ' training files ', te_count, ' test files and ', tv_count, ' validation files were partitioned')

此函数将 s_dir 中的文件拆分为存储在 dest_dir 中的训练、测试和验证文件。 s_dir 是包含要拆分的文件的目录的完整路径 dest_dir 是目标目录的完整路径。如果它不存在，则创建它。 train_size 是介于 0.0 和 1.0 之间的浮点数，表示要分配为训练文件的文件百分比 test_size 是介于 0.0 和 1.0 之间的浮点数，表示要分配为测试文件的文件百分比在 dest_dir 中创建了三个子目录“train”、“test”和“valid”，用于存储训练文件，测试文件和验证文件。如果这些子目录已经存在，则检查现有内容。如果找到内容，则会打印通知达到那个效果。然后提示用户输入“D”删除内容，“Q”终止程序执行或“C”继续。如果选择了“C”，则不会删除内容，但是如果存在任何文件，则可能会覆盖文件文件与添加到子目录的新文件具有相同的文件名。注意 test、train 和 valid 目录是否存在并有内容，并且用户选择“c”继续 s_dir 中的子目录和文件被附加到 test、train 和 valid 子目录的内容中在 dest_dir 此功能使用了必须安装在您的工作环境中的 tqdm 和 sklearn¶

【讨论】：

以上是关于将主数据目录拆分为训练/验证/测试集的主要内容，如果未能解决你的问题，请参考以下文章