将主数据目录拆分为训练/验证/测试集
Posted
技术标签:
【中文标题】将主数据目录拆分为训练/验证/测试集【英文标题】:To split the main data directory into Train/validation/test Set 【发布时间】:2021-04-19 07:03:14 【问题描述】:我正在研究 X 射线图像分类,我的数据存储在 1 个目录中,我需要将其分为训练集、验证集和测试集。我确实设法使用 ImagedDataGenerator 分离了训练集和验证集,但在分离测试集时遇到了麻烦。这是我的代码。
import split
# Path
Images = 'data_processed_cropped_32'
data_set = os.path.join(r'C:\Users\320067835\Desktop\Thesis\Data\png', Images)
#split.ratio('data_processed_cropped_32', output="output", seed=1337, ratio=(0.8, 0.1,0.1))
# Image size
img_width = 32
img_height = 32
# Data augmentation
data_gen = tf.keras.preprocessing.image.ImageDataGenerator(rescale = 1/255, horizontal_flip = True,
rotation_range = 0,validation_split=0.2)
train_set = data_gen.flow_from_directory(data_set, target_size = (img_width, img_height), color_mode = 'grayscale',
class_mode = 'categorical', batch_size = 32, interpolation = 'nearest',
subset ='training')
validation_set = data_gen.flow_from_directory(data_set, target_size= (img_width,img_height), color_mode='grayscale',
batch_size=32, class_mode='categorical', interpolation= 'nearest',
subset='validation')
# Build a model
cnn = Sequential()
cnn.add(keras.Input(shape = (32,32,1)))
cnn.add(Conv2D(16,(3,3), padding = 'same', activation = 'relu', input_shape= (img_width,img_height,1)))
cnn.add(MaxPooling2D(2,2))
cnn.add(Conv2D(32,(3,3), padding = 'same',activation = 'relu', input_shape= (img_width, img_height,1)))
cnn.add(MaxPooling2D(2,2))
cnn.add(Flatten())
cnn.add(Dense(units = 100, activation = 'relu'))
cnn.add(Dense(units = 50, activation = 'relu'))
cnn.add(Dense(units=23, activation = 'softmax'))
cnn.summary()
cnn.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
cnn.fit(train_set,validation_data = validation_set,epochs = 20)
我厌倦了使用拆分文件夹,但它不起作用。我认为大多数情况下没有正确使用它,因为我不知道在拆分数据后如何访问这 3 个文件夹。 或者有没有其他方法可以拆分我的测试集?
【问题讨论】:
This 可能会有所帮助。 我经历了它,但我总共有 23 节课,我不确定以这种方式将火车和测试分开是否有意义。对于一些班级来说,这似乎仍然可行。 【参考方案1】:我经常需要这样做,我开发了一个彻底的函数来完成拆分。它相当冗长,因为它做了很多检查等。代码发布在下面。
import os
import shutil
from tqdm import tqdm
from sklearn.model_selection import train_test_split
def tr_te_val_split(s_dir, dest_dir, train_size, test_size):
if train_size <0 or train_size >1:
print('*** Train size must be a float between 0.0 and 1.0, process terminated ***')
return
if test_size <0 or test_size >1:
print('*** Test size must be a float between 0.0 and 1.0, process terminated ***')
return
if test_size + train_size >1:
print ('*** The sum of the train size plus the test size must be <= 1, process terminating ***')
return
remainder= 1-train_size # percent available for test and validation
test_size= test_size/remainder
if os.path.isdir(dest_dir)==False:
os.mkdir(dest_dir)
print ('The dest_dir you specified ', dest_dir, ' does not exist, created it for you ')
dest_list=os.listdir(dest_dir) # list content of destination directory
for d in ['train', 'test', 'valid']:
d_path=os.path.join(dest_dir,d)
if d not in dest_list:
os.mkdir(d_path) # create train, test and valid directories in the destination directory
else: # check to see if there are any files in these directories
d_list=os.listdir(d_path)
if len(d_list) > 0: # there are files or directories in d
cycle=True
print('*** WARNING*** there is content in ', d_path)
while cycle:
ans=input(' enter D to delete content, C to continue and keep content or Q to Quit ')
if ans not in ['D', 'd', 'C', 'c', 'Q', 'q']:
print('your response ', ans, ' was not a D, C or Q, try again')
else:
cycle=False
if ans in ['Q', 'q']:
print ('**** PROCESS TERMINATED BY USER ****')
return
else:
if ans in ['D', 'd']:
print(' Removing all files and sub directories in ', d_path)
for f in d_list:
f_path=os.path.join (d_path,f)
if os.path.isdir(f_path):
shutil.rmtree(f_path)
else:
os.remove(f_path)
class_list=os.listdir(s_dir) # listof classes
for klass in tqdm(class_list): # iterate through the classes
klass_path=os.path.join(s_dir, klass) # path to class directory
f_list=os.listdir(klass_path) # get the list of file names
ftrain, ftv= train_test_split(f_list, train_size=train_size, random_state=123 )
ftest, fvalid= train_test_split(ftv, train_size= test_size, random_state=123 )
for d in ['train', 'test', 'valid']:
d_path=os.path.join(dest_dir,d)
d_class_path=os.path.join(d_path,klass)
if os.path.isdir(d_class_path)==False:
os.mkdir(d_class_path)
if d=='train':
fx=ftrain
elif d=='test':
fx=ftest
else:
fx=fvalid
for f in fx:
f_path=os.path.join(klass_path, f)
d_f_path=os.path.join(d_class_path,f)
shutil.copy(f_path, d_f_path)
for d in ['train', 'test', 'valid']:
file_count=0
d_path=os.path.join(dest_dir, d)
d_list=os.listdir(d_path)
for klass in d_list:
klass_path=os.path.join(d_path, klass)
klass_list=os.listdir(klass_path)
d_count=len(klass_list)
file_count=file_count + d_count
if d == 'train':
tr_count=file_count
elif d =='test':
te_count=file_count
else:
tv_count=file_count
print ('Process Completed ', tr_count, ' training files ', te_count, ' test files and ', tv_count, ' validation files were partitioned')
此函数将 s_dir 中的文件拆分为存储在 dest_dir 中的训练、测试和验证文件。 s_dir 是包含要拆分的文件的目录的完整路径 dest_dir 是目标目录的完整路径。如果它不存在,则创建它。 train_size 是介于 0.0 和 1.0 之间的浮点数,表示要分配为训练文件的文件百分比 test_size 是介于 0.0 和 1.0 之间的浮点数,表示要分配为测试文件的文件百分比 在 dest_dir 中创建了三个子目录“train”、“test”和“valid”,用于存储训练文件, 测试文件和验证文件。 如果这些子目录已经存在,则检查现有内容。如果找到内容,则会打印通知 达到那个效果。然后提示用户输入“D”删除内容,“Q”终止程序执行 或“C”继续。如果选择了“C”,则不会删除内容,但是如果存在任何文件,则可能会覆盖文件 文件与添加到子目录的新文件具有相同的文件名。 注意 test、train 和 valid 目录是否存在并有内容,并且用户选择“c”继续 s_dir 中的子目录和文件被附加到 test、train 和 valid 子目录的内容中 在 dest_dir 此功能使用了必须安装在您的工作环境中的 tqdm 和 sklearn¶
【讨论】:
以上是关于将主数据目录拆分为训练/验证/测试集的主要内容,如果未能解决你的问题,请参考以下文章