Ray[tune] for pytorch TypeError: ray.cloudpickle.dumps
Posted
技术标签:
【中文标题】Ray[tune] for pytorch TypeError: ray.cloudpickle.dumps【英文标题】: 【发布时间】:2021-11-05 16:01:05 【问题描述】:我在开始使用 Ray 的曲调时遇到问题。我有一个要训练的 PyTorch 模型,我正在尝试使用这个库进行微调。我是 Raytune 的新手,所以请多多包涵并帮助我了解错误的根源。
我的训练功能:
> # Training and Validation
def train(resnet18, start_epoch=0, end_epoch=500, checkpoint_dir=None, optimizer=None, train_dataloader=None, val_dataloader=None):
n_epochs = 500
epoch = 0
since = time.time()
train_loss_history = []
val_loss_history = []
best_acc = 0
# Load checkpoint
if checkpoint:
checkpoint = torch.load(checkpoint_dir)
resnet18.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch'] + 1
best_acc = checkpoint['best_val_acc']
optimizer.param_groups[0]['lr'] = 0.001
for epoch in range(start_epoch, end_epoch):
print(f'Epoch epoch/n_epochs-1')
running_loss, running_corrects = 0.0, 0.0
resnet18.train()
for inputs, labels in train_dataloader:
inputs = inputs.to(device)
labels = labels.to(device)
optimizer.zero_grad()
outputs = resnet18(inputs)
loss = criterion(outputs, labels)
_, preds = torch.max(outputs, 1)
loss.backward()
optimizer.step()
running_loss += loss.item() * inputs.size(0)
running_corrects += torch.sum(preds == labels.data)
epoch_loss = running_loss / len(train_dataset)
train_loss_history.append(epoch_loss)
epoch_acc = running_corrects.float() / len(train_dataset)
sw.add_scalar("train/loss", epoch_loss, epoch)
sw.add_scalar("train/Acc", epoch_acc, epoch)
resnet18.eval()
running_loss, running_corrects = 0.0, 0.0
for inputs, labels in val_dataloader:
outputs = resnet18(inputs)
loss = criterion(outputs, labels)
_, preds = torch.max(outputs, 1)
running_loss += loss.item() * inputs.size(0)
running_corrects += torch.sum(preds == labels.data)
epoch_val_loss = running_loss / len(val_dataset)
val_loss_history.append(epoch_val_loss)
epoch_val_acc = running_corrects.float() / len(val_dataset)
sw.add_scalar("val/loss", epoch_val_loss, epoch)
sw.add_scalar("val/Acc", epoch_val_acc, epoch)
if epoch_val_acc > best_acc:
best_acc = epoch_val_acc
torch.save(resnet18.state_dict(), "best_res18_1.pt")
torch.save(
'epoch': epoch,
'model_state_dict': resnet18.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'train_loss': epoch_loss,
'val_loss': epoch_val_loss,
'best_val_acc': best_acc
, "checkpoint_res18.pt")
# sw.add_scalar("learning_rate", lr_update.get_last_lr()[0], epoch)
# lr_update.step()
print(f'train loss: epoch_loss train Acc: epoch_acc val loss: epoch_val_loss val Acc: epoch_val_acc')
print('-'*10)
print(f'training time: time.time() - since')
return best_acc
我在这里训练尝试调整:
def training_function(config):
sgd = optim.SGD(resnet18.parameters(), lr=config['lr'], momentum=config['mom'], nesterov=config['nest'])
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=False, sampler=weightedSampler)
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=64, shuffle=True)
resnet18 = models.resnet18(pretrained=False).to(device)
resnet18.fc = nn.Linear(512, 29).to(device)
for i in range(10):
best_acc = train(resnet18=resnet18, optimizer=sgd, train_dataloader=train_dataloader, val_dataloader=val_dataloader)
tune.report(mean_accuracy=best_acc)
config = 'lr': tune.grid_search([0.01, 0.001, 0.0001, 0.00001]),
'mom': tune.grid_search([0.98, 0.9, 0.8]),
'nest': tune.choice([True, False])
analysis = tune.run(training_function, config)
以及我得到的错误:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-8-e74d4c4e9dfc> in <module>
13 'nest': tune.choice([True, False])
14
---> 15 analysis = tune.run(training_function, config)
~/anaconda3/envs/py37-start/lib/python3.7/site-packages/ray/tune/tune.py in run(run_or_experiment, name, metric, mode, stop, time_budget_s, config, resources_per_trial, num_samples, local_dir, search_alg, scheduler, keep_checkpoints_num, checkpoint_score_attr, checkpoint_freq, checkpoint_at_end, verbose, progress_reporter, log_to_file, trial_name_creator, trial_dirname_creator, sync_config, export_formats, max_failures, fail_fast, restore, server_port, resume, queue_trials, reuse_actors, trial_executor, raise_on_failed_trial, callbacks, loggers, ray_auto_init, run_errored_only, global_checkpoint_period, with_server, upload_dir, sync_to_cloud, sync_to_driver, sync_on_checkpoint, _remote)
415 export_formats=export_formats,
416 max_failures=max_failures,
--> 417 restore=restore)
418 else:
419 logger.debug("Ignoring some parameters passed into tune.run.")
~/anaconda3/envs/py37-start/lib/python3.7/site-packages/ray/tune/experiment.py in __init__(self, name, run, stop, time_budget_s, config, resources_per_trial, num_samples, local_dir, upload_dir, trial_name_creator, trial_dirname_creator, loggers, log_to_file, sync_to_driver, sync_to_cloud, checkpoint_freq, checkpoint_at_end, sync_on_checkpoint, keep_checkpoints_num, checkpoint_score_attr, export_formats, max_failures, restore)
150 "checkpointable function. You can specify checkpoints "
151 "within your trainable function.")
--> 152 self._run_identifier = Experiment.register_if_needed(run)
153 self.name = name or self._run_identifier
154
~/anaconda3/envs/py37-start/lib/python3.7/site-packages/ray/tune/experiment.py in register_if_needed(cls, run_object)
303 "\n-If the error is typing-related, try removing "
304 "the type annotations and try again.")
--> 305 raise type(e)(str(e) + " " + extra_msg) from None
306 return name
307 else:
TypeError: ray.cloudpickle.dumps(<class 'ray.tune.function_runner.wrap_function.<locals>.ImplicitFunc'>) failed.
To check which non-serializable variables are captured in scope, re-run the ray script with 'RAY_PICKLE_VERBOSE_DEBUG=1'. Other options:
-Try reproducing the issue by calling `pickle.dumps(trainable)`.
-If the error is typing-related, try removing the type annotations and try again.
我的猜测是我遗漏了一些明显的东西,我在 SO 或 github 上找不到这个错误。 谢谢。
【问题讨论】:
【参考方案1】:我尝试运行您的代码。没有遇到任何序列化问题。这是结果(错误只是因为我没有包含每个 pytorch 依赖项)
| Trial name | status | loc | lr | mom | nest |
|-------------------------------+----------+-------+--------+-------+--------|
| training_function_dc6c3_00000 | ERROR | | 0.01 | 0.98 | False |
| training_function_dc6c3_00001 | ERROR | | 0.001 | 0.98 | True |
| training_function_dc6c3_00002 | ERROR | | 0.0001 | 0.98 | False |
| training_function_dc6c3_00003 | ERROR | | 1e-05 | 0.98 | True |
| training_function_dc6c3_00004 | ERROR | | 0.01 | 0.9 | False |
| training_function_dc6c3_00005 | ERROR | | 0.001 | 0.9 | True |
| training_function_dc6c3_00006 | ERROR | | 0.0001 | 0.9 | True |
| training_function_dc6c3_00007 | ERROR | | 1e-05 | 0.9 | True |
| training_function_dc6c3_00008 | ERROR | | 0.01 | 0.8 | False |
| training_function_dc6c3_00009 | ERROR | | 0.001 | 0.8 | False |
| training_function_dc6c3_00010 | ERROR | | 0.0001 | 0.8 | True |
| training_function_dc6c3_00011 | ERROR | | 1e-05 | 0.8 | False |
+-------------------------------+----------+-------+--------+-------+--------+
Number of errored trials: 12
+-------------------------------+--------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name | # failures | error file |
|-------------------------------+--------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------|
| training_function_dc6c3_00000 | 1 | /Users/xwjiang/ray_results/training_function_2021-09-11_07-52-18/training_function_dc6c3_00000_0_lr=0.01,mom=0.98,nest=False_2021-09-11_07-52-19/error.txt |
| training_function_dc6c3_00001 | 1 | /Users/xwjiang/ray_results/training_function_2021-09-11_07-52-18/training_function_dc6c3_00001_1_lr=0.001,mom=0.98,nest=True_2021-09-11_07-52-19/error.txt |
| training_function_dc6c3_00002 | 1 | /Users/xwjiang/ray_results/training_function_2021-09-11_07-52-18/training_function_dc6c3_00002_2_lr=0.0001,mom=0.98,nest=False_2021-09-11_07-52-19/error.txt |
| training_function_dc6c3_00003 | 1 | /Users/xwjiang/ray_results/training_function_2021-09-11_07-52-18/training_function_dc6c3_00003_3_lr=1e-05,mom=0.98,nest=True_2021-09-11_07-52-19/error.txt |
| training_function_dc6c3_00004 | 1 | /Users/xwjiang/ray_results/training_function_2021-09-11_07-52-18/training_function_dc6c3_00004_4_lr=0.01,mom=0.9,nest=False_2021-09-11_07-52-19/error.txt |
| training_function_dc6c3_00005 | 1 | /Users/xwjiang/ray_results/training_function_2021-09-11_07-52-18/training_function_dc6c3_00005_5_lr=0.001,mom=0.9,nest=True_2021-09-11_07-52-19/error.txt |
| training_function_dc6c3_00006 | 1 | /Users/xwjiang/ray_results/training_function_2021-09-11_07-52-18/training_function_dc6c3_00006_6_lr=0.0001,mom=0.9,nest=True_2021-09-11_07-52-19/error.txt |
| training_function_dc6c3_00007 | 1 | /Users/xwjiang/ray_results/training_function_2021-09-11_07-52-18/training_function_dc6c3_00007_7_lr=1e-05,mom=0.9,nest=True_2021-09-11_07-52-19/error.txt |
| training_function_dc6c3_00008 | 1 | /Users/xwjiang/ray_results/training_function_2021-09-11_07-52-18/training_function_dc6c3_00008_8_lr=0.01,mom=0.8,nest=False_2021-09-11_07-52-19/error.txt |
| training_function_dc6c3_00009 | 1 | /Users/xwjiang/ray_results/training_function_2021-09-11_07-52-18/training_function_dc6c3_00009_9_lr=0.001,mom=0.8,nest=False_2021-09-11_07-52-19/error.txt |
| training_function_dc6c3_00010 | 1 | /Users/xwjiang/ray_results/training_function_2021-09-11_07-52-18/training_function_dc6c3_00010_10_lr=0.0001,mom=0.8,nest=True_2021-09-11_07-52-19/error.txt |
| training_function_dc6c3_00011 | 1 | /Users/xwjiang/ray_results/training_function_2021-09-11_07-52-18/training_function_dc6c3_00011_11_lr=1e-05,mom=0.8,nest=False_2021-09-11_07-52-19/error.txt |
顺便说一句,当你打电话给tune.run
时,就像 tune.run(train_func, config=config) 一样。否则你传入的配置会被误认为name
。
【讨论】:
以上是关于Ray[tune] for pytorch TypeError: ray.cloudpickle.dumps的主要内容,如果未能解决你的问题,请参考以下文章
[Ray.Tune] [已解决] TypeError: ray.cloudpickle.dumps
[Ray.Tune] [已解决] TypeError: ray.cloudpickle.dumps