TensorBoard 中的 hp_metric 是啥以及如何摆脱它?
Posted
技术标签:
【中文标题】TensorBoard 中的 hp_metric 是啥以及如何摆脱它?【英文标题】:What is hp_metric in TensorBoard and how to get rid of it?TensorBoard 中的 hp_metric 是什么以及如何摆脱它? 【发布时间】:2021-04-03 15:16:54 【问题描述】:我是 Tensorboard 的新手。
我正在使用相当简单的代码运行一个实验,这是输出:
我不记得要求hp_metric
图表,但它在这里。
它是什么以及如何摆脱它?
使用 Pytorch Lightning 重现完整代码(不是我认为任何人都应该重现此问题才能回答):
请注意唯一取消引用 TensorBoard 的行是
self.logger.experiment.add_scalars("losses", "train_loss": loss, global_step=self.current_epoch)
import torch
from torch import nn
import torch.nn.functional as F
from typing import List, Optional
from pytorch_lightning.core.lightning import LightningModule
from Testing.Research.toy_datasets.ClustersDataset import ClustersDataset
from torch.utils.data import DataLoader
from Testing.Research.config.ConfigProvider import ConfigProvider
from pytorch_lightning import Trainer, seed_everything
from torch import optim
import os
from pytorch_lightning.loggers import TensorBoardLogger
class VAEFC(LightningModule):
# see https://towardsdatascience.com/understanding-variational-autoencoders-vaes-f70510919f73
# for possible upgrades, see https://arxiv.org/pdf/1602.02282.pdf
# https://stats.stackexchange.com/questions/332179/how-to-weight-kld-loss-vs-reconstruction-loss-in-variational-auto-encoder
def __init__(self, encoder_layer_sizes: List, decoder_layer_sizes: List, config):
super(VAEFC, self).__init__()
self._config = config
self.logger: Optional[TensorBoardLogger] = None
assert len(encoder_layer_sizes) >= 3, "must have at least 3 layers (2 hidden)"
# encoder layers
self._encoder_layers = nn.ModuleList()
for i in range(1, len(encoder_layer_sizes) - 1):
enc_layer = nn.Linear(encoder_layer_sizes[i - 1], encoder_layer_sizes[i])
self._encoder_layers.append(enc_layer)
# predict mean and covariance vectors
self._mean_layer = nn.Linear(encoder_layer_sizes[
len(encoder_layer_sizes) - 2],
encoder_layer_sizes[len(encoder_layer_sizes) - 1])
self._logvar_layer = nn.Linear(encoder_layer_sizes[
len(encoder_layer_sizes) - 2],
encoder_layer_sizes[len(encoder_layer_sizes) - 1])
# decoder layers
self._decoder_layers = nn.ModuleList()
for i in range(1, len(decoder_layer_sizes)):
dec_layer = nn.Linear(decoder_layer_sizes[i - 1], decoder_layer_sizes[i])
self._decoder_layers.append(dec_layer)
self._recon_function = nn.MSELoss(reduction='mean')
def _encode(self, x):
for i in range(len(self._encoder_layers)):
layer = self._encoder_layers[i]
x = F.relu(layer(x))
mean_output = self._mean_layer(x)
logvar_output = self._logvar_layer(x)
return mean_output, logvar_output
def _reparametrize(self, mu, logvar):
if not self.training:
return mu
std = logvar.mul(0.5).exp_()
if std.is_cuda:
eps = torch.cuda.FloatTensor(std.size()).normal_()
else:
eps = torch.FloatTensor(std.size()).normal_()
reparameterized = eps.mul(std).add_(mu)
return reparameterized
def _decode(self, z):
for i in range(len(self._decoder_layers) - 1):
layer = self._decoder_layers[i]
z = F.relu((layer(z)))
decoded = self._decoder_layers[len(self._decoder_layers) - 1](z)
# decoded = F.sigmoid(self._decoder_layers[len(self._decoder_layers)-1](z))
return decoded
def _loss_function(self, recon_x, x, mu, logvar, reconstruction_function):
"""
recon_x: generating images
x: origin images
mu: latent mean
logvar: latent log variance
"""
binary_cross_entropy = reconstruction_function(recon_x, x) # mse loss TODO see if mse or cross entropy
# loss = 0.5 * sum(1 + log(sigma^2) - mu^2 - sigma^2)
kld_element = mu.pow(2).add_(logvar.exp()).mul_(-1).add_(1).add_(logvar)
kld = torch.sum(kld_element).mul_(-0.5)
# KL divergence Kullback–Leibler divergence, regularization term for VAE
# It is a measure of how different two probability distributions are different from each other.
# We are trying to force the distributions closer while keeping the reconstruction loss low.
# see https://towardsdatascience.com/understanding-variational-autoencoders-vaes-f70510919f73
# read on weighting the regularization term here:
# https://stats.stackexchange.com/questions/332179/how-to-weight-kld-loss-vs-reconstruction-loss-in-variational
# -auto-encoder
return binary_cross_entropy + kld * self._config.regularization_factor
def training_step(self, batch, batch_index):
orig_batch, noisy_batch, _ = batch
noisy_batch = noisy_batch.view(noisy_batch.size(0), -1)
recon_batch, mu, logvar = self.forward(noisy_batch)
loss = self._loss_function(
recon_batch,
orig_batch, mu, logvar,
reconstruction_function=self._recon_function
)
# self.logger.experiment.add_scalars("losses", "train_loss": loss)
self.logger.experiment.add_scalars("losses", "train_loss": loss, global_step=self.current_epoch)
# self.logger.experiment.add_scalar("train_loss", loss, self.current_epoch)
self.logger.experiment.flush()
return loss
def train_dataloader(self):
default_dataset, train_dataset, test_dataset = ClustersDataset.clusters_dataset_by_config()
train_dataloader = DataLoader(train_dataset, batch_size=self._config.batch_size, shuffle=True)
return train_dataloader
def test_dataloader(self):
default_dataset, train_dataset, test_dataset = ClustersDataset.clusters_dataset_by_config()
test_dataloader = DataLoader(test_dataset, batch_size=self._config.batch_size, shuffle=True)
return test_dataloader
def configure_optimizers(self):
optimizer = optim.Adam(model.parameters(), lr=self._config.learning_rate)
return optimizer
def forward(self, x):
mu, logvar = self._encode(x)
z = self._reparametrize(mu, logvar)
decoded = self._decode(z)
return decoded, mu, logvar
if __name__ == "__main__":
config = ConfigProvider.get_config()
seed_everything(config.random_seed)
latent_dim = config.latent_dim
enc_layer_sizes = config.enc_layer_sizes + [latent_dim]
dec_layer_sizes = [latent_dim] + config.dec_layer_sizes
model = VAEFC(config=config, encoder_layer_sizes=enc_layer_sizes, decoder_layer_sizes=dec_layer_sizes)
logger = TensorBoardLogger(save_dir='tb_logs', name='VAEFC')
logger.hparams = config # TODO only put here relevant stuff
# trainer = Trainer(gpus=1)
trainer = Trainer(deterministic=config.is_deterministic,
#auto_lr_find=config.auto_lr_find,
#log_gpu_memory='all',
# min_epochs=99999,
max_epochs=config.num_epochs,
default_root_dir=os.getcwd(),
logger=logger
)
# trainer.tune(model)
trainer.fit(model)
print("done training vae with lightning")
ClustersDataset.py
from torch.utils.data import Dataset
import matplotlib.pyplot as plt
import torch
import numpy as np
from Testing.Research.config.ConfigProvider import ConfigProvider
class ClustersDataset(Dataset):
__default_dataset = None
__default_dataset_train = None
__default_dataset_test = None
def __init__(self, cluster_size: int, noise_factor: float = 0, transform=None, n_clusters=2, centers_radius=4.0):
super(ClustersDataset, self).__init__()
self._cluster_size = cluster_size
self._noise_factor = noise_factor
self._n_clusters = n_clusters
self._centers_radius = centers_radius
# self._transform = transform
self._size = self._cluster_size * self._n_clusters
self._create_data_clusters()
self._combine_clusters_to_array()
self._normalize_data()
self._add_noise()
# self._plot()
pass
@staticmethod
def clusters_dataset_by_config():
if ClustersDataset.__default_dataset is not None:
return \
ClustersDataset.__default_dataset, \
ClustersDataset.__default_dataset_train, \
ClustersDataset.__default_dataset_test
config = ConfigProvider.get_config()
default_dataset = ClustersDataset(
cluster_size=config.cluster_size,
noise_factor=config.noise_factor,
transform=None,
n_clusters=config.n_clusters,
centers_radius=config.centers_radius
)
train_size = int(config.train_size * len(default_dataset))
test_size = len(default_dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(default_dataset, [train_size, test_size])
ClustersDataset.__default_dataset = default_dataset
ClustersDataset.__default_dataset_train = train_dataset
ClustersDataset.__default_dataset_test = test_dataset
return default_dataset, train_dataset, test_dataset
def _create_data_clusters(self):
self._clusters = [torch.zeros((self._cluster_size, 2)) for _ in range(self._n_clusters)]
centers_radius = self._centers_radius
for i, c in enumerate(self._clusters):
r, x, y = 3.0, centers_radius * np.cos(i * np.pi * 2 / self._n_clusters), centers_radius * np.sin(
i * np.pi * 2 / self._n_clusters)
cluster_length = 1.1
cluster_start = i * 2 * np.pi / self._n_clusters
cluster_end = cluster_length * (i + 1) * 2 * np.pi / self._n_clusters
cluster_inds = torch.linspace(start=cluster_start, end=cluster_end, steps=self._cluster_size,
dtype=torch.float)
c[:, 0] = r * torch.sin(cluster_inds) + y
c[:, 1] = r * torch.cos(cluster_inds) + x
def _plot(self):
plt.figure()
plt.scatter(self._noisy_values[:, 0], self._noisy_values[:, 1], s=1, color='b', label="noisy_values")
plt.scatter(self._values[:, 0], self._values[:, 1], s=1, color='r', label="values")
plt.legend(loc="upper left")
plt.show()
def _combine_clusters_to_array(self):
size = self._size
self._values = torch.zeros(size, 2)
self._labels = torch.zeros(size, dtype=torch.long)
for i, c in enumerate(self._clusters):
self._values[i * self._cluster_size: (i + 1) * self._cluster_size, :] = self._clusters[i]
self._labels[i * self._cluster_size: (i + 1) * self._cluster_size] = i
def _add_noise(self):
size = self._size
mean = torch.zeros(size, 2)
std = torch.ones(size, 2)
noise = torch.normal(mean, std)
self._noisy_values = torch.zeros(size, 2)
self._noisy_values[:] = self._values
self._noisy_values = self._noisy_values + noise * self._noise_factor
def _normalize_data(self):
values_min, values_max = torch.min(self._values), torch.max(self._values)
self._values = (self._values - values_min) / (values_max - values_min)
self._values = self._values * 2 - 1
def __len__(self):
return self._size # number of samples in the dataset
def __getitem__(self, index):
item = self._values[index, :]
noisy_item = self._noisy_values[index, :]
# if self._transform is not None:
# noisy_item = self._transform(item)
return item, noisy_item, self._labels[index]
@property
def values(self):
return self._values
@property
def noisy_values(self):
return self._noisy_values
配置值(ConfigProvider 只是将它们作为对象返回)
num_epochs: 15
batch_size: 128
learning_rate: 0.0001
auto_lr_find: False
noise_factor: 0.1
regularization_factor: 0.0
cluster_size: 5000
n_clusters: 5
centers_radius: 4.0
train_size: 0.8
latent_dim: 8
enc_layer_sizes: [2, 200, 200, 200]
dec_layer_sizes: [200, 200, 200, 2]
retrain_vae: False
random_seed: 11
is_deterministic: True
【问题讨论】:
【参考方案1】:这是pytorch Lightning中tensorboard的默认设置。您可以将 default_hp_metric
设置为 false 以摆脱此指标。
TensorBoardLogger(save_dir='tb_logs', name='VAEFC', default_hp_metric=False)
hp_metric
可帮助您跟踪不同超参数的模型性能。您可以在tensorboard 中的hparams
处查看。
【讨论】:
“帮助你”是什么意思?我能看到的只是一个单点图。 如果我们不显式创建记录器,而是使用来自子类化pl.LightningModule
的默认self.logger
怎么办?另外,这应该做什么?我所有的跑步都是-1。
>“帮助你”是什么意思?您可以将指标设置为记录在 pytorch-lightning.readthedocs.io/en/latest/extensions/… 中,然后,您可以查看您的超参数,并根据您选择的指标查看哪个结果最好。
@Pete 如果您制定一个独立的答案,我可能会接受它【参考方案2】:
hp_metric
(超参数指标)帮助您调整超参数。
您可以将此指标设置为您喜欢的任何值,如 pytorch official docs 中所述。
然后,您可以查看您的超参数,并根据您选择的指标查看哪个结果最好。
或者,如果您不想要它,您可以按照@joe32140 的回答中的建议禁用它:
您可以将 default_hp_metric 设置为 false 以摆脱该指标。
TensorBoardLogger(save_dir='tb_logs', name='VAEFC', default_hp_metric=False)
【讨论】:
我不明白这种情况是否或如何在多次运行中持续存在 @Gulzar 因为您使用的是 pytorch-lightning,您应该会发现每次进行训练运行时,都会在 Lightning_logs 目录中创建一个新目录。通常这是 version_0、version_1 等。当您运行 tensorboard 并将 --log_dir 设置为 Lightning_logs 的路径时,您应该会在 tensorboard 中看到所有运行。如果您转到 HParams 页面,每次运行都会列出您使用的 HParams(假设它们作为 hparams dict/Namespace 传递给模型)以及记录的 hp_metric 结果。见docs 我知道,但这并不能解释图表是什么。如果每次运行都有一个标量,那么图形是什么? 您将在每次进行验证时记录 hp_metric,因此该图表将显示您的指标如何随步数变化。在您的问题图表中,它没有被记录,因此您只有一个点,但是如果您按照描述进行设置,您将在该运行的整个训练过程中看到指标的时间序列。以上是关于TensorBoard 中的 hp_metric 是啥以及如何摆脱它?的主要内容,如果未能解决你的问题,请参考以下文章
如何从 virtualenv 中的 python 脚本运行 Tensorboard?
TensorBoard 中的 Tensorflow 混淆矩阵