pytorch神经网络对Excel数据集进行处理(读取,转为tensor格式,归一化),并且以鸢尾花(iris)数据集为例,实现BP神经网络
Posted <编程路上>
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了pytorch神经网络对Excel数据集进行处理(读取,转为tensor格式,归一化),并且以鸢尾花(iris)数据集为例,实现BP神经网络相关的知识,希望对你有一定的参考价值。
最近跟导师做的项目是关于BP,LSTN神经网络的,数据集对象是一些Excel表格类型的,我使用pytorch进行训练,读取Excel表格数据的时候统一进行一些处理,所以我想把它封装到函数,以后处理其它数据集,直接调用函数实现,这不就方便了吗。
我将以鸢尾花数据集作为例子进行展示:
我已经编写了2.0版本,方法更加集成化,建议使用2.0版本:2.0
可以看到鸢尾花数据集有四个特征,分别是0,1,2,3,label是鸢尾花种类,共三种,分别以0,1,2表示。
首先第一部分是读取Excel数据(需要主要的是标签需要在最后一列,函数默认最后一列为标签,前边的为特征值):
def open_excel(filename):
"""
打开数据集,进行数据处理
:param filename:文件名
:return:特征集数据、标签集数据
"""
readbook = pd.read_excel(f'filename.xlsx', engine='openpyxl')
nplist = readbook.T.to_numpy()
data = nplist[0:-1].T
data = np.float64(data)
target = nplist[-1]
return data, target
def open_csv(filename):
"""
打开数据集,进行数据处理
:param filename:文件名
:return:特征集数据、标签集数据
"""
readbook = pd.read_csv(f'filename.csv')
nplist = readbook.T.to_numpy()
data = nplist[0:-1].T
data = np.float64(data)
target = nplist[-1]
return data, target
使用方法为feature, label = open_excel('iris')
,输入为Excel名字,返回值为numpy类型的特征值和标签。
第二个函数是将数据划分为训练集和测试集:
def random_number(data_size, key):
"""
使用shuffle()打乱
"""
number_set = []
for i in range(data_size):
number_set.append(i)
if key == 1:
random.shuffle(number_set)
return number_set
def split_data_set(data_set, target_set, rate, ifsuf):
"""
说明:分割数据集,默认数据集的rate是测试集
:param data_set: 数据集
:param target_set: 标签集
:param rate: 测试集所占的比率
:return: 返回训练集数据、测试集数据、训练集标签、测试集标签
"""
# 计算训练集的数据个数
train_size = int((1 - rate) * len(data_set))
# 随机获得数据的下标
data_index = random_number(len(data_set), ifsuf)
# 分割数据集(X表示数据,y表示标签),以返回的index为下标
# 训练集数据
x_train = data_set[data_index[:train_size]]
# 测试集数据
x_test = data_set[data_index[train_size:]]
# 训练集标签
y_train = target_set[data_index[:train_size]]
# 测试集标签
y_test = target_set[data_index[train_size:]]
return x_train, x_test, y_train, y_test
使用方法很简单,输入为特征值,标签,划分比例,是否打乱,返回值为训练集,测试集的特征值和标签。
# 数据划分为训练集和测试集和是否打乱数据集
split = 0.3 # 测试集占数据集整体的多少
ifshuffle = 1 # 1为打乱数据集,0为不打乱
x_train, x_test, y_train, y_test = split_data_set(feature, label, split, ifshuffle)
第三个函数为numpy转为tensor:
def inputtotensor(inputtensor, labeltensor):
"""
将数据集的输入和标签转为tensor格式
:param inputtensor: 数据集输入
:param labeltensor: 数据集标签
:return: 输入tensor,标签tensor
"""
inputtensor = np.array(inputtensor)
inputtensor = torch.FloatTensor(inputtensor)
labeltensor = np.array(labeltensor)
labeltensor = labeltensor.astype(float)
labeltensor = torch.LongTensor(labeltensor)
return inputtensor, labeltensor
输入为numpy的特征值和标签,返回值为tensor格式的特征值和标签。
# 将数据转为tensor格式
traininput, trainlabel = inputtotensor(x_train, y_train)
testinput, testlabel = inputtotensor(x_test, y_test)
第四部分是归一化处理,使用的是torch中的nn
# 归一化处理
traininput = nn.functional.normalize(traininput)
testinput = nn.functional.normalize(testinput)
你只需要调用函数就可以实现,可以说非常方便。
下面我用以上函数实现后实现一下BP神经网络:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import random
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
def open_excel(filename):
"""
打开数据集,进行数据处理
:param filename:文件名
:return:特征集数据、标签集数据
"""
readbook = pd.read_excel(f'filename.xlsx', engine='openpyxl')
nplist = readbook.T.to_numpy()
data = nplist[0:-1].T
data = np.float64(data)
target = nplist[-1]
return data, target
def open_csv(filename):
"""
打开数据集,进行数据处理
:param filename:文件名
:return:特征集数据、标签集数据
"""
readbook = pd.read_csv(f'filename.csv')
nplist = readbook.T.to_numpy()
data = nplist[0:-1].T
data = np.float64(data)
target = nplist[-1]
return data, target
def random_number(data_size, key):
"""
使用shuffle()打乱
"""
number_set = []
for i in range(data_size):
number_set.append(i)
if key == 1:
random.shuffle(number_set)
return number_set
def split_data_set(data_set, target_set, rate, ifsuf):
"""
说明:分割数据集,默认数据集的rate是测试集
:param data_set: 数据集
:param target_set: 标签集
:param rate: 测试集所占的比率
:return: 返回训练集数据、测试集数据、训练集标签、测试集标签
"""
# 计算训练集的数据个数
train_size = int((1 - rate) * len(data_set))
# 随机获得数据的下标
data_index = random_number(len(data_set), ifsuf)
# 分割数据集(X表示数据,y表示标签),以返回的index为下标
# 训练集数据
x_train = data_set[data_index[:train_size]]
# 测试集数据
x_test = data_set[data_index[train_size:]]
# 训练集标签
y_train = target_set[data_index[:train_size]]
# 测试集标签
y_test = target_set[data_index[train_size:]]
return x_train, x_test, y_train, y_test
def inputtotensor(inputtensor, labeltensor):
"""
将数据集的输入和标签转为tensor格式
:param inputtensor: 数据集输入
:param labeltensor: 数据集标签
:return: 输入tensor,标签tensor
"""
inputtensor = np.array(inputtensor)
inputtensor = torch.FloatTensor(inputtensor)
labeltensor = np.array(labeltensor)
labeltensor = labeltensor.astype(float)
labeltensor = torch.LongTensor(labeltensor)
return inputtensor, labeltensor
# 定义BP神经网络
class BPNerualNetwork(torch.nn.Module):
def __init__(self):
super().__init__()
self.model = nn.Sequential(nn.Linear(input_size, hidden_size),
nn.ReLU(),
nn.Linear(hidden_size, output_size),
nn.LogSoftmax(dim=1)
)
def forward(self, x):
x = self.model(x)
return x
def addbatch(data_train, data_test, batchsize):
"""
设置batch
:param data_train: 输入
:param data_test: 标签
:param batchsize: 一个batch大小
:return: 设置好batch的数据集
"""
data = TensorDataset(data_train, data_test)
data_loader = DataLoader(data, batch_size=batchsize, shuffle=False)
return data_loader
def train_test(traininput, trainlabel, testinput, testlabel, batchsize):
"""
函数输入为:训练输入,训练标签,测试输入,测试标签,一个batch大小
进行BP的训练,每训练一次就算一次准确率,同时记录loss
:return:训练次数list,训练loss,测试loss,准确率
"""
# 设置batch
traindata = addbatch(traininput, trainlabel, batchsize) # shuffle打乱数据集
for epoch in range(1001):
for step, data in enumerate(traindata):
net.train()
inputs, labels = data
# 前向传播
out = net(inputs)
# 计算损失函数
loss = loss_func(out, labels)
# 清空上一轮的梯度
optimizer.zero_grad()
# 反向传播
loss.backward()
# 参数更新
optimizer.step()
# 测试准确率
net.eval()
testout = net(testinput)
testloss = loss_func(testout, testlabel)
prediction = torch.max(testout, 1)[1] # torch.max
pred_y = prediction.numpy() # 事先放在了GPU,所以必须得从GPU取到CPU中!!!!!!
target_y = testlabel.data.numpy()
j = 0
for i in range(pred_y.size):
if pred_y[i] == target_y[i]:
j += 1
acc = j / pred_y.size
if epoch % 10 == 0:
print("训练次数为", epoch, "的准确率为:", acc)
if __name__ == "__main__":
feature, label = open_excel('iris')
# 数据划分为训练集和测试集和是否打乱数据集
split = 0.3 # 测试集占数据集整体的多少
ifshuffle = 1 # 1为打乱数据集,0为不打乱
x_train, x_test, y_train, y_test = split_data_set(feature, label, split, ifshuffle)
# 将数据转为tensor格式
traininput, trainlabel = inputtotensor(x_train, y_train)
testinput, testlabel = inputtotensor(x_test, y_test)
# 归一化处理
traininput = nn.functional.normalize(traininput)
testinput = nn.functional.normalize(testinput)
Epoch = 1000
input_size = 4
hidden_size = 5
output_size = 3
LR = 0.005
batchsize = 30
net = BPNerualNetwork()
optimizer = torch.optim.Adam(net.parameters(), LR)
# 设定损失函数
loss_func = torch.nn.CrossEntropyLoss()
# 训练并且记录每次准确率,loss 函数输入为:训练输入,训练标签,测试输入,测试标签,一个batch大小
train_test(traininput, trainlabel, testinput, testlabel, batchsize)
轻轻松松到达0.9777,这不是主要的,本次主要是进行简化一下Excel数据集操作。
小白学习PyTorch教程十基于大型电影评论数据集训练第一个LSTM模型
@Author:Runsen
本博客对原始IMDB数据集进行预处理,建立一个简单的深层神经网络模型,对给定数据进行情感分析。
import numpy as np
# read data from text files
with open('reviews.txt', 'r') as f:
reviews = f.read()
with open('labels.txt', 'r') as f:
labels = f.read()
编码
在将数据输入深度学习模型之前,应该将其转换为数值,文本转换被称为编码,这涉及到每个字符转换成一个整数。在进行编码之前,需要清理数据。
有以下几个预处理步骤:
- 删除标点符号。
- 使用\\n作为分隔符拆分文本。
- 把所有的评论重新组合成一个大串。
from string import punctuation
# remove punctuation
reviews = reviews.lower()
text = ''.join([c for c in reviews if c not in punctuation])
print(punctuation) # !"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~
# split by new lines and spaces
reviews_split = text.split('\\n')
text = ' '.join(reviews_split)
# create a list of words
words = text.split()
建立字典并对评论进行编码
创建一个字典,将词汇表中的单词映射为整数。然后通过这个字典,评论可以转换成整数,然后再传送到模型网络。
from collections import Counter
word_counts = Counter(words)
vocab = sorted(word_counts, key = word_counts.get, reverse = True)
vocab2idx = {vocab:idx for idx, vocab in enumerate(vocab, 1)}
print("Size of Vocabulary: ", len(vocab))
Size of Vocabulary: 74072
encoded_reviews = []
for review in reviews_split:
encoded_reviews.append([vocab2idx[vocab] for vocab in review.split()])
print("The number of reviews: ", len(encoded_reviews))
The number of reviews: 25001
对标签进行编码
Negative 和Positive应分别标记为0和1(整数)
splitted_labels = labels.split("\\n")
encoded_labels = np.array([
1 if label == "positive" else 0 for label in splitted_labels
])
删除异常值
应删除长度为0评论,然后,将对剩余的数据进行填充,保证所有数据具有相同的长度。
length_reviews = Counter([len(x) for x in encoded_reviews])
print("Zero-length reviews: ", length_reviews[0])
print("Maximum review length: ", max(length_reviews))
Zero-length reviews: 1
Maximum review length: 2514
# reviews with length 0
non_zero_idx = [i for i, review in enumerate(encoded_reviews) if len(review) != 0]
# Remove 0-length reviews
encoded_reviews = [encoded_reviews[i] for i in non_zero_idx]
encoded_labels = np.array([encoded_labels[i] for i in non_zero_idx])
填充序列
下面要处理很长和很短的评论,需要使用0填充短评论,使其适合特定的长度,
并将长评论剪切为seq_length
的单词。这里设置seq_length=200
def text_padding(encoded_reviews, seq_length):
reviews = []
for review in encoded_reviews:
if len(review) >= seq_length:
reviews.append(review[:seq_length])
else:
reviews.append([0]*(seq_length-len(review)) + review)
return np.array(reviews)
seq_length = 200
padded_reviews = text_padding(encoded_reviews, seq_length)
print(padded_reviews[:12, :12])
数据加载器
将数据按8:1:1的比例拆分为训练集、验证集和测试集,然后使用“TensorDataset”和“DataLoader”函数来处理评论和标签数据。
ratio = 0.8
train_length = int(len(padded_reviews) * ratio)
X_train = padded_reviews[:train_length]
y_train = encoded_labels[:train_length]
remaining_x = padded_reviews[train_length:]
remaining_y = encoded_labels[train_length:]
test_length = int(len(remaining_x)*0.5)
X_val = remaining_x[: test_length]
y_val = remaining_y[: test_length]
X_test = remaining_x[test_length :]
y_test = remaining_y[test_length :]
print("Feature shape of train review set: ", X_train.shape)
print("Feature shape of val review set: ", X_val.shape)
print("Feature shape of test review set: ", X_test.shape)
import torch
from torch.utils.data import TensorDataset, DataLoader
batch_size = 50
device = "cuda" if torch.cuda.is_available() else "cpu"
train_dataset = TensorDataset(torch.from_numpy(X_train).to(device), torch.from_numpy(y_train).to(device))
valid_dataset = TensorDataset(torch.from_numpy(X_val).to(device), torch.from_numpy(y_val).to(device))
test_dataset = TensorDataset(torch.from_numpy(X_test).to(device), torch.from_numpy(y_test).to(device))
train_loader = DataLoader(train_dataset, batch_size = batch_size, shuffle = True)
valid_loader = DataLoader(valid_dataset, batch_size = batch_size, shuffle = True)
test_loader = DataLoader(test_dataset, batch_size = batch_size, shuffle = True)
data_iter = iter(train_loader)
X_sample, y_sample = data_iter.next()
RNN模型的实现
到目前为止,包括标记化在内的预处理已经完成。现在建立一个神经网络模型来预测评论的情绪。
-
首先,嵌入层将单词标记转换为特定大小。
-
第二,一个 LSTM层,由
hidden_size
和num_layers
定义。 -
第三,通过完全连接的层从LSTM层的输出映射期望的输出大小。
-
最后,sigmoid激活层以概率0到1的形式返回输出。
import torch.nn as nn
from torch.autograd import Variable
class Model(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, num_layers):
super(Model, self).__init__()
self.hidden_dim = hidden_dim
self.num_layers = num_layers
# embedding and LSTM
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.lstm = nn.LSTM(input_size = embedding_dim,
hidden_size = hidden_dim,
num_layers = num_layers,
batch_first = True,
dropout = 0.5,
bidirectional = False)
# 完连接层
self.fc = nn.Sequential(
nn.Dropout(0.5),
nn.Linear(hidden_dim, output_dim),
nn.Sigmoid()
)
def forward(self, token, hidden):
batch_size = token.size(0)
# embedding and lstm output
out = self.embedding(token.long())
out, hidden = self.lstm(out, hidden)
# stack up lstm outputs
out = out.contiguous().view(-1, self.hidden_dim)
# fully connected layer
out = self.fc(out)
# reshape to be batch_size first
out = out.view(batch_size, -1)
# get the last batch of labels
out = out[:, -1]
return out
def init_hidden(self, batch_size):
return (Variable(torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)),
Variable(torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)))
- vocab_size : 词汇量
- embedding_dim : 嵌入查找表中的列数
- hidden_dim : LSTM单元隐藏层中的单元数
- output_dim : 期望输出的大小
vocab_size = len(vocab)+1 # +1 for the 0 padding + our word tokens
embedding_dim = 400
hidden_dim = 256
output_dim = 1
num_layers = 2
model = Model(vocab_size, embedding_dim, hidden_dim, output_dim, num_layers).to(device)
model
训练
对于损失函数,BCELoss
被用于二分类交叉熵损失,通过给出介于0和1之间的概率进行分类。使用Adam优化器,学习率为0.001
另外,torch.nn.utils.clip_grad_norm_(model.parameters(), clip = 5)
,防止了RNN中梯度的爆炸和消失问题clip
是要剪裁最大值。
# Loss function and Optimizer
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001)
for epoch in range(num_epochs):
model.train()
hidden = model.init_hidden(batch_size)
for i, (review, label) in enumerate(train_loader):
review, label = review.to(device), label.to(device)
# Initialize Optimizer
optimizer.zero_grad()
hidden = tuple([h.data for h in hidden])
# Feed Forward
output = model(review, hidden)
# Calculate the Loss
loss = criterion(output.squeeze(), label.float())
# Back Propagation
loss.backward()
# Prevent Exploding Gradient Problem
nn.utils.clip_grad_norm_(model.parameters(), 5)
# Update
optimizer.step()
train_losses.append(loss.item())
# Print Statistics
if (i+1) % 100 == 0:
### Evaluation ###
# initialize hidden state
val_h = model.init_hidden(batch_size)
val_losses = []
model.eval()
for review, label in valid_loader:
review, label = review.to(device), label.to(device)
val_h = tuple([h.data for h in val_h])
output = model(review, val_h)
val_loss = criterion(output.squeeze(), label.float())
val_losses.append(val_loss.item())
print("Epoch: {}/{} | Step {}, Train Loss {:.4f}, Val Loss {:.4f}".
format(epoch+1, num_epochs, i+1, np.mean(train_losses), np.mean(val_losses)))
以上是关于pytorch神经网络对Excel数据集进行处理(读取,转为tensor格式,归一化),并且以鸢尾花(iris)数据集为例,实现BP神经网络的主要内容,如果未能解决你的问题,请参考以下文章
使用 pytorch 和 sklearn 对 MNIST 数据集进行交叉验证