日常手写三层反向传播神经网络(损失函数交叉熵+正则项+反向求导)

Posted 囚生CY

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了日常手写三层反向传播神经网络(损失函数交叉熵+正则项+反向求导)相关的知识,希望对你有一定的参考价值。

课程的一次作业,虽然没什么用,但是手写一遍dense确实能加深对神经网络的理解,尤其是反向传播求导这一块。

资源已经上传,不过最近CSDN犯病不能改资源积分了。留个BDY链接了?

#-*- coding:UTF-8 -*-
import numpy as np
import pandas as pd
from scipy.io import loadmat
from scipy.linalg import norm
import matplotlib.pyplot as plt

""" 百度云链接:https://pan.baidu.com/s/1kdSoUcK9PFjUEfRiUI9pdw """
""" 密码:iycw """

"""
	Nesterov's方法是先根据历史信息走到一个点,
	再根据那个点的gradient来走一段更新;
	这恰好与动量方法的次序是相反的,
	动量方法是先根据当前点的gradient来走一段更新,
	然后再根据历史信息往前面搞一段距离;
"""
def train(wd,n_hidden,n_iters,learning_rate,momentum_mul,do_early_stopping=False,minibatch_size=10,isNestrov=False):
	"""
		· wd: 权重衰减
		· n_hidden: 隐层结点数量
		· n_iters: 随机梯度下降迭代次数
		· learning_rate: 学习速率
		· momentum_mul: 速率衰减系数(这个系数将附加在前一次的动量上,然后瞎搞)
		· do_early_stopping: 是否提早结束(如果是则简单的输出过去epoch中最优的那个)
		· minibatch_size: 随机梯度下降的小批尺寸
		· inNestrov: 是否使用Nestrov方法
		· return: 数据集上的分类损失
	"""
	data_file = loadmat("data.mat",squeeze_me=True,struct_as_record=False)
	data = data_file["data"]											 # 读取数据
	"""
		· data.training.inputs —— 256×1000
		· data.training.targets —— 10×1000
		· data.validation.inputs —— 256×1000
		· data.validation.targets —— 10×1000
		· data.test.inputs —— 256×9000
		· data.test.targets —— 10×9000
	"""
	data_train = "X":data.training.inputs,"y":data.training.targets
	data_valid = "X":data.validation.inputs,"y":data.validation.targets
	data_test = "X":data.test.inputs,"y":data.test.targets
	n_train = data_train["X"].shape[1]									 # 训练集样本数量							 
	params = initial_model(n_hidden)									 # 初始化两个权重矩阵的参数
	theta = model2theta(params)											 # 将两个矩阵压扁拼接成向量
	test_gradient(params,data_train,wd,n_hidden)						 # 检测梯度是否有问题
	v = 0																 # 初始化速率
	loss_train = []														 # 储存训练时的损失函数值
	loss_valid = []														 # 储存验证时的损失函数值
	best = 															 # 储存最优参数
	if do_early_stopping:												 # 提前结束
		best["theta"] = 0
		best["loss_valid"] = np.inf
		best["iter"] = -1

	for t in range(n_iters+1):											 # 随机梯度下降迭代算法
		batch_start = (t*minibatch_size) % n_train						 # 每次取一定规模的样本来搞搞
		data_batch = 
			"X": data_train["X"][:,batch_start:batch_start+minibatch_size],
			"y": data_train["y"][:,batch_start:batch_start+minibatch_size],
		
		if isNestrov:													 # Nestrov's方法
			temp = theta + momentum_mul*v								 # 先照着前一次的情况再苟一段距离
			loss,grad = eval_obj_grad(theta2model(temp),data_batch,wd,n_hidden)
			grad_vec = model2theta(grad)								 # 梯度转向量
			v = momentum_mul*v - grad_vec								 # 得到本次的实际下降方向
			theta -= learning_rate*v									 # 找到更新后的参数值

		else:															 # 传统动量
			loss,grad = eval_obj_grad(theta2model(theta),data_batch,wd,n_hidden)
			grad_vec = model2theta(grad)								 # 梯度转向量
			v = momentum_mul*v - grad_vec								 # 计算经过动量调整后的下降方向
			theta += learning_rate*v									 # 找到更新后的参数值

		params = theta2model(theta)										 # 将调整好的theta转化为params的格式(即两个权重矩阵)
		loss = eval_obj(params,data_train,wd)							 # 计算训练损失
		loss_train.append(loss)											 # 储存训练损失
		loss = eval_obj(params,data_valid,wd)							 # 计算验证损失
		loss_valid.append(loss)											 # 存储验证损失
		if do_early_stopping and loss_valid[-1]<best["loss_valid"]:		 # 如果验证损失比当前最好要小, 则更新best
			best["theta"] = theta.copy()
			best["loss_valid"] = loss_valid[-1]
			best["iter"] = t
		if t%(max(1,n_iters//10))==0:									 # 每过10次迭代将输出信息
			print("After %d iterations - ||theta|| is %.4e - training loss is %.4e - and validation loss is %.4e\\n"%(t,norm(theta),loss_train[-1],loss_valid[-1]))
	test_gradient(params,data_train,wd,n_hidden)
	""" 实验绘图 """
	plt.close()
	plt.figure()
	plt.plot(loss_train,label="training loss")
	plt.plot(loss_valid,label="validation loss")
	plt.legend(loc="best")
	plt.show()
	if do_early_stopping:
		print("Early stopping: validation loss: %.3e, was lowest after %d iterations" % (best["loss_valid"],best["iter"]))
		theta = best["theta"]
	params = theta2model(theta)
	""" 检查效果 """
	datasets = [data_train,data_valid,data_test]
	acc = [accuracy(params,x) for x in datasets]
	classification_loss = [eval_obj(params,x,0) for x in datasets]
	print("Accuracy: training %.4f,validation %.4f,testing %.4f" % (acc[0],acc[1],acc[2]))
	info = 
		"loss_train": classification_loss[0],
		"loss_valid": classification_loss[1],
		"loss_test": classification_loss[2],
	
	print(info)

def eval_obj(params,data,wd):											 # 计算损失函数
	W_hid,W_out = params["W_hid"],params["W_out"]						 # 100×256 10×100
	X,y = data["X"],data["y"]											 # 256×1000 10×1000
	""" 前向传播 """
	hidden_in = np.dot(W_hid,X)
	hidden_out = sigmoid(hidden_in)
	y_hat = np.dot(W_out,hidden_out)
	exps = np.exp(y_hat)
	y_hat = exps/(np.sum(exps,axis=0))									 # softmax输出
	assert y_hat.shape==y.shape											 # 断言
	loss = 0
	for i in range(y_hat.shape[1]):
		loss -= np.dot(y[:,i].T,np.log(y_hat[:,i]))
	loss /= y_hat.shape[1]
	loss += wd/2*(np.linalg.norm(W_hid,ord="fro")**2+np.linalg.norm(W_out,ord="fro")**2)
	return loss

def eval_obj_grad(params,data,wd,n_hidden):								 # 计算模型的损失函数与梯度
	W_hid,W_out = params["W_hid"],params["W_out"]						 # 100×256 10×100
	X,y = data["X"],data["y"]											 # 256×1000 10×1000
	m = y.shape[1]														 # 获取样本数量
	""" 前向传播 """
	hidden_in = np.dot(W_hid,X)											 # 计算隐层输入 100×1000
	hidden_out = sigmoid(hidden_in)										 # 计算隐层输出 100×1000
	y_hat = np.dot(W_out,hidden_out)									 # 计算输出层输入 10×1000
	exps = np.exp(y_hat)
	y_hat = exps/(np.sum(exps,axis=0))									 # softmax输出 10×1000
	assert y_hat.shape==y.shape											 # 断言
	loss = 0
	for i in range(y_hat.shape[1]):
		loss -= np.dot(y[:,i].T,np.log(y_hat[:,i]))
	loss /= y_hat.shape[1]
	loss += wd/2*(np.linalg.norm(W_hid,ord="fro")**2+np.linalg.norm(W_out,ord="fro")**2)
	""" 反向传播 """
	dz_2 = y_hat - y													 # 计算预测与实际的误差 10×1000
	grad_W_out = 1/m * np.dot(dz_2,hidden_out.T)						 # 转置很关键 10×100
	dz_1 = np.dot(W_out.T,dz_2) * hidden_out*(1-hidden_out)				 # 对sigmoid激活函数求导 100×1000
	grad_W_hid = 1/m * np.dot(dz_1,X.T)									 # 100×256
	grad = 															 # 返回梯度
		"W_out": grad_W_out + wd*W_out,									 # SGD飞了我才想起来还有正则项的梯度
		"W_hid": grad_W_hid + wd*W_hid,									 # SGD飞了我才想起来还有正则项的梯度
	
	return loss,grad

def initial_model(n_hid):
	n_params = (256+10)*n_hid											 # 表明 输入->隐层 & 隐层->输出 的两个权重矩阵总的元素数量
	as_row_vector = np.cos(np.arange(n_params))							 # 返回darray([0,1,2,...,n_params-1])
	params = 
	params["W_hid"] = as_row_vector[:256*n_hid].reshape((n_hid,256))*0.1
	params["W_out"] = as_row_vector[256*n_hid:].reshape((10,n_hid))*0.1
	return params

def test_gradient(params,data,wd,n_hidden):								 # 测试梯度
	loss,analytic_grad = eval_obj_grad(params,data,wd,n_hidden)			 # 获取当前参数的损失函数值及梯度
	num_checks = 100
	theta = model2theta(params)
	grad_ana = model2theta(analytic_grad)
	delta = 1e-4
	threshold = 1e-5
	for i in range(num_checks):
		ind = (i*1299283) % theta.size
		grad_ind_ana = grad_ana[ind]
		theta1 = theta.copy()
		theta1[ind] += delta
		l1 = eval_obj(theta2model(theta1),data,wd)
		theta2 = theta.copy()
		theta2[ind] -= delta
		l2 = eval_obj(theta2model(theta2),data,wd)
		grad_ind_fin = (l1-l2)/(2*delta)
		diff = abs(grad_ind_ana - grad_ind_fin)
		if diff<threshold: continue
		if diff/(abs(grad_ind_ana)+abs(grad_ind_fin))<threshold: continue
		raise AssertionError("%d-th: l %.3e\\nl1 %.3e\\nl2 %.3e\\nanalytic %.3e\\nfd %.3e\\ndiff %.3e\\n" % (i,loss,l1,l2,grad_ind_ana,grad_ind_fin,diff))
	print("Gradient test passed")

def model2theta(params):												 # 将两个权重矩阵参数转化为向量再拼起来
	theta = np.concatenate((params["W_out"].flatten(),params["W_hid"].flatten()))
	return theta

def theta2model(theta):													 # 将向量形式转化为两个权重矩阵输出
	n_hid = theta.size // (256+10)										 # theta.size是theta的向量维度, a//b运算符相当于int(a/b6)
	params = 
	params["W_out"] = np.reshape(theta[:n_hid*10],(10,n_hid))
	params["W_hid"] = np.reshape(theta[n_hid*10:],(n_hid,256))
	return params

def accuracy(params,data):												 # 计算准确度
	W_hid,W_out = params["W_hid"],params["W_out"]
	index_transpose = np.nonzero(data["y"].T)
	true_label = index_transpose[1]
	a_hidden = W_hid.dot(data["X"])
	h_hidden = sigmoid(a_hidden)
	a_out = W_out.dot(h_hidden)
	pred = a_out.argmax(axis=0)
	return np.mean(pred==true_label)

def log_sum_exp(X):														 # 对矩阵X每列的求以自然底数的指数累和, 对得到的行向量去自然对数
	return np.log(np.sum(np.exp(X),axis=0))
	
def sigmoid(X):															 # 激活函数
	return 1/(1+np.exp(-X))

def grad_sigmoid(X):													 # 激活函数求导
	return sigmoid(X)*(1-sigmoid(X))

def P2Q1():
	print("Part2 Question1...")
	params = [															 # 存一些需要测试的参数
		[0,10,70,0.005,0,False,4],								
		[0,10,70,0.01,0,False,4],									
		[0,10,70,0.05,0,False,4],										
		[0,10,70,0.2,0,False,4],										
		[0,10,70,1.0,0,False,4],
		[0,10,70,5.0,0,False,4],
	]
	for param in params:
		print("正在测试参数:wd is  - n_hidden is  - n_iters is  - learning_rate is  - momentum_mul is  - do_early_stopping is  - minibatch_size is ...".format(param[0],param[1],param[2],param[3],param[4],param[5],param[6]))
		train(param[0],param[1],param[2],param[3],param[4],param[5],param[6])

def P2Q2():
	print("Part2 Question2...")
	param = [0,10,100,None,None,False,4]
	learning_rates = [0.01,0.05,0.2,1.0,5.0]
	momentums_muls = [0,0.5,0.9]	
	for learning_rate in learning_rates:
		for momentum_mul in momentums_muls:
			tempParam = param[:]
			tempParam[3] = learning_rate
			tempParam[4] = momentum_mul
			print("正在测试参数:wd is  - n_hidden is  - n_iters is  - learning_rate is  - momentum_mul is  - do_early_stopping is  - minibatch_size is ...".format(tempParam[0],tempParam[1],tempParam[2],tempParam[3],tempParam[4],tempParam[5],tempParam[6]))
			train(tempParam[0],tempParam[1],tempParam[2],tempParam[3],tempParam[4],tempParam[5],tempParam[6])

def P2Q3():
	""" 与P2Q1比较 """
	print("Part2 Question3...")
	params = [															 # 存一些需要测试的参数
		[0,10,70,0.005,0,False,4],								
		[0,10,70,0.01,0,False,4],									
		[0,10,70,0.05,0,False,4],										
		[0,10,70,0.2,0,False,4],										
		[0,10,70,1.0,0,False,4],
		[0,10,70,5.0,0,False,4],
	]
	for param in params:
		print("正在测试参数:wd is  - n_hidden is  - n_iters is  - learning_rate is  - momentum_mul is  - do_early_stopping is  - minibatch_size is ...".format(param[0],param[1],param[2],param[3],param[4],param[5],param[6]))
		train(param[0],param[1],param[2],param[3],param[4],param[5],param[6],True)
	print("###########################################################")
	print("########################  分界线  ##########################")
	print("###########################################################")
	""" 与P2Q2比较 """
	param = [0,10,100,None,None,False,4]
	learning_rates = [0.01,0.05,0.2,1.0,5.0]
	momentums_muls = [0,0.5,0.9]	
	for learning_rate in learning_rates:
		for momentum_mul in momentums_muls:
			tempParam = param[:]
			tempParam[3] = learning_rate
			tempParam[4] = momentum_mul
			print("正在测试参数:wd is  - n_hidden is  - n_iters is  - learning_rate is  - momentum_mul is  - do_early_stopping is  - minibatch_size is ...".format(tempParam[0],tempParam[1],tempParam[2],tempParam[3],tempParam[4],tempParam[5],tempParam[6]))
			train(tempParam[0],tempParam[1],tempParam[2],tempParam[3],tempParam[4],tempParam[5],tempParam[6],True)

def P3Q1():
	print("Part3 Question1...")
	params = [															 # 存一些需要测试的参数
		[0,200,1000,0.2,0.9,False,4],								
		[1e-4,200,1000,0.2,0.9,False,4],								
		[1e-3,200,1000,0.2,0.9,False,4],								
		[1e-2,200,1000,0.2,0.9,False,4],								
		[1e-1,200,1000,0.2,0.9,False,4],								
		[1,200,1000,0.2,0.9,False,4],								
		[10,200,1000,0.2,0.9,False,4],								
	]
	for param in params:
		print("正在测试参数:wd is  - n_hidden is  - n_iters is  - learning_rate is  - momentum_mul is  - do_early_stopping is  - minibatch_size is ...".format(param[0],param[1],param[2],param[3],param[4],param[5],param[6]))
		train(param[0],param[1],param[2],param[3],param[4],param[5],param[6])

def P3Q2():
	print("Part3 Question2...")
	param = [0,None,1000,0.2,0.9,None,4]
	n_hiddens = [10,50,100,200,300]
	for n_hidden in n_hiddens:
		tempParam = param[:]
		tempParam[1] = n_hidden
		tempParam[5] = True
		print("正在测试参数:wd is  - n_hidden is  - n_iters is  - learning_rate is  - momentum_mul is  - do_early_stopping is  - minibatch_size is ...".format(tempParam[0],tempParam[1],tempParam[2],tempParam[3],tempParam[4],tempParam[5],tempParam[6]))
		train(tempParam[0],tempParam[1],tempParam[2],tempParam[3],tempParam[4],tempParam[5],tempParam[6],True)

		tempParam = param[:]
		tempParam[1] = n_hidden
		tempParam[5] = False
		print("正在测试参数:wd is  - n_hidden is  - n_iters is  - learning_rate is  - momentum_mul is  - do_early_stopping is  - minibatch_size is ...".format(tempParam[0],tempParam[1],tempParam[2],tempParam[3],tempParam[4],tempParam[5],tempParam[6]))
		train(tempParam[0],tempParam[1],tempParam[2],tempParam[3],tempParam[4],tempParam[5],tempParam[6],True)

def train_1(wd,n_hidden,n_iters,learning_rate,momentum_mul,do_early_stopping=False,minibatch_size=10,isNestrov=False):
	"""
		· wd: 权重衰减
		· n_hidden: 隐层结点数量
		· n_iters: 随机梯度下降迭代次数
		· learning_rate: 学习速率
		· momentum_mul: 速率衰减系数(这个系数将附加在前一次的动量上,然后瞎搞)
		· do_early_stopping: 是否提早结束(如果是则简单的输出过去epoch中最优的那个)
		· minibatch_size: 随机梯度下降的小批尺寸
		· inNestrov: 是否使用Nestrov方法
		· return: 数据集上的分类损失
	"""
	data_file = loadmat("data.mat",squeeze_me=True,struct_as_record=False)
	data = data_file["data"]											 # 读取数据
	"""
		· data.training.inputs —— 256×1000
		· data.training.targets —— 10×1000
		· data.validation.inputs —— 256×1000
		· data.validation.targets —— 10×1000
		· data.test.inputs —— 256×9000
		· data.test.targets —— 10×9000
	"""
	data_train = "X":data.training.inputs,"y":data.training.targets
	data_valid = "X":data.validation.inputs,"y":data.validation.targets
	data_test = "X":data.test.inputs,"y":data.test.targets
	n_train = data_train["X"].shape[1]									 # 训练集样本数量							 
	params = initial_model(n_hidden)									 # 初始化两个权重矩阵的参数
	theta = model2theta(params)											 # 将两个矩阵压扁拼接成向量
	test_gradient(params,data_train,wd,n_hidden)						 # 检测梯度是否有问题
	v = 0																 # 初始化速率
	loss_train = []														 # 储存训练时的损失函数值
	loss_valid = []														 # 储存验证时的损失函数值
	best = 															 # 储存最优参数
	if do_early_stopping:												 # 提前结束
		best["theta"] = 0
		best["loss_valid"] = np.inf
		best["iter"] = -1

	for t in range(n_iters+1):											 # 随机梯度下降迭代算法
		batch_start = (t*minibatch_size) % n_train						 # 每次取一定规模的样本来搞搞
		data_batch = 
			"X": data_train["X"][:,batch_start:batch_start+minibatch_size],
			"y": data_train["y"][:,batch_start:batch_start+minibatch_size],
		
		if isNestrov:													 # Nestrov's方法
			temp = theta + momentum_mul*v								 # 先照着前一次的情况再苟一段距离
			loss,grad = eval_obj_grad(theta2model(temp),data_batch,wd,n_hidden)
			grad_vec = model2theta(grad)								 # 梯度转向量
			v = momentum_mul*v - grad_vec								 # 得到本次的实际下降方向
			theta -= learning_rate*v									 # 找到更新后的参数值

		else:															 # 传统动量
			loss,grad = eval_obj_grad(theta2model(theta),data_batch,wd,n_hidden)
			grad_vec = model2theta(grad)								 # 梯度转向量
			v = momentum_mul*v - grad_vec								 # 计算经过动量调整后的下降方向
			theta += learning_rate*v									 # 找到更新后的参数值

		params = theta2model(theta)										 # 将调整好的theta转化为params的格式(即两个权重矩阵)
		loss = eval_obj(params,data_train,wd)							 # 计算训练损失
		loss_train.append(loss)											 # 储存训练损失
		loss = eval_obj(params,data_valid,wd)							 # 计算验证损失
		loss_valid.append(loss)											 # 存储验证损失
		if do_early_stopping and loss_valid[-1]<best["loss_valid"]:		 # 如果验证损失比当前最好要小, 则更新best
			best["theta"] = theta.copy()
			best["loss_valid"] = loss_valid[-1]
			best["iter"] = t
		if t%(max(1,n_iters//10))==0:									 # 每过10次迭代将输出信息
			print("After %d iterations - ||theta|| is %.4e - training loss is %.4e - and validation loss is %.4e\\n"%(t,norm(theta),loss_train[-1],loss_valid[-1]))
	test_gradient(params,data_train,wd,n_hidden)
	""" 实验绘图 """
	plt.close()
	plt.figure()
	plt.plot(loss_train,label="training loss")
	plt.plot(loss_valid,label="validation loss")
	plt.legend(loc="best")
	plt.show()
	if do_early_stopping:
		print("Early stopping: validation loss: %.3e, was lowest after %d iterations" % (best["loss_valid"],best["iter"]))
		theta = best["theta"]
	params = theta2model(theta)
	""" 检查效果 """
	datasets = [data_train,data_valid,data_test]
	acc = [accuracy(params,x) for x in datasets]
	classification_loss = [eval_obj(params,x,0) for x in datasets]
	print("Accuracy: training %.4f,validation %.4f,testing %.4f" % (acc[0],acc[1],acc[2]))
	return acc[1]

def train_2(wd,n_hidden,n_iters,learning_rate,momentum_mul,do_early_stopping=False,minibatch_size=10,isNestrov=False):
	"""
		· wd: 权重衰减
		· n_hidden: 隐层结点数量
		· n_iters: 随机梯度下降迭代次数
		· learning_rate: 学习速率
		· momentum_mul: 速率衰减系数(这个系数将附加在前一次的动量上,然后瞎搞)
		· do_early_stopping: 是否提早结束(如果是则简单的输出过去epoch中最优的那个)
		· minibatch_size: 随机梯度下降的小批尺寸
		· inNestrov: 是否使用Nestrov方法
		· return: 数据集上的分类损失
	"""
	data_file = loadmat("data.mat",squeeze_me=True,struct_as_record=False)
	data = data_file["data"]											 # 读取数据
	"""
		· data.training.inputs —— 256×1000
		· data.training.targets —— 10×1000
		· data.validation.inputs —— 256×1000
		· data.validation.targets —— 10×1000
		· data.test.inputs —— 256×9000
		· data.test.targets —— 10×9000
	"""
	data_valid = "X":data.training.inputs,"y":data.training.targets
	data_train = "X":data.validation.inputs,"y":data.validation.targets
	data_test = "X":data.test.inputs,"y":data.test.targets
	n_train = data_train["X"].shape[1]									 # 训练集样本数量							 
	params = initial_model(n_hidden)									 # 初始化两个权重矩阵的参数
	theta = model2theta(params)											 # 将两个矩阵压扁拼接成向量
	test_gradient(params,data_train,wd,n_hidden)						 # 检测梯度是否有问题
	v = 0																 # 初始化速率
	loss_train = []														 # 储存训练时的损失函数值
	loss_valid = []														 # 储存验证时的损失函数值
	best = 															 # 储存最优参数
	if do_early_stopping:												 # 提前结束
		best["theta"] = 0
		best["loss_valid"] = np.inf
		best["iter"] = -1

	for t in range(n_iters+1):											 # 随机梯度下降迭代算法
		batch_start = (t*minibatch_size) % n_train						 # 每次取一定规模的样本来搞搞
		data_batch = 
			"X": data_train["X"][:,batch_start:batch_start+minibatch_size],
			"y": data_train["y"][:,batch_start:batch_start+minibatch_size],
		
		if isNestrov:													 # Nestrov's方法
			temp = theta + momentum_mul*v								 # 先照着前一次的情况再苟一段距离
			loss,grad = eval_obj_grad(theta2model(temp),data_batch,wd,n_hidden)
			grad_vec = model2theta(grad)								 # 梯度转向量
			v = momentum_mul*v - grad_vec								 # 得到本次的实际下降方向
			theta -= learning_rate*v									 # 找到更新后的参数值

		else:															 # 传统动量
			loss,grad = eval_obj_grad(theta2model(theta),data_batch,wd,n_hidden)
			grad_vec = model2theta(grad)								 # 梯度转向量
			v = momentum_mul*v - grad_vec								 # 计算经过动量调整后的下降方向
			theta += learning_rate*v									 # 找到更新后的参数值

		params = theta2model(theta)										 # 将调整好的theta转化为params的格式(即两个权重矩阵)
		loss = eval_obj(params,data_train,wd)							 # 计算训练损失
		loss_train.append(loss)											 # 储存训练损失
		loss = eval_obj(params,data_valid,wd)							 # 计算验证损失
		loss_valid.append(loss)											 # 存储验证损失
		if do_early_stopping and loss_valid[-1]<best["loss_valid"]:		 # 如果验证损失比当前最好要小, 则更新best
			best["theta"] = theta.copy()
			best["loss_valid"] = loss_valid[-1]
			best["iter"] = t
		if t%(max(1,n_iters//10))==0:									 # 每过10次迭代将输出信息
			print("After %d iterations - ||theta|| is %.4e - training loss is %.4e - and validation loss is %.4e\\n"%(t,norm(theta),loss_train[-1],loss_valid[-1]))
	test_gradient(params,data_train,wd,n_hidden)
	""" 实验绘图 """
	plt.close()
	plt.figure()
	plt.plot(loss_train,label="training loss")
	plt.plot(loss_valid,label="validation loss")
	plt.legend(loc="best")
	plt.show()
	if do_early_stopping:
		print("Early stopping: validation loss: %.3e, was lowest after %d iterations" % (best["loss_valid"],best["iter"]))
		theta = best["theta"]
	params = theta2model(theta)
	""" 检查效果 """
	datasets = [data_train,data_valid,data_test]
	acc = [accuracy(params,x) for x in datasets]
	classification_loss = [eval_obj(params,x,0) for x in datasets]
	print("Accuracy: training %.4f,validation %.4f,testing %.4f" % (acc[0],acc[1],acc[2]))
	return acc[1]

def P3Q3():
	wds = [0,1e-4,1e-3,1e-2,1e-1,1,10]
	n_hiddens = [10,50,100,200,300]
	n_iters = [1000]
	learning_rates = [0.2]
	momentum_muls = [0.9]
	do_early_stoppings = [True,False]
	minibatch_sizes = [4]
	isNestrovs = [False]
	results = 														 # 存储所有结果
	count = 0															 # 计数
	for wd in wds:
		for n_hidden in n_hiddens:
			for n_iter in n_iters:
				for learning_rate in learning_rates:
					for momentum_mul in momentum_muls:
						for do_early_stopping in do_early_stoppings:
							for minibatch_size in minibatch_sizes:
								for isNestrov in isNestrovs:
									count += 1
									print("###################################################")
									print("################### 第个组合 #####################".format(count))
									print("###################################################")
									print("正在测试参数 - 1折:wd is  - n_hidden is  - n_iters is  - learning_rate is  - momentum_mul is  - do_early_stopping is  - minibatch_size is  - isNestrov is ...".format(wd,n_hidden,n_iter,learning_rate,momentum_mul,do_early_stopping,minibatch_size,isNestrov))
									acc1 = train_1(wd,n_hidden,n_iter,learning_rate,momentum_mul,do_early_stopping,minibatch_size,isNestrov)
									print("正在测试参数 - 2折:wd is  - n_hidden is  - n_iters is  - learning_rate is  - momentum_mul is  - do_early_stopping is  - minibatch_size is  - isNestrov is ...".format(wd,n_hidden,n_iter,learning_rate,momentum_mul,do_early_stopping,minibatch_size,isNestrov))									
									acc2 = train_2(wd,n_hidden,n_iter,learning_rate,momentum_mul,do_early_stopping,minibatch_size,isNestrov)
									results["--".format(wd,n_hidden,do_early_stopping)] = (acc1+acc2)/2
	for key,value in results.items():
		print("\\t平均精度 ".format(key,value))

if __name__ == "__main__":
	P3Q3()

 

以上是关于日常手写三层反向传播神经网络(损失函数交叉熵+正则项+反向求导)的主要内容,如果未能解决你的问题,请参考以下文章

softmax交叉熵损失函数求导

神经网络和深度学习笔记 - 第三章 如何提高神经网络学习算法的效果

反向传播 - 神经网络 - 导数

具有交叉熵误差的整流线性单元激活的反向传播

直观理解为什么分类问题用交叉熵损失而不用均方误差损失?

matlab处理手写识别问题