机器学习线性回归（最小二乘法/梯度下降法）多项式回归logistic回归softmax回归

Posted 2021-06-26 nefu_ljw

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了机器学习线性回归（最小二乘法/梯度下降法）多项式回归logistic回归softmax回归相关的知识，希望对你有一定的参考价值。

本文部分代码参考github：Machine-Learning-for-Beginner-by-Python3

本文所有代码和数据集文件可在此下载：https://download.csdn.net/download/ljw_study_in_CSDN/19546447

文章目录

（一）线性回归和多项式回归

根据给定数据集，利用线性回归和多项式回归模型训练和测试一个数据预测模型，并对模型的性能和预测能力进行分析；

1. 线性回归（最小二乘法/梯度下降法）

实验代码：

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing as spp  # 引入数据预处理的库
# 在训练样本集的最后一列加1
def Trans(xdata):
    ones = np.ones(len(xdata)).reshape(-1, 1)
    xta = np.append(xdata, ones, axis=1)
    return xta
# 利用传统的最小二乘法求解参数，即公式 W=(XT.X)-1*XT.Y
def ljw_leastsq(xdata, ydata):
    xdata = Trans(xdata)
    xTx = np.dot(xdata.T, xdata)
    # 判断行列式是否为零
    if np.linalg.det(xTx) == 0:
        print("the Matrix cannot do inverse!")
        return
    invert = np.linalg.inv(xTx) # 求逆阵
    ws = np.dot(np.dot(invert, xdata.T), ydata)
    return ws
# 梯度下降法
def Gradient(xdata1, ydata, learn_rate=0.1, iter_times=100000, error=1e-8):
    xdata = Trans(xdata1)
    # 系数w,b的初始化
    weights = np.zeros((xdata.shape[1], 1))  # (len(xdata[1]), 1))
    # 存储成本函数的值
    cost_function = []
    for i in range(iter_times):
        # 得到回归的值
        y_predict = np.dot(xdata, weights)
        # 最小二乘法计算误差
        cost = np.sum((y_predict - ydata) ** 2) / len(xdata)
        cost_function.append(cost)
        # 计算梯度
        dJ_dw = 2 * np.dot(xdata.T, (y_predict - ydata)) / len(xdata)
        # 更新系数w,b的值
        weights = weights - learn_rate * dJ_dw
        # 提前结束循环的机制
        if len(cost_function) > 1:
            if 0 < cost_function[-2] - cost_function[-1] < error:
                break
    return weights
def predict(xdata, ws):
    xt = Trans(xdata)
    return np.dot(xt, ws)
# 读入文本文件数据并对x进行归一化处理
def preprocess_data(filename):
    data = pd.read_table(filename, header=None).values
    x = data[:, 0].reshape(-1, 1)  # 将x由列表转化为二维数组
    y = data[:, 1].reshape(-1, 1)  # 将y由列表转化为二维数组
    x = spp.MinMaxScaler().fit_transform(x)  # 对x进行极大极小归一化
    return x, y
def draw(pic_num, fun, title):
    plt.figure(pic_num)
    w = fun(x_train, y_train)  # 用最小二乘函数或梯度下降函数，求解参数
    plt.plot(x_train, predict(x_train, w), "r", label="回归直线 $y=ax+b$")  # 拟合的直线(标为红色)
    plt.scatter(x_train, y_train, label="原数据")  # 原数据的散点图
    plt.legend()
    s = "y=" + str(round(w[0][0], 3)) + "*x+" + str(round(w[1][0], 3))  # 回归直线
    plt.text(0.05, 18, s, fontsize=20)
    plt.title(title)
    plt.show()
if __name__ == "__main__":
    x_train, y_train = preprocess_data("train.txt")
    plt.rcParams['font.sans-serif'] = 'SimSun'  # 设置中文字体
    draw(1, ljw_leastsq, "线性回归-最小二乘法拟合")
    draw(2, Gradient, "线性回归-梯度下降法拟合")

实验结果：
在这里插入图片描述

从上图可以发现，当设置梯度下降法的超参数error=1e-8时，与最小二乘法计算得到的系数w相差较小。通过不断调小参数error，梯度下降法算出的系数将更接近最小二乘法计算出的结果。

2. 多项式回归

实验代码：

import numpy as np
import pandas as pd
from scipy.optimize import leastsq
import matplotlib.pyplot as plt
# 定义多项式，w为多项式的系数
def fit_func(w, x):
    f = np.poly1d(w)  # np.ploy1d()用来构造多项式，默认 ax^3+bx^2+c^x+d
    return f(x)
# 残差函数
def err_func(w, x, y):
    ret = fit_func(w, x) - y
    return ret
# n项(最高n-1次)多项式拟合，有n个系数
def n_poly(n, x, y):
    w_init = np.random.randn(n)  # 生成n个随机数作为参数初值
    parameters = leastsq(err_func, w_init, args=(np.array(x), np.array(y)))  # 调用最小二乘法,x,y为列表型变量
    return parameters[0]
def read_data(filename):  # 读入文本文件数据
    data = pd.read_table(filename, header=None).values
    return data[:, 0], data[:, 1]
if __name__ == '__main__':
    x_train, y_train = read_data("train.txt")  # 读入数据
    x_temp = np.linspace(0, 25, 10000)  # 绘制拟合回归时需要的监测点
    row = 3
    col = 4
    plt.rcParams["font.sans-serif"] = "SimSun"  # 设置中文字体
    # 绘制子图，子图大小row*col，绘制出m次多项式的拟合回归，m设为3~14
    fig, ax = plt.subplots(row, col, figsize=(15, 10))
    m_num = row * col
    m = np.linspace(3, 3 + m_num - 1, m_num).astype(int)  # m = [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
    for i in range(row):
        for j in range(col):
            m_index = i * col + j
            ax[i, j].plot(x_temp, fit_func(n_poly(m[m_index] + 1, x_train, y_train), x_temp), 'r')  # 拟合的曲线(标为红色)
            ax[i, j].scatter(x_train, y_train)  # 原数据的散点图
            ax[i, j].set_title("m=" + str(m[m_index]))
            ax[i, j].legend(labels=["回归曲线", "原数据"])
    plt.show()

实验结果：
在这里插入图片描述
分别令最高幂次m=3,4,5,6,7,8,9,10,11,12,13,14。
从上图可以发现m=13时，多项式模型拟合效果最好。

（二）利用线性回归模型进行波斯顿房价预测

利用马萨诸塞州波士顿郊区的房屋信息数据，利用线性回归模型训练和测试一个房价预测模型，并对模型的性能和预测能力进行测试分析；

实验代码：

import numpy as np
import pandas as pd
from sklearn import preprocessing as spp  # 引入数据预处理的库
import matplotlib.pyplot as plt  # 绘图
from pylab import mpl
# 创建线性回归的类
class LinearRegression:
    def __init__(self, learn_rate=0.2, iter_times=200000, error=1e-9):
        self.learn_rate = learn_rate
        self.iter_times = iter_times
        self.error = error
    def Trans(self, xdata):
        one1 = np.ones(len(xdata))
        xta = np.append(xdata, one1.reshape(-1, 1), axis=1)
        return xta
    # 梯度下降法
    def Gradient(self, xdata, ydata):
        xdata = self.Trans(xdata)
        # 系数w,b的初始化
        self.weights = np.zeros((len(xdata[0]), 1))
        # 存储成本函数的值
        cost_function = []
        for i in range(self.iter_times):
            # 得到回归的值
            y_predict = np.dot(xdata, self.weights)
            # 最小二乘法计算误差
            cost = np.sum((y_predict - ydata) ** 2) / len(xdata)
            cost_function.append(cost)
            # 计算梯度
            dJ_dw = 2 * np.dot(xdata.T, (y_predict - ydata)) / len(xdata)
            # 更新系数w,b的值
            self.weights = self.weights - self.learn_rate * dJ_dw
            # 提前结束循环的机制
            if len(cost_function) > 1:
                if 0 < cost_function[-2] - cost_function[-1] < self.error:
                    break
        return self.weights, cost_function
    # 根据公式
    def Formula(self, xdata, ydata):
        xdata = self.Trans(xdata)
        self.weights = np.dot(np.dot(np.linalg.inv(np.dot(xdata.T, xdata)), xdata.T), ydata)
        y_predict = np.dot(xdata, self.weights)
        cost = [np.sum((ydata - np.mean(ydata)) ** 2) / len(xdata)]  # 开始是以y值得平均值作为预测值计算cost
        cost += [np.sum((y_predict - ydata) ** 2) / len(xdata)]  # 利用公式，一次计算便得到参数的值，不需要迭代。
        return self.weights, cost  # 包括2个值
    # 预测
    def predict(self, xdata):
        return np.dot(self.Trans(xdata), self.weights)
def figure(title, *datalist):
    for jj in datalist:
        plt.plot(jj[0], '-', label=jj[1], linewidth=2)
        plt.plot(jj[0], 'o')
    plt.grid()
    plt.title(title)
    plt.legend()
    plt.show()
# 计算R2的函数
def getR(ydata_tr, ydata_pre):
    sum_error = np.sum(((ydata_tr - np.mean(ydata_tr)) ** 2))
    inexplicable = np.sum(((ydata_tr - ydata_pre) ** 2))
    return 1 - inexplicable / sum_error
# 读取数据并进行数据预处理
def preprocess_data(percent=0.1):
    data = pd.read_csv(r'Boston.csv')
    xdata = data.drop('MEDV', axis=1).values
    xdata = spp.MinMaxScaler().fit_transform(xdata)  # 归一的x值，y值分为训练数据集和预测数据集
    ydata = data['MEDV']
    sign_list = list(range(len(xdata)))
    # 用于测试的序号
    select_sign = sorted(np.random.choice(sign_list, int(len(xdata) * percent), replace=False))
    # 用于训练的序号
    no_select_sign = [isign for isign in sign_list if isign not in select_sign]
    # 测试数据
    x_predict_data = xdata[select_sign]
    y_predict_data = ydata[select_sign].values.reshape(len(select_sign), 1)  # 转化数据结构
    # 训练数据
    x_train_data = xdata[no_select_sign]
    y_train_data = ydata[no_select_sign].values.reshape(len(no_select_sign), 1)  # 转化数据结构
    return x_train_data, y_train_data, x_predict_data, y_predict_data  # 训练集、测试集
if __name__ == "__main__":
    regressor = LinearRegression()
    lrdata = preprocess_data()  # 可用于算法的数据
    train_error = regressor.Gradient(lrdata[0], lrdata[1])  # 开始训练
    predict_result = regressor.predict(lrdata[2])  # 用于预测数据的预测值
    train_pre_result = regressor.predict(lrdata[0])  # 用于训练数据的预测值
    mpl.rcParams['font.sans-serif'] = ['SimHei']  # 设置中文字体
    mpl.rcParams['axes.unicode_minus'] = False
    figure('误差图 最终的MSE = %.4f' % (train_error[1][-1]), [train_error[1], 'error'])  # 绘制误差图
    figure('预测值与真实值图 模型的' + r'$R^2=%.4f$' % (getR(lrdata[1], train_pre_result)), [predict_result, '预测值'],
           [lrdata[3], '真实值'])  # 绘制预测值与真实值图
    plt.show()
    print('线性回归的系数为:\\n w = %s, \\nb= %s' % (train_error[0][:-1], train_error[0][-1]))

实验结果：
在这里插入图片描述

（三）利用logistic回归模型进行心脏病预测

心脏病是人类健康的头号杀手。全世界1/3的人口死亡是因心脏病引起的，而我国，每年有几十万人死于心脏病。所以，如果可以通过提取人体相关的体侧指标，通过数据挖掘的方式来分析不同特征对于心脏病的影响，对于预测和预防心脏病将起到至关重要的作用。本文将会通过真实的数据，通过Python搭建心脏病预测案例。

实验代码：

import numpy as np
import pandas as pd
from prettytable import PrettyTable  # 用于计算混淆矩阵
import matplotlib.pyplot as plt
from pylab import mpl
def confusion(realy, outy):
    mix = PrettyTable()
    type = sorted(list(set(realy.T[0])), reverse=True)
    mix.field_names = [' '] + ['预测:%d类' % si for si in type]
    # 字典形式存储混淆矩阵数据
    cmdict = {}
    for jkj in type:
        cmdict[jkj] = []
        for hh in type:
            hu = len(['0' for jj in range(len(realy)) if realy[jj][0] == jkj and outy[jj][0] == hh])
            cmdict[jkj].append(hu)
    # 输出表格
    for fu in type:
        mix.add_row(['真实:%d类' % fu] + cmdict[fu])
    return mix
# 返回混淆矩阵用到的数据TP，TN，FP，FN
def getmatrix(realy, outy, possclass=1):  # 默认类1 为正类
    TP = len(['0' for jj in range(以上是关于机器学习线性回归（最小二乘法/梯度下降法）多项式回归logistic回归softmax回归的主要内容，如果未能解决你的问题，请参考以下文章 
 机器学习模型和算法
 线性回归
 《白话机器学习的数学》公式整理
 第二篇[机器学习] 学习机器学习，从最简单的线性回归开始
 python实现线性回归原理
 机器学习入门之单变量线性回归（上）——梯度下降法