模式识别实验三:K均值算法和模糊C均值算法

Posted zstar-_

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了模式识别实验三:K均值算法和模糊C均值算法相关的知识,希望对你有一定的参考价值。

本文采用了sonar和Iris数据集,完整的程序代码实验报告pdf,数据集可以戳下面的链接下载。
Link:https://download.csdn.net/download/qq1198768105/71411278

实验报告图片版

程序代码

以Iris数据集为例:

k-means

import numpy as np
import matplotlib.pyplot as plt
import random


# 正常导入数据
def load_dataset():
    data = np.genfromtxt('./iris.txt', delimiter=',', usecols=(0, 1, 2, 3))
    target = np.genfromtxt('./iris.txt', delimiter=',', usecols=(4), dtype=str)
    t = np.zeros(len(target))
    t[target == 'setosa'] = 1
    t[target == 'versicolor'] = 2
    t[target == 'virginica'] = 3
    return data, t


# 随机初始化k个聚类中心,从样本中随机选取
def randChosenCent(data, k):
    # 样本数
    m = data.shape[0]
    # 初始化列表
    centroids = []
    # 生成类似于样本索引的列表
    centroidsIndex = random.sample(range(0, m), k)  # 产生k个[0,60)的不同随机数
    # 根据索引获取样本
    for j in centroidsIndex:
        centroids.append(data[j])
    return centroids


def osdistance(vecA, vecB):  # 两个向量间欧式距离
    return np.sqrt(sum(np.power(vecA - vecB, 2)))


def kMeans(data, k):
    # 样本总数
    m = len(data)
    # 分配样本到最近的簇:存[簇序号,距离],m行2列
    cluster = np.zeros((m, 2))

    # 通过随机产生的样本点初始化聚类中心
    centroids = np.array(randChosenCent(data, k))
    # print('最初的中心=', centroids)
    clusterChanged = True  # 标记每次迭代后聚类中心是否发生变化
    iterTime = 0  # 标记迭代次数
    # 所有样本分配结果不再改变,迭代终止
    while clusterChanged:
        # 分配到最近的聚类中心对应的簇中
        for i in range(m):
            # 初始定义距离为无穷大
            minDist = float('inf')
            # 初始化索引值
            minIndex = -1
            # 计算每个样本与k个中心点距离
            for j in range(k):
                # 计算第i个样本到第j个中心点的距离
                distJI = osdistance(centroids[j], data[i])
                # 判断距离是否为最小
                if distJI < minDist:
                    # 更新获取到最小距离
                    minDist = distJI
                    # 获取对应的簇序号
                    minIndex = j
            cluster[i, 0] = minIndex
            cluster[i, 1] = minDist
        iterTime += 1
        # 更新聚类中心
        centroids_pre = centroids.copy()  # 将之前的聚类中心做深拷贝
        for cent in range(k):
            cent_sum = np.zeros((1, 4))  # (1,4)维度的向量
            num = 0  # num 用来计量簇内个数
            for i in range(m):
                if (cluster[i, 0] == cent):
                    cent_sum += data[i, :]
                    num += 1
            centroids[cent, :] = cent_sum / num
        if ((centroids_pre == centroids).all()):
            clusterChanged = False
    # print('迭代次数为', '%d' % iterTime)
    return cluster, iterTime, centroids


# 计算分类纯度
def cal_accuracy(k):
    accuracy = 0
    for i in range(k):
        label_list = []  # label_list 存储第i簇样本的真实标签
        for j in range(len(cluster)):
            if (cluster[j][0] == i):
                label_list.append(t[j])
        # print(label_list)
        true_label = max(label_list, key=label_list.count)  # 选取数量最大的标签作为其标签
        # 再次遍历真实样本类别,若真实样本类别=簇类别,accuracy+1
        for n in range(len(label_list)):
            if (label_list[n] == true_label):
                accuracy += 1
    accuracy = accuracy / len(data)
    return accuracy


def draw(data, t):
    x0 = data[t == 1]
    x1 = data[t == 2]
    x2 = data[t == 3]
    plt.figure(1)
    plt.scatter(x0[:, 0], x0[:, 1], c='r', marker='o', label='类别一')
    plt.scatter(x1[:, 0], x1[:, 1], c='g', marker='o', label='类别二')
    plt.scatter(x2[:, 0], x2[:, 1], c='blue', marker='o', label='类别三')
    plt.xlabel('花瓣长度')
    plt.ylabel('花瓣宽度')
    plt.title('花瓣长度和花瓣宽度特征之间的散点图(真实数据)')
    plt.legend(loc=2)  # 把图例放到左上角
    plt.rcParams['font.sans-serif'] = ['SimHei']
    plt.savefig('./iris_kmeans(yuanshi)')
    plt.show()


def draw_pre(cluster, data, centroids):
    x0 = []
    x1 = []
    x2 = []
    for i in range(len(cluster)):
        if cluster[i][0] == 0:
            x0.append(data[i])
        elif cluster[i][0] == 1:
            x1.append(data[i])
        elif cluster[i][0] == 2:
            x2.append(data[i])
    x0 = np.array(x0)
    x1 = np.array(x1)
    x2 = np.array(x2)
    plt.figure(2)
    plt.scatter(x0[:, 0], x0[:, 1], c='r', marker='o', label='类别一')
    plt.scatter(x1[:, 0], x1[:, 1], c='g', marker='o', label='类别二')
    plt.scatter(x2[:, 0], x2[:, 1], c='b', marker='o', label='类别三')
    plt.scatter(centroids[:, 0], centroids[:, 1], c='black', marker='x')
    plt.xlabel('花瓣长度')
    plt.ylabel('花瓣宽度')
    plt.title('花瓣长度和花瓣宽度特征之间的散点图(预测数据)')
    plt.legend(loc=2)  # 把图例放到左上角
    plt.rcParams['font.sans-serif'] = ['SimHei']
    plt.savefig('./iris_kmeans(yuce)')
    plt.show()


if __name__ == '__main__':
    data, t = load_dataset()
    k = 3
    cluster, iterTime, centroids = kMeans(data, k)
    # 绘制前后对比散点图
    draw(data, t)
    draw_pre(cluster, data, centroids)
    sum_iterTime = 0
    sum_accuracy = 0
    for i in range(10):
        cluster, iterTime, centroids = kMeans(data, k)
        accuracy = cal_accuracy(k)
        sum_iterTime += iterTime
        sum_accuracy += accuracy
    print("平均迭代次数为:", "".format(sum_iterTime / 10))
    print("平均分类纯度为:", ":.2%".format(sum_accuracy / 10))

FCM

import numpy as np

# 正常导入数据
def load_dataset():
    data = np.genfromtxt('./iris.txt', delimiter=',', usecols=(0, 1, 2, 3))
    target = np.genfromtxt('./iris.txt', delimiter=',', usecols=(4), dtype=str)
    t = np.zeros(len(target))
    t[target == 'setosa'] = 1
    t[target == 'versicolor'] = 2
    t[target == 'virginica'] = 3
    return data, t


def osdistance(vecA, vecB):  # 两个向量间欧式距离
    return np.sqrt(sum(np.power(vecA - vecB, 2)))

# 初始化U矩阵
def initmatU(m, c):
    mat_u = np.random.uniform(0, 1, (m, c))  # 0,1之间均匀分布初始化
    # 归一化——每一个样本对所有分类集合隶属度总和为1
    for i in range(m):
        addsum = 0
        for j in range(c):
            addsum += mat_u[i, j]
        mat_u[i, :] = mat_u[i, :] / addsum
    return mat_u


def FCMtrain(data, c, alpha, theta):
    m = len(data)
    dim = data.shape[1]  # 样本维度
    mat_u = initmatU(m, c)
    # 计算c个聚类中心
    c_list = np.zeros([c, dim])
    iterTime = 0  # 标记迭代次数
    last_cost = 0  # 上一次的损失

    while True:
        # 计算聚类中心c_list
        for j in range(c):
            sum_uij = 0  # 表达式分母
            sum_uij_x = 0  # 表达式分子
            for i in range(m):
                sum_uij += mat_u[i, j] ** alpha
                sum_uij_x += mat_u[i, j] ** alpha * data[i, :]
            c_list[j, :] = sum_uij_x / sum_uij
        # 计算损失函数
        cost = 0
        for j in range(c):
            for i in range(m):
                vec1 = np.array(data[i, :])  # 第i条样本
                vec2 = np.array(c_list[j, :])  # 第j个中心
                dis = osdistance(vec1, vec2)
                cost += mat_u[i, j] ** alpha * dis ** 2
        if abs(last_cost - cost) < theta:
            break
        last_cost = cost
        # 重新计算U
        for j in range(c):
            vec1 = np.array(c_list[j, :])  # 第j条样本
            for i in range(m):
                vec2 = np.array(data[i, :])  # 第i个中心
                dis_ij = osdistance(vec1, vec2)
                sumd_d = 0
                for k in range(c):
                    vec3 = np.array(c_list[k, :])  # 第k个中心
                    dis_ki = osdistance(vec2, vec3)
                    sumd_d += (dis_ij / dis_ki) ** (2 / (alpha - 1))
                mat_u[i, j] = 1 / sumd_d
        # 归一化
        for i in range(m):
            addsum = 0
            for j in range(c):
                addsum += mat_u[i, j]
            mat_u[i, :] = mat_u[i, :] / addsum
        iterTime += 1
    # print('迭代次数为', '%d' % iterTime)
    # 对每一条样本进行遍历,隶属度最大的集合类别即为样本预测类别
    pred = []
    for i in range(m):
        t = np.argmax(mat_u[i, :])
        pred.append(t)
    return c_list, pred, iterTime


# 计算分类纯度
def cal_accuracy(c, pred):
    accuracy = 0
    for i in range(c):
        label_list = []  # label_list 存储第i簇样本的真实标签
        for j in range(len(pred)):
            if (pred[j] == i):
                label_list.append(t[j])
        true_label = max(label_list, key=label_list.count)  # 选取数量最大的标签作为其标签
        # 再次遍历真实样本类别,若真实样本类别=簇类别,accuracy+1
        for n in range(len(label_list)):
            if (label_list[n] == true_label):
                accuracy += 1
    accuracy = accuracy / len(data)
    return accuracy


if __name__ == '__main__':
    data, t = load_dataset()
    c = 3
    alpha = 6
    theta = 0.001
    sum_iterTime = 0
    sum_accuracy = 0
    c_list, pred, iterTime = FCMtrain(data, c, alpha, theta)
    # print(c_list)
    for i in range(10):
        c_list, pred, iterTime = FCMtrain(data, c, alpha, theta)
        accuracy = cal_accuracy(c, pred)
        sum_iterTime += iterTime
        sum_accuracy += accuracy
    print("平均迭代次数为:", "".format(sum_iterTime/10))
    print("平均分类纯度为:", ":.2%".format(sum_accuracy/10))

以上是关于模式识别实验三:K均值算法和模糊C均值算法的主要内容,如果未能解决你的问题,请参考以下文章

“k 均值”和“模糊 c 均值”目标函数有啥区别?

第三节3:类K-Means算法之模糊K-均值算法(FCM算法)

模糊 K 均值聚类算法

K均值聚类算法

模糊c–均值聚类算法的原理解释及推导

模糊c–均值聚类算法的原理解释及推导