模式识别实验三:K均值算法和模糊C均值算法
Posted zstar-_
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了模式识别实验三:K均值算法和模糊C均值算法相关的知识,希望对你有一定的参考价值。
本文采用了sonar和Iris数据集,完整的程序代码实验报告pdf,数据集可以戳下面的链接下载。
Link:https://download.csdn.net/download/qq1198768105/71411278
实验报告图片版
程序代码
以Iris数据集为例:
k-means
import numpy as np
import matplotlib.pyplot as plt
import random
# 正常导入数据
def load_dataset():
data = np.genfromtxt('./iris.txt', delimiter=',', usecols=(0, 1, 2, 3))
target = np.genfromtxt('./iris.txt', delimiter=',', usecols=(4), dtype=str)
t = np.zeros(len(target))
t[target == 'setosa'] = 1
t[target == 'versicolor'] = 2
t[target == 'virginica'] = 3
return data, t
# 随机初始化k个聚类中心,从样本中随机选取
def randChosenCent(data, k):
# 样本数
m = data.shape[0]
# 初始化列表
centroids = []
# 生成类似于样本索引的列表
centroidsIndex = random.sample(range(0, m), k) # 产生k个[0,60)的不同随机数
# 根据索引获取样本
for j in centroidsIndex:
centroids.append(data[j])
return centroids
def osdistance(vecA, vecB): # 两个向量间欧式距离
return np.sqrt(sum(np.power(vecA - vecB, 2)))
def kMeans(data, k):
# 样本总数
m = len(data)
# 分配样本到最近的簇:存[簇序号,距离],m行2列
cluster = np.zeros((m, 2))
# 通过随机产生的样本点初始化聚类中心
centroids = np.array(randChosenCent(data, k))
# print('最初的中心=', centroids)
clusterChanged = True # 标记每次迭代后聚类中心是否发生变化
iterTime = 0 # 标记迭代次数
# 所有样本分配结果不再改变,迭代终止
while clusterChanged:
# 分配到最近的聚类中心对应的簇中
for i in range(m):
# 初始定义距离为无穷大
minDist = float('inf')
# 初始化索引值
minIndex = -1
# 计算每个样本与k个中心点距离
for j in range(k):
# 计算第i个样本到第j个中心点的距离
distJI = osdistance(centroids[j], data[i])
# 判断距离是否为最小
if distJI < minDist:
# 更新获取到最小距离
minDist = distJI
# 获取对应的簇序号
minIndex = j
cluster[i, 0] = minIndex
cluster[i, 1] = minDist
iterTime += 1
# 更新聚类中心
centroids_pre = centroids.copy() # 将之前的聚类中心做深拷贝
for cent in range(k):
cent_sum = np.zeros((1, 4)) # (1,4)维度的向量
num = 0 # num 用来计量簇内个数
for i in range(m):
if (cluster[i, 0] == cent):
cent_sum += data[i, :]
num += 1
centroids[cent, :] = cent_sum / num
if ((centroids_pre == centroids).all()):
clusterChanged = False
# print('迭代次数为', '%d' % iterTime)
return cluster, iterTime, centroids
# 计算分类纯度
def cal_accuracy(k):
accuracy = 0
for i in range(k):
label_list = [] # label_list 存储第i簇样本的真实标签
for j in range(len(cluster)):
if (cluster[j][0] == i):
label_list.append(t[j])
# print(label_list)
true_label = max(label_list, key=label_list.count) # 选取数量最大的标签作为其标签
# 再次遍历真实样本类别,若真实样本类别=簇类别,accuracy+1
for n in range(len(label_list)):
if (label_list[n] == true_label):
accuracy += 1
accuracy = accuracy / len(data)
return accuracy
def draw(data, t):
x0 = data[t == 1]
x1 = data[t == 2]
x2 = data[t == 3]
plt.figure(1)
plt.scatter(x0[:, 0], x0[:, 1], c='r', marker='o', label='类别一')
plt.scatter(x1[:, 0], x1[:, 1], c='g', marker='o', label='类别二')
plt.scatter(x2[:, 0], x2[:, 1], c='blue', marker='o', label='类别三')
plt.xlabel('花瓣长度')
plt.ylabel('花瓣宽度')
plt.title('花瓣长度和花瓣宽度特征之间的散点图(真实数据)')
plt.legend(loc=2) # 把图例放到左上角
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.savefig('./iris_kmeans(yuanshi)')
plt.show()
def draw_pre(cluster, data, centroids):
x0 = []
x1 = []
x2 = []
for i in range(len(cluster)):
if cluster[i][0] == 0:
x0.append(data[i])
elif cluster[i][0] == 1:
x1.append(data[i])
elif cluster[i][0] == 2:
x2.append(data[i])
x0 = np.array(x0)
x1 = np.array(x1)
x2 = np.array(x2)
plt.figure(2)
plt.scatter(x0[:, 0], x0[:, 1], c='r', marker='o', label='类别一')
plt.scatter(x1[:, 0], x1[:, 1], c='g', marker='o', label='类别二')
plt.scatter(x2[:, 0], x2[:, 1], c='b', marker='o', label='类别三')
plt.scatter(centroids[:, 0], centroids[:, 1], c='black', marker='x')
plt.xlabel('花瓣长度')
plt.ylabel('花瓣宽度')
plt.title('花瓣长度和花瓣宽度特征之间的散点图(预测数据)')
plt.legend(loc=2) # 把图例放到左上角
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.savefig('./iris_kmeans(yuce)')
plt.show()
if __name__ == '__main__':
data, t = load_dataset()
k = 3
cluster, iterTime, centroids = kMeans(data, k)
# 绘制前后对比散点图
draw(data, t)
draw_pre(cluster, data, centroids)
sum_iterTime = 0
sum_accuracy = 0
for i in range(10):
cluster, iterTime, centroids = kMeans(data, k)
accuracy = cal_accuracy(k)
sum_iterTime += iterTime
sum_accuracy += accuracy
print("平均迭代次数为:", "".format(sum_iterTime / 10))
print("平均分类纯度为:", ":.2%".format(sum_accuracy / 10))
FCM
import numpy as np
# 正常导入数据
def load_dataset():
data = np.genfromtxt('./iris.txt', delimiter=',', usecols=(0, 1, 2, 3))
target = np.genfromtxt('./iris.txt', delimiter=',', usecols=(4), dtype=str)
t = np.zeros(len(target))
t[target == 'setosa'] = 1
t[target == 'versicolor'] = 2
t[target == 'virginica'] = 3
return data, t
def osdistance(vecA, vecB): # 两个向量间欧式距离
return np.sqrt(sum(np.power(vecA - vecB, 2)))
# 初始化U矩阵
def initmatU(m, c):
mat_u = np.random.uniform(0, 1, (m, c)) # 0,1之间均匀分布初始化
# 归一化——每一个样本对所有分类集合隶属度总和为1
for i in range(m):
addsum = 0
for j in range(c):
addsum += mat_u[i, j]
mat_u[i, :] = mat_u[i, :] / addsum
return mat_u
def FCMtrain(data, c, alpha, theta):
m = len(data)
dim = data.shape[1] # 样本维度
mat_u = initmatU(m, c)
# 计算c个聚类中心
c_list = np.zeros([c, dim])
iterTime = 0 # 标记迭代次数
last_cost = 0 # 上一次的损失
while True:
# 计算聚类中心c_list
for j in range(c):
sum_uij = 0 # 表达式分母
sum_uij_x = 0 # 表达式分子
for i in range(m):
sum_uij += mat_u[i, j] ** alpha
sum_uij_x += mat_u[i, j] ** alpha * data[i, :]
c_list[j, :] = sum_uij_x / sum_uij
# 计算损失函数
cost = 0
for j in range(c):
for i in range(m):
vec1 = np.array(data[i, :]) # 第i条样本
vec2 = np.array(c_list[j, :]) # 第j个中心
dis = osdistance(vec1, vec2)
cost += mat_u[i, j] ** alpha * dis ** 2
if abs(last_cost - cost) < theta:
break
last_cost = cost
# 重新计算U
for j in range(c):
vec1 = np.array(c_list[j, :]) # 第j条样本
for i in range(m):
vec2 = np.array(data[i, :]) # 第i个中心
dis_ij = osdistance(vec1, vec2)
sumd_d = 0
for k in range(c):
vec3 = np.array(c_list[k, :]) # 第k个中心
dis_ki = osdistance(vec2, vec3)
sumd_d += (dis_ij / dis_ki) ** (2 / (alpha - 1))
mat_u[i, j] = 1 / sumd_d
# 归一化
for i in range(m):
addsum = 0
for j in range(c):
addsum += mat_u[i, j]
mat_u[i, :] = mat_u[i, :] / addsum
iterTime += 1
# print('迭代次数为', '%d' % iterTime)
# 对每一条样本进行遍历,隶属度最大的集合类别即为样本预测类别
pred = []
for i in range(m):
t = np.argmax(mat_u[i, :])
pred.append(t)
return c_list, pred, iterTime
# 计算分类纯度
def cal_accuracy(c, pred):
accuracy = 0
for i in range(c):
label_list = [] # label_list 存储第i簇样本的真实标签
for j in range(len(pred)):
if (pred[j] == i):
label_list.append(t[j])
true_label = max(label_list, key=label_list.count) # 选取数量最大的标签作为其标签
# 再次遍历真实样本类别,若真实样本类别=簇类别,accuracy+1
for n in range(len(label_list)):
if (label_list[n] == true_label):
accuracy += 1
accuracy = accuracy / len(data)
return accuracy
if __name__ == '__main__':
data, t = load_dataset()
c = 3
alpha = 6
theta = 0.001
sum_iterTime = 0
sum_accuracy = 0
c_list, pred, iterTime = FCMtrain(data, c, alpha, theta)
# print(c_list)
for i in range(10):
c_list, pred, iterTime = FCMtrain(data, c, alpha, theta)
accuracy = cal_accuracy(c, pred)
sum_iterTime += iterTime
sum_accuracy += accuracy
print("平均迭代次数为:", "".format(sum_iterTime/10))
print("平均分类纯度为:", ":.2%".format(sum_accuracy/10))
以上是关于模式识别实验三:K均值算法和模糊C均值算法的主要内容,如果未能解决你的问题,请参考以下文章