如何绘制kmeans？

Posted 2023-03-12

技术标签:

【中文标题】如何绘制kmeans？【英文标题】：how to graph kmeans? 【发布时间】：2020-07-19 01:04:03 【问题描述】：

我正在使用数据集并尝试学习 Kmeans 聚类，我正在使用以下代码：

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

# Create Points to cluster
Points = pd.DataFrame()
Points.loc[:,0] = [243,179,152,255,166,162,233,227,204,341,283,202,217,197,191,114,
      153,215,196,187,127,85,182,172,184,252,193,191,187,193,197,200,
      186,188,155,-99,22,68,167,-75,30,49,63,45,58,52,164,51,49,68,52,43,68,
      72,-51,59,56,-127,33,68,143,-26,-85,84,11,105,62,47,-75,2,67,-41,-33,
      10,28,23,34,19,13,6,-73,155,30]
Points.loc[:,1] = [2.1,4,2.6,2.1,2.5,0.4,0.3,4.9,1.1,1,-1.5,3.3,2.2,1.9,2.4,2.2,0.9,
      1.8,1.7,3.2,2.4,4.4,1.4,4.4,2.6,0.6,2.9,3.8,2.6,8.5,8.8,7.5,8.3,8.
      5,3.5,6.3,-1.4,-0.4,3,-5.2,-2.7,-3.2,-0.8,-3.9,-0.6,0.9,-5.1,-2.2,
      -0.3,-1.2,0.1,-2.1,-2.1,3.7,11.8,0,0,-6.6,-1,10.1,11.9,-3,-22,-18.2,-13.3,
      -8.4,-21.7,-16.7,-13.8,-13.9,-13.2,-14.9,-21.6,-16.4,-14.4,-15.8,
      -15.3,-15.3,-2.7,-13.2,-8.9,-3.3,-12.9]

# Create initial cluster centroids
ClusterCentroidGuesses = pd.DataFrame()
ClusterCentroidGuesses.loc[:,0] = [100, 200, 0]
ClusterCentroidGuesses.loc[:,1] = [2, -2, 0]

def Plot2DKMeans(Points, Labels, ClusterCentroids, Title):
    for LabelNumber in range(max(Labels)+1):
        LabelFlag = Labels == LabelNumber
        color =  ['c', 'm', 'y', 'b', 'g', 'r', 'c', 'm', 'y', 
                  'b', 'g', 'r', 'c', 'm', 'y'][LabelNumber]
        marker = ['s', 'o', 'v', '^', '<', '>', '8', 'p', '*', 
                  'h', 'H', 'D', 'd', 'P', 'X'][LabelNumber]
        plt.scatter(Points.loc[LabelFlag,0], Points.loc[LabelFlag,1],
                    s= 100, c=color, edgecolors="black", alpha=0.3, marker=marker)
        plt.scatter(ClusterCentroids.loc[LabelNumber,0],
                    ClusterCentroids.loc[LabelNumber,1], 
                    s=200, c="black", marker=marker)
    plt.title(Title)
    plt.show()

def KMeansNorm(Points, ClusterCentroidGuesses, NormD1, NormD2):
    PointsNorm = Points.copy()
    ClusterCentroids = ClusterCentroidGuesses.copy()
    if NormD1:
        # Determine mean of 1st dimension
        mean1 = np.mean(PointsNorm[:,0])
        # Determine standard deviation of 1st dimension
        std1 = np.std(PointsNorm[:,0])
        # Normalize 1st dimension of Points
        PointsNorm[:,0] = ((PointsNorm[:,0] - mean1)/std1) 
        # Normalize 1st dimension of ClusterCentroids
        Cmean1 = np.mean(ClusterCentroids[:,0])
        Cstd1 = np.std(ClusterCentroids[:,0])
        ClusterCentroids[:,0] = ((ClusterCentroids[:,0] - Cmean1)/Cstd1)
    if NormD2:
        # Determine mean of 2nd dimension
        mean2 = np.mean(PointsNorm[:,1])
        # Determine standard deviation of 2nd dimension
        std2 = np.std(PointsNorm[:,1])
        # Normalize 2nd dimension of Points
        PointsNorm[:,1] = ((PointsNorm[:,1] - mean2)/std2) 
        # Normalize 2nd dimension of ClusterCentroids
        Cmean2 = np.mean(ClusterCentroids[:,1])
        Cstd2 = np.std(ClusterCentroids[:,1])
        ClusterCentroids[:,1] = ((ClusterCentroids[:,1] - Cmean2)/Cstd2)
    # Do actual clustering
    kmeans = KMeans(n_clusters=3, init=ClusterCentroidGuesses, n_init=1).fit(PointsNorm)
    Labels = kmeans.labels_
    ClusterCentroids = pd.DataFrame(kmeans.cluster_centers_)
    if NormD1:
        # Denormalize 1st dimension
        PointsNorm[:,0] = PointsNorm[:,0]*std1+mean1
        ClusterCentroids[:,0] = ClusterCentroids[:0]*Cstd1+Cmean1
    if NormD2:
        # Denormalize 2nd dimension
        PointsNorm[:,1] = PointsNorm[:,1]*std2+mean2
        ClusterCentroids[:,1] = ClusterCentroids[:1]*Cstd2+Cmean2
    return Labels, ClusterCentroids

# Compare distributions of the two dimensions
plt.rcParams["figure.figsize"] = [6.0, 4.0] # Standard
plt.hist(Points.loc[:,0], bins = 20, color=[0, 0, 1, 0.5])
plt.hist(Points.loc[:,1], bins = 20, color=[1, 1, 0, 0.5])
plt.title("Compare Distributions")
plt.show()

# Change the plot dimensions
plt.rcParams["figure.figsize"] = [8, 8] # Square
# plt.rcParams["figure.figsize"] = [8, 0.5] # Wide
# plt.rcParams["figure.figsize"] = [0.5, 8] # Tall

# Cluster without normalization
# Are the points separated into clusters along one or both dimensions?
# Which dimension separates the points into clusters?
# Set Normalizations
NormD1=False
NormD2=False
Labels, ClusterCentroids = KMeansNorm(Points, ClusterCentroidGuesses, NormD1, NormD2)
Title = 'No Normalization'
Plot2DKMeans(Points, Labels, ClusterCentroids, Title)

# Set Normalizations
NormD1=True
NormD2=False
Labels, ClusterCentroids = KMeansNorm(Points, ClusterCentroidGuesses, NormD1, NormD2)
Title = 'No Normalization'
Plot2DKMeans(Points, Labels, ClusterCentroids, Title)

在尝试绘制 NormD1=True 时，我收到一个错误代码

TypeError: '(slice(None, None, None), 0)' is an invalid key

有人可以帮助我了解我哪里出错了吗？

【问题讨论】：

您应该将PointsNorm[:,0] 替换为PointsNorm[0]，因为PointsNorm 是一个数据框，而不是一个numpy 数组。 ClusterCentroids 类似。此外，ClusterCentroids[:0] 没有任何意义，它会导致一个空的数据框。也许ClusterCentroids[0] 是什么意思？ 【参考方案1】：

您似乎对这东西进行了过度设计！或者，也许您正在尝试学习 KMeans 的机制。让我们简化它，让它发挥作用，然后你可以将简单的东西外推到更复杂的东西。这是一个简单的示例，供您开始使用。

# K-MEANS CLUSTERING
# Importing Modules
from sklearn import datasets
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D

# Loading dataset
iris_df = datasets.load_iris()

# Declaring Model
model = KMeans(n_clusters=3)

# Fitting Model
model.fit(iris_df.data)

# Predicitng a single input
predicted_label = model.predict([[7.2, 3.5, 0.8, 1.6]])

# Prediction on the entire data
all_predictions = model.predict(iris_df.data)

# Printing Predictions
print(predicted_label)
print(all_predictions)


# import some data to play with
iris = datasets.load_iris()
X = iris.data[:, :3]  # we only take the first two features.
y = iris.target


fig = plt.figure(figsize=(10,10))
plt = fig.add_subplot(1, 1, 1, projection='3d')
plt.scatter(X[:,0],X[:,1],X[:,2], 
            c=all_predictions, edgecolor='red', s=40, alpha = 0.5)
plt.set_title("First three PCA directions")
plt.set_xlabel("Educational_Degree")
plt.set_ylabel("Gross_Monthly_Salary")
plt.set_zlabel("Claim_Rate")
plt.dist = 10
plt

就个人而言，我认为 3D 图表更适合呈现 KMeans 数据点。有时 2D 图表效果很好，但通常它们可能缺乏细节，因此会歪曲数据集的真实情况。最后，数据集应该正常分区开始，否则你可能会得到一些非常奇怪的结果！

【讨论】：

谢谢，我发布的代码是过度设计的，它是用于未评分的作业。发生了很多事情，我正在努力解决问题。对于您发布的代码，有多少光彩？从点来看，似乎有 5 个不同的集群，你怎么能输入质心？实际上，当我现在查看它时，我没有看到定义的特定数量的集群。无论如何，它有效。看看这个链接。我认为这是一个很好的聚类实验。 pythonforfinance.net/2018/02/08/…

以上是关于如何绘制kmeans？的主要内容，如果未能解决你的问题，请参考以下文章