聚类之k-means附代码
Posted hrnn
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了聚类之k-means附代码相关的知识,希望对你有一定的参考价值。
import os
import sys as sys
#reload(sys)
#sys.setdefaultencoding(‘utf-8‘)
from sklearn.cluster import KMeans
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
import numpy as np
def tfidf_vector(corpus_path):
corpus_train=[]
#利用train-corpus提取特征
target_train=[]
for line in open(corpus_path):
line=line.strip().split(‘ ‘)
if len(line)==2:
words=line[1]
category=line[0]
target_train.append(category)
corpus_train.append(words)
print ("build train-corpus done!!")
count_v1= CountVectorizer(max_df=0.4,min_df=0.01)
counts_train = count_v1.fit_transform(corpus_train)
word_dict={}
for index,word in enumerate(count_v1.get_feature_names()):
word_dict[index]=word
print ("the shape of train is ")
print (repr(counts_train.shape))
tfidftransformer = TfidfTransformer()
tfidf_train = tfidftransformer.fit(counts_train).transform(counts_train)
return tfidf_train,word_dict
def best_kmeans(tfidf_matrix,word_dict):
K = range(1, 10)
meandistortions = []
for k in K:
print (k),(‘****‘*5)
kmeans = KMeans(n_clusters=k)
kmeans.fit(tfidf_matrix)
meandistortions.append(sum(np.min(cdist(tfidf_matrix.toarray(), kmeans.cluster_centers_, ‘euclidean‘), axis=1)) / tfidf_matrix.shape[0])
plt.plot(K, meandistortions, ‘bx-‘)
plt.grid(True)
plt.xlabel(‘Number of clusters‘)
plt.ylabel(‘Average within-cluster sum of squares‘)
plt.title(‘Elbow for Kmeans clustering‘)
plt.show()
corpus_train = "corpus_train.txt"
cluster_docs = "cluster_result_document.txt"
cluster_keywords = "cluster_result_keyword.txt"
num_clusters = 7
tfidf_train,word_dict=tfidf_vector(corpus_train)
best_kmeans(tfidf_train,word_dict)
cluster_kmeans(tfidf_train,word_dict,cluster_docs,cluster_keywords,num_clusters)
以上是关于聚类之k-means附代码的主要内容,如果未能解决你的问题,请参考以下文章
机器学习算法精讲20篇-k-means聚类算法应用案例(附示例代码)
数学建模MATLAB应用实战系列(106)-机器学习算法:K-means聚类(附MATLAB代码)
目标检测K-means和K-means++计算anchors结果比较(附完整代码,全网最详细的手把手教程)