k近邻算法学习笔记
Posted 周虽旧邦其命维新
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了k近邻算法学习笔记相关的知识,希望对你有一定的参考价值。
1、使用模拟数据演示k近邻算法
import numpy as np
import matplotlib.pyplot as plt
from math import sqrt
from collections import Counter
# knn算法思想:如果样本在特征空间的 k个最相邻的样本中大部分属于某一类,那么该样本也属于这一类
# raw_data_x原始特征集合,raw_data_y标签集合
raw_data_x = [[3.393533211, 2.331273381],
[3.110073483, 1.781539638],
[1.343808831, 3.368360954],
[3.582294042, 4.679779110],
[2.280362439, 2.866990263],
[7.423436942, 4.696522875],
[5.745015997, 3.533989803],
[9.172168622, 2.511101045],
[7.792783487, 3.424088941],
[7.9939820917, 0.791637231]
]
raw_data_y = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
x_train = np.array(raw_data_x)
y_train = np.array(raw_data_y)
x = np.array([8.093607318, 3.365731514])
plt.scatter(x_train[y_train==0, 0], x_train[y_train==0, 1], color='r')
plt.scatter(x_train[y_train==1, 0], x_train[y_train==1, 1], color='b')
plt.scatter(x[0], x[1], color='y')
# plt.show()
# distances = []
# for data in x_train:
# 计算训练集中每个点到新的点x的距离
# distance = sqrt(np.sum((data - x) ** 2))
# distances.append(distance)
# 计算训练集中每个点到新的点x的距离集合
distances = [sqrt(np.sum((data - x) ** 2)) for data in x_train]
print(distances)
# np.argsort对数组进行排序,返回索引列表,这里是对新的点x到训练集中每个点的距离进行排序
nearest = np.argsort(distances)
print(nearest)
k = 6
# 获得新的点x距离最近的k个点的类别(k个样本),distances[i]到最近的k个点的距离
top_K = [y_train[i] for i in nearest[:k]]
print(top_K)
# Counter计数,元素被作为字典的key存储,它们的计数作为字典的value存储
print(Counter(top_K))
votes = Counter(top_K)
print(votes.most_common(1))
# Counter提供的most_common方法获取计数最多的n个元素
# 返回一个列表(里面的元素是一个元组,元组第0位是被计数的具体元素,元组的第1位是出现的次数)
result = votes.most_common(1)[0][0]
# result就是根据knn算法预测的新的点x所属的类别
print(result)
2、使用scikit-learn中的knn算法
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
# raw_data_x原始特征集合,raw_data_y标签集合
raw_data_x = [[3.393533211, 2.331273381],
[3.110073483, 1.781539638],
[1.343808831, 3.368360954],
[3.582294042, 4.679779110],
[2.280362439, 2.866990263],
[7.423436942, 4.696522875],
[5.745015997, 3.533989803],
[9.172168622, 2.511101045],
[7.792783487, 3.424088941],
[7.9939820917, 0.791637231]
]
raw_data_y = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
X_train = np.array(raw_data_x)
y_train = np.array(raw_data_y)
x = np.array([8.093607318, 3.365731514])
# 构建knn算法机器学习对象
knnClassfier = KNeighborsClassifier(n_neighbors=6)
# 拟合出knn算法的模型
knnClassfier.fit(X_train, y_train)
# predict:预测结果
print(knnClassfier.predict([x]))
3、手写代码模拟knn算法
编写KNN.py模拟scikilearn中的knn算法预测过程:
import numpy as np
from math import sqrt
from collections import Counter
# 注意:python是缩进严格的语言,方法定义def前面没有缩进时会报错
class MyKNeighborsClassfier:
def __init__(self, n_neighbors):
assert n_neighbors >= 1, "n_neighbors must be valid"
self.k = n_neighbors
self.X_train = None
self.y_train = None
def fit(self, X_train, y_train):
assert X_train.shape[0] == y_train.shape[0], "训练集特征数据与标签集合不符"
self.X_train = X_train
self.y_train = y_train
return self
def predict(self, X_predict):
return np.array([self.__predict(x) for x in X_predict])
def __predict(self, X_predict):
distances = [sqrt(np.sum((x_train - X_predict) ** 2)) for x_train in self.X_train]
nearest = np.argsort(distances)
top_K = [self.y_train[i] for i in nearest[:self.k]]
votes = Counter(top_K)
y_predict = votes.most_common(1)[0][0]
return y_predict
使用自定义的预测方法预测鸢尾花类型:
import numpy as np
from KNN import MyKNeighborsClassfier
# raw_data_x原始特征集合,raw_data_y标签集合
raw_data_x = [[3.393533211, 2.331273381],
[3.110073483, 1.781539638],
[1.343808831, 3.368360954],
[3.582294042, 4.679779110],
[2.280362439, 2.866990263],
[7.423436942, 4.696522875],
[5.745015997, 3.533989803],
[9.172168622, 2.511101045],
[7.792783487, 3.424088941],
[7.9939820917, 0.791637231]
]
raw_data_y = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
X_train = np.array(raw_data_x)
y_train = np.array(raw_data_y)
x = np.array([8.093607318, 3.365731514])
kNeighborsClassfier = MyKNeighborsClassfier(n_neighbors=6)
kNeighborsClassfier.fit(X_train, y_train)
print(kNeighborsClassfier.predict(x))
4、模拟数据集拆分
编写model_selection.py文件,模拟scikilearn中knn算法数据集
import numpy as np
class TrainTestSplit:
def train_test_split(x, y, ratio=0.2, seed=None):
if seed:
np.random.seed(seed)
# 获得乱序的数据集索引
shuffle_index = np.random.permutation(len(x))
test_size = int(len(x) * ratio)
# 前test_size个索引对应的数据作为测试集,从test_size往后的索引对应的数据作为训练集
test_indexs = shuffle_index[:test_size]
train_indexs = shuffle_index[test_size:]
x_train = x[train_indexs]
y_train = y[train_indexs]
x_test = x[test_indexs]
y_test = y[test_indexs]
return x_train,y_train,x_test,y_test
测试:
import numpy as np
from model_selection import TrainTestSplit
from sklearn import datasets
from KNN import MyKNeighborsClassfier
iris = datasets.load_iris()
x_train,y_train,x_test,y_test = TrainTestSplit.train_test_split(x=iris.data, y=iris.target)
# print(x_train.shape, y_train.shape)
# print(x_test.shape, y_test.shape)
# print(x_test)
# print(y_test)
# print(x_train)
# print(y_train)
knnClassfier = MyKNeighborsClassfier(n_neighbors=6)
knnClassfier.fit(X_train=x_train,y_train=y_train)
result = knnClassfier.predict(x_test)
print(sum(result == y_test))
5、封装准确度方法
编写metrics.py
import numpy as np
def accuracy_score(y_true, y_predict):
return sum(y_true == y_predict)/len(y_true)
在KNN.py文件中添加score方法调用metrics.py中的accuracy_score来计算准确度
from metrics import accuracy_score
def score(self, x_test, y_test):
y_predict = self.predict(X_predict=x_test)
return accuracy_score(y_predict, y_test)
使用手写数字识别数据集测试准确度方法:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from model_selection import TrainTestSplit
from KNN import MyKNeighborsClassfier
# 手写数字识别测试数据集
digits = datasets.load_digits()
print(digits.keys())
X = digits.data
print(X.shape)
y = digits.target
print(y.shape)
x_train,y_train,x_test,y_test = TrainTestSplit.train_test_split(x = X, y = y, ratio=0.2)
my_knn_cli = MyKNeighborsClassfier(n_neighbors=3)
my_knn_cli.fit(X_train=x_train,y_train=y_train)
# result = my_knn_cli.predict(x_test)
# print(sum(result == y_test)/len(y_test))
print(my_knn_cli.score(x_test,y_test))
6、超参数
6.1 超参数概念:
超参数:在算法运行前需要决定的参数
模型参数:算法过程中学习的参数
KNN算法中没有模型参数
KNN算法中的k是典型的超参数
6.2 寻找最好的k,p,weights
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
# 手写数字识别测试数据集
digits = datasets.load_digits()
x = digits.data
y = digits.target
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
# print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)
# knnclf = KNeighborsClassifier(n_neighbors=3)
# knnclf.fit(X=X_train,y=y_train)
# print(knnclf.score(X=X_test,y=y_test))
# 寻找最好的k
# uniform不考虑距离,distance考虑距离
best_method = ""
# p (1曼哈顿距离,2欧拉距离,任意p:明可夫斯基距离)
best_p = 0
best_score = 0.0
best_k = -1
for method in ["uniform", "distance"]:
for k in range(1, 11):
if method == "distance":
for p in range(1, 6):
knnclf = KNeighborsClassifier(n_neighbors=k, weights=method, p=p)
knnclf.fit(X=X_train, y=y_train)
score = knnclf.score(X=X_test, y=y_test)
if score > best_score:
best_score = score
best_k = k
best_method = method
best_p = p
else:
knnclf = KNeighborsClassifier(n_neighbors=k, weights=method)
knnclf.fit(X=X_train, y=y_train)
score = knnclf.score(X=X_test, y=y_test)
if score > best_score:
best_score = score
best_k = k
best_method = method
best_p = 0
print(best_method)
print(best_p)
print(best_k)
print(best_score)
6.3 网格搜索与更多超参数
GridSearchCV网格搜索
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
import time
# 手写数字识别测试数据集
digits = datasets.load_digits()
x = digits.data
y = digits.target
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
param_grid = [
'weights':['uniform'],
'n_neighbors':[i for i in range(1,11)]
,
'weights':['distance'],
'n_neighbors':[i for i in range(1,11)],
'p':[i for i in range(1,6)]
]
print(param_grid)
knnclf = KNeighborsClassifier()
# n_jobs指定使用cpu的核数,-1所有的cpu都要使用,verbose是否打印搜索的日志,值越大,日志越详细
grid_search = GridSearchCV(knnclf, param_grid, n_jobs=-1, verbose=2)
time_start = time.time()
grid_search.fit(X=X_train,y=y_train)
time_end = time.time()
# 计算的时间差为程序的执行时间,单位为秒
print(time_end-time_start)
print(grid_search.best_estimator_)
print(grid_search.best_params_)
print(grid_search.best_score_)
knnclf = grid_search.best_estimator_
print(knnclf.score(X=X_test,y=y_test))
# best_estimator_网格搜索的最佳参数KNeighborsClassifier对象
# best_params_最佳param_grid超参数组合结果
# best_score_最佳超参数测试得分值
更多的距离定义,可以通过KNeighborsClassifier对象的metric属性指定,有关可用度量的列表,请参见DistanceMetric的文档。
7、数据归一化
7.1 最值归一化和均值方差归一化
import numpy as np
import matplotlib.pyplot as plt
# 最值归一化
# x = np.random.randint(0,100,size=100)
# print(x)
# result = (x - np.min(x))/(np.max(x)-np.min(x))
# print(result)
x = np.random.randint(0, 100, (50, 2))
x = np.array(x,dtype=float)
print(x)
x[:,0] = (x[:,0] - np.min(x[:,0]))/(np.max(x[:,0]) - np.min(x[:,0]))
x[:,1] = (x[:,1] - np.min(x[:,1]))/(np.max(x[:,1]) - np.min(x[:,1]))
print(x)
# 方差
print(np.std(x[:,0]))
print(np.std(x[:,1]))
# plt.scatter(x[:,0], x[:,1])
# plt.show()
# 均值方差归一化
x = np.random.randint(0, 100, (50, 2))
x = np.array(x,dtype=float)
x[:,0] = (x[:,0] - np.mean(x[:,0]))/np.std(x[:,0])
x[:,1] = (x[:,1] - np.mean(x[:,1]))/np.std(x[:,1])
print(x)
# 方差
print(np.std(x[:,0]))
print(np.std(x[:,1]))
plt.scatter(x[:,0], x[:,1])
plt.show()
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
iris = datasets.load_iris()
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y)
# sklearn.preprocessing StandardScaler方差归一化对象
standardScaler = StandardScaler()
standardScaler.fit(X_train)
print(standardScaler.mean_)
print(standardScaler.scale_)
X_train = standardScaler.transform(X_train)
print(X_train)
X_test = standardScaler.transform(X_test)
print(X_test)
knnclf = KNeighborsClassifier(n_neighbors=3)
knnclf.fit(X_train, y_train)
print(knnclf.score(X_test, y_test))
7.2 手写代码模拟StandardScaler
编写preprocessing.py文件
import numpy as np
class StanderScaler:
def __init__(self):
self.mean_ = None
self.scale_ = None
def fit(self, X):
assert X.ndim == 2, "this dimension X must be two"
self.mean_ = np.array([np.mean(X[i]) for i in range(X.shape[1])])
self.scale_ = np.array([np.std(X[i]) for i in range(X.shape[1])])
return self
def transform(self, X):
assert len(self.scale_) == X.shape[1], "the feature num of X must be equal to scale_ "
resX = np.empty(X.shape, dtype=float)
for col in range(X.shape[1]):
resX[:, col] = (X[:,col] - self.mean_[col])/self.scale_[col]
return resX
测试:
from preprocessing import StanderScaler
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
iris = datasets.load_iris()
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y)
standardScaler = StanderScaler()
standardScaler.fit(X_train)
print(standardScaler.mean_)
print(standardScaler.scale_)
X_train = standardScaler.transform(X_train)
print(X_train)
X_test = standardScaler.transform(X_test)
print(X_test)
knnclf = KNeighborsClassifier(n_neighbors=3)
knnclf.fit(X_train, y_train)
print(knnclf.score(X_test, y_test))
8、总结
以上是关于k近邻算法学习笔记的主要内容,如果未能解决你的问题,请参考以下文章