机器学习实战-第二章代码+注释-KNN
Posted neu-2015
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了机器学习实战-第二章代码+注释-KNN相关的知识,希望对你有一定的参考价值。
#-*- coding:utf-8 -*- #https://blog.csdn.net/fenfenmiao/article/details/52165472 from numpy import * #科学计算包 import operator #运算符模块 import matplotlib import matplotlib.pyplot as plt #matplotlib.pyplot是一些命令行风格函数的集合 from os import listdir #列出给定目录的文件名 def createDataSet(): group = array([[1.0, 1.1], [1.0, 1.0], [0, 0], [0, 0.1]]) labels = [‘A‘, ‘A‘, ‘B‘, ‘B‘] return group, labels def classify0(inX, dataSet, labels, k) : dataSetSize = dataSet.shape[0] #查看矩阵或者数组的维数 c.shape[1] 为第一维的长度,c.shape[0] 为第二维的长度 此处为4 #(dataSetSize, 1)使数组重复完是四行一样的 而不是在1行中。 diffMat = tile(inX, (dataSetSize, 1)) - dataSet #numpy.tile(A,reps) tile共有2个参数,A指待输入数组,reps则决定A重复的次数。整个函数用于重复数组A来构建新的数组。 sqDiffMat = diffMat ** 2 #幂 (x1 - x2)的幂 sqDistance = sqDiffMat.sum(axis = 1) #每行相加 横着相加 distances = sqDistance ** 0.5 #开根号 sortedDistIndicies = distances.argsort() #argsort是排序,将元素按照由小到大的顺序返回下标 classCount = {} #dict字典数据类型,字典是Python中唯一内建的映射类型 for i in range(k) : voteIlabel = labels[sortedDistIndicies[i]] #get是取字典里的元素,如果之前这个voteIlabel是有的,那么就返回字典里这个voteIlabel里的值,如果没有就返回0(后面写的),这行代码的意思就是算离目标点距离最近的k个点的类别,这个点是哪个类别哪个类别就加1 classCount[voteIlabel] = classCount.get(voteIlabel, 0) + 1 #key=operator.itemgetter(1)的意思是按照字典里的第一个排序,{A:1,B:2},要按照第1个(AB是第0个),即‘1’‘2’排序。reverse=True是降序排序 sortedClassCount = sorted(classCount.iteritems(), key = operator.itemgetter(0), reverse = True) return sortedClassCount[0][0] def file2matrix(filename): fr = open(filename) # 一次读取整个文本数据,并且自动将文件内容分析成一个行的列表,比readline()快 ,后面的img2vector就是使用的readline(),因为要逐行逐个读取,可以对比一下 arrayOLines = fr.readlines() numberOfLines = len(arrayOLines) #返回来一个给定形状和类型的用0填充的数组; returnMat = zeros((numberOfLines, 3)) #文件有几行就是几行,设置为3列(可调) classLabelVector = [] index = 0 for line in arrayOLines : line = line.strip() #去掉回车符 listFromLine = line.split(‘ ‘) #分成了4列数据,得到了4个列表 #将数据前三列提取出来,存放到returnMat的NumPy矩阵中,也就是特征矩阵 returnMat[index, :] = listFromLine[0 : 3] #前3个列表元素是爱伦要的特征,取出来去填充returnMat #classLabelVector.append(int(listFromLine[-1])) if listFromLine[-1] == ‘didntLike‘: classLabelVector.append(1) elif listFromLine[-1] == ‘smallDoses‘: classLabelVector.append(2) elif listFromLine[-1] == ‘largeDoses‘: classLabelVector.append(3) index += 1 return returnMat, classLabelVector #归一化 def autoNorm(dataSet) : #min(0)返回该矩阵中每一列的最小值 #min(1)返回该矩阵中每一行的最小值 #max(0)返回该矩阵中每一列的最大值 #max(1)返回该矩阵中每一行的最大值 minVals = dataSet.min(0) maxVals = dataSet.max(0) ranges = maxVals - minVals #得到数据集的行数 shape方法用来得到矩阵或数组的维数 normDataSet = zeros(shape(dataSet)) m = dataSet.shape[0] normDataSet = dataSet - tile(minVals, (m, 1)) normDataSet = normDataSet / tile(ranges, (m, 1)) return normDataSet, ranges, minVals def datingClassTest() : hoRatio = 0.10 datingDataMatm, datingLabels = file2matrix(‘F:jxqDesktopdatingTestSet.txt‘) normMat, ranges, minVals = autoNorm(datingDataMat) #归一化 m = normMat.shape[0] #二维数组维度大小 numTestVecs = int(m * hoRatio) #训练样本从第m * hoRatio 开始 errorCount = 0.0 for i in range(numTestVecs): classifierResult = classify0(normMat[i, :], normMat[numTestVecs:m, :], datingLabels[numTestVecs : m], 3) print("the classifier came back with : %d, the real answer is : %d" % (classifierResult, datingLabels[i])) if (classifierResult != datingLabels[i]) : errorCount += 1.0 print("the total error rate is : %f" % (errorCount / float(numTestVecs))) def classifyPerson() : resultList = [‘not at all‘, ‘in small doses‘, ‘in large doses‘] percentTats = float(raw_input("percentage of time spent playing video games?")) ffMiles = float(raw_input("frequent flier miles earned per year?")) iceCream = float(raw_input("liters of ice cream consumed per year?")) datingDataMat, datingLables = file2matrix(‘F:jxqDesktopdatingTestSet.txt‘) normMat, ranges, minVals = autoNorm(datingDataMat) inArr = array([ffMiles, percentTats, iceCream]) classifierResult = classify0((inArr-minVals)/ranges, normMat, datingLables, 3) print ("You will probably like this person:", resultList[classifierResult - 1]) #索引从0开始,索引减去1才能索引到对应的resultList def img2vector(filename) : returnVect = zeros((1, 1024)) # 用于保存1x1024的向量 fr = open(filename) for i in range(32) : lineStr = fr.readline() for j in range(32) : returnVect[0, 32*i+j] = int(lineStr[j]) # 字符需要强制类型转换成整数 return returnVect def handwritingClassTest() : hwLabels = [] #获取目录内容 trainingFileList = listdir(‘F:\jxq\Documents\Tencent Files\834810071\FileRecv\machinelearninginaction-master\machinelearninginaction-master\Ch02\digits\trainingDigits‘) m = len(trainingFileList) trainingMat = zeros((m, 1024)) for i in range(m) : fileNameStr = trainingFileList[i] fileStr = fileNameStr.split(‘.‘)[0] #无后缀文件名 classNumStr = int(fileStr.split(‘_‘)[0]) #获取文件内的数字 hwLabels.append(classNumStr) #图片转换为向量 trainingMat[i, :] = img2vector(‘F:\jxq\Documents\Tencent Files\834810071\FileRecv\machinelearninginaction-master\machinelearninginaction-master\Ch02\digits\trainingDigits\%s‘ %fileNameStr) testFileList = listdir(‘F:\jxq\Documents\Tencent Files\834810071\FileRecv\machinelearninginaction-master\machinelearninginaction-master\Ch02\digits\testDigits‘) errorCount = 0.0 mTest = len(testFileList) for i in range(mTest): fileNameStr = testFileList[i] fileStr = fileNameStr.split(‘.‘)[0] classNumStr = int(fileStr.split(‘_‘)[0]) vectorUnderTest = img2vector(‘F:\jxq\Documents\Tencent Files\834810071\FileRecv\machinelearninginaction-master\machinelearninginaction-master\Ch02\digits\testDigits\%s‘ %fileNameStr) classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3) #分类 print("the classifier came back with: %d, the real answer is: %d" % (classifierResult, classNumStr)) if (classifierResult != classNumStr) : errorCount += 1.0 print(" the total number of error is: %d" %errorCount) print(" the total error rate is: %f" %(errorCount/float(mTest))) if __name__ == ‘__main__‘: #filename = ‘F:jxqDesktopdatingTestSet.txt‘ #datingDataMat, datingLabels = file2matrix(filename) ‘‘‘ print(datingDataMat) print(datingLabels) fig = plt.figure() #figure()来创建多个图 ax = fig.add_subplot(111) #参数349的意思是:将画布分割成3行4列,图像画在从左到右从上到下的第9块 #后两个是颜色 ax.scatter(datingDataMat[:, 1], datingDataMat[:, 2], 15.0*array(datingLabels), 15.0*array(datingLabels)) #绘制散点图 X[:,0]就是取所有行的第0个数据, X[:,1] 就是取所有行的第1个数据 ‘‘‘ ‘‘‘ normMat, range, minVals = autoNorm(datingDataMat) print(normMat) print(range) print(minVals) ‘‘‘ #datingClassTest() #plt.show() #classifyPerson() #testVector = img2vector(‘F:\jxq\Documents\Tencent Files\834810071\FileRecv\machinelearninginaction-master\machinelearninginaction-master\Ch02\digits\testDigits\0_13.txt‘) #print(testVector[0, 0:31]) #print(testVector[0, 32:63]) handwritingClassTest()
以上是关于机器学习实战-第二章代码+注释-KNN的主要内容,如果未能解决你的问题,请参考以下文章