python k近邻算法的示例代码。参考:机器学习在行动第2章。

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python k近邻算法的示例代码。参考:机器学习在行动第2章。相关的知识,希望对你有一定的参考价值。

#!/usr/bin/env python

from numpy import array, tile, zeros, shape
import operator
import urllib2
import matplotlib
import matplotlib.pyplot as plt

def createDataSet():
    '''
    dataSet is like:
    [[  4.09200000e+04   8.32697600e+00   9.53952000e-01]
     [  1.44880000e+04   7.15346900e+00   1.67390400e+00]
     [  2.60520000e+04   1.44187100e+00   8.05124000e-01]
     ...,
     [  2.65750000e+04   1.06501020e+01   8.66627000e-01]
     [  4.81110000e+04   9.13452800e+00   7.28045000e-01]
     [  4.37570000e+04   7.88260100e+00   1.33244600e+00]]

    labels is like:
    [3, 2, 1, ..., 2, 3, 1]
    '''
    lines = urllib2.urlopen('https://raw.githubusercontent.com/pbharrin/machinelearninginaction/master/Ch02/datingTestSet2.txt').readlines()
    dataSet = zeros((len(lines), 3))
    labels = []
    index = 0
    for line in lines:
        line = line.strip().split('\t')
        dataSet[index, :] = line[0:3]
        labels.append(int(line[-1]))
        index += 1
    return dataSet, labels

def plotDataSet(dataSet, labels):
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1) # 1*1 grid, 1st subplot
    ax.scatter(dataSet[:, 1], dataSet[:, 2],
        15.0*array(labels), 15.0*array(labels))
    plt.show()

def autoNorm(dataSet):
    '''
    Normalize dataSet: normVlu = (vlu - min) / (max - min)
    '''
    minVals = dataSet.min(0)
    maxVals = dataSet.max(0)
    ranges = maxVals - minVals
    normDataSet = zeros(shape(dataSet))
    m = dataSet.shape[0]
    normDataSet = dataSet - tile(minVals, (m, 1))
    normDataSet = normDataSet / tile(ranges, (m, 1))
    return normDataSet, ranges, minVals

def classify0(inX, dataSet, labels, k):
    '''
    Classify inX using kNN
    '''
    # calculate Euclidean distance
    dataSetSize = dataSet.shape[0]
    diffMat = tile(inX, (dataSetSize, 1)) - dataSet
    sqDiffMat = diffMat**2
    sqDistances = sqDiffMat.sum(axis=1)
    distances = sqDistances**0.5
    # sort
    sortedDistIndicies = distances.argsort()
    # count label frequency among top k data points
    classCount = {}
    for i in range(k):
        label = labels[sortedDistIndicies[i]]
        classCount[label] = classCount.get(label, 0) + 1
    # select the label with highest frequency
    sortedClassCount = sorted(classCount.iteritems(),
        key=operator.itemgetter(1), reverse=True)
    return sortedClassCount[0][0]

def runClassification(dataSet, labels, testRatio):
    # normalization
    dataSet, ranges, minVals = autoNorm(dataSet)
    # run classification
    dataSetSize = dataSet.shape[0]
    testSetSize = int(dataSetSize*testRatio)
    trainDataSet = dataSet[testSetSize:dataSetSize, :]
    trainLabels = labels[testSetSize:dataSetSize]
    errorNum = 0
    for i in range(testSetSize):
        testDataPoint = dataSet[i, :]
        k = 3
        testLabel = classify0(testDataPoint, trainDataSet, trainLabels, k)
        realLabel = labels[i]
        print("The classifier came back with: %d, the real answer is: %d" % (testLabel, realLabel))
        if (testLabel != realLabel):
            errorNum += 1
    print("The totle error rate is: %f" % (float(errorNum)/float(testSetSize)))


if __name__ == '__main__':
    # prepare data
    dataSet, labels = createDataSet()
    # plot data
    plotDataSet(dataSet, labels)
    # run classification
    runClassification(dataSet, labels, testRatio=0.1)

以上是关于python k近邻算法的示例代码。参考:机器学习在行动第2章。的主要内容,如果未能解决你的问题,请参考以下文章

机器学习——k近邻算法原理分析与python代码实现

机器学习——k-近邻(K-Nearest Neighbor)

机器学习实战笔记(Python实现)-01-K近邻算法(KNN)

机器学习实战☛k-近邻算法(K-Nearest Neighbor, KNN)

机器学习实战python3 K近邻(KNN)算法实现

k-近邻算法