python 生成ID3决策树的示例代码。参考:机器学习在行动第3章

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python 生成ID3决策树的示例代码。参考:机器学习在行动第3章相关的知识,希望对你有一定的参考价值。

#!/usr/bin/env python

from math import log
import operator
import urllib2

def createDataSet():
    '''
    prepare data: the last column is the label
    dataSet is like:
    [['young', 'myope', 'no', 'reduced', 'no lenses'],
     ['young', 'myope', 'no', 'normal', 'soft'],
     ['young', 'myope', 'yes', 'reduced', 'no lenses'],
     ...
     ['young', 'myope', 'yes', 'normal', 'hard'],
     ['young', 'hyper', 'no', 'reduced', 'no lenses']]
    '''
    lines = urllib2.urlopen('https://raw.githubusercontent.com/pbharrin/machinelearninginaction/master/Ch03/lenses.txt').readlines()
    dataSet = [line.strip().split('\t') for line in lines]
    featureNames = ['age', 'prescript', 'astigmatic', 'tearRate']
    return dataSet, featureNames

def calcShannonEnt(dataSet):
    '''
    Calculate the entropy of the label column (i.e. the last column of the data set)
    '''
    # count the frequency of each label
    labelCounts = {}
    for rec in dataSet:
        label = rec[-1]
        if (label not in labelCounts.keys()):
            labelCounts[label] = 0
        labelCounts[label] += 1
    # calculate the entropy
    totalCounts = len(dataSet)
    shannonEnt = 0.0
    for key in labelCounts:
        prob = float(labelCounts[key]) / totalCounts
        shannonEnt -= prob * log(prob, 2)
    return shannonEnt

def sliceDataSet(dataSet, axis, value):
    '''
    return the sub data set where the axis column's value == value, excluding
    the axis column.
    '''
    subDataSet = []
    for rec in dataSet:
        if (rec[axis] == value):
            subDataSet.append(rec[:axis] + rec[axis+1:])
    return subDataSet

def chooseBestFeatureToSplit(dataSet):
    recNum = len(dataSet)
    featureNum = len(dataSet[0]) - 1 # the last column is the label
    baseEntropy = calcShannonEnt(dataSet)
    bestInfoGain = 0.0
    bestFeature = -1
    # calculate entropy for each feature
    for i in range(featureNum):
        featureList = [rec[i] for rec in dataSet]
        uniqFeatureValues = set(featureList)
        newEntropy = 0.0
        for value in uniqFeatureValues:
            # select the sub data set where this feature's value == value
            subDataSet = sliceDataSet(dataSet, i, value)
            prob = len(subDataSet) / float(recNum)
            newEntropy += prob * calcShannonEnt(subDataSet)
        # select the best feature: which feature makes the entropy decrease the most
        infoGain = baseEntropy - newEntropy
        if (infoGain > bestInfoGain):
            bestInfoGain = infoGain
            bestFeature = i
    return bestFeature

def majorityCnt(classList):
    '''
    return the class with highest frequency in the class list
    '''
    classCounts = {}
    for vote in classList:
        if (vote not in classCounts.keys()):
            classCounts[vote] = 0
        classCounts[vote] += 1
    sortedClassCounts = sorted(classCounts.iteritems(), key=operator.itemgetter(1), reverse=True)
    return sortedClassCounts[0][0]

def createTree(dataSet, featureNames):
    '''
    create the decision tree using ID3 algorithm
    '''
    labelList = [rec[-1] for rec in dataSet]
    if (len(set(labelList)) == 1):
        # if the labels are all the same, return the label's value
        return labelList[0]
    elif (len(dataSet[0]) == 1):
        # if only lable column is left, return the label's value with highest frequency
        return majorityCnt(labelList)
    else:
        # select best feature
        bestFeatureIndex = chooseBestFeatureToSplit(dataSet)
        bestFeatureName = featureNames[bestFeatureIndex]
        # init tree
        myTree = {bestFeatureName: {}}
        del(featureNames[bestFeatureIndex])
        # get the uniq values of the best feature
        bestFeatureValues = [rec[bestFeatureIndex] for rec in dataSet]
        uniqBestFeatureValues = set(bestFeatureValues)
        # for each uniq value, generate the tree recursively
        for value in uniqBestFeatureValues:
            subFeatureNames = featureNames[:] # create a copy of featureNames
            myTree[bestFeatureName][value] = createTree(sliceDataSet(dataSet, bestFeatureIndex, value), subFeatureNames)
        return myTree

def classify(tree, featureNames, testRec):
    rootFeatureName = tree.keys()[0]
    rootFeatureDict = tree[rootFeatureName]
    rootFeatureIndex = featureNames.index(rootFeatureName)
    for rootFeatureValue in rootFeatureDict.keys():
        if (testRec[rootFeatureIndex] == rootFeatureValue):
            nextBranch = rootFeatureDict[rootFeatureValue]
            if (type(nextBranch).__name__ == 'dict'):
                # if this branch's value is still a dict, classify recursively
                featureName = classify(nextBranch, featureNames, testRec)
            else:
                featureName = nextBranch
    return featureName


if __name__ == '__main__':
    dataSet, featureNames = createDataSet()
    tree = createTree(dataSet, featureNames[:])
    featureName = classify(tree, featureNames, ['young', 'hyper', 'yes', 'normal'])

以上是关于python 生成ID3决策树的示例代码。参考:机器学习在行动第3章的主要内容,如果未能解决你的问题,请参考以下文章

人工智能机器学习之使用Python生成ID3决策树

5-3 决策树 ID3决策树的生成算法

5-3 决策树 ID3决策树的生成算法

ID3算法实现的决策树生成

机器学习速成宝典模型篇08支持向量机SVM(附python代码)

决策树的几种类型差异及Spark 2.0-MLlibScikit代码分析