代码还好懂,但是后面选择更好的划分数据集的方法,有点不知道为什么那样选。
还要好好理解推导。
from math import log #计算香农熵 def calcShannonEnt(dataSet): numEntries = len(dataSet) labelCount = {} for featVector in dataSet: currentlabel = featVector[-1] labelCount[currentlabel] = labelCount.get(currentlabel,0) + 1 shannonEnt = 0.0 for key in labelCount: prob = float(labelCount[key])/numEntries shannonEnt -= prob * log(prob, 2) return shannonEnt #训练样本 def createDataSet(): dataSet = [[1,1,‘yes‘],[1,1,‘yes‘],[1,0,‘no‘], [0,1,‘no‘],[0,1,‘no‘]] labels = [‘no surfacing‘,‘flippers‘] return dataSet,labels #按照给定特征划分数据集 def splitDataSet(dataSet,axis,value): retDataSet = [] for featVec in dataSet: if(featVec[0]==value): reducedFeatVec = featVec[:axis] #这个变量干嘛的? reducedFeatVec.extend(featVec[axis+1:]) retDataSet.append(reducedFeatVec) return retDataSet def main(): dataSet,labels = createDataSet() # shannonEnt = calcShannonEnt(dataSet) #香农熵 # print(shannonEnt) print(splitDataSet(dataSet,0,1)) print(splitDataSet(dataSet,0,0)) main()
append和extend区别:
a = [1,2,3] c = [1,2,3] b = [4,5,6] a.append(b) c.extend(b) print(a) print(c)
[1, 2, 3, [4, 5, 6]]
[1, 2, 3, 4, 5, 6]