机器学习实战:第九章 树回归

Posted 黑凤梨

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了机器学习实战:第九章 树回归相关的知识,希望对你有一定的参考价值。

源代码中有两处错误,在网上查找后解决。

 

from numpy import *
import matplotlib.pyplot as plt

def loadDataSet(fileName):
    dataSet = []
    fr=open(fileName)
    for line in fr.readlines():
        curLine = line.strip().split(\t)
        fltLine=map(float,curLine)
        dataSet.append(list(fltLine))
    return dataSet


def binSplitDataSet(dataSet, feature, value):
    mat0 = dataSet[nonzero(dataSet[:,feature] > value)[0],:]     ##  第一处错误
    mat1 = dataSet[nonzero(dataSet[:,feature] <= value)[0],:]
    return mat0,mat1


def regLeaf(dataSet):#returns the value used for each leaf
    return mean(dataSet[:, -1])


def regErr(dataSet):
    return var(dataSet[:,-1]) * shape(dataSet)[0]

def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)):
    tolS = ops[0]
    tolN = ops[1]
    #if all the target variables are the same value: quit and return value
    if len(set(dataSet[:, -1].T.tolist()[0])) == 1: #exit cond 1
        return None , leafType(dataSet)
    m,n = shape(dataSet)
    #the choice of the best feature is driven by Reduction in RSS error from mean
    S = errType(dataSet)
    bestS = inf; bestIndex = 0; bestValue = 0
    for featIndex in range(n-1):
        #for splitVal in set(dataSet[:,featIndex]):

## 第二处错误 for splitVal in set((dataSet[:, featIndex].T.tolist())[0]): mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal) if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN): continue newS = errType(mat0) + errType(mat1) if newS < bestS: bestIndex = featIndex bestValue = splitVal bestS = newS #if the decrease (S-bestS) is less than a threshold don‘t do the split if (S - bestS) < tolS: return None, leafType(dataSet) #exit cond 2 mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue) if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN): #exit cond 3 return None, leafType(dataSet) return bestIndex,bestValue#returns the best feature to split on #and the value used for that split def createTree(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)):#assume dataSet is NumPy Mat so we can array filtering feat, val = chooseBestSplit(dataSet, leafType, errType, ops)#choose the best split if feat == None: return val #if the splitting hit a stop condition return val retTree = {} retTree[spInd] = feat retTree[spVal] = val lSet, rSet = binSplitDataSet(dataSet, feat, val) retTree[left] = createTree(lSet, leafType, errType, ops) retTree[right] = createTree(rSet, leafType, errType, ops) return retTree ########################################### # example construct a simple regression tree myDat1 = loadDataSet(rtrain.txt) myMat = mat(myDat1) mytree = createTree(myMat) mytree=createTree(myMat,ops=(1,4)) print(mytree) x=[];y=[] for a in myDat1: x.append(a[:][-2]) y.append(a[:][-1]) plt.scatter(x,y) plt.show() ############################# # cut some branches def isTree(obj): return (type(obj).__name__==dict) def getMean(tree): if isTree(tree[right]): tree[right] = getMean(tree[right]) if isTree(tree[left]): tree[left] = getMean(tree[left]) return (tree[left]+tree[right])/2.0 def prune(tree, testData): if shape(testData)[0] == 0: return getMean(tree) #if we have no test data collapse the tree if (isTree(tree[right]) or isTree(tree[left])):#if the branches are not trees try to prune them lSet, rSet = binSplitDataSet(testData, tree[spInd], tree[spVal]) if isTree(tree[left]): tree[left] = prune(tree[left], lSet) if isTree(tree[right]): tree[right] = prune(tree[right], rSet) #if they are now both leafs, see if we can merge them if not isTree(tree[left]) and not isTree(tree[right]): lSet, rSet = binSplitDataSet(testData, tree[spInd], tree[spVal]) errorNoMerge = sum(power(lSet[:,-1] - tree[left],2)) + sum(power(rSet[:,-1] - tree[right],2)) treeMean = (tree[left]+tree[right])/2.0 errorMerge = sum(power(testData[:,-1] - treeMean,2)) if errorMerge < errorNoMerge: print ("merging") return treeMean else: return tree else: return tree mydatTest=loadDataSet(test.txt) mymatTest=mat(mydatTest) cut_tree=prune(mytree,mymatTest) print(cut_tree) ################################ #model tree def linearSolve(dataSet): #helper function used in two places m,n = shape(dataSet) X = mat(ones((m,n))); Y = mat(ones((m,1)))#create a copy of data with 1 in 0th postion X[:,1:n] = dataSet[:,0:n-1]; Y = dataSet[:,-1]#and strip out Y xTx = X.T*X if linalg.det(xTx) == 0.0: raise NameError(This matrix is singular, cannot do inverse,\n try increasing the second value of ops) ws = xTx.I * (X.T * Y) return ws,X,Y def modelLeaf(dataSet): ws,X,Y =linearSolve(dataSet) return ws def modelErr(dataSet): ws,X,Y = linearSolve(dataSet) yHat = X* ws return sum(power(Y - yHat,2)) def regTreeEval(model, inDat): return float(model) def modelTreeEval(model, inDat): n = shape(inDat)[1] X = mat(ones((1,n+1))) X[:,1:n+1]=inDat return float(X*model) def treeForeCast(tree, inData, modelEval=regTreeEval): if not isTree(tree): return modelEval(tree, inData) if inData[tree[spInd]] > tree[spVal]: if isTree(tree[left]): return treeForeCast(tree[left], inData, modelEval) else: return modelEval(tree[left], inData) else: if isTree(tree[right]): return treeForeCast(tree[right], inData, modelEval) else: return modelEval(tree[right], inData) def createForeCast(tree, testData, modelEval=regTreeEval): m = len(testData) yHat = mat(zeros((m, 1))) for i in range(m): yHat[i, 0] = treeForeCast(tree, mat(testData[i]), modelEval) return yHat trainmat=mat(loadDataSet(train.txt)) testdat=loadDataSet(test.txt) testmat=mat(testdat) print(testmat[:,1]) mytree=createTree(trainmat,ops=(1,4)) yHat=createForeCast(mytree,testmat[:,0]) #print(yHat) co1=corrcoef(yHat,testmat[:,1],rowvar=0)[0,1] print(co1) #mytree=createTree(trainmat,modelLeaf,modelErr,(0,20)) #yHat=createForeCast(mytree,testmat[:,0],modelTreeEval) #co2=corrcoef(yHat,testmat[:,1],rowvar=0)[0,1] #print(co2)

 

以上是关于机器学习实战:第九章 树回归的主要内容,如果未能解决你的问题,请参考以下文章

机器学习-最小二乘法

机器学习实战之树回归

《Java并发编程实战》第九章 图形用户界面应用程序界面 读书笔记

《Java并发编程实战》第九章 图形用户界面应用程序界面 读书笔记

python入到到实战--第九章

机器学习算法基础+实战系列决策树算法