Sklearn - 与样品的不一致数实测值输入变量:[16512,4128]

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Sklearn - 与样品的不一致数实测值输入变量:[16512,4128]相关的知识,希望对你有一定的参考价值。

通过实际操作机器学习与Scikit,学习和TensorFlow第二章中去,运行上述错误。发生这种情况时,我尝试实施以下行:

linReg.fit(housingPrepared, housing_labels)

在线研究它看起来像它做我的特点,我的标签尺寸不匹配起来的东西。打印housingPrepared(X)和housing_labels(Y)的形状产生了以下结果:

(16512, 16) (4128,)

我花了最后一小时通过行会通过网上看到,如果我错过这一章在一条线上,什么都找不到。不知道是否有人在这里可能会在哪里这个问题的可能的解决方案可能是一种直觉。

非常感谢你提前。我所有的代码到问题行下面贴:

import os
import tarfile
from six.moves import urllib
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from zlib import crc32
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import Imputer, OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from CategoricalEncoder import CategoricalEncoder
from sklearn.linear_model import LinearRegression
from sklearn.utils.validation import check_array
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml/master/"
HOUSING_PATH = os.path.join("datasets","housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetchHousingData(housingUrl=HOUSING_URL, housingPath=HOUSING_PATH):
    if not os.path.isdir(housingPath):
        os.makedirs(housingPath)
    tgzPath = os.path.join(housingPath, "housing.tgz")
    urllib.request.urlretrieve(housingUrl, tgzPath)
    housingTgz = tarfile.open(tgzPath)
    housingTgz.extractall(path=housingPath)
    housingTgz.close()

def loadHousingData(housingPath=HOUSING_PATH):
    return pd.read_csv("https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.csv")

housing = loadHousingData()
#plt.hist(housing['longitude'],bins=50)
#plt.show()

def splitTrainTesT(data, testRatio):
    shuffled_indices = np.random.permutation(len(data))
    testSetSize = int(len(data)* testRatio)
    testIndices = shuffled_indices[:testSetSize]
    trainIndices = shuffled_indices[testSetSize:]
    return data.iloc[trainIndices], data.iloc[testIndices]

def testSetCheck(identifier, testRatio):
    return crc32(np.int64(identifier)) & 0xffffffff < testRatio * 2 ** 32

def splitTrainTestByID(data, testRatio, idColumn):
    ids = data[idColumn]
    inTestSet = ids.apply(lambda id_: testSetCheck(id_, testRatio))
    return data.loc[~inTestSet], data.loc[inTestSet]


#housingWithID = housing.reset_index()
#trainSet, testSet = splitTrainTestByID(housingWithID,0.2,"index")

trainSet, testSet = train_test_split(housing,test_size=0.2,random_state=42)

housing["income_cat"] = np.ceil(housing["median_income"]/1.5)
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)

#plt.hist(housing["income_cat"])
#plt.show()

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for trainIndex, testIndex in split.split(housing, housing["income_cat"]):
    stratTrainSet = housing.loc[trainIndex]
    stratTestSet = housing.loc[testIndex]

for set in (stratTrainSet, stratTestSet):
    set.drop("income_cat", axis=1, inplace=True)

housing = stratTrainSet.copy()
#print(housing)

#plt.scatter(x=housing["latitude"],y=housing["longitude"], alpha=0.4)
#plt.show()

corr_matrix = housing.corr()
#print(corr_matrix["median_house_value"].sort_values(ascending=False))

#attribues = ["median_house_value", "median_income", "total_rooms", "housing_median_age"]
#scatter_matrix(housing[attribues], figsize=(12,8))
#plt.show()

""" PREPARING DATA FOR MACHINE LEARNING ALGORITHMS"""
housing = stratTrainSet.drop("median_house_value", axis=1)
housing_labels = stratTestSet["median_house_value"].copy()


housing.dropna(subset=["total_bedrooms"])
imputer = Imputer(strategy="median")
housingNum = housing.drop("ocean_proximity", axis=1)
imputer.fit(housingNum)

X = imputer.transform(housingNum)
housingTr = pd.DataFrame(X, columns=housingNum.columns)

housingCat = housing["ocean_proximity"]
housingCatEncoded, housingCategories = housingCat.factorize()

encoder = OneHotEncoder()
housingCat1Hot = encoder.fit_transform(housingCatEncoded.reshape(-1,1))


"""Custom Transformers For Rooms Per Household, etc"""
roomsIX, bedroomsIX, populationIX, householdsIX = 3,4,5,6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, addBedroomsPerRoom = True):
        self.addBedroomsPerRoom = addBedroomsPerRoom
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        roomsPerHousehold = X[:,roomsIX]/X[:,householdsIX]
        populationPerHousehold = X[:,populationIX]/X[:,householdsIX]
        if self.addBedroomsPerRoom:
            bedroomsPerRoom = X[:,bedroomsIX]/X[:,roomsIX]
            return np.c_[X, roomsPerHousehold, populationPerHousehold, bedroomsPerRoom]
        else:
            return np.c_[X, roomsPerHousehold, populationPerHousehold]

attrAdder = CombinedAttributesAdder(addBedroomsPerRoom=False)
housingExtraAttribs = attrAdder.transform(housing.values)

numPipeline = Pipeline([('imputer', Imputer(strategy='median')),
                        ('attribs_adder', CombinedAttributesAdder()),
                        ('std_scaler', StandardScaler()),
                        ])

housingNumTr = numPipeline.fit_transform(housingNum)

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attributeNames):
        self.attributeNames = attributeNames
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attributeNames].values


numAttribs = list(housingNum)
catAttribs = ["ocean_proximity"]

numPipeline = Pipeline([('selector', DataFrameSelector(numAttribs)),
                        ('imputer', Imputer(strategy='median')),
                        ('attribs_adder', CombinedAttributesAdder()),
                        ('std_scaler', StandardScaler()),])

"""UPDATE SKLEARN TO INCLUDE CATEGORICAL ENCODER LIBRARY"""
catPipeline = Pipeline([('selector', DataFrameSelector(catAttribs)),
                        ('cat_encoder', CategoricalEncoder(encoding='onehot-dense')),
                        ])

fullPipeline = FeatureUnion(transformer_list=[("num_pipeline", numPipeline), ("cat_pipeline", catPipeline),])

housingPrepared = fullPipeline.fit_transform(housing)



linReg = LinearRegression()

print(housingPrepared.shape, housing_labels.shape)
linReg.fit(housingPrepared, housing_labels)
答案

我相信这个问题是在这两条线:

housing = stratTrainSet.drop("median_house_value", axis=1)
housing_labels = stratTestSet["median_house_value"].copy()

将其更改为:

housing = stratTrainSet.drop("median_house_value", axis=1)
housing_labels = stratTrainSet["median_house_value"].copy()

和你去好。

以上是关于Sklearn - 与样品的不一致数实测值输入变量:[16512,4128]的主要内容,如果未能解决你的问题,请参考以下文章

sklearn:发现样本数量不一致的输入变量:[1, 99]

Sklearn:ValueError:发现样本数量不一致的输入变量:[1, 6]

样本数量不一致的 Python Sklearn 变量

机器学习中的损失函数

过拟合与对策

loss函数求导的意义