正在加载的管道仅返回列名
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了正在加载的管道仅返回列名相关的知识,希望对你有一定的参考价值。
我使用自定义类的管道的joblib和tempfile进行了腌制,并且在加载序列化管道时,我仅得到具有功能名称的数组。这是我第一次为机器学习管道开发自定义类。我不知道scikit-learn是否具有用于数据帧选择的类或如何执行类似的操作。
# Custom classes
class DataFrameSelector(BaseEstimator, TransformerMixin):
def __init__(self, attribute_names):
self.attribute_names = attribute_names
def fit(self, X, y=None):
return self
def transform(self, X):
return X[self.attribute_names]
class MeanEncoder(BaseEstimator, TransformerMixin):
def __init__(self):
return None
def fit(self, X, y):
if not isinstance(X, pd.DataFrame):
if isinstance(X, pd.Series):
X = pd.DataFrame(X)
else:
raise ValueError('Not a pandas DataFrame')
if X.shape[0] != y.shape[0]:
raise ValueError('The length of X is different than the length of y')
df = pd.DataFrame(X.copy())
df['TARGET'] = y.copy()
vars_cat = {}
for col in X.select_dtypes('object').columns:
ordered_labels = df.groupby(col).agg({'TARGET':'mean'}).sort_values('TARGET', ascending=False).index
ordinal_labels = {k:i for i, k in enumerate(ordered_labels, 0)}
vars_cat[col] = ordinal_labels
df[col] = df[col].map(ordinal_labels)
self.labels_ = vars_cat
self.train_cols_ = X.columns
return self
def transform(self, X):
if not isinstance(X, pd.DataFrame):
if isinstance(X, pd.Series):
X_transform = pd.DataFrame(X.copy())
else:
raise ValueError('Not a pandas DataFrame')
X_transform = pd.DataFrame(X[self.train_cols_].copy())
check_array = [col for col in self.labels_.keys() if col not in X_transform.columns and X_transform[col].dtype == object]
if len(check_array) > 0:
raise ValueError('Missing the following columns:', check_array)
for col in self.labels_.keys():
X_transform[col] = X_transform[col].map(self.labels_[col])
return X_transform
# --------------------------------------------------------------------
pipeline_grid = Pipeline(steps=[
('select_vars', DataFrameSelector(vars),
('encoder', MeanEncoder()),
('xgboost', xgb.XGBClassifier(random_state=SEED, n_jobs=5, verbosity=2))
])
search = RandomizedSearchCV(
estimator=pipeline_grid,
param_distributions=params_dist_grid,
n_iter=5,
cv=cv,
n_jobs=5,
scoring='roc_auc',
random_state=SEED,
verbose=3
)
pipeline_model = search.best_estimator_
s3 = boto3.resource('s3')
# Write
with tempfile.TemporaryFile() as fp:
joblib.dump(pipeline_modelo, fp)
fp.seek(0)
s3.Bucket(NM_BUCKET).put_object(Key= path + name_pipe, Body=fp.read())
# Load
s3 = boto3.client('s3')
with tempfile.TemporaryFile() as fp:
s3.download_fileobj(Fileobj=fp, Bucket=NM_BUCKET, Key=path + name_pipe)
fp.seek(0)
pipe = joblib.load(fp)
以下结果:
array(['VAR01', 'VAR02', 'VAR03', 'VAR04', 'VAR05', 'VAR06', 'VAR07', 'VAR08', 'VAR09', 'VAR10'])
答案
我修复了从DataFrameSelector中删除BaseEstimator的问题
现在看起来像
class DataFrameSelector(TransformerMixin):
def __init__(self, attribute_names):
self.attribute_names = attribute_names
def fit(self, X, y=None):
return self
def transform(self, X):
return X[self.attribute_names]
以上是关于正在加载的管道仅返回列名的主要内容,如果未能解决你的问题,请参考以下文章
mongoengine 中的聚合返回 $geoNear 仅作为管道中的第一阶段有效