FitFailedWarning:估计器拟合失败。当使用更大的 int 值时,这些参数的训练测试分区上的分数将设置为 nan
Posted
技术标签:
【中文标题】FitFailedWarning:估计器拟合失败。当使用更大的 int 值时,这些参数的训练测试分区上的分数将设置为 nan【英文标题】:FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan when using greater int values 【发布时间】:2021-11-14 07:53:59 【问题描述】:我最近观看了一个 YouTube (DataSchool) 视频,其中该人仅使用了 Titanic 数据集中的 3 列并制作了一个管道。我想添加更多列以获得更高的准确性,因此我添加了 Age 和 Fare。
我认为这可能是因为Age
和Fare
的值导致我在执行cross_val_score
时收到此错误
columns_trans = make_column_transformer(
(OneHotEncoder(), ['Sex', 'Embarked']),
remainder='passthrough')
logreg = LogisticRegression(solver='lbfgs')
pipe = make_pipeline(columns_trans, logreg)
cross_val_score(pipe, X, y, cv=5, scoring='accuracy').mean()
/opt/conda/lib/python3.7/site-packages/sklearn/model_selection/_validation.py:552:FitFailedWarning:估计器拟合失败。这些参数在这个训练测试分区上的分数将设置为 nan。
如果我删除 Age
和 Fare
,一切正常。我想知道Column Transformer
或make_pipeline
是否对这样的值有问题。
我还尝试缩放 Fare 和 Age 的值,然后它给出了 cross_val_score
但在 pipe.predict()
中失败了,给出了错误:
ValueError: Input contains NaN, infinity or a value too large for dtype('float64')
追溯:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
/tmp/ipykernel_119/4279568460.py in <module>
----> 1 cross_val_score(pipe, X, y, cv=5, scoring='accuracy', error_score="raise").mean()
/opt/conda/lib/python3.7/site-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs)
70 FutureWarning)
71 kwargs.update(k: arg for k, arg in zip(sig.parameters, args))
---> 72 return f(**kwargs)
73 return inner_f
74
/opt/conda/lib/python3.7/site-packages/sklearn/model_selection/_validation.py in cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, error_score)
404 fit_params=fit_params,
405 pre_dispatch=pre_dispatch,
--> 406 error_score=error_score)
407 return cv_results['test_score']
408
/opt/conda/lib/python3.7/site-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs)
70 FutureWarning)
71 kwargs.update(k: arg for k, arg in zip(sig.parameters, args))
---> 72 return f(**kwargs)
73 return inner_f
74
/opt/conda/lib/python3.7/site-packages/sklearn/model_selection/_validation.py in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score, return_estimator, error_score)
246 return_times=True, return_estimator=return_estimator,
247 error_score=error_score)
--> 248 for train, test in cv.split(X, y, groups))
249
250 zipped_scores = list(zip(*scores))
/opt/conda/lib/python3.7/site-packages/joblib/parallel.py in __call__(self, iterable)
1039 # remaining jobs.
1040 self._iterating = False
-> 1041 if self.dispatch_one_batch(iterator):
1042 self._iterating = self._original_iterator is not None
1043
/opt/conda/lib/python3.7/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
857 return False
858 else:
--> 859 self._dispatch(tasks)
860 return True
861
/opt/conda/lib/python3.7/site-packages/joblib/parallel.py in _dispatch(self, batch)
775 with self._lock:
776 job_idx = len(self._jobs)
--> 777 job = self._backend.apply_async(batch, callback=cb)
778 # A job can complete so quickly than its callback is
779 # called before we get here, causing self._jobs to
/opt/conda/lib/python3.7/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
206 def apply_async(self, func, callback=None):
207 """Schedule a func to be run"""
--> 208 result = ImmediateResult(func)
209 if callback:
210 callback(result)
/opt/conda/lib/python3.7/site-packages/joblib/_parallel_backends.py in __init__(self, batch)
570 # Don't delay the application, to avoid keeping the input
571 # arguments in memory
--> 572 self.results = batch()
573
574 def get(self):
/opt/conda/lib/python3.7/site-packages/joblib/parallel.py in __call__(self)
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
262 return [func(*args, **kwargs)
--> 263 for func, args, kwargs in self.items]
264
265 def __reduce__(self):
/opt/conda/lib/python3.7/site-packages/joblib/parallel.py in <listcomp>(.0)
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
262 return [func(*args, **kwargs)
--> 263 for func, args, kwargs in self.items]
264
265 def __reduce__(self):
/opt/conda/lib/python3.7/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
529 estimator.fit(X_train, **fit_params)
530 else:
--> 531 estimator.fit(X_train, y_train, **fit_params)
532
533 except Exception as e:
/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
333 if self._final_estimator != 'passthrough':
334 fit_params_last_step = fit_params_steps[self.steps[-1][0]]
--> 335 self._final_estimator.fit(Xt, y, **fit_params_last_step)
336
337 return self
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py in fit(self, X, y, sample_weight)
1415 penalty=penalty, max_squared_sum=max_squared_sum,
1416 sample_weight=sample_weight)
-> 1417 for class_, warm_start_coef_ in zip(classes_, warm_start_coef))
1418
1419 fold_coefs_, _, n_iter_ = zip(*fold_coefs_)
/opt/conda/lib/python3.7/site-packages/joblib/parallel.py in __call__(self, iterable)
1039 # remaining jobs.
1040 self._iterating = False
-> 1041 if self.dispatch_one_batch(iterator):
1042 self._iterating = self._original_iterator is not None
1043
/opt/conda/lib/python3.7/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
857 return False
858 else:
--> 859 self._dispatch(tasks)
860 return True
861
/opt/conda/lib/python3.7/site-packages/joblib/parallel.py in _dispatch(self, batch)
775 with self._lock:
776 job_idx = len(self._jobs)
--> 777 job = self._backend.apply_async(batch, callback=cb)
778 # A job can complete so quickly than its callback is
779 # called before we get here, causing self._jobs to
/opt/conda/lib/python3.7/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
206 def apply_async(self, func, callback=None):
207 """Schedule a func to be run"""
--> 208 result = ImmediateResult(func)
209 if callback:
210 callback(result)
/opt/conda/lib/python3.7/site-packages/joblib/_parallel_backends.py in __init__(self, batch)
570 # Don't delay the application, to avoid keeping the input
571 # arguments in memory
--> 572 self.results = batch()
573
574 def get(self):
/opt/conda/lib/python3.7/site-packages/joblib/parallel.py in __call__(self)
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
262 return [func(*args, **kwargs)
--> 263 for func, args, kwargs in self.items]
264
265 def __reduce__(self):
/opt/conda/lib/python3.7/site-packages/joblib/parallel.py in <listcomp>(.0)
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
262 return [func(*args, **kwargs)
--> 263 for func, args, kwargs in self.items]
264
265 def __reduce__(self):
/opt/conda/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py in _logistic_regression_path(X, y, pos_class, Cs, fit_intercept, max_iter, tol, verbose, solver, coef, class_weight, dual, penalty, intercept_scaling, multi_class, random_state, check_input, max_squared_sum, sample_weight, l1_ratio)
762 n_iter_i = _check_optimize_result(
763 solver, opt_res, max_iter,
--> 764 extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)
765 w0, loss = opt_res.x, opt_res.fun
766 elif solver == 'newton-cg':
/opt/conda/lib/python3.7/site-packages/sklearn/utils/optimize.py in _check_optimize_result(solver, result, max_iter, extra_warning_msg)
241 " https://scikit-learn.org/stable/modules/"
242 "preprocessing.html"
--> 243 ).format(solver, result.status, result.message.decode("latin1"))
244 if extra_warning_msg is not None:
245 warning_msg += "\n" + extra_warning_msg
AttributeError: 'str' object has no attribute 'decode'
【问题讨论】:
在cross_val_score
中设置error_score="raise"
以获得完整的错误回溯(而不是让cross_val_score
掩盖错误并只给出分数NaN)。泰坦尼克号数据集(由 sklearn 的 fetch_openml
提供,version=1
)确实缺少年龄值和一个(??)票价,所以你需要处理这些。
@BenReiniger 扩展错误消息显示AttributeError: 'str' object has no attribute 'decode'
,但列中没有字符串和缺失值。
请把整个回溯放到问题中。
@BenReiniger 添加了它。
【参考方案1】:
我通过将LogisticRegression()
中的solver=lbfgs
更改为solver=liblinear
解决了这个错误
logreg = LogisticRegression(solver='lbfgs')
到
logreg = LogisticRegression(solver='liblinear')
对于以下错误:
ValueError:输入包含 NaN、无穷大或对于 dtype('float64') 来说太大的值
最好检查您的test
数据是否包含任何空值或字符串。
【讨论】:
以上是关于FitFailedWarning:估计器拟合失败。当使用更大的 int 值时,这些参数的训练测试分区上的分数将设置为 nan的主要内容,如果未能解决你的问题,请参考以下文章
FitFailedWarning:估计器拟合失败。当使用更大的 int 值时,这些参数的训练测试分区上的分数将设置为 nan