Spark 随机森林交叉验证错误
Posted
技术标签:
【中文标题】Spark 随机森林交叉验证错误【英文标题】:Spark Random Forest Cross-Validation error 【发布时间】:2016-06-26 17:04:16 【问题描述】:我正在尝试在 Spark 中对随机森林进行交叉验证。
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
data = nds.sc.parallelize([
LabeledPoint(0.0, [0,402,6,0]),
LabeledPoint(0.0, [3,500,3,0]),
LabeledPoint(1.0, [1,590,1,1]),
LabeledPoint(1.0, [3,328,5,0]),
LabeledPoint(1.0, [4,351,4,0]),
LabeledPoint(0.0, [2,372,2,0]),
LabeledPoint(0.0, [4,302,5,0]),
LabeledPoint(1.0, [1,387,2,0]),
LabeledPoint(1.0, [1,419,3,0]),
LabeledPoint(0.0, [1,370,5,0]),
LabeledPoint(0.0, [1,410,4,0]),
LabeledPoint(0.0, [2,509,7,1]),
LabeledPoint(0.0, [1,307,5,0]),
LabeledPoint(0.0, [0,424,4,1]),
LabeledPoint(0.0, [1,509,2,1]),
LabeledPoint(1.0, [3,361,4,0]),
])
train=data.toDF(['label','features'])
numfolds =2
rf = RandomForestClassifier(labelCol="label", featuresCol="features")
evaluator = MulticlassClassificationEvaluator()
paramGrid = ParamGridBuilder().addGrid(rf.maxDepth,
[4,8,10]).addGrid(rf.impurity, ['entropy','gini']).addGrid(rf.featureSubsetStrategy, [6,8,10]).build()
pipeline = Pipeline(stages=[rf])
crossval = CrossValidator(
estimator=pipeline,
estimatorParamMaps=paramGrid,
evaluator=evaluator,
numFolds= numfolds)
model = crossval.fit(train)
我收到以下错误
Py4JJavaError Traceback (most recent call last)
<ipython-input-87-7ea70f89086a> in <module>()
66 numFolds=num)
67
---> 68 model = crossval.fit(train)
/opt/spark/current/python/pyspark/ml/pipeline.py in fit(self, dataset, params)
67 return self.copy(params)._fit(dataset)
68 else:
---> 69 return self._fit(dataset)
70 else:
71 raise ValueError("Params must be either a param map or a list/tuple of param maps, "
/opt/spark/current/python/pyspark/ml/tuning.py in _fit(self, dataset)
237 train = df.filter(~condition)
238 for j in range(numModels):
--> 239 model = est.fit(train, epm[j])
240 # TODO: duplicate evaluator to take extra params from input
241 metric = eva.evaluate(model.transform(validation, epm[j]))
/opt/spark/current/python/pyspark/ml/pipeline.py in fit(self, dataset, params)
65 elif isinstance(params, dict):
66 if params:
---> 67 return self.copy(params)._fit(dataset)
68 else:
69 return self._fit(dataset)
/opt/spark/current/python/pyspark/ml/pipeline.py in _fit(self, dataset)
211 dataset = stage.transform(dataset)
212 else: # must be an Estimator
--> 213 model = stage.fit(dataset)
214 transformers.append(model)
215 if i < indexOfLastEstimator:
/opt/spark/current/python/pyspark/ml/pipeline.py in fit(self, dataset, params)
67 return self.copy(params)._fit(dataset)
68 else:
---> 69 return self._fit(dataset)
70 else:
71 raise ValueError("Params must be either a param map or a list/tuple of param maps, "
/opt/spark/current/python/pyspark/ml/wrapper.py in _fit(self, dataset)
130
131 def _fit(self, dataset):
--> 132 java_model = self._fit_java(dataset)
133 return self._create_model(java_model)
134
/opt/spark/current/python/pyspark/ml/wrapper.py in _fit_java(self, dataset)
126 :return: fitted Java model
127 """
--> 128 self._transfer_params_to_java()
129 return self._java_obj.fit(dataset._jdf)
130
/opt/spark/current/python/pyspark/ml/wrapper.py in _transfer_params_to_java(self)
80 for param in self.params:
81 if param in paramMap:
---> 82 pair = self._make_java_param_pair(param, paramMap[param])
83 self._java_obj.set(pair)
84
/opt/spark/current/python/pyspark/ml/wrapper.py in _make_java_param_pair(self, param, value)
71 java_param = self._java_obj.getParam(param.name)
72 java_value = _py2java(sc, value)
---> 73 return java_param.w(java_value)
74
75 def _transfer_params_to_java(self):
/opt/spark/current/python/lib/py4j-0.9-src.zip/py4j/java_gateway.py in __call__(self, *args)
811 answer = self.gateway_client.send_command(command)
812 return_value = get_return_value(
--> 813 answer, self.gateway_client, self.target_id, self.name)
814
815 for temp_arg in temp_args:
/opt/spark/current/python/pyspark/sql/utils.py in deco(*a, **kw)
43 def deco(*a, **kw):
44 try:
---> 45 return f(*a, **kw)
46 except py4j.protocol.Py4JJavaError as e:
47 s = e.java_exception.toString()
/opt/spark/current/python/lib/py4j-0.9-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name)
306 raise Py4JJavaError(
307 "An error occurred while calling 012.\n".
--> 308 format(target_id, ".", name), value)
309 else:
310 raise Py4JError(
Py4JJavaError: An error occurred while calling o1434.w.
: java.lang.ClassCastException: java.lang.Integer cannot be cast to java.lang.String
at org.apache.spark.ml.tree.RandomForestParams$$anonfun$5.apply(treeParams.scala:340)
at org.apache.spark.ml.param.Param.validate(params.scala:71)
at org.apache.spark.ml.param.ParamPair.<init>(params.scala:509)
at org.apache.spark.ml.param.Param.$minus$greater(params.scala:85)
at org.apache.spark.ml.param.Param.w(params.scala:82)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:497)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381)
at py4j.Gateway.invoke(Gateway.java:259)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.GatewayConnection.run(GatewayConnection.java:209)
at java.lang.Thread.run(Thread.java:745)
paramGrid 似乎没有将我的输入作为列表读取。是否有替代格式或解决方法。任何帮助将不胜感激。
【问题讨论】:
【参考方案1】:您向rf.featureSubsetStrategy
传递了错误的值。它应该是一个描述策略的字符串,它支持以下值:auto、all、onthird、sqrt、log2。见:RandomForestClassifier.featureSubsetStrategy.doc
。
也不要使用data.toDF(['label','features'])
。它不会保留正确的顺序。使用:
data.toDF()
或者如果您想修改名称:
from operator import attrgetter
data.map(attrgetter("label", "features")).toDF(["some_name", "some_other_name"])
最后,标签列必须被索引,或者您必须提供所需的元数据。见How can I declare a Column as a categorical feature in a DataFrame for use in ml
【讨论】:
我用 rf.featureSubsetStrategy, ['auto','onethird'] 替换了它,但我得到了同样的错误。然后我从 ParamGridBuilder 中删除了 rf.featureSubsetStrategy, ['auto','onethird'] 并再次得到同样的错误。 @mikeL 您的代码还有其他与ParamGrid
无关的问题,但我相信如果您解决了这个特定问题,您不会遇到同样的错误。
是的,这是一个不同的错误,是的,还有其他问题。看来数据框的特征和标签位置错误。
感谢所有反馈。我还没有让它运行,但你解决了我问的问题和其他几个问题!谢谢【参考方案2】:
这是我最初的代码
lr_parameter_grid_ = ParamGridBuilder().addGrid(lr.maxIter, [50, 200, 500])\
.addGrid(lr.regParam, [0, 0.3, 1])\
.addGrid(lr.elasticNetParam, [0, 0.3, 1]).build()
我遇到了同样的错误。然后我只保留了 1 个参数 (maxIter)
并且它起作用了。后来我添加了regParam
,它再次起作用了。最后我添加了elasticNetParam
,它仍然有效。不知道为什么第一次放多个参数就不行了,但是从一个开始然后不断添加就可以了。
不是一个永久的、很好的解决方案,但对我有用
【讨论】:
以上是关于Spark 随机森林交叉验证错误的主要内容,如果未能解决你的问题,请参考以下文章