聚类-----KMeans
Posted soyosuyang
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了聚类-----KMeans相关的知识,希望对你有一定的参考价值。
package Spark_MLlib import org.apache.spark.ml.clustering.KMeans import org.apache.spark.sql.SparkSession import org.apache.spark.ml.linalg.{Vector, Vectors} /** * K均值 */ case class features_schema(features:Vector) object 聚类__KMeans { val spark=SparkSession.builder().master("local[2]").getOrCreate() import spark.implicits._ def main(args: Array[String]): Unit = { val data=spark.sparkContext.textFile("file:///home/soyo/桌面/spark编程测试数据/soyo2.txt") .map(_.split(",")).map(x=>features_schema(Vectors.dense(x(0).toDouble,x(1).toDouble,x(2).toDouble,x(3).toDouble))).toDF() data.show() val KMeansModel=new KMeans().setK(7).setFeaturesCol("features").setPredictionCol("prediction").fit(data) val results=KMeansModel.transform(data) results.show(150) //模型所有的聚类中心(指最后生成的聚类中心,K是几就有几组)的情况 KMeansModel.clusterCenters.foreach(println) //集合内误差平方和(选取K的大小可以参照,使用场景+最大的集合内误差平方的值=较合适的K) val cost=KMeansModel.computeCost(data) println(cost) } }
结果:
+-----------------+
| features|
+-----------------+
|[5.1,3.5,1.4,0.2]|
|[4.9,3.0,1.4,0.2]|
|[4.7,3.2,1.3,0.2]|
|[4.6,3.1,1.5,0.2]|
|[5.0,3.6,1.4,0.2]|
|[5.4,3.9,1.7,0.4]|
|[4.6,3.4,1.4,0.3]|
|[5.0,3.4,1.5,0.2]|
|[4.4,2.9,1.4,0.2]|
|[4.9,3.1,1.5,0.1]|
|[5.4,3.7,1.5,0.2]|
|[4.8,3.4,1.6,0.2]|
|[4.8,3.0,1.4,0.1]|
|[4.3,3.0,1.1,0.1]|
|[5.8,4.0,1.2,0.2]|
|[5.7,4.4,1.5,0.4]|
|[5.4,3.9,1.3,0.4]|
|[5.1,3.5,1.4,0.3]|
|[5.7,3.8,1.7,0.3]|
|[5.1,3.8,1.5,0.3]|
+-----------------+
only showing top 20 rows
+-----------------+----------+
| features|prediction|
+-----------------+----------+
|[5.1,3.5,1.4,0.2]| 0|
|[4.9,3.0,1.4,0.2]| 0|
|[4.7,3.2,1.3,0.2]| 0|
|[4.6,3.1,1.5,0.2]| 0|
|[5.0,3.6,1.4,0.2]| 0|
|[5.4,3.9,1.7,0.4]| 0|
|[4.6,3.4,1.4,0.3]| 0|
|[5.0,3.4,1.5,0.2]| 0|
|[4.4,2.9,1.4,0.2]| 0|
|[4.9,3.1,1.5,0.1]| 0|
|[5.4,3.7,1.5,0.2]| 0|
|[4.8,3.4,1.6,0.2]| 0|
|[4.8,3.0,1.4,0.1]| 0|
|[4.3,3.0,1.1,0.1]| 0|
|[5.8,4.0,1.2,0.2]| 0|
|[5.7,4.4,1.5,0.4]| 0|
|[5.4,3.9,1.3,0.4]| 0|
|[5.1,3.5,1.4,0.3]| 0|
|[5.7,3.8,1.7,0.3]| 0|
|[5.1,3.8,1.5,0.3]| 0|
+-----------------+----------+
only showing top 20 rows
[5.005999999999999,3.4180000000000006,1.4640000000000002,0.2439999999999999]
[6.8538461538461535,3.076923076923076,5.715384615384614,2.0538461538461537]
[5.883606557377049,2.740983606557377,4.388524590163936,1.4344262295081966]
78.94506582597859
以上是关于聚类-----KMeans的主要内容,如果未能解决你的问题,请参考以下文章