40JSON数据源综合案例实战
Posted weiyiming007
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了40JSON数据源综合案例实战相关的知识,希望对你有一定的参考价值。
一、JSON数据源综合案例实战
1、概述
Spark SQL可以自动推断JSON文件的元数据,并且加载其数据,创建一个DataFrame。可以使用SQLContext.read.json()方法,针对一个元素类型为String的RDD,或者是一个JSON文件。
但是要注意的是,这里使用的JSON文件与传统意义上的JSON文件是不一样的。每行都必须,也只能包含一个,单独的,自包含的,有效的JSON对象。不能让一个JSON对象分散在多行。否则会报错。
###
综合性复杂案例:查询成绩为80分以上的学生的基本信息与成绩信息
students.json
"name":"Leo", "score":85 "name":"Marry", "score":99 "name":"Jack", "score":74
2、java案例实现
package cn.spark.study.sql; import java.util.ArrayList; import java.util.List; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.PairFunction; import org.apache.spark.sql.DataFrame; import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; import org.apache.spark.sql.SQLContext; import org.apache.spark.sql.types.DataTypes; import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructType; import scala.Tuple2; /** * JSON数据源 * @author Administrator * */ public class JSONDataSource public static void main(String[] args) SparkConf conf = new SparkConf() .setAppName("JSONDataSource"); JavaSparkContext sc = new JavaSparkContext(conf); SQLContext sqlContext = new SQLContext(sc); // 针对json文件,创建DataFrame(针对json文件创建DataFrame) DataFrame studentScoresDF = sqlContext.read().json( "hdfs://spark1:9000/spark-study/students.json"); // 针对学生成绩信息的DataFrame,注册临时表,查询分数大于80分的学生的姓名 // (注册临时表,针对临时表执行sql语句) studentScoresDF.registerTempTable("student_scores"); DataFrame goodStudentScoresDF = sqlContext.sql( "select name,score from student_scores where score>=80"); // (将DataFrame转换为rdd,执行transformation操作) List<String> goodStudentNames = goodStudentScoresDF.javaRDD().map( new Function<Row, String>() private static final long serialVersionUID = 1L; @Override public String call(Row row) throws Exception return row.getString(0); ).collect(); // 然后针对JavaRDD<String>,创建DataFrame // (针对包含json串的JavaRDD,创建DataFrame) List<String> studentInfoJSONs = new ArrayList<String>(); studentInfoJSONs.add("\"name\":\"Leo\", \"age\":18"); studentInfoJSONs.add("\"name\":\"Marry\", \"age\":17"); studentInfoJSONs.add("\"name\":\"Jack\", \"age\":19"); JavaRDD<String> studentInfoJSONsRDD = sc.parallelize(studentInfoJSONs); DataFrame studentInfosDF = sqlContext.read().json(studentInfoJSONsRDD); // 针对学生基本信息DataFrame,注册临时表,然后查询分数大于80分的学生的基本信息 studentInfosDF.registerTempTable("student_infos"); String sql = "select name,age from student_infos where name in ("; for(int i = 0; i < goodStudentNames.size(); i++) sql += "‘" + goodStudentNames.get(i) + "‘"; if(i < goodStudentNames.size() - 1) sql += ","; sql += ")"; DataFrame goodStudentInfosDF = sqlContext.sql(sql); // 然后将两份数据的DataFrame,转换为JavaPairRDD,执行join transformation // (将DataFrame转换为JavaRDD,再map为JavaPairRDD,然后进行join) JavaPairRDD<String, Tuple2<Integer, Integer>> goodStudentsRDD = goodStudentScoresDF.javaRDD().mapToPair(new PairFunction<Row, String, Integer>() private static final long serialVersionUID = 1L; @Override public Tuple2<String, Integer> call(Row row) throws Exception return new Tuple2<String, Integer>(row.getString(0), Integer.valueOf(String.valueOf(row.getLong(1)))); ).join(goodStudentInfosDF.javaRDD().mapToPair(new PairFunction<Row, String, Integer>() private static final long serialVersionUID = 1L; @Override public Tuple2<String, Integer> call(Row row) throws Exception return new Tuple2<String, Integer>(row.getString(0), Integer.valueOf(String.valueOf(row.getLong(1)))); )); // 然后将封装在RDD中的好学生的全部信息,转换为一个JavaRDD<Row>的格式 // (将JavaRDD,转换为DataFrame) JavaRDD<Row> goodStudentRowsRDD = goodStudentsRDD.map( new Function<Tuple2<String,Tuple2<Integer,Integer>>, Row>() private static final long serialVersionUID = 1L; @Override public Row call( Tuple2<String, Tuple2<Integer, Integer>> tuple) throws Exception return RowFactory.create(tuple._1, tuple._2._1, tuple._2._2); ); // 创建一份元数据,将JavaRDD<Row>转换为DataFrame List<StructField> structFields = new ArrayList<StructField>(); structFields.add(DataTypes.createStructField("name", DataTypes.StringType, true)); structFields.add(DataTypes.createStructField("score", DataTypes.IntegerType, true)); structFields.add(DataTypes.createStructField("age", DataTypes.IntegerType, true)); StructType structType = DataTypes.createStructType(structFields); DataFrame goodStudentsDF = sqlContext.createDataFrame(goodStudentRowsRDD, structType); // 将好学生的全部信息保存到一个json文件中去 // (将DataFrame中的数据保存到外部的json文件中去) goodStudentsDF.write().format("json").save("hdfs://spark1:9000/spark-study/good-students"); #### students.json "name":"Leo", "score":85 "name":"Marry", "score":99 "name":"Jack", "score":74
3、scala案例实现
package cn.spark.study.sql import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.sql.SQLContext import org.apache.spark.sql.types.StructType import org.apache.spark.sql.types.StructField import org.apache.spark.sql.types.StringType import org.apache.spark.sql.types.IntegerType import org.apache.spark.sql.Row import org.apache.spark.sql.types.LongType /** * @author Administrator */ object JSONDataSource def main(args: Array[String]): Unit = val conf = new SparkConf() .setAppName("JSONDataSource") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) // 创建学生成绩DataFrame val studentScoresDF = sqlContext.read.json("hdfs://spark1:9000/spark-study/students.json") // 查询出分数大于80分的学生成绩信息,以及学生姓名 studentScoresDF.registerTempTable("student_scores") val goodStudentScoresDF = sqlContext.sql("select name,score from student_scores where score>=80") val goodStudentNames = goodStudentScoresDF.rdd.map row => row(0) .collect() // 创建学生基本信息DataFrame val studentInfoJSONs = Array("\"name\":\"Leo\", \"age\":18", "\"name\":\"Marry\", \"age\":17", "\"name\":\"Jack\", \"age\":19") val studentInfoJSONsRDD = sc.parallelize(studentInfoJSONs, 3); val studentInfosDF = sqlContext.read.json(studentInfoJSONsRDD) // 查询分数大于80分的学生的基本信息 studentInfosDF.registerTempTable("student_infos") var sql = "select name,age from student_infos where name in (" for(i <- 0 until goodStudentNames.length) sql += "‘" + goodStudentNames(i) + "‘" if(i < goodStudentNames.length - 1) sql += "," sql += ")" val goodStudentInfosDF = sqlContext.sql(sql) // 将分数大于80分的学生的成绩信息与基本信息进行join val goodStudentsRDD = goodStudentScoresDF.rdd.map row => (row.getAs[String]("name"), row.getAs[Long]("score")) .join(goodStudentInfosDF.rdd.map row => (row.getAs[String]("name"), row.getAs[Long]("age")) ) // 将rdd转换为dataframe val goodStudentRowsRDD = goodStudentsRDD.map( info => Row(info._1, info._2._1.toInt, info._2._2.toInt)) val structType = StructType(Array( StructField("name", StringType, true), StructField("score", IntegerType, true), StructField("age", IntegerType, true))) val goodStudentsDF = sqlContext.createDataFrame(goodStudentRowsRDD, structType) // 将dataframe中的数据保存到json中 goodStudentsDF.write.format("json").save("hdfs://spark1:9000/spark-study/good-students-scala")
以上是关于40JSON数据源综合案例实战的主要内容,如果未能解决你的问题,请参考以下文章
Structured Streaming 实战案例 读取文本数据