MapReduce-处理需求NBA球员数据(Hadoop)
Posted 雷米恩
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了MapReduce-处理需求NBA球员数据(Hadoop)相关的知识,希望对你有一定的参考价值。
目录
基于本地模式测试
数据需求
MapReduce是一种并行编程模型,用于大规模数据集的并行运算。
数据格式:
球员-位置-身高-体重-年龄-球龄-出场次数-场均时间-进攻能力-防守能力-是否进入过全明星-薪资
斯蒂芬-库里,得分后卫,1.91,86,29,7,79,33.38,31.933,4,是,3468 勒布朗-詹姆斯,大前锋,2.03,113,32,13,74,37.75,36.14,8,是,3329 保罗-米尔萨普,中锋,2.03,112,32,10,69,33.95,22.712,7,是,3127 戈登-海沃德,小前锋,2.03,103,27,6,73,34.45,25.382,5,是,2973......
需求:将数据按照薪资降序排行并且按位置进行分区
数据输出格式:
球员-位置-身高-体重-年龄-进攻能力-防守能力-薪资
凯尔-洛里,控球后卫,1.85m,89,31岁,29.35,4,2870万美元 迈克-康利,控球后卫,1.85m,79,29岁,26.785,4,2853万美元 拉塞尔-维斯布鲁克,控球后卫,1.91m,91,28岁,42.95,9,2853万美元 达米安-利拉德,控球后卫,1.91m,88,27岁,32.857,4,2615万美元 ......
戈登-海沃德,小前锋,2.03m,103,27岁,25.382,5,2973万美元 德玛尔-德罗赞,小前锋,2.01m,99,28岁,31.219,5,2774万美元 尼古拉斯-巴图姆,小前锋,2.03m,91,28岁,21.042,6,2243万美元 卡瓦伊-莱纳德,小前锋,2.01m,104,26岁,30.024,5,1887万美元 ......
勒布朗-詹姆斯,大前锋,2.03m,113,32岁,36.14,8,3329万美元 卡梅隆-安东尼,大前锋,2.03m,109,33岁,25.298,5,2624万美元 凯文-杜兰特,大前锋,2.06m,109,29岁,29.919,9,2500万美元 奥托-波特,大前锋,2.03m,93,24岁,15.952,5,2477万美元 ......
保罗-米尔萨普,中锋,2.03m,112,32岁,22.712,7,3127万美元 布雷克-格里芬,中锋,2.08m,114,28岁,27.488,6,2951万美元 艾尔-霍弗德,中锋,2.08m,111,31岁,19.956,6,2773万美元 安德烈-德拉蒙德,中锋,2.11m,127,24岁,18.751,11,2378万美元 ......
斯蒂芬-库里,得分后卫,1.91m,86,29岁,31.933,4,3468万美元 詹姆斯-哈登,得分后卫,1.96m,100,28岁,41.288,7,2830万美元 CJ-麦科勒姆,得分后卫,1.91m,86,26岁,26.522,2,2396万美元 布拉德利-比尔,得分后卫,1.96m,94,24岁,26.568,3,2378万美元 ......
定义Map类
class MapperNBA extends Mapper<LongWritable, Text, SerializeNBA, Text>{
SerializeNBA k = new SerializeNBA();
Text v = new Text();
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, SerializeNBA, Text>.Context context) throws IOException, InterruptedException {
String line = value.toString();
String[] split = line.split(",");
String pos = split[1];
double height = Double.parseDouble(split[2]);
long weight = Long.parseLong(split[3]);
double attack = Double.parseDouble(split[8]);
int defend = Integer.parseInt(split[9]);
long income = Long.parseLong(split[11]);
k.setName(split[0]);
k.setPos(pos);
k.setHeight(height);
k.setWeight(weight);
k.setAge(Integer.parseInt(split[4]));
k.setAttack(attack);
k.setDefend(defend);
k.setIncome(income);
context.write(k,v);
}
}
定义Reduce类
class ReducerNBA extends Reducer<SerializeNBA, Text,SerializeNBA, Text>{
@Override
protected void reduce(SerializeNBA key, Iterable<Text> values, Reducer<SerializeNBA, Text, SerializeNBA, Text>.Context context) throws IOException, InterruptedException {
for (Text value : values) {
context.write(key,value);
}
}
}
自定义输出类型类并且进行排序
class SerializeNBA implements WritableComparable<SerializeNBA>{
private String name;
private String pos;
private double height;
private long weight;
private int age;
private double attack;
private int defend;
private long income;
public SerializeNBA(){}
public String getName(String name) {
return this.name;
}
public void setName(String name) {
this.name = name;
}
public String getPos(String pos) {
return this.pos;
}
public void setPos(String pos) {
this.pos = pos;
}
public double getHeight(double height) {
return this.height;
}
public void setHeight(double height) {
this.height = height;
}
public long getWeight(long weight) {
return this.weight;
}
public void setWeight(long weight) {
this.weight = weight;
}
public int getAge(int age) {
return this.age;
}
public void setAge(int age) {
this.age = age;
}
public double getAttack(double attack) {
return this.attack;
}
public void setAttack(double attack) {
this.attack = attack;
}
public int getDefend(int defend) {
return this.defend;
}
public void setDefend(int defend) {
this.defend = defend;
}
public long getIncome(long income) {
return this.income;
}
public void setIncome(long income) {
this.income = income;
}
@Override
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeUTF(name);
dataOutput.writeUTF(pos);
dataOutput.writeDouble(height);
dataOutput.writeLong(weight);
dataOutput.writeInt(age);
dataOutput.writeDouble(attack);
dataOutput.writeInt(defend);
dataOutput.writeLong(income);
}
@Override
public void readFields(DataInput dataInput) throws IOException {
this.name=dataInput.readUTF();
this.pos=dataInput.readUTF();
this.height=dataInput.readDouble();
this.weight=dataInput.readLong();
this.age=dataInput.readInt();
this.attack=dataInput.readDouble();
this.defend=dataInput.readInt();
this.income=dataInput.readLong();
}
// readFileds()和write()方法用于读取和写入序列化数据以通过网络传输。
@Override
public String toString() {
return name + "," +
pos + "," +
height + "m" + "," +
weight + "," +
age + "岁"+","+
attack + "," +
defend + "," +
income+"万美元";
}
@Override
public int compareTo(SerializeNBA o) {
if (this.income>o.income){
return -1;
}else if (this.income<o.income){
return 1;
}else{
return 0;
}
}
}
定义分区类Partitioner
class PartitionerNBA extends Partitioner<SerializeNBA,Text>{
int partitioner;
@Override
public int getPartition(SerializeNBA serializeNBA, Text text, int numPartitions) {
String s = serializeNBA.toString();
String[] split = s.split(",");
if (split[1].equals("大前锋")){
return partitioner = 0;
}else if (split[1].equals("小前锋")){
return partitioner = 1;
}else if (split[1].equals("控球后卫")){
return partitioner = 2;
}else if (split[1].equals("中锋")){
return partitioner = 3;
}else {
return partitioner = 4;
}
}
}
定义Driver/main类(驱动)
public class DriverNBA {
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
Job job = Job.getInstance(new Configuration());
job.setJarByClass(DriverNBA.class);
job.setMapperClass(MapperNBA.class);
job.setReducerClass(ReducerNBA.class);
job.setMapOutputKeyClass(SerializeNBA.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(SerializeNBA.class);
job.setOutputValueClass(Text.class);
job.setPartitionerClass(PartitionerNBA.class);
job.setNumReduceTasks(5);
FileInputFormat.setInputPaths(job,new Path("E:\\\\com.raymone.hadoop\\\\data\\\\NBA"));
FileOutputFormat.setOutputPath(job,new Path("E:\\\\com.raymone.hadoop\\\\data\\\\NBA_OUT"));
System.exit(job.waitForCompletion(true)?0:1);
}
}
数据结果
基于Hadoop集群测试
jar包导出并且上传到集群
上传jar包并将其改名为nba.jar(mv /com.raymone.hadoop-1.0-SNAPSHOT.jar /nba.jar)
运行jar包(在这之前先将数据上传到HDFS)
数据结果
登录9870端口:
数据已经存放在HDFS的各个DataNode节点里 。
可以根据自己的需求输出自己想要的数据:
以上是关于MapReduce-处理需求NBA球员数据(Hadoop)的主要内容,如果未能解决你的问题,请参考以下文章
python 从stats.nba.com中删除NBA球员统计数据