2018-07-30期 MapReduce分区(Partitioner)编程案例
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了2018-07-30期 MapReduce分区(Partitioner)编程案例相关的知识,希望对你有一定的参考价值。
1、EmpSalaryBean 对象
package cn.sjq.mr.part;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;
/**
* 定义一个员工薪水的JavaBean,并实现MapReduce的Writable序列化接口
* @author songjq
*
*/
public class EmpSalaryBean implements Writable {
/*
定义成员属性
c_oid
c_employee_name
c_second_company_name
c_third_company_name
c_fourth_company_name
c_company_name
c_dept_name
c_sub_total
c_com_fund_payamt
*/
private int seq;
private String c_oid;
private String c_employee_name;
private String c_second_company_name;
private String c_third_company_name;
private String c_fourth_company_name;
private String c_company_name;
private String c_dept_name;
private float c_sub_total;
private float c_com_fund_payamt;
public int getSeq() {
return seq;
}
public void setSeq(int seq) {
this.seq = seq;
}
public String getC_oid() {
return c_oid;
}
public void setC_oid(String c_oid) {
this.c_oid = c_oid;
}
public String getC_employee_name() {
return c_employee_name;
}
public void setC_employee_name(String c_employee_name) {
this.c_employee_name = c_employee_name;
}
public String getC_second_company_name() {
return c_second_company_name;
}
public void setC_second_company_name(String c_second_company_name) {
this.c_second_company_name = c_second_company_name;
}
public String getC_third_company_name() {
return c_third_company_name;
}
public void setC_third_company_name(String c_third_company_name) {
this.c_third_company_name = c_third_company_name;
}
public String getC_fourth_company_name() {
return c_fourth_company_name;
}
public void setC_fourth_company_name(String c_fourth_company_name) {
this.c_fourth_company_name = c_fourth_company_name;
}
public String getC_company_name() {
return c_company_name;
}
public void setC_company_name(String c_company_name) {
this.c_company_name = c_company_name;
}
public String getC_dept_name() {
return c_dept_name;
}
public void setC_dept_name(String c_dept_name) {
this.c_dept_name = c_dept_name;
}
public float getC_sub_total() {
return c_sub_total;
}
public void setC_sub_total(float c_sub_total) {
this.c_sub_total = c_sub_total;
}
public float getC_com_fund_payamt() {
return c_com_fund_payamt;
}
public void setC_com_fund_payamt(float c_com_fund_payamt) {
this.c_com_fund_payamt = c_com_fund_payamt;
}
//反序列化方法
@Override
public void readFields(DataInput in) throws IOException {
this.seq = in.readInt();
this.c_oid = in.readUTF();
this.c_employee_name = in.readUTF();
this.c_second_company_name = in.readUTF();
this.c_third_company_name = in.readUTF();
this.c_fourth_company_name = in.readUTF();
this.c_company_name = in.readUTF();
this.c_dept_name = in.readUTF();
this.c_sub_total = in.readFloat();
this.c_com_fund_payamt = in.readFloat();
}
//序列化方法
@Override
public void write(DataOutput out) throws IOException {
out.writeInt(this.seq);
out.writeUTF(this.c_oid);
out.writeUTF(this.c_employee_name);
out.writeUTF(this.c_second_company_name);
out.writeUTF(this.c_third_company_name);
out.writeUTF(this.c_fourth_company_name);
out.writeUTF(this.c_company_name);
out.writeUTF(this.c_dept_name);
out.writeFloat(this.c_sub_total);
out.writeFloat(this.c_com_fund_payamt);
}
@Override
public String toString() {
return this.seq+" "+this.c_oid+" "+
this.c_employee_name+" "+this.c_second_company_name+" "+
this.c_third_company_name+" "+this.c_fourth_company_name+" "+
this.c_company_name+" "+this.c_dept_name+" "+
this.c_sub_total+" "+this.c_com_fund_payamt+" ";
}
}
2、Mapper、Reducer、Job、Partitioner实现
package cn.sjq.mr.part;
import java.io.IOException;
import java.util.Iterator;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.junit.Test;
/**
* 自定义分区
* 按照员工薪水范围进行分区
* 按照[0-2000] [2000-4000] [4000-6000] [6000-8000] >8000这几个范围进行分区
* 所有的Mapper、Reducer、Partitioner、Job均采用匿名内部类实现
* @author songjq
*
*/
public class EmployeePart {
/**
* 分区主要在<k2,v2>上进行,因此这里k2:员工薪水 v2:员工对象
* @author songjq
*
*/
static class EmployeePartMapper extends Mapper<LongWritable, Text, FloatWritable, EmpSalaryBean> {
private FloatWritable tkey = new FloatWritable();
private EmpSalaryBean tvalue = new EmpSalaryBean();
@Override
protected void map(LongWritable k1, Text v1, Context context) throws IOException, InterruptedException {
//获取一行
String line = v1.toString();
//分词
String[] fds = StringUtils.split(line, ",");
//将分词数据封装到EmpSalaryBean对象
tvalue.setSeq(new Integer(fds[0]).intValue());
tvalue.setC_oid(fds[1]);
tvalue.setC_employee_name(fds[2]);
tvalue.setC_second_company_name(fds[3]);
tvalue.setC_third_company_name(fds[4]);
tvalue.setC_fourth_company_name(fds[5]);
tvalue.setC_company_name(fds[6]);
tvalue.setC_dept_name(fds[7]);
tvalue.setC_sub_total(new Float(fds[8]).floatValue());
tvalue.setC_com_fund_payamt(new Float(fds[9]).floatValue());
tkey.set(tvalue.getC_sub_total());
//序列化输出到Reducer
context.write(tkey,tvalue);
}
}
/**
* 将分区后的数据写入HDFS
* @author songjq
*
*/
static class EmployeePartReducer extends Reducer<FloatWritable, EmpSalaryBean, NullWritable, EmpSalaryBean> {
@Override
protected void reduce(FloatWritable k3, Iterable<EmpSalaryBean> v3, Context ctx)
throws IOException, InterruptedException {
Iterator<EmpSalaryBean> iterator = v3.iterator();
while(iterator.hasNext()) {
EmpSalaryBean v4 = iterator.next();
ctx.write(NullWritable.get(), v4);
}
}
}
/**
* 自定义EmployeePartJob分区
* Partitioner<FloatWritable, EmpSalaryBean>对应Mapper<k2,v2>
* @author songjq
*
*/
static class EmployeeMyPartioner extends Partitioner<FloatWritable, EmpSalaryBean>{
/*
* 这里分5个区
* (non-Javadoc)
* @see org.apache.hadoop.mapreduce.Partitioner#getPartition(java.lang.Object, java.lang.Object, int)
*/
@Override
public int getPartition(FloatWritable k2, EmpSalaryBean v2, int reduceNum) {
if(k2.get()<2000) {
//[0-2000)
return 0;
}else if(k2.get()<4000) {
//[2000-4000)
return 1;
}else if(k2.get()<6000) {
//[4000-6000)
return 2;
}else if(k2.get()<8000) {
//[6000-8000)
return 3;
}else {
//>8000
return 4;
}
}
}
/**
* 提交MapReduce任务
* @throws Exception
*/
@Test
public void EmployeePartJobSubmiter() throws Exception{
Job job = Job.getInstance(new Configuration());
job.setJarByClass(EmployeePart.class);
job.setMapperClass(EmployeePartMapper.class);
job.setReducerClass(EmployeePartReducer.class);
job.setMapOutputKeyClass(FloatWritable.class);
job.setMapOutputValueClass(EmpSalaryBean.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(EmpSalaryBean.class);
//指定自定义分区
job.setPartitionerClass(EmployeeMyPartioner.class);
//设置运行的ReduceTask数量,建议等于分区数量,必须>=partNums
job.setNumReduceTasks(5);
FileInputFormat.setInputPaths(job, new Path("D:\test\tmp\part\empsalary.csv"));
FileOutputFormat.setOutputPath(job, new Path("D:\test\tmp\part\output1"));
job.waitForCompletion(true);
}
}
以上是关于2018-07-30期 MapReduce分区(Partitioner)编程案例的主要内容,如果未能解决你的问题,请参考以下文章
[MapReduce_8] MapReduce 中的自定义分区实现