2018-08-01 期 MapReduce实现多表查询等值连接
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了2018-08-01 期 MapReduce实现多表查询等值连接相关的知识,希望对你有一定的参考价值。
1、EmployeeDeptBean
package cn.sjq.bigdata.mr.equal.join;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
/**
* Emp表和Dept表合并后的JavaBean
* 数据格式如下:
* Emp表:
* 7369,SMITH,CLERK,7902,1980/12/17,800,,20
7499,ALLEN,SALESMAN,7698,1981/2/20,1600,300,30
7521,WARD,SALESMAN,7698,1981/2/22,1250,500,30
Dept表:
10,ACCOUNTING,NEW YORK
20,RESEARCH,DALLAS
30,SALES,CHICAGO
40,OPERATIONS,BOSTON
要求:通过MapReduce处理后,输出数据格式为(要求按照DEPTNO升序,SAL降序):
DEPTNO EMPNO DNAME LOC ENAME JOB MGR HIREDATE SAL COMM
10 7369 ACCOUNTING NEW YORK SMITH SALESMAN 7902 17-DEC-80 1600 1400
如果需要对输出并排序,因此需要继承WritableComparator
* @author songjq
*
*/
public class EmployeeDeptBean implements WritableComparable<EmployeeDeptBean> {
//定义成员属性
private int deptNo = 0;
private int empno = 0;
private String dname = "";
private String loc = "";
private String ename = "";
private String job = "";
private int mgr = 0;
private String hiredate = "";
private float salary = 0f;
private float comm = 0f;
//定义emp表和dept表标志位flag 0:emp 1:dept
private int flag = 0;
public int getFlag() {
return flag;
}
public void setFlag(int flag) {
this.flag = flag;
}
public int getDeptNo() {
return deptNo;
}
public void setDeptNo(int deptNo) {
this.deptNo = deptNo;
}
public int getEmpno() {
return empno;
}
public void setEmpno(int empno) {
this.empno = empno;
}
public String getDname() {
return dname;
}
public void setDname(String dname) {
this.dname = dname;
}
public String getLoc() {
return loc;
}
public void setLoc(String loc) {
this.loc = loc;
}
public String getEname() {
return ename;
}
public void setEname(String ename) {
this.ename = ename;
}
public String getJob() {
return job;
}
public void setJob(String job) {
this.job = job;
}
public int getMgr() {
return mgr;
}
public void setMgr(int mgr) {
this.mgr = mgr;
}
public String getHiredate() {
return hiredate;
}
public void setHiredate(String hiredate) {
this.hiredate = hiredate;
}
public float getSalary() {
return salary;
}
public void setSalary(float salary) {
this.salary = salary;
}
public float getComm() {
return comm;
}
public void setComm(float comm) {
this.comm = comm;
}
public EmployeeDeptBean() {
}
public EmployeeDeptBean(int deptNo, int empno, String dname, String loc, String ename, String job, int mgr,
String hiredate, float salary, float comm, int flag) {
this.deptNo = deptNo;
this.empno = empno;
this.dname = dname;
this.loc = loc;
this.ename = ename;
this.job = job;
this.mgr = mgr;
this.hiredate = hiredate;
this.salary = salary;
this.comm = comm;
this.flag = flag;
}
/*
* 反序列化
* (non-Javadoc)
* @see org.apache.hadoop.io.Writable#readFields(java.io.DataInput)
*/
@Override
public void readFields(DataInput in) throws IOException {
this.deptNo = in.readInt();
this.empno = in.readInt();
this.dname = in.readUTF();
this.loc = in.readUTF();
this.ename = in.readUTF();
this.job = in.readUTF();
this.mgr = in.readInt();
this.hiredate = in.readUTF();
this.salary = in.readFloat();
this.comm = in.readFloat();
this.flag = in.readInt();
}
/*
* 序列化
* (non-Javadoc)
* @see org.apache.hadoop.io.Writable#write(java.io.DataOutput)
*/
@Override
public void write(DataOutput out) throws IOException {
out.writeInt(this.deptNo);
out.writeInt(this.empno);
out.writeUTF(this.dname);
out.writeUTF(this.loc);
out.writeUTF(this.ename);
out.writeUTF(this.job);
out.writeInt(this.mgr);
out.writeUTF(this.hiredate);
out.writeFloat(this.salary);
out.writeFloat(this.comm);
out.writeInt(this.flag);
}
/*
* 对象比较
* (non-Javadoc)
* @see java.lang.Comparable#compareTo(java.lang.Object)
*/
@Override
public int compareTo(EmployeeDeptBean o) {
//第一步:比较部门编号,升序排列
if(this.deptNo>o.getDeptNo()) {
return 1;
}else if(this.deptNo<o.getDeptNo()) {
return -1;
}
//第二步:比较薪资,降序排列
if(this.salary>=this.getSalary()) {
return -1;
}else {
return 1;
}
}
/*
* 重写toString方法
* (non-Javadoc)
* @see java.lang.Object#toString()
*/
@Override
public String toString() {
// DEPTNO EMPNO DNAME LOC ENAME JOB MGR HIREDATE SAL COMM
// 10 7369 ACCOUNTING NEW YORK SMITH SALESMAN 7902 17-DEC-80 1600 1400
return this.deptNo + " " + this.empno + " " + formatStr(this.dname, 12) + formatStr(this.ename, 12)
+ formatStr(this.job, 12) + formatStr(String.valueOf(this.mgr), 6) + formatStr(this.hiredate, 12)
+ formatStr(String.valueOf(this.salary), 6) + " " + formatStr(String.valueOf(this.comm), 6) + " " + formatStr(this.loc, 16);
}
/**
* 字符串填充空格
* @param str
* @param length
* @return
*/
public static String formatStr(String str, int length)
{
if (str == null)
{
str="";
}
int strLen = str.getBytes().length;
if (strLen == length)
{
return str;
} else if (strLen < length)
{
int temp = length - strLen;
String tem = "";
for (int i = 0; i < temp; i++)
{
tem = tem + " ";
}
return str + tem;
} else
{
return str.substring(0, length);
}
}
}
2、EmpDeptEqualJoin
package cn.sjq.bigdata.mr.equal.join;
import java.io.IOException;
import java.text.NumberFormat;
import java.util.ArrayList;
import java.util.Iterator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.junit.Test;
/**
* 所有的Mapper类、Reducer类、Job类均在本类采用采用匿名内部类实现
* @author songjq
*
*/
public class EmpDeptEqualJoin {
/**
* Mapper端
* /scott下存在emp.csv,dept.csv
* 输入路径:/scott
* k1:输入偏移量
* v1:输入数据,输入可能为emp表,也可能为dept表
* k2:部门号DeptNo
* v2:输出EmployeeDeptBean
*
* 原理:这里主要利用MapReduce相同key输出到相同Reduce处理特性来实现表的等值连接
* 这里将所有相同部门号的部门信息及相同部门号的员工信息输出到同一个reduce进行合并处理
* @author songjq
*
*/
static class EmpDeptEqualJoinMapper extends Mapper<LongWritable, Text, IntWritable, EmployeeDeptBean> {
private EmployeeDeptBean ed = null;
private IntWritable tkey = new IntWritable();
@Override
protected void map(LongWritable k1, Text v1, Context context) throws IOException, InterruptedException {
// 获取输入数据,可能为emp.csv,也可能为dept.csv
String line = v1.toString();
// 分词
String[] fileds = line.split(",");
// 获取文件输入对象
FileSplit filesplit = (FileSplit) context.getInputSplit();
// 获取输入文件名称
String fname = filesplit.getPath().getName();
ed = new EmployeeDeptBean();
if (fname.equals("emp.csv")) {
// 7369,SMITH,CLERK,7902,1980/12/17,800,,20
ed.setDeptNo(Integer.parseInt(fileds[7]));
ed.setFlag(0);
ed.setEmpno(Integer.parseInt(fileds[0]));
ed.setEname(fileds[1]);
ed.setJob(fileds[2]);
try {
ed.setMgr(Integer.parseInt(fileds[3]));
}catch (Exception e) {
ed.setMgr(0);
}
ed.setHiredate(fileds[4]);
try {
// 防止fileds[5]为空抛空指针异常
ed.setSalary(Float.parseFloat(fileds[5]));
} catch (Exception e) {
ed.setSalary(0);
}
try {
// 防止fileds[6]为空抛空指针异常
ed.setComm(Float.parseFloat(fileds[6]));
} catch (Exception e) {
ed.setComm(0);
}
} else if (fname.equals("dept.csv")) {
ed.setFlag(1);
ed.setDeptNo(Integer.parseInt(fileds[0]));
ed.setDname(fileds[1]);
ed.setLoc(fileds[2]);
}
// 将EmployeeDeptBean对象输出到Reducer
tkey.set(ed.getDeptNo());
context.write(tkey , ed);
}
}
/**
* Reducer端
* k3:部门号
* v3:EmployeeDeptBean集合
* k4:NullWritable
* v4:EmployeeDeptBean
* 原理说明:
* 利用相同Key输出到相同的Reduce处理这一特性,将部门信息和员工信息进行等值合并(利用部门号相同这一条件)
* @author songjq
*
*/
static class EmpDeptEqualJoinReducer
extends Reducer<IntWritable, EmployeeDeptBean, NullWritable, EmployeeDeptBean> {
@Override
protected void reduce(IntWritable k3, Iterable<EmployeeDeptBean> v3, Context ctx)
throws IOException, InterruptedException {
Iterator<EmployeeDeptBean> iterator = v3.iterator();
//定义存放员工信息对象list
ArrayList<EmployeeDeptBean> list = new ArrayList<EmployeeDeptBean>();
//定义部门属性信息
String dname = "";
String loc = "";
while(iterator.hasNext()) {
EmployeeDeptBean ed = iterator.next();
//定义emp表和dept表标志位flag, 0:emp 1:dept
if(ed.getFlag()==0) {
//不能直接list.add(ed);ed为同一个对象
EmployeeDeptBean edtmp = new EmployeeDeptBean();
//也不能直接将edtmp = ed;
edtmp.setDeptNo(ed.getDeptNo());
edtmp.setEname(ed.getEname());
edtmp.setEmpno(ed.getEmpno());
edtmp.setMgr(ed.getMgr());
edtmp.setJob(ed.getJob());
edtmp.setHiredate(ed.getHiredate());
edtmp.setSalary(ed.getSalary());
edtmp.setComm(ed.getComm());
list.add(edtmp);
}else if(ed.getFlag()==1) {
dname = ed.getDname();
loc = ed.getLoc();
}
}
//将部门信息合并到员工对象list
for(EmployeeDeptBean fulled:list) {
fulled.setDname(dname);
fulled.setLoc(loc);
ctx.write(NullWritable.get(), fulled);
}
}
}
/**
* 提交job到hadoop集群执行
* @param args
* @throws Exception
*/
@Test
public void EmpDeptEqualJoinJob() throws Exception {
//定义job
Job job = Job.getInstance(new Configuration());
//设置mapper及输出key,value数据类型
job.setMapperClass(EmpDeptEqualJoinMapper.class);
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(EmployeeDeptBean.class);
//设置reducer及输出key,value数据类型
job.setReducerClass(EmpDeptEqualJoinReducer.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(EmployeeDeptBean.class);
//设置输入输出路径
FileInputFormat.setInputPaths(job, new Path("D:\test\tmp\join\equal"));
FileOutputFormat.setOutputPath(job, new Path("D:\test\tmp\join\equal_out1"));
//提交任务job
job.waitForCompletion(true);
}
}
3、执行结果
10 7934 ACCOUNTING MILLER CLERK 7782 1982/1/23 1300.0 0.0 NEW YORK
10 7839 ACCOUNTING KING PRESIDENT 0 1981/11/17 5000.0 0.0 NEW YORK
10 7782 ACCOUNTING CLARK MANAGER 7839 1981/6/9 2450.0 0.0 NEW YORK
20 7876 RESEARCH ADAMS CLERK 7788 1987/5/23 1100.0 0.0 DALLAS
20 7788 RESEARCH SCOTT ANALYST 7566 1987/4/19 3000.0 0.0 DALLAS
20 7369 RESEARCH SMITH CLERK 7902 1980/12/17 800.0 0.0 DALLAS
20 7566 RESEARCH JONES MANAGER 7839 1981/4/2 2975.0 0.0 DALLAS
20 7902 RESEARCH FORD ANALYST 7566 1981/12/3 3000.0 0.0 DALLAS
30 7844 SALES TURNER SALESMAN 7698 1981/9/8 1500.0 0.0 CHICAGO
30 7499 SALES ALLEN SALESMAN 7698 1981/2/20 1600.0 300.0 CHICAGO
30 7698 SALES BLAKE MANAGER 7839 1981/5/1 2850.0 0.0 CHICAGO
30 7654 SALES MARTIN SALESMAN 7698 1981/9/28 1250.0 1400.0 CHICAGO
30 7521 SALES WARD SALESMAN 7698 1981/2/22 1250.0 500.0 CHICAGO
30 7900 SALES JAMES CLERK 7698 1981/12/3 950.0 0.0 CHICAGO
以上是关于2018-08-01 期 MapReduce实现多表查询等值连接的主要内容,如果未能解决你的问题,请参考以下文章
2018-08-07 期 MapReduce模拟实现热销商品排行
2018-07-29期 MapReduce实现对字符串进行排序
2018-08-05 期 MapReduce实现每个单词在每个文件中坐标信息统计