2018-08-01 期 MapReduce实现多表查询等值连接

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了2018-08-01 期 MapReduce实现多表查询等值连接相关的知识,希望对你有一定的参考价值。

1、EmployeeDeptBean

package cn.sjq.bigdata.mr.equal.join;

import java.io.DataInput;

import java.io.DataOutput;

import java.io.IOException;

import org.apache.hadoop.io.WritableComparable;

/**

* Emp表和Dept表合并后的JavaBean

* 数据格式如下:

* Emp表:

* 7369,SMITH,CLERK,7902,1980/12/17,800,,20

7499,ALLEN,SALESMAN,7698,1981/2/20,1600,300,30

7521,WARD,SALESMAN,7698,1981/2/22,1250,500,30

Dept表:

10,ACCOUNTING,NEW YORK

20,RESEARCH,DALLAS

30,SALES,CHICAGO

40,OPERATIONS,BOSTON

要求:通过MapReduce处理后,输出数据格式为(要求按照DEPTNO升序,SAL降序):

DEPTNO EMPNO DNAME LOC ENAME JOB MGR HIREDATE SAL COMM

10 7369 ACCOUNTING NEW YORK SMITH SALESMAN 7902 17-DEC-80 1600 1400

如果需要对输出并排序,因此需要继承WritableComparator

* @author songjq

*

*/

public class EmployeeDeptBean implements WritableComparable<EmployeeDeptBean> {

//定义成员属性

private int deptNo = 0;

private int empno = 0;

private String dname = "";

private String loc = "";

private String ename = "";

private String job = "";

private int mgr = 0;

private String hiredate = "";

private float salary = 0f;

private float comm = 0f;

//定义emp表和dept表标志位flag 0:emp 1:dept

private int flag = 0;

public int getFlag() {

return flag;

}

public void setFlag(int flag) {

this.flag = flag;

}

public int getDeptNo() {

return deptNo;

}

public void setDeptNo(int deptNo) {

this.deptNo = deptNo;

}

public int getEmpno() {

return empno;

}

public void setEmpno(int empno) {

this.empno = empno;

}

public String getDname() {

return dname;

}

public void setDname(String dname) {

this.dname = dname;

}

public String getLoc() {

return loc;

}

public void setLoc(String loc) {

this.loc = loc;

}

public String getEname() {

return ename;

}

public void setEname(String ename) {

this.ename = ename;

}

public String getJob() {

return job;

}

public void setJob(String job) {

this.job = job;

}

public int getMgr() {

return mgr;

}

public void setMgr(int mgr) {

this.mgr = mgr;

}

public String getHiredate() {

return hiredate;

}

public void setHiredate(String hiredate) {

this.hiredate = hiredate;

}

public float getSalary() {

return salary;

}

public void setSalary(float salary) {

this.salary = salary;

}

public float getComm() {

return comm;

}

public void setComm(float comm) {

this.comm = comm;

}

public EmployeeDeptBean() {

}

public EmployeeDeptBean(int deptNo, int empno, String dname, String loc, String ename, String job, int mgr,

String hiredate, float salary, float comm, int flag) {

this.deptNo = deptNo;

this.empno = empno;

this.dname = dname;

this.loc = loc;

this.ename = ename;

this.job = job;

this.mgr = mgr;

this.hiredate = hiredate;

this.salary = salary;

this.comm = comm;

this.flag = flag;

}

/*

* 反序列化

* (non-Javadoc)

* @see org.apache.hadoop.io.Writable#readFields(java.io.DataInput)

*/

@Override

public void readFields(DataInput in) throws IOException {

this.deptNo = in.readInt();

this.empno = in.readInt();

this.dname = in.readUTF();

this.loc = in.readUTF();

this.ename = in.readUTF();

this.job = in.readUTF();

this.mgr = in.readInt();

this.hiredate = in.readUTF();

this.salary = in.readFloat();

this.comm = in.readFloat();

this.flag = in.readInt();

}

/*

* 序列化

* (non-Javadoc)

* @see org.apache.hadoop.io.Writable#write(java.io.DataOutput)

*/

@Override

public void write(DataOutput out) throws IOException {

out.writeInt(this.deptNo);

out.writeInt(this.empno);

out.writeUTF(this.dname);

out.writeUTF(this.loc);

out.writeUTF(this.ename);

out.writeUTF(this.job);

out.writeInt(this.mgr);

out.writeUTF(this.hiredate);

out.writeFloat(this.salary);

out.writeFloat(this.comm);

out.writeInt(this.flag);

}

/*

* 对象比较

* (non-Javadoc)

* @see java.lang.Comparable#compareTo(java.lang.Object)

*/

@Override

public int compareTo(EmployeeDeptBean o) {

//第一步:比较部门编号,升序排列

if(this.deptNo>o.getDeptNo()) {

return 1;

}else if(this.deptNo<o.getDeptNo()) {

return -1;

}

//第二步:比较薪资,降序排列

if(this.salary>=this.getSalary()) {

return -1;

}else {

return 1;

}

}

/*

*  重写toString方法

* (non-Javadoc)

* @see java.lang.Object#toString()

*/

@Override

public String toString() {

// DEPTNO EMPNO DNAME LOC ENAME JOB MGR HIREDATE SAL COMM

// 10 7369 ACCOUNTING NEW YORK SMITH SALESMAN 7902 17-DEC-80 1600 1400

return this.deptNo + " " + this.empno + " " + formatStr(this.dname, 12) + formatStr(this.ename, 12)

+ formatStr(this.job, 12) + formatStr(String.valueOf(this.mgr), 6) + formatStr(this.hiredate, 12)

+ formatStr(String.valueOf(this.salary), 6) + " " + formatStr(String.valueOf(this.comm), 6) + " " + formatStr(this.loc, 16);

}

/**

* 字符串填充空格

* @param str

* @param length

* @return

*/

public static String formatStr(String str, int length)

   {

     if (str == null)

       {

         str="";

       }

     int strLen = str.getBytes().length;

     if (strLen == length)

       {

         return str;

       } else if (strLen < length)

       {

         int temp = length - strLen;

         String tem = "";

         for (int i = 0; i < temp; i++)

           {

             tem = tem + " ";

           }

         return str + tem;

       } else

       {

         return str.substring(0, length);

       }

   }

}

2、EmpDeptEqualJoin

package cn.sjq.bigdata.mr.equal.join;

import java.io.IOException;

import java.text.NumberFormat;

import java.util.ArrayList;

import java.util.Iterator;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.NullWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import org.junit.Test;

/**

* 所有的Mapper类、Reducer类、Job类均在本类采用采用匿名内部类实现

* @author songjq

*

*/

public class EmpDeptEqualJoin {

/**

* Mapper端

* /scott下存在emp.csv,dept.csv

* 输入路径:/scott

* k1:输入偏移量

* v1:输入数据,输入可能为emp表,也可能为dept表

* k2:部门号DeptNo

* v2:输出EmployeeDeptBean

*

* 原理:这里主要利用MapReduce相同key输出到相同Reduce处理特性来实现表的等值连接

* 这里将所有相同部门号的部门信息及相同部门号的员工信息输出到同一个reduce进行合并处理

* @author songjq

*

*/

static class EmpDeptEqualJoinMapper extends Mapper<LongWritable, Text, IntWritable, EmployeeDeptBean> {

private EmployeeDeptBean ed = null;

private IntWritable tkey = new IntWritable();

@Override

protected void map(LongWritable k1, Text v1, Context context) throws IOException, InterruptedException {

// 获取输入数据,可能为emp.csv,也可能为dept.csv

String line = v1.toString();

// 分词

String[] fileds = line.split(",");

// 获取文件输入对象

FileSplit filesplit = (FileSplit) context.getInputSplit();

// 获取输入文件名称

String fname = filesplit.getPath().getName();

ed = new EmployeeDeptBean();

if (fname.equals("emp.csv")) {

// 7369,SMITH,CLERK,7902,1980/12/17,800,,20

ed.setDeptNo(Integer.parseInt(fileds[7]));

ed.setFlag(0);

ed.setEmpno(Integer.parseInt(fileds[0]));

ed.setEname(fileds[1]);

ed.setJob(fileds[2]);

try {

ed.setMgr(Integer.parseInt(fileds[3]));

}catch (Exception e) {

ed.setMgr(0);

}

ed.setHiredate(fileds[4]);

try {

// 防止fileds[5]为空抛空指针异常

ed.setSalary(Float.parseFloat(fileds[5]));

} catch (Exception e) {

ed.setSalary(0);

}

try {

// 防止fileds[6]为空抛空指针异常

ed.setComm(Float.parseFloat(fileds[6]));

} catch (Exception e) {

ed.setComm(0);

}

} else if (fname.equals("dept.csv")) {

ed.setFlag(1);

ed.setDeptNo(Integer.parseInt(fileds[0]));

ed.setDname(fileds[1]);

ed.setLoc(fileds[2]);

}

// 将EmployeeDeptBean对象输出到Reducer

tkey.set(ed.getDeptNo());

context.write(tkey , ed);

}

}

/**

* Reducer端

* k3:部门号

* v3:EmployeeDeptBean集合

* k4:NullWritable

* v4:EmployeeDeptBean

* 原理说明:

* 利用相同Key输出到相同的Reduce处理这一特性,将部门信息和员工信息进行等值合并(利用部门号相同这一条件)

* @author songjq

*

*/

static class EmpDeptEqualJoinReducer

extends Reducer<IntWritable, EmployeeDeptBean, NullWritable, EmployeeDeptBean> {

@Override

protected void reduce(IntWritable k3, Iterable<EmployeeDeptBean> v3, Context ctx)

throws IOException, InterruptedException {

Iterator<EmployeeDeptBean> iterator = v3.iterator();

//定义存放员工信息对象list

ArrayList<EmployeeDeptBean> list = new ArrayList<EmployeeDeptBean>();

//定义部门属性信息

String dname = "";

String loc = "";

while(iterator.hasNext()) {

EmployeeDeptBean ed = iterator.next();

//定义emp表和dept表标志位flag, 0:emp 1:dept

if(ed.getFlag()==0) {

//不能直接list.add(ed);ed为同一个对象

EmployeeDeptBean edtmp = new EmployeeDeptBean();

//也不能直接将edtmp = ed;

edtmp.setDeptNo(ed.getDeptNo());

edtmp.setEname(ed.getEname());

edtmp.setEmpno(ed.getEmpno());

edtmp.setMgr(ed.getMgr());

edtmp.setJob(ed.getJob());

edtmp.setHiredate(ed.getHiredate());

edtmp.setSalary(ed.getSalary());

edtmp.setComm(ed.getComm());

list.add(edtmp);

}else if(ed.getFlag()==1) {

dname = ed.getDname();

loc = ed.getLoc();

}

}

//将部门信息合并到员工对象list

for(EmployeeDeptBean fulled:list) {

fulled.setDname(dname);

fulled.setLoc(loc);

ctx.write(NullWritable.get(), fulled);

}

}

}

/**

* 提交job到hadoop集群执行

* @param args

* @throws Exception

*/

@Test

public void EmpDeptEqualJoinJob() throws Exception {

//定义job

Job job = Job.getInstance(new Configuration());

//设置mapper及输出key,value数据类型

job.setMapperClass(EmpDeptEqualJoinMapper.class);

job.setMapOutputKeyClass(IntWritable.class);

job.setMapOutputValueClass(EmployeeDeptBean.class);

//设置reducer及输出key,value数据类型

job.setReducerClass(EmpDeptEqualJoinReducer.class);

job.setOutputKeyClass(NullWritable.class);

job.setOutputValueClass(EmployeeDeptBean.class);

//设置输入输出路径

FileInputFormat.setInputPaths(job, new Path("D:\test\tmp\join\equal"));

FileOutputFormat.setOutputPath(job, new Path("D:\test\tmp\join\equal_out1"));

//提交任务job

job.waitForCompletion(true);

}

}

3、执行结果

10 7934 ACCOUNTING  MILLER      CLERK       7782  1982/1/23   1300.0 0.0   NEW YORK        

10 7839 ACCOUNTING  KING        PRESIDENT   0     1981/11/17  5000.0 0.0   NEW YORK        

10 7782 ACCOUNTING  CLARK       MANAGER     7839  1981/6/9    2450.0 0.0   NEW YORK        

20 7876 RESEARCH    ADAMS       CLERK       7788  1987/5/23   1100.0 0.0   DALLAS          

20 7788 RESEARCH    SCOTT       ANALYST     7566  1987/4/19   3000.0 0.0   DALLAS          

20 7369 RESEARCH    SMITH       CLERK       7902  1980/12/17  800.0 0.0   DALLAS          

20 7566 RESEARCH    JONES       MANAGER     7839  1981/4/2    2975.0 0.0   DALLAS          

20 7902 RESEARCH    FORD        ANALYST     7566  1981/12/3   3000.0 0.0   DALLAS          

30 7844 SALES       TURNER      SALESMAN    7698  1981/9/8    1500.0 0.0   CHICAGO        

30 7499 SALES       ALLEN       SALESMAN    7698  1981/2/20   1600.0 300.0 CHICAGO        

30 7698 SALES       BLAKE       MANAGER     7839  1981/5/1    2850.0 0.0   CHICAGO        

30 7654 SALES       MARTIN      SALESMAN    7698  1981/9/28   1250.0 1400.0 CHICAGO        

30 7521 SALES       WARD        SALESMAN    7698  1981/2/22   1250.0 500.0 CHICAGO        

30 7900 SALES       JAMES       CLERK       7698  1981/12/3   950.0 0.0   CHICAGO        


以上是关于2018-08-01 期 MapReduce实现多表查询等值连接的主要内容,如果未能解决你的问题,请参考以下文章

2018-07-28期 MapReduce实现对数字排序

2018-08-07 期 MapReduce模拟实现热销商品排行

2018-07-29期 MapReduce实现对字符串进行排序

2018-08-05 期 MapReduce实现每个单词在每个文件中坐标信息统计

2018-08-10期 MapReduce实现双色球近10年每个号码中奖次数统计

2018-08-09期 MapReduce实现对单个用户支付金额最大的前N个商品排名