跟A君学大数据-用MapReduce实现表关联

Posted 2021-04-19 六点A君

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了跟A君学大数据-用MapReduce实现表关联相关的知识，希望对你有一定的参考价值。

前言

前面使用 MapReduce，可以进行单词计数，单词去重，数字排序等，那么结合到数据库应用，
如何实现表关联呢？
MapReduce 更像算法题，怎么通过 Map 和 Reduce 这两个步骤来实现关联，得到所需数据呢？
例如有一张表，里面两个字段，child 和 parent，现在让你找出里面的 grandChild 和 grandParent 来。
以 mysql 为例，我们直接一行 sql 就可以解决：

 
   
   
 
  
    
    
  select a.child,b.parent 
  
    
    
  from child_parent a, child_parent b
  
    
    
  where a.parent=b.child
  
    
    
  order by a.child desc

那么从 MapReduce 角度该如何设计 Map 以及 Reduce 函数呢？

设计

需要使得左表的 parent 和右表的 child 列相连接。
将 paren 设置为 key，而 child 作为 value 进行输出，作为左表
再将同一对 child 和 paren 的 child 设为 key，而 parent 设置为 value 作为输出。
给每个输出增加标志作为区分左右表。
在 Reduce 函数的接受的结果中，每个 key 的 value-list 包含了 grandchild 和 grandparen 关系
取出每个 key 的 value 进行解析，将左表的 child 放到一个数组，右表的 parent 放到一个数组，最后做双重循环迪卡尔集即可（就如 sql 语句中的笛卡尔集）
因为在 Reduce 中，给出的是 key 相同的 value_list，所以就是相当于上面 sql 的 where a.parent=b.child

具体实现

 
   
   
 
  
    
    
  package com.anla.chapter3.innerjoin;
  
    
    
  
  
    
    
  import org.apache.hadoop.conf.Configuration;
  
    
    
  import org.apache.hadoop.fs.Path;
  
    
    
  import org.apache.hadoop.io.Text;
  
    
    
  import org.apache.hadoop.mapreduce.Job;
  
    
    
  import org.apache.hadoop.mapreduce.Mapper;
  
    
    
  import org.apache.hadoop.mapreduce.Reducer;
  
    
    
  import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
  
    
    
  import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
  
    
    
  import org.apache.hadoop.util.GenericOptionsParser;
  
    
    
  
  
    
    
  import java.io.IOException;
  
    
    
  import java.util.Iterator;
  
    
    
  
  
    
    
  /**
  
    
    
   * @user anLA7856
  
    
    
   * @time 19-3-22 下午6:01
  
    
    
   * @description
  
    
    
   */
  
    
    
  public class SimpleJoin {
  
    
    
   public static int time = 0;
  
    
    
  
  
    
    
   public static class Map extends Mapper<Object, Text, Text, Text> {
  
    
    
   @Override
  
    
    
   protected void map(Object key, Text value, Context context) throws IOException, InterruptedException {
  
    
    
   String childName;
  
    
    
   String parentName;
  
    
    
   String relationType;
  
    
    
   String line = value.toString();
  
    
    
   int i = 0;
  
    
    
   // 用来寻找分隔符
  
    
    
   String[] values = line.split(" ");
  
    
    
   if (!"child".equals(values[0])) {
  
    
    
   // 不为child，即不计算第一行
  
    
    
   childName = values[0];
  
    
    
   parentName = values[1];
  
    
    
   relationType = "1"; // 左右表区分
  
    
    
   context.write(new Text(parentName), new Text(relationType+"+"+childName+"+"+parentName)); // 左表
  
    
    
   relationType = "2"; // 左右表区分
  
    
    
   context.write(new Text(childName), new Text(relationType+"+" + childName + "+" +parentName)); // 右表
  
    
    
   }
  
    
    
   }
  
    
    
   }
  
    
    
  
  
    
    
   public static class Reduce extends Reducer<Text, Text, Text, Text> {
  
    
    
   @Override
  
    
    
   protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
  
    
    
   if (time == 0) { // 输出表头
  
    
    
   context.write(new Text("grandChild"), new Text("grandParent"));
  
    
    
   time ++;
  
    
    
   }
  
    
    
   int grandChildNum = 0;
  
    
    
   String grandChild[] = new String[10];
  
    
    
   int grandParentNum = 0;
  
    
    
   String grandParent[] = new String[10];
  
    
    
   Iterator iterator = values.iterator();
  
    
    
   while (iterator.hasNext()){
  
    
    
   String record = iterator.next().toString();
  
    
    
   int len = record.length();
  
    
    
   if (len == 0) {
  
    
    
   continue;
  
    
    
   }
  
    
    
   char relationType = record.charAt(0);
  
    
    
   String childName = record.split("\\+")[1];
  
    
    
   String parentName = record.split("\\+")[2];
  
    
    
   // 左表
  
    
    
   if (relationType == '1') {
  
    
    
   grandChild[grandChildNum] = childName;
  
    
    
   grandChildNum ++;
  
    
    
   }else {
  
    
    
   grandParent[grandParentNum] = parentName;
  
    
    
   grandParentNum++;
  
    
    
   }
  
    
    
  
  
    
    
   }
  
    
    
   // grandChild和grandParent求迪卡尔
  
    
    
   if (grandChildNum != 0 && grandParentNum != 0) {
  
    
    
   for (int m = 0; m <grandChildNum; m++) {
  
    
    
   for (int n = 0; n < grandParentNum; n++){
  
    
    
   context.write(new Text(grandChild[m]), new Text(grandParent[n]));
  
    
    
   }
  
    
    
   }
  
    
    
   }
  
    
    
   }
  
    
    
  
  
    
    
   }
  
    
    
  
  
    
    
  
  
    
    
   public static void main(String[] args) throws Exception{
  
    
    
   Configuration configuration = new Configuration();
  
    
    
   String[] otherArgs = new GenericOptionsParser(configuration, args).getRemainingArgs();
  
    
    
   if (otherArgs.length != 2) {
  
    
    
   System.out.println("Usage: Sort <in> <out>");
  
    
    
   System.exit(2);
  
    
    
   }
  
    
    
   Job job = Job.getInstance(configuration, "SimpleJoin");
  
    
    
   job.setJarByClass(SimpleJoin.class);
  
    
    
   job.setMapperClass(Map.class);
  
    
    
   job.setReducerClass(Reduce.class);
  
    
    
   job.setOutputKeyClass(Text.class);
  
    
    
   job.setOutputValueClass(Text.class);
  
    
    
   FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
  
    
    
   FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
  
    
    
   System.exit(job.waitForCompletion(true) ? 0:1);
  
    
    
   }
  
    
    
  }

还是按照前一篇运行方法：跟 A 君学大数据 (二)- 手把手运行 Hadoop 的 WordCount 程序

得到结果：

参考资料：

Hadoop In Action

以上是关于跟A君学大数据-用MapReduce实现表关联的主要内容，如果未能解决你的问题，请参考以下文章

MapReduce编程之实现多表关联

大数据学习之十二——MapReduce代码实例：关联性操作

《从0开始学大数据》之MapReduce 计算框架是如何运作的

mapreduce-实现多表关联

mapreduce-实现单表关联

《从0开始学大数据》之Spark的编程模型