2018-08-04 期 MapReduce倒排索引编程案例2(jobControll方式)

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了2018-08-04 期 MapReduce倒排索引编程案例2(jobControll方式)相关的知识,希望对你有一定的参考价值。

1、第一阶段MapReduce任务程序

package cn.itcast.bigdata.index;

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

import org.apache.hadoop.mapreduce.lib.input.FileSplit;

/**

* 利用MapReduce实现输入多个文件中单词在每个文件中出现的次数,输出格式如下:

* hello (a.txt 2,b.txt 1,c.txt 4)

* tom (a.txt 5,b.txt 3)

* 实现方法:采用倒排索引算法并结合jobControll实现

* 本案例中所有的Mapper、Reducer、Job均采用匿名内部类实现

* @author songjq

*

*/

public class IndexStepOne {

/**

* 第一阶段Mapper处理后输出数据格式为

* <k2> <v2>

* <hello:a.txt> <1>

* <hello:a.txt> <1>

* <hello:b.txt> <1>

* @author songjq

*

*/

static class IndexStepOneMapper extends Mapper<LongWritable, Text, Text, IntWritable> {

/**

* 格式:<hello-->a.txt,1><helle-->b.txt,1>

*/

private Text tkey = new Text();

private IntWritable tvalue = new IntWritable(1);

@Override

protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

FileSplit inputSplit = (FileSplit) context.getInputSplit();

String fileName = inputSplit.getPath().getName();

String line = value.toString();

String[] split = line.split(" ");

for (String val : split) {

tkey.set(val + "-->" + fileName);

context.write(tkey, tvalue);

}

}

}

/**

* 第一阶段Mapper输出数据格式为

* <k2> <v2>

* <hello:a.txt> <1>

* <hello:a.txt> <1>

* <hello:b.txt> <1>

* 第一阶段Reducer处理后输出到HDFS数据格式为

* <k3> <v3>

* <hello> <a.txt-->2>

* <hello> <b.txt-->1>

* @author songjq

*

*/

static class IndexStepOneReducer extends Reducer<Text, IntWritable, Text, LongWritable> {

private LongWritable tvalue = new LongWritable(0);

@Override

protected void reduce(Text key, Iterable<IntWritable> values, Context ctx)

throws IOException, InterruptedException {

long count = 0;

for(IntWritable value:values) {

count++;

}

tvalue.set(count);

ctx.write(key, tvalue);

}

}

}

2、第二阶段MapReduce任务程序

package cn.itcast.bigdata.index;

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapreduce.Mapper;

import org.apache.hadoop.mapreduce.Reducer;

/**

* 利用MapReduce实现输入多个文件中单词在每个文件中出现的次数,输出格式如下:

* hello (a.txt 2,b.txt 1,c.txt 4)

* tom (a.txt 5,b.txt 3)

* 实现方法:采用倒排索引算法并结合jobControll实现

* 本案例中所有的Mapper、Reducer、Job均采用匿名内部类实现

* @author songjq

*

*/

public class IndexStepTwo {

/**

* 第二阶段Mapper

* 第二阶段Mapper输入数据为第一阶段Reducer输出到HDFS的数据,格式为

* hello a.txt-->2

* hello b.txt-->1

* 通过第二阶段Mapper处理,输出数据格式为

* <k2> <v2>

* <hello> <a.txt-->2,b.txt-->1>

* @author songjq

*

*/

static class IndexStepTwoMapper extends Mapper<LongWritable, Text, Text, Text>{

private Text tkey = new Text();

private Text tvalue = new Text();

@Override

protected void map(LongWritable key, Text value, Context context)

throws IOException, InterruptedException {

String line = value.toString();

String[] split = line.split(" ");

if(split.length>1) {

String[] split2 = split[0].split("-->");

tkey.set(split2[0]);

if(split2.length>1) {

tvalue.set(split2[1]+"-->"+split[1]);

context.write(tkey, tvalue);

}

}

}

}

/**

* 第二阶段Reducer

* 通过第二阶段Reducer处理后,为最终输出结果,输出格式为

* <k4> <v4>

* <hello> <(a.txt 2,b.txt 1)>

* @author songjq

*

*/

static class IndexStepTwoReducer extends Reducer<Text, Text, Text, Text>{

private Text tval = new Text();

@Override

protected void reduce(Text key, Iterable<Text> values, Context ctx)

throws IOException, InterruptedException {

StringBuffer sb = new StringBuffer();

for(Text value:values) {

sb.append(value+" ");

}

tval.set(sb.toString());

ctx.write(key, tval);

}

}

}

3、利用jobControll来实现依赖任务的提交

package cn.itcast.bigdata.index;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.io.IntWritable;

import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;

import org.apache.hadoop.mapred.jobcontrol.JobControl;

import org.apache.hadoop.mapreduce.Job;

import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import org.apache.hadoop.mapreduce.lib.jobcontrol.ControlledJob;

import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import cn.itcast.bigdata.index.IndexStepOne.IndexStepOneMapper;

import cn.itcast.bigdata.index.IndexStepOne.IndexStepOneReducer;

import cn.itcast.bigdata.index.IndexStepTwo.IndexStepTwoMapper;

import cn.itcast.bigdata.index.IndexStepTwo.IndexStepTwoReducer;

/**

* 简单的job串联可以使用jobControll来实现 更复杂的job的调度可以考虑用shell脚本来写,或者干脆用现成的任务调度工具oozie来做

* 这里使用简单的jobControll来实现两个阶段MapReduce任务依赖提交处理

* 由于第二阶段的Mapper输入需要依赖第一阶段Reducer的输出,因此可以利用jobControll来实现第二阶段Mapper的等待,直到

* 第一阶段Reducer输出后,第二阶段的job才开始提交处理

* 核心方法:

* controlledJob2.addDependingJob(controlledJob1);

* @author songjq

*

*/

public class OnceSubmitClient {

public static void main(String[] args) throws Exception {

// 构造第一阶段的基本job对象job1

Configuration conf1 = new Configuration();

Job job1 = Job.getInstance(conf1, "inexStepOne");

job1.setJarByClass(OnceSubmitClient.class);

job1.setMapperClass(IndexStepOneMapper.class);

job1.setReducerClass(IndexStepOneReducer.class);

job1.setMapOutputKeyClass(Text.class);

job1.setMapOutputValueClass(IntWritable.class);

job1.setOutputKeyClass(Text.class);

job1.setOutputValueClass(LongWritable.class);

FileInputFormat.setInputPaths(job1, new Path(args[0]));

FileOutputFormat.setOutputPath(job1, new Path(args[1]));

// 构造第二阶段的基本job对象job2

Configuration conf2 = new Configuration();

Job job2 = Job.getInstance(conf2, "inexStepTwo");

job2.setJarByClass(OnceSubmitClient.class);

job2.setMapperClass(IndexStepTwoMapper.class);

job2.setReducerClass(IndexStepTwoReducer.class);

job2.setMapOutputKeyClass(Text.class);

job2.setMapOutputValueClass(Text.class);

job2.setOutputKeyClass(Text.class);

job2.setOutputValueClass(Text.class);

// 第二个job的输出是第一个job的输入

FileInputFormat.setInputPaths(job2, new Path(args[1]));

FileOutputFormat.setOutputPath(job2, new Path(args[2]));

// ControlledJob是基本的job的封装

ControlledJob controlledJob1 = new ControlledJob(conf1);

// 将job1封装到controlledJob1中去

controlledJob1.setJob(job1);

ControlledJob controlledJob2 = new ControlledJob(conf2);

// 将job2封装到controlledJob2中去

controlledJob2.setJob(job2);

// 先构造一个job控制器

JobControl jobControl = new JobControl("index");

// 指定两个job之间的依赖关系

controlledJob2.addDependingJob(controlledJob1);

// 向job控制器中添加job

jobControl.addJob(controlledJob1);

jobControl.addJob(controlledJob2);

// 创建一个线程去启动jobControl

Thread thread = new Thread(jobControl);

thread.start();

// 如果job没有运行完,主线程就等等

while (!jobControl.allFinished()) {

thread.sleep(500);

}

int succeedSize = jobControl.getSuccessfulJobList().size();

//0正常退出 1异常退出

System.exit(succeedSize == 2 ? 0 : 1);

}

}


以上是关于2018-08-04 期 MapReduce倒排索引编程案例2(jobControll方式)的主要内容,如果未能解决你的问题,请参考以下文章

Lucene 倒排索fst引原理与实现

Lucene 倒排索fst引原理与实现

Lucene 倒排索fst引原理与实现

利用MapReduce实现倒排索引

mapreduce 高级案例倒排索引

大数据MapReduce入门之倒排索引