Hadoop WordCount(Streaming,Python,Java三合一)

Posted 白石江边

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Hadoop WordCount(Streaming,Python,Java三合一)相关的知识,希望对你有一定的参考价值。

一、Steaming

Map任务:

#!/bin/bash
awk BEGIN{
        FS = "[ ,.      ]"
        OFS = "	"
}{
        for( i = 1; i <= NF; ++i)
        {
                dict[$i] += 1
        }
}END{
        for( key in dict)
        {
                print key,dict[key]
        }
}

Reducer任务:

#!/bin/bash
awk BEGIN{
        OFS = "	"
}{
        dict[$1] += $2
}END{
        for(key in dict)
        {
                print key,dict[key]
        }
}

启动脚本:

#!/bin/bash

hadoop fs -rm -r /data/apps/zhangwenchao/mapreduce/streaming/wordcount/output

hadoop jar /data/tools/hadoop/hadoop-2.6.2/share/hadoop/tools/lib/hadoop-streaming-2.6.2.jar         -input /data/apps/zhangwenchao/mapreduce/streaming/wordcount/input         -output /data/apps/zhangwenchao/mapreduce/streaming/wordcount/output         -mapper "sh -x mapper.sh"         -reducer "sh -x reducer.sh"         -file mapper.sh         -file reducer.sh         -jobconf mapred.job.name=wordcount         -jobconf mapred.job.tasks=5         -jobconf mapred.reduce.tasks=3

 

二、Python

Map任务:

#! /usr/bin/python 
import sys
import re
for line in sys.stdin:
        wordlist=re.split([;,.?],line)
        for words in wordlist:
                words=words.strip()
                tmp = words.split()
                for item in tmp:
                        print "%s	%s" % (item, 1)

Reducer任务:

#!/usr/bin/env python
from operator import itemgetter
import sys
current_word = None
current_count = 0
word = None
for line in sys.stdin:
    line = line.strip()
    word, count = line.split(	, 1)
    try:
        count = int(count)
    except ValueError:
        continue
    if current_word == word:
        current_count += count
    else:
        if current_word:
                print %s	%s % (current_word,current_count)
        current_count = count
        current_word = word
if word == current_word:
    print "%s	%s" % (current_word, current_count)

启动脚本:

 #!/bin/bash

hadoop fs -rm -r /data/apps/zhangwenchao/mapreduce/python/wordcount/output

hadoop jar /data/tools/hadoop/hadoop-2.6.2/share/hadoop/tools/lib/hadoop-streaming-2.6.2.jar         -input /data/apps/zhangwenchao/mapreduce/python/wordcount/input         -output /data/apps/zhangwenchao/mapreduce/python/wordcount/output         -mapper "mapper.py"         -reducer "reducer.py"         -file mapper.py         -file reducer.py         -jobconf mapred.job.name=wordcount         -jobconf mapred.job.tasks=5         -jobconf mapred.reduce.tasks=3

 

三、Java

Map:

import java.io.IOException;
import java.util.StringTokenizer;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class MyMap extends Mapper<LongWritable, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();

public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());

context.write(word, one);
}
}
}

Reduce:

import java.io.IOException;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class MyReduce extends Reducer<Text, IntWritable, Text, IntWritable> {
@Override
public void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
context.write(key, new IntWritable(sum));
}
}

Main:

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class Main {
public static void main(String[] args) throws Exception {

String input = "hdfs://test1:8020/test/**/test/zhangwenchao/java/wordcount/intput";
String output = "hdfs://test1:8020/test/**/test/zhangwenchao/java/wordcount/output";
Configuration conf = new Configuration();

Job job = new Job(conf);
job.setJobName("test4");
job.setJarByClass(Main.class);

FileInputFormat.addInputPath(job, new Path(input));
FileOutputFormat.setOutputPath(job, new Path(output));

job.setMapperClass(MyMap.class);
job.setReducerClass(MyReduce.class);

job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);

job.setCombinerClass(MyReduce.class);

job.setNumReduceTasks(3);

job.waitForCompletion(true);
}
}

以上是关于Hadoop WordCount(Streaming,Python,Java三合一)的主要内容,如果未能解决你的问题,请参考以下文章

如何运行自带wordcount-Hadoop2

Hadoop下WordCount程序

运行Hadoop自带的wordcount单词统计程序

Hadoop wordcount Demon

[hadoop] hadoop 运行 wordcount

运行第一个Hadoop程序,WordCount