一个经典的MapRuduce实例------webcount(网站分析访客信息)
Posted learn21cn
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了一个经典的MapRuduce实例------webcount(网站分析访客信息)相关的知识,希望对你有一定的参考价值。
统计某一特定网站的某个时辰访客人数
所用版本:hadoop2.6.5
数据样式如下:
111.111.111.111 - - [16/Dec/2012:05:32:50 -0500] "GET / HTTP/1.1" 200 14791 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)" 111.111.111.111 - - [16/Dec/2012:05:33:50 -0500] "GET / HTTP/1.1" 200 14791 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)" 111.111.111.111 - - [16/Dec/2012:05:34:45 -0500] "GET / HTTP/1.1" 200 14791 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)" 111.111.111.111 - - [16/Dec/2012:05:34:50 -0500] "GET / HTTP/1.1" 200 14791 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)" 111.111.111.111 - - [16/Dec/2012:09:34:55 -0500] "GET / HTTP/1.1" 200 14791 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)" 111.111.111.111 - - [16/Dec/2012:10:23:30 -0500] "GET / HTTP/1.1" 200 14791 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)" 111.111.111.111 - - [16/Dec/2012:10:32:50 -0500] "GET / HTTP/1.1" 200 14791 "-" "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)"
辅助类
1 package com.trendwise.software; 2 3 import java.text.SimpleDateFormat; 4 import java.util.Date; 5 import java.io.DataInput; import java.io.DataOutput; 6 import java.io.IOException; 7 import org.apache.hadoop.io.WritableComparable; 8 9 public class DateWritable implements WritableComparable<DateWritable>{ 10 private final static SimpleDateFormat formatter = new SimpleDateFormat( "yyyy-MM-dd\' T \'HH:mm:ss.SSS" ); 11 private Date date; 12 public Date getDate() { 13 return date; 14 } 15 public void setDate( Date date ) { 16 this.date = date; 17 } 18 19 @Override 20 public void readFields(DataInput in) throws IOException { 21 date = new Date( in.readLong() ); 22 } 23 24 @Override 25 public void write(DataOutput out) throws IOException { 26 out.writeLong( date.getTime() ); 27 } 28 29 @Override 30 public int compareTo(DateWritable o) { 31 return date.compareTo( o.getDate() ); 32 } 33 34 public String toString() { 35 return formatter.format( date); 36 } 37 }
mapper 映射特定年份中每月每天每个时辰的访客数
1 package com.trendwise.software; 2 3 import java.io.IOException; 4 import java.util.Calendar; 5 import org.apache.hadoop.io.IntWritable; 6 import org.apache.hadoop.io.LongWritable; 7 import org.apache.hadoop.io.Text; 8 import org.apache.hadoop.mapreduce.Mapper; 9 10 public class LogMapper extends Mapper<LongWritable, Text, DateWritable, IntWritable> { 11 public static DateWritable dates = new DateWritable(); 12 public final static IntWritable two = new IntWritable(1); 13 public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { 14 String text = value.toString(); 15 // Get the date and time 16 int openBracket = text.indexOf( \'[\' ); 17 int closeBracket = text.indexOf( \']\' ); 18 if( openBracket != -1 && closeBracket != -1 ) { 19 // Read the date 20 String dateString = text.substring( text.indexOf( \'[\' ) + 1, text. indexOf( \']\' ) ); 21 // Build a date object from a string of the form: 16/Dec/2012:05:32:50 -0500 22 int index = 0; 23 int nextIndex = dateString.indexOf( \'/\' ); 24 int day = Integer.parseInt( dateString.substring(index, nextIndex) ); 25 26 index = nextIndex; nextIndex = dateString.indexOf( \'/\', index+1 ); 27 String month = dateString.substring( index+1, nextIndex ); 28 index = nextIndex; 29 nextIndex = dateString.indexOf( \':\', index ); 30 int year = Integer.parseInt(dateString.substring(index + 1, nextIndex)); 31 index = nextIndex; nextIndex = dateString.indexOf( \':\', index+1 ); 32 int hour = Integer.parseInt(dateString.substring(index + 1, nextIndex)); 33 // Build a calendar object for this date 34 Calendar calendar = Calendar.getInstance(); 35 calendar.set( Calendar.DATE, day ); 36 calendar.set( Calendar.YEAR, year ); 37 calendar.set( Calendar.HOUR, hour ); 38 calendar.set( Calendar.MINUTE, 0 ); 39 calendar.set( Calendar.SECOND, 0 ); 40 calendar.set( Calendar.MILLISECOND, 0 ); 41 if( month.equalsIgnoreCase( "dec" ) ) { 42 calendar.set( Calendar.MONTH, Calendar.DECEMBER ); 43 } 44 else if( month.equalsIgnoreCase( "nov" ) ) { 45 calendar.set( Calendar.MONTH, Calendar.NOVEMBER ); 46 } 47 else if( month.equalsIgnoreCase( "oct" ) ) { 48 calendar.set( Calendar.MONTH, Calendar.OCTOBER ); 49 } 50 else if( month.equalsIgnoreCase( "sep" ) ) { 51 calendar.set( Calendar.MONTH, Calendar.SEPTEMBER ); 52 } 53 else if( month.equalsIgnoreCase( "aug" ) ) { 54 calendar.set( Calendar.MONTH, Calendar.AUGUST ); 55 } 56 else if( month.equalsIgnoreCase( "jul" ) ) { 57 calendar.set( Calendar.MONTH, Calendar.JULY ); 58 } 59 else if( month.equalsIgnoreCase( "jun" ) ) { 60 calendar.set( Calendar.MONTH, Calendar.JUNE ); 61 } 62 else if( month.equalsIgnoreCase( "may" ) ) { 63 calendar.set( Calendar.MONTH, Calendar.MAY ); 64 } 65 else if( month.equalsIgnoreCase( "apr" ) ) { 66 calendar.set( Calendar.MONTH, Calendar.APRIL ); 67 } 68 else if( month.equalsIgnoreCase( "mar" ) ) { 69 calendar.set( Calendar.MONTH, Calendar.MARCH ); 70 } 71 else if( month.equalsIgnoreCase( "feb" ) ) { 72 calendar.set( Calendar.MONTH, Calendar.FEBRUARY ); 73 } 74 else if( month.equalsIgnoreCase( "jan" ) ) { 75 calendar.set( Calendar.MONTH, Calendar.JANUARY ); 76 } 77 78 dates.setDate( calendar.getTime() ); 79 context.write(dates, two); 80 81 } 82 } 83 }
reducer 汇总一个时辰内访客人数
1 package com.trendwise.software; 2 3 import java.io.IOException; 4 import org.apache.hadoop.io.IntWritable; 5 import org.apache.hadoop.mapreduce.Reducer; 6 7 public class LogReducer extends Reducer<DateWritable, IntWritable, DateWritable, IntWritable> { 8 @Override 9 public void reduce( DateWritable key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { 10 11 int countn = 0; 12 for(IntWritable v :values){ 13 countn += v.get(); 14 } 15 context.write(key, new IntWritable( countn) ); 16 } 17 }
driver 配置信息,程序入口
1 package com.trendwise.software; 2 3 import java.io.IOException; 4 import org.apache.hadoop.conf.Configuration; 5 import org.apache.hadoop.fs.Path; 6 import org.apache.hadoop.io.IntWritable; 7 import org.apache.hadoop.mapreduce.Job; 8 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 9 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 10 11 public class Driver { 12 13 public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { 14 15 String in = args[0]; 16 String out = args[1]; 17 int unitmb =Integer.valueOf(args[2]); 18 int nreducer = Integer.valueOf(args[3]); 19 20 Configuration conf = new Configuration(); 21 conf.set("mapreduce.input.fileinputformat.split.maxsize", String.valueOf(unitmb * 1024 * 1024)); 22 conf.set("mapred.min.split.size", String.valueOf(unitmb * 1024 * 1024)); 23 conf.set("mapreduce.input.fileinputformat.split.minsize.per.node", String.valueOf(unitmb * 1024 * 1024)); 24 conf.set("mapreduce.input.fileinputformat.split.minsize.per.rack", String.valueOf(unitmb * 1024 * 1024)); 25 26 Job job = new Job(conf); 27 FileInputFormat.addInputPath(job, new Path(in)); 28 FileOutputFormat.setOutputPath(job, new Path(out)); 29 job.setMapperClass(LogMapper.class); 30 job.setReducerClass(LogReducer.class); 31 job.setCombinerClass(LogReducer.class); 32 job.setNumReduceTasks(nreducer); 33 job.setMapOutputKeyClass(DateWritable.class); 34 job.setMapOutputValueClass(IntWritable.class); 35 job.setOutputKeyClass(DateWritable.class); 36 job.setOutputValueClass(IntWritable.class); 37 job.setJarByClass(Driver.class); 38 job.waitForCompletion(true); 39 40 } 41 }
command
result
以上是关于一个经典的MapRuduce实例------webcount(网站分析访客信息)的主要内容,如果未能解决你的问题,请参考以下文章