朴素贝叶斯之MapReduce版

Posted 余音丶未散

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了朴素贝叶斯之MapReduce版相关的知识,希望对你有一定的参考价值。

1,统计词出现的次数

1/计算类别的先验概率
 *输入格式:类别+文档id+文档词(切分成A,b,c)
 *输出格式:类别+文档出现次数+文档出现的词的总数

2/计算每个词的条件概率
 *输入格式:类别+文档id+文档词(切分成A,b,c)
 *输出格式:类别+词+词的总数

3/假设二分类问题-计算概率值 
 * 1类别+文档出现次数+文档出现的词的总数 
 * 2类别+词+词的总数
 * 3类别+词+log(词的总数/文档出现的词的总数),类别-log(文档出现次数/sum(文档出现次数))
 * 输入格式:类别+词+词的总数 
 * 输出格式:"词","类别+log()值概率"+1,2+类别的先验概率

 * 4/假设二分类问题-测试
 * 1类别+文档出现次数+文档出现的词的总数
 * 2类别+词+词的总数
 * 3类别+词+log(词的总数/文档出现的词的总数),类别-log(文档出现次数/sum(文档出现次数))
 *输入格式:新文档id+文档词(切分成A,b,c)
 *输出格式:新文档id+类别


这个版本基本写了MapReduce的朴素贝叶斯思路--具体优化和修改以后再弄

Python版实现
http://blog.csdn.net/q383700092/article/details/51773364
R语言版调用函数
http://blog.csdn.net/q383700092/article/details/51774069
MapReduce简化实现版
http://blog.csdn.net/q383700092/article/details/51778765
spark版
后续添加

Bayes1

package com.ml.mapreduce;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

/**
 * 1/计算类别的先验概率
 * 汇总到dict1.txt
 *输入格式:类别+文档id+文档词(切分成A,b,c)
 *输出格式:类别+文档出现次数+文档出现的词的总数
 */
public class Bayes1 extends Configured implements Tool {

	public static enum Counter {
		PARSER_ERR
	}

	public static class MyMap extends Mapper<LongWritable, Text, Text, Text> {
		private Text mykey = new Text();// 类别id
		private Text myval = new Text();// 文档id+文档长度

		protected void map(LongWritable key, Text value, Context context)
				throws IOException, InterruptedException {
			String[] array = value.toString().split(",");
			String[] doc=array[2].split("-");
			mykey.set(array[0]);
			myval.set("1"+","+doc.length);
			context.write(mykey, myval);
		};
	}

	public static class MyReduce extends Reducer<Text, Text, Text, Text> {
		private Text val = new Text();

		protected void reduce(Text key, Iterable<Text> values, Context context)
				throws IOException, InterruptedException {
			// 用于计算该类别总的个数
			int sum = 0;
			//计算出现词的总个数
			int wordsum = 0;
			// 循环遍历 Interable
			for (Text value : values) {
				// 累加
				String[] array = value.toString().split(",");
				sum += Integer.parseInt(array[0]);
				wordsum += Integer.parseInt(array[1]);
				val.set(sum+","+wordsum);
			}
			context.write(key, val);
		};
	}

	@Override
	public int run(String[] args) throws Exception {
		// 1 conf
		Configuration conf = new Configuration();
		conf.set("mapred.textoutputformat.separator", ",");// key value分隔符
		// 2 create job
		// Job job = new Job(conf, ModuleMapReduce.class.getSimpleName());
		Job job = this.parseInputAndOutput(this, conf, args);
		// 3 set job
		// 3.1 set run jar class
		// job.setJarByClass(ModuleReducer.class);
		// 3.2 set intputformat
		job.setInputFormatClass(TextInputFormat.class);
		// 3.3 set input path
		// FileInputFormat.addInputPath(job, new Path(args[0]));
		// 3.4 set mapper
		job.setMapperClass(MyMap.class);
		// 3.5 set map output key/value class
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(Text.class);
		// 3.6 set partitioner class
		// job.setPartitionerClass(HashPartitioner.class);
		// 3.7 set reduce number
		// job.setNumReduceTasks(1);
		// 3.8 set sort comparator class
		// job.setSortComparatorClass(LongWritable.Comparator.class);
		// 3.9 set group comparator class
		// job.setGroupingComparatorClass(LongWritable.Comparator.class);
		// 3.10 set combiner class
		// job.setCombinerClass(null);
		// 3.11 set reducer class
		job.setReducerClass(MyReduce.class);
		// 3.12 set output format

		job.setOutputFormatClass(TextOutputFormat.class);
		// 3.13 job output key/value class
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		// 3.14 set job output path
		// FileOutputFormat.setOutputPath(job, new Path(args[1]));
		// 4 submit job
		boolean isSuccess = job.waitForCompletion(true);
		// 5 exit
		// System.exit(isSuccess ? 0 : 1);
		return isSuccess ? 0 : 1;
	}

	public Job parseInputAndOutput(Tool tool, Configuration conf, String[] args)
			throws Exception {
		// validate
		if (args.length != 2) {
			System.err.printf("Usage:%s [genneric options]<input><output>\\n",
					tool.getClass().getSimpleName());
			ToolRunner.printGenericCommandUsage(System.err);
			return null;
		}
		// 2 create job
		Job job = new Job(conf, tool.getClass().getSimpleName());
		// 3.1 set run jar class
		job.setJarByClass(tool.getClass());
		// 3.3 set input path
		FileInputFormat.addInputPath(job, new Path(args[0]));
		// 3.14 set job output path
		FileOutputFormat.setOutputPath(job, new Path(args[1]));

		return job;
	}

	public static void main(String[] args) throws Exception {
		args = new String[] {
				"hdfs://192.168.192.129:9000/ml/bayesTrain.txt",
				// "hdfs://hadoop-00:9000/home910/liyuting/output/" };
				"hdfs://192.168.192.129:9000/ml/bayes/" };
		// run mapreduce
		int status = ToolRunner.run(new Bayes1(), args);
		// 5 exit
		System.exit(status);
	}
}

</pre>Bayes2<p></p><p></p><pre code_snippet_id="1734263" snippet_file_name="blog_20160628_2_8203234" name="code" class="java">package com.ml.mapreduce;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

/**
 * 2/计算每个词的条件概率
 * 汇总到dict2.txt
 *输入格式:类别+文档id+文档词(切分成A,b,c)
 *输出格式:类别+词+词的总数
 */
public class Bayes2 extends Configured implements Tool {

	public static enum Counter {
		PARSER_ERR
	}

	public static class MyMap extends Mapper<LongWritable, Text, Text, Text> {
		private Text mykey = new Text();//类别+词
		private Text myval = new Text();//出现个数

		protected void map(LongWritable key, Text value, Context context)
				throws IOException, InterruptedException {
			String[] array = value.toString().split(",");
			String[] doc=array[2].split("-");
			for (String str : doc) {
				mykey.set(array[0]+ ","+ str);
				myval.set("1");
				context.write(mykey, myval);
			}
		};
	}

	public static class MyReduce extends Reducer<Text, Text, Text, Text> {
		private Text val = new Text();

		protected void reduce(Text key, Iterable<Text> values, Context context)
				throws IOException, InterruptedException {
			// 用于计算每个类别里面每个词出现的总数
			int sum = 0;
			// 循环遍历 Interable
			for (Text value : values) {
				// 累加
				String array = value.toString();
				sum += Integer.parseInt(array);
				val.set(sum + "");
			}
			context.write(key, val);
		};
	}

	@Override
	public int run(String[] args) throws Exception {
		// 1 conf
		Configuration conf = new Configuration();
		conf.set("mapred.textoutputformat.separator", ",");// key value分隔符
		// 2 create job
		// Job job = new Job(conf, ModuleMapReduce.class.getSimpleName());
		Job job = this.parseInputAndOutput(this, conf, args);
		// 3 set job
		// 3.1 set run jar class
		// job.setJarByClass(ModuleReducer.class);
		// 3.2 set intputformat
		job.setInputFormatClass(TextInputFormat.class);
		// 3.3 set input path
		// FileInputFormat.addInputPath(job, new Path(args[0]));
		// 3.4 set mapper
		job.setMapperClass(MyMap.class);
		// 3.5 set map output key/value class
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(Text.class);
		// 3.6 set partitioner class
		// job.setPartitionerClass(HashPartitioner.class);
		// 3.7 set reduce number
		// job.setNumReduceTasks(1);
		// 3.8 set sort comparator class
		// job.setSortComparatorClass(LongWritable.Comparator.class);
		// 3.9 set group comparator class
		// job.setGroupingComparatorClass(LongWritable.Comparator.class);
		// 3.10 set combiner class
		// job.setCombinerClass(null);
		// 3.11 set reducer class
		job.setReducerClass(MyReduce.class);
		// 3.12 set output format

		job.setOutputFormatClass(TextOutputFormat.class);
		// 3.13 job output key/value class
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		// 3.14 set job output path
		// FileOutputFormat.setOutputPath(job, new Path(args[1]));
		// 4 submit job
		boolean isSuccess = job.waitForCompletion(true);
		// 5 exit
		// System.exit(isSuccess ? 0 : 1);
		return isSuccess ? 0 : 1;
	}

	public Job parseInputAndOutput(Tool tool, Configuration conf, String[] args)
			throws Exception {
		// validate
		if (args.length != 2) {
			System.err.printf("Usage:%s [genneric options]<input><output>\\n",
					tool.getClass().getSimpleName());
			ToolRunner.printGenericCommandUsage(System.err);
			return null;
		}
		// 2 create job
		Job job = new Job(conf, tool.getClass().getSimpleName());
		// 3.1 set run jar class
		job.setJarByClass(tool.getClass());
		// 3.3 set input path
		FileInputFormat.addInputPath(job, new Path(args[0]));
		// 3.14 set job output path
		FileOutputFormat.setOutputPath(job, new Path(args[1]));

		return job;
	}

	public static void main(String[] args) throws Exception {
		args = new String[] {
				"hdfs://192.168.192.129:9000/ml/bayesTrain.txt",
				// "hdfs://hadoop-00:9000/home910/liyuting/output/" };
				"hdfs://192.168.192.129:9000/ml/bayes/pword/" };
		// run mapreduce
		int status = ToolRunner.run(new Bayes2(), args);
		// 5 exit
		System.exit(status);
	}
}

Bayes3

package com.ml.mapreduce;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

/**
 * 3/假设二分类问题-计算概率值 
 * 1类别+文档出现次数+文档出现的词的总数 
 * 2类别+词+词的总数
 * 3类别+词+log(词的总数/文档出现的词的总数),类别-log(文档出现次数/sum(文档出现次数))
 * 
 * 输入格式:类别+词+词的总数 
 * 输出格式:"词","类别+log()值概率"+1,2+类别的先验概率
 */
public class Bayes3 extends Configured implements Tool {

	public static enum Counter {
		PARSER_ERR
	}

	public static class MyMap extends Mapper<LongWritable, Text, Text, Text> {
		private Text mykey = new Text();// 类别+词
		private Text myval = new Text();// 出现个数

		protected void map(LongWritable key, Text value, Context context)
				throws IOException, InterruptedException {
			 BufferedReader br = null;
	            //获得当前作业的DistributedCache相关文件
	            Path[] distributePaths = DistributedCache.getLocalCacheFiles(context.getConfiguration());
	            String lines = null;
	            String[] class1 = {"0","0"};
	            String[] class0 = {"0","0"};
	            for(Path p : distributePaths){
	                if(p.getParent().toString().endsWith("bayes")){
	                    //读缓存文件,并放到mem中
	                    br = new BufferedReader(new FileReader(p.toString()));
	                    while(null!=(lines=br.readLine())){
	                        String[] pall= lines.split(",");
	                        if (pall[0].equals("1")) {
	                        	class1[0]=pall[1];
	                        	class1[1]=pall[2];
							}else {
								class0[0]=pall[1];
								class0[1]=pall[2];
							}
	                    }
	                }
	            }
			String[] array = value.toString().split(",");
			Double plog=0.0;
			if (array[0].equals("1")) {
				mykey.set(array[1]);// 词
				plog=Math.log(Double.parseDouble(array[2])/Double.parseDouble(class1[1]));
				myval.set(array[0]+","+plog);// 类别+log概率
				context.write(mykey, myval);
			}else {
				mykey.set(array[1]);// 词
				plog=Math.log(Double.parseDouble(array[2])/Double.parseDouble(class0[1]));
				myval.set(array[0]+","+plog);// 类别+log概率
				context.write(mykey, myval);
			}
			
		};
	}

	public static class MyReduce extends Reducer<Text, Text, Text, Text> {
		private Text val = new Text();

		protected void reduce(Text key, Iterable<Text> values, Context context)
				throws IOException, InterruptedException {
			
			String vals="tab";
			for (Text value : values) {
				// 累加
				vals=vals+","+value.toString();
			}
			val.set(vals);
			context.write(key, val);
		};
	}

	@Override
	public int run(String[] args) throws Exception {
		// 1 conf
		Configuration conf = new Configuration();
		conf.set("mapred.textoutputformat.separator", ",");// key value分隔符
		DistributedCache.addCacheFile(new Path(args[2]).toUri(), conf);//为该job添加缓存文件
		// 2 create job
		// Job job = new Job(conf, ModuleMapReduce.class.getSimpleName());
		Job job = this.parseInputAndOutput(this, conf, args);
		// 3 set job
		// 3.1 set run jar class
		// job.setJarByClass(ModuleReducer.class);
		// 3.2 set intputformat
		job.setInputFormatClass(TextInputFormat.class);
		// 3.3 set input path
		// FileInputFormat.addInputPath(job, new Path(args[0]));
		// 3.4 set mapper
		job.setMapperClass(MyMap.class);
		// 3.5 set map output key/value class
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(Text.class);
		// 3.6 set partitioner class
		// job.setPartitionerClass(HashPartitioner.class);
		// 3.7 set reduce number
//		 job.setNumReduceTasks(0);
		// 3.8 set sort comparator class
		// job.setSortComparatorClass(LongWritable.Comparator.class);
		// 3.9 set group comparator class
		// job.setGroupingComparatorClass(LongWritable.Comparator.class);
		// 3.10 set combiner class
		// job.setCombinerClass(null);
		// 3.11 set reducer class
		job.setReducerClass(MyReduce.class);
		// 3.12 set output format

		job.setOutputFormatClass(TextOutputFormat.class);
		// 3.13 job output key/value class
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		// 3.14 set job output path
		// FileOutputFormat.setOutputPath(job, new Path(args[1]));
		// 4 submit job
		boolean isSuccess = job.waitForCompletion(true);
		// 5 exit
		// System.exit(isSuccess ? 0 : 1);
		return isSuccess ? 0 : 1;
	}

	public Job parseInputAndOutput(Tool tool, Configuration conf, String[] args)
			throws Exception {
		// validate
//		if (args.length != 2) {
//			System.err.printf("Usage:%s [genneric options]<input><output>\\n",
//					tool.getClass().getSimpleName());
//			ToolRunner.printGenericCommandUsage(System.err);
//			return null;
//		}
		// 2 create job
		Job job = new Job(conf, tool.getClass().getSimpleName());
		// 3.1 set run jar class
		job.setJarByClass(tool.getClass());
		// 3.3 set input path
		FileInputFormat.addInputPath(job, new Path(args[0]));
		// 3.14 set job output path
		FileOutputFormat.setOutputPath(job, new Path(args[1]));

		return job;
	}

	public static void main(String[] args) throws Exception {
		args = new String[] {
				"hdfs://192.168.192.129:9000/ml/bayes/pword/part-r-00000",
				// "hdfs://hadoop-00:9000/home910/liyuting/output/" };
				"hdfs://192.168.192.129:9000/ml/bayes/pall/",
				"hdfs://192.168.192.129:9000/ml/bayes/part-r-00000"};
		// run mapreduce
		int status = ToolRunner.run(new Bayes3(), args);
		// 5 exit
		System.exit(status);
	}
}
Bayes4

package com.ml.mapreduce;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Reducer.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

/**
 * 4/假设二分类问题-测试
 * 1类别+文档出现次数+文档出现的词的总数
 * 2类别+词+词的总数
 * 3类别+词+log(词的总数/文档出现的词的总数),类别-log(文档出现次数/sum(文档出现次数))
 * 
 *输入格式:新文档id+文档词(切分成A,b,c)
 *输出格式:新文档id+类别
 */
public class Bayes4 extends Configured implements Tool {

	public static enum Counter {
		PARSER_ERR
	}

	public static class MyMap extends Mapper<LongWritable, Text, Text, Text> {
		private Text mykey = new Text();//类别+词
		private Text myval = new Text();//出现个数
		Map zidianString=new HashMap();//key是词 value是概率值-假设字典可以读到内存中//不能的话切分读取
		protected void map(LongWritable key, Text value, Context context)
				throws IOException, InterruptedException {
			 BufferedReader br = null;
	            //获得当前作业的DistributedCache相关文件
	            Path[] distributePaths = DistributedCache.getLocalCacheFiles(context.getConfiguration());
	            String lines = null;
	            for(Path p : distributePaths){
	                if(p.getParent().toString().endsWith("pall")){
	                    //读缓存文件,并放到mem中
	                    br = new BufferedReader(new FileReader(p.toString()));
	                    while(null!=(lines=br.readLine())){
	                        String[] pall= lines.split(",");
	                        if (pall.length>4) {
	                        	if (pall[2].equals("1")) {
	                        		zidianString.put(pall[0], pall[2]+","+pall[3]+","+pall[4]+","+pall[5]);
								}else {
									zidianString.put(pall[0], pall[4]+","+pall[5]+","+pall[2]+","+pall[3]);
								}
							}else {
								if (pall[2].equals("1")) {
									zidianString.put(pall[0], pall[2]+","+pall[3]+","+"0"+","+"0.0");
								}else {
									zidianString.put(pall[0], "1"+","+"0.0"+","+pall[2]+","+pall[3]);
								}
								
							}
	                    }
	                }
	            }
			String[] array = value.toString().split(",");
			String[] doc=array[1].split("-");
			for (String str : doc) {
				if (zidianString.containsKey(str)) {
					String[] kk=zidianString.get(str).toString().split(",");//类别+概率
					mykey.set(array[0]);//文档id
					myval.set(kk[0]+","+kk[1]+","+kk[2]+","+kk[3]);//类别+log概率
					context.write(mykey, myval);
				}
			}
		};
	}

	public static class MyReduce extends Reducer<Text, Text, Text, Text> {
		private Text val = new Text();

		protected void reduce(Text key, Iterable<Text> values, Context context)
				throws IOException, InterruptedException {
			// 用于计算每个类别里面词的概率
			Double sum=0.5;//类别1的先验概率 --需要提前算好0-0这里可以考虑读入--等有空再修改
			Double sum2=0.5;//类别0的先验概率 
			// 循环遍历 Interable
			for (Text value : values) {
				// 累加
				String[] array = value.toString().split(",");
					sum += Double.parseDouble(array[1]);//似然概率
					sum2 += Double.parseDouble(array[3]);//似然概率
			}
			if (sum>sum2) {
				val.set("类别1");
			}else {
				val.set("类别0");
			}
			context.write(key, val);
		};
	}

	@Override
	public int run(String[] args) throws Exception {
		// 1 conf
		Configuration conf = new Configuration();
		conf.set("mapred.textoutputformat.separator", ",");// key value分隔符
		DistributedCache.addCacheFile(new Path(args[2]).toUri(), conf);//为该job添加缓存文件
		// 2 create job
		// Job job = new Job(conf, ModuleMapReduce.class.getSimpleName());
		Job job = this.parseInputAndOutput(this, conf, args);
		// 3 set job
		// 3.1 set run jar class
		// job.setJarByClass(ModuleReducer.class);
		// 3.2 set intputformat
		job.setInputFormatClass(TextInputFormat.class);
		// 3.3 set input path
		// FileInputFormat.addInputPath(job, new Path(args[0]));
		// 3.4 set mapper
		job.setMapperClass(MyMap.class);
		// 3.5 set map output key/value class
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(Text.class);
		// 3.6 set partitioner class
		// job.setPartitionerClass(HashPartitioner.class);
		// 3.7 set reduce number
//		 job.setNumReduceTasks(0);
		// 3.8 set sort comparator class
		// job.setSortComparatorClass(LongWritable.Comparator.class);
		// 3.9 set group comparator class
		// job.setGroupingComparatorClass(LongWritable.Comparator.class);
		// 3.10 set combiner class
		// job.setCombinerClass(null);
		// 3.11 set reducer class
		job.setReducerClass(MyReduce.class);
		// 3.12 set output format

		job.setOutputFormatClass(TextOutputFormat.class);
		// 3.13 job output key/value class
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(Text.class);
		// 3.14 set job output path
		// FileOutputFormat.setOutputPath(job, new Path(args[1]));
		// 4 submit job
		boolean isSuccess = job.waitForCompletion(true);
		// 5 exit
		// System.exit(isSuccess ? 0 : 1);
		return isSuccess ? 0 : 1;
	}

	public Job parseInputAndOutput(Tool tool, Configuration conf, String[] args)
			throws Exception {
		// validate
//		if (args.length != 2) {
//			System.err.printf("Usage:%s [genneric options]<input><output>\\n",
//					tool.getClass().getSimpleName());
//			ToolRunner.printGenericCommandUsage(System.err);
//			return null;
//		}
		// 2 create job
		Job job = new Job(conf, tool.getClass().getSimpleName());
		// 3.1 set run jar class
		job.setJarByClass(tool.getClass());
		// 3.3 set input path
		FileInputFormat.addInputPath(job, new Path(args[0]));
		// 3.14 set job output path
		FileOutputFormat.setOutputPath(job, new Path(args[1]));

		return job;
	}

	public static void main(String[] args) throws Exception {
		args = new String[] {
				"hdfs://192.168.192.129:9000/ml/test.txt",
				// "hdfs://hadoop-00:9000/home910/liyuting/output/" };
				"hdfs://192.168.192.129:9000/ml/bayes/result/",
				"hdfs://192.168.192.129:9000/ml/bayes/pall/part-r-00000"};
		// run mapreduce
		int status = ToolRunner.run(new Bayes4(), args);
		// 5 exit
		System.exit(status);
	}
}



以上是关于朴素贝叶斯之MapReduce版的主要内容,如果未能解决你的问题,请参考以下文章

朴素贝叶斯之实践篇

朴素贝叶斯之新闻分类

贝叶斯分类小结

Spark MLlib速成宝典模型篇04朴素贝叶斯Naive Bayes(Python版)

机器学习—朴素贝叶斯

基于朴素贝叶斯分类器的文本分类