MapReduce几个小案例

Posted tkzm

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了MapReduce几个小案例相关的知识,希望对你有一定的参考价值。

MapReduce案例

1.单词计数--wordcount

首先准备好文件

Hello World
Hello Java
Hello World
Hello hadoop
wo
shi
wo

开始编写程序

public class MapReduceTest 
    //第一个参数是我们的行偏移量
    //第二个参数是我们的数据集
    //第三个是我们要输出时候的key类型
    //第四个是我们要输出时候的value类型
    public static class Map extends Mapper<LongWritable, Text, Text, IntWritable>
        //重写map方法
        //第一个参数是偏移量  第二个是我们读取的数据集  第三个是上下文简单点说就说往文件上写要用到这个参数
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException 
            String[] s = value.toString().split(" ");//把数据按照空格切分
            for (String values:s) //用for循环去遍历我们的数组
                //第一个参数是输出的key,第二个是value
                //这里为什么value是1呢  是因为一会我们要在reduce阶段统计出现的次数  现在数据就拆分成了 <hello ,1>
              context.write(new Text(values),new IntWritable(1));//将我们的数据输出//
            
        
    
    
   //第一个、二个参数是跟map输出时候的类型保持一致
   //第三个、四个是我们一会要输出时候的类型  Text ==  String  IntWritable == int
   public static class Reduce extends Reducer<Text, IntWritable, Text, IntWritable>
       @Override
       protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException 
           int sum=0;
           for (IntWritable value:values) //遍历我们刚才在map端得到的数据
               sum +=value.get();  //value.get()得到每一个数据后面的1并用+=把他们相加起来
           
           context.write(key,new IntWritable(sum));
       
   

    public static void main(String[] args) throws Exception 
        Configuration conf=new Configuration();
        Job job=Job.getInstance(conf);//注意要导这个import org.apache.hadoop.mapreduce.Job包
        job.setMapperClass(Map.class);//这里写的是我们的自定义Map方法名
        job.setReducerClass(Reduce.class);//自定义Reduce方法名
        job.setJarByClass(MapReduceTest.class);//这里写的是主类
        job.setMapOutputKeyClass(Text.class);//Map输出时候key的类型
        job.setMapOutputValueClass(IntWritable.class);//Map输出时候value的类型
        job.setCombinerClass(Reduce.class);//这里这个是在map阶段进行了一次整合
        job.setOutputKeyClass(Text.class);//这是输出时候的key类型
        job.setOutputValueClass(IntWritable.class);//这是输出时候的key类型

        FileInputFormat.setInputPaths(job,new Path("D:\\\\新建文件夹\\\\test\\\\"));//文件存放的位置一定要有
        FileSystem fs=FileSystem.get(conf);
        Path paths = new Path("D:\\\\新建文件夹\\\\demo");
        //判断一下是否有这个文件夹有则删除
        if (fs.exists(paths))
            fs.delete(paths,true);
        
        FileOutputFormat.setOutputPath(job,paths);//文件输出的位置一定不能有
        job.submit();//job提交
    

 2.计算每个人的钱数

文件1

name,money
tom,100
kebi,200
tom,500
susan,600
hua,1000
hua,5000
xin,600

文件2

name,money
tom,400
tom,500
susan,650
kebi,5000
xin,600
hua,800

代码编写

public class MapReduceTest02 
    public static class Map extends Mapper<LongWritable, Text,Text, IntWritable>
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException 
            StringTokenizer stringTokenizer=new StringTokenizer(value.toString(),",");
            if (key.get()!=0)//当偏移量不等于0时可以去掉第一行没有用的数据
                while (stringTokenizer.hasMoreTokens())
                    String name = stringTokenizer.nextToken();
                    String money = stringTokenizer.nextToken();
                    context.write(new Text(name),new IntWritable(Integer.parseInt(money)));
                
            

        
    

    public static class Reduce extends Reducer<Text, IntWritable,Text,IntWritable>
        @Override
        protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException 
            int sum=0;//总钱数
            for (IntWritable value:values) 
                sum += value.get();
            
            context.write(key,new IntWritable(sum));
        
    

    public static void main(String[] args) throws Exception 
        Configuration conf=new Configuration();
        Job job=Job.getInstance(conf);
        job.setMapperClass(Map.class);
        job.setReducerClass(Reduce.class);
        job.setJarByClass(MapReduceTest02.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);


        job.setOutputValueClass(IntWritable.class);
        job.setOutputKeyClass(Text.class);

        FileInputFormat.setInputPaths(job,new Path("D:\\\\新建文件夹\\\\test02\\\\"));
        Path path = new Path("D:\\\\新建文件夹\\\\output");
        FileSystem fileSystem = FileSystem.get(conf);
        if (fileSystem.exists(path))
            fileSystem.delete(path);
        
        FileOutputFormat.setOutputPath(job,path);
        job.submit();
    

3.求出共同好友

准备文件

如下是各自的好友列表,要求得出他们的共同好友,如A-B C,D,E。
A:B,C,D,E,F,M
B:C,D,L,E
C:A,B,E,F,D
D:G,A,V,X,E,B
E:F,A,C,B,T
F:A,D,B
G:M,L,K,B

编写代码

//我们先找出拥有这个好友的所有人
public
class MapReduceTest03 public static class Map extends Mapper<LongWritable, Text,Text, Text> @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException if (key.get()!=0)//去除第一行无用数据 String[] split = value.toString().split(":");//先按:分割 String[] split1 = split[1].split(",");//按照,号继续分割得到我们想要的 for (String sp:split1) context.write(new Text(sp),new Text(split[0]));//输出拥有这个好友的人 public static class Reduce extends Reducer<Text, Text,Text,Text> @Override protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException StringBuffer sb = new StringBuffer(); for(Text friend : values) sb.append(friend.toString()).append(","); context.write(key, new Text(sb.toString())); public static void main(String[] args) throws Exception Configuration conf=new Configuration(); Job job=Job.getInstance(conf); job.setJarByClass(MapReduceTest03.class); job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); FileInputFormat.setInputPaths(job,new Path("D:\\\\新建文件夹\\\\test03\\\\")); Path path = new Path("D:\\\\新建文件夹\\\\output01"); FileSystem fileSystem = FileSystem.get(conf); if (fileSystem.exists(path)) fileSystem.delete(path); FileOutputFormat.setOutputPath(job,path); job.submit();

上面的程序输出的结果

A    C,F,D,E,
B    G,F,E,D,C,A,
C    E,A,B,
D    C,F,B,A,
E    D,C,A,B,
F    E,A,C,
G    D,
K    G,
L    G,B,
M    G,A,
T    E,
V    D,
X    D,

 

//输出我们想要的结果
public
class MapReduceTest03s public static class Map extends Mapper<LongWritable, Text,Text, Text> @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException String[] split = value.toString().split("\\t"); String[] split1 = split[1].split(","); Arrays.sort(split1);//排序去重 for (int i = 0; i < split1.length-1; i++) //利用两个for循环对结果一一匹配 for (int j = i+1; j < split1.length; j++) context.write(new Text(split1[i]+"-"+split1[j]),new Text(split[0])); System.out.println((split1[i]+"-"+split1[j])+"\\t"+split[0]); public static class Reduce extends Reducer<Text, Text,Text,Text> @Override protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException StringBuffer sb=new StringBuffer(); for (Text te:values) sb.append(te).append(","); context.write(key, new Text(sb.toString())); public static void main(String[] args) throws Exception Configuration conf=new Configuration(); Job job=Job.getInstance(conf); job.setJarByClass(MapReduceTest03s.class); job.setMapperClass(MapReduceTest03s.Map.class); job.setReducerClass(MapReduceTest03s.Reduce.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); FileInputFormat.setInputPaths(job,new Path("D:\\\\新建文件夹\\\\output01\\\\")); Path path = new Path("D:\\\\新建文件夹\\\\output02"); FileSystem fileSystem = FileSystem.get(conf); if (fileSystem.exists(path)) fileSystem.delete(path); FileOutputFormat.setOutputPath(job,path); job.submit();

最终结果展示

A-B    E,C,D,
A-C    E,B,F,D,
A-D    B,E,
A-E    B,F,C,
A-F    D,B,
A-G    B,M,
B-C    D,E,
B-D    E,
B-E    C,
B-F    D,
B-G    L,
C-D    A,E,B,
C-E    B,A,F,
C-F    D,B,A,
C-G    B,
D-E    B,A,
D-F    A,B,
D-G    B,
E-F    B,A,
E-G    B,
F-G    B,

4.读取JSON数据

"uid":"1000166111","phone":"17703771999","addr":"河南省 南阳"
"uid":"1000432103","phone":"15388889881","addr":"云南省 昆明"
"uid":"1000473355","phone":"15388889557","addr":"云南省 昆明"
"uid":"1000555472","phone":"18083815777","addr":"云南省 昆明"
"uid":"1000585644","phone":"15377892222","addr":"广东省 中山"
"uid":"1000774061","phone":"18026666666","addr":"广东省 惠州"
"uid":"1001024965","phone":"18168526111","addr":"江苏省 苏州"
"uid":"1001283200","phone":"15310952123","addr":"重庆"
"uid":"1001523180","phone":"15321168157","addr":"北京"
//先创建一个实体类
//
实现Writable接口 public class Phone implements Writable private String uid; private String phone; private String addr; public String getUid() return uid; public void setUid(String uid) this.uid = uid; public String getPhone() return phone; public void setPhone(String phone) this.phone = phone; public String getAddr() return addr; public void setAddr(String addr) this.addr = addr; public Phone() public Phone(String uid, String phone, String addr) this.uid = uid; this.phone = phone; this.addr = addr; @Override public void write(DataOutput dataOutput) throws IOException //序列化 dataOutput.writeUTF(this.uid); dataOutput.writeUTF(this.phone); dataOutput.writeUTF(this.addr); @Override public void readFields(DataInput dataInput) throws IOException //反序列化 this.uid=dataInput.readUTF(); this.phone=dataInput.readUTF(); this.addr=dataInput.readUTF(); @Override public String toString() return "Phone" + "uid=‘" + uid + ‘\\‘‘ + ", phone=‘" + phone + ‘\\‘‘ + ", addr=‘" + addr + ‘\\‘‘ + ‘‘;

 

public class Json2Object 
    public static class Map extends Mapper<LongWritable, Text,Phone, NullWritable>
        Phone phone;
        String line;
        @Override
        protected void setup(Context context) throws IOException, InterruptedException 
            //代码的优化让我们的实例在内存中仅实例化一次
            phone=new Phone();
            line=new String();
        
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException 
            line=new String(value.getBytes(),0,value.getLength(),"GBK");//处理乱码
            ObjectMapper objectMapper=new ObjectMapper();
            Phone phones = objectMapper.readValue(line, Phone.class);//这句就能把我们的json数据转换为对象
            context.write(phones,NullWritable.get());
        
    

    public static void main(String[] args) throws Exception 
        Configuration conf=new Configuration();
        Job job=Job.getInstance(conf);
        job.setMapperClass(Map.class);
        job.setJarByClass(Json2Object.class);
        job.setMapOutputKeyClass(Phone.class);
        job.setMapOutputValueClass(NullWritable.class);

        job.setNumReduceTasks(0);//我们这个不需要reduce计算所以把他设置为0
        FileInputFormat.setInputPaths(job,new Path("D:\\\\新建文件夹\\\\test07\\\\"));
        FileSystem fs=FileSystem.get(conf);
        Path paths = new Path("D:\\\\新建文件夹\\\\json");
        if (fs.exists(paths))
            fs.delete(paths,true);
        
        FileOutputFormat.setOutputPath(job,paths);
        boolean b = job.waitForCompletion(true);
        System.exit(b?0:1);
    

5.分区解决数据倾斜

准备一个文件里面一个单词非常多别的很少

 技术图片

//先实现我们的分区
public
class PartitonerTest extends Partitioner<Text, IntWritable> @Override public int getPartition(Text text, IntWritable intWritable, int i) Random random=new Random();//定义一个随机数 return random.nextInt(i);//这里面的i就是ReduceTasks的数量

 

//再把数据比较均匀的分散在4各分区中
public
class MapReduceTest public static class Map extends Mapper<LongWritable, Text, Text,IntWritable> Text text; IntWritable intWritable; @Override protected void setup(Context context) throws IOException, InterruptedException text=new Text(); intWritable=new IntWritable(); @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException String[] s = value.toString().split(" "); for (String it:s) intWritable.set(1); text.set(it); System.out.println(text); if (!text.equals("")) context.write(text,intWritable); public static class Reduce extends Reducer<Text,IntWritable,Text,IntWritable> @Override protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException int sum=0; for (IntWritable value:values) sum +=value.get(); context.write(key,new IntWritable(sum)); public static void main(String[] args) throws Exception Configuration conf=new Configuration(); Job job=Job.getInstance(conf); job.setJarByClass(MapReduceTest.class); job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); job.setCombinerClass(Reduce.class); job.setPartitionerClass(PartitonerTest.class); job.setNumReduceTasks(4);//这里面我们设置了4个文件 job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.setInputPaths(job,new Path("D:\\\\新建文件夹\\\\test04")); Path path=new Path("D:\\\\新建文件夹\\\\lx"); FileSystem fileSystem=FileSystem.get(conf); if (fileSystem.exists(path)) fileSystem.delete(path); FileOutputFormat.setOutputPath(job,path); job.submit();

 

//再去我们 刚才分好的四个分区中去计算单词个数
public
class MapReduceTest02 public static class Map extends Mapper<LongWritable, Text, Text,IntWritable> Text text; IntWritable intWritable; @Override protected void setup(Context context) throws IOException, InterruptedException text=new Text(); intWritable=new IntWritable(); @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException String[] s = value.toString().split("\\t"); intWritable.set(Integer.parseInt(s[1])); for (String it:s) text.set(it); context.write(text,intWritable); public static class Reduce extends Reducer<Text,IntWritable,Text,IntWritable> @Override protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException int sum=0; for (IntWritable value:values) sum +=value.get(); context.write(key,new IntWritable(sum)); public static void main(String[] args) throws Exception Configuration conf=new Configuration(); Job job=Job.getInstance(conf); job.setJarByClass(MapReduceTest02.class); job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); job.setCombinerClass(Reduce.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.setInputPaths(job,new Path("D:\\\\新建文件夹\\\\lx")); Path path=new Path("D:\\\\新建文件夹\\\\lx10"); FileSystem fileSystem=FileSystem.get(conf); if (fileSystem.exists(path)) fileSystem.delete(path); FileOutputFormat.setOutputPath(job,path); job.submit();

 5.文件内容转换为对象输出

name,age,score,sex
tom,10,100,女
susa,9,99,男
hua,60,10,dog
多喝水,大
//创建实体类
public class Person implements Writable 
    private String name;
    private int age;
    private int score;
    private String sex;

    public String getName() 
        return name;
    

    public void setName(String name) 
        this.name = name;
    

    public int getAge() 
        return age;
    

    public void setAge(int age) 
        this.age = age;
    

    public int getScore() 
        return score;
    

    public void setScore(int score) 
        this.score = score;
    

    public String getSex() 
        return sex;
    

    public void setSex(String sex) 
        this.sex = sex;
    

    public Person() 
    

    public Person(String name, int age, int score, String sex) 
        this.name = name;
        this.age = age;
        this.score = score;
        this.sex = sex;
    

    @Override
    public void write(DataOutput dataOutput) throws IOException 
            dataOutput.writeUTF(this.name);
            dataOutput.writeInt(this.age);
            dataOutput.writeInt(this.score);
            dataOutput.writeUTF(this.sex);
    

    @Override
    public void readFields(DataInput dataInput) throws IOException 
        this.name = dataInput.readUTF();
        this.age=dataInput.readInt();
        this.score=dataInput.readInt();
        this.sex=dataInput.readUTF();
    

    @Override
    public String toString() 
        return "Person" +
                "name=‘" + name + ‘\\‘‘ +
                ", age=" + age +
                ", score=" + score +
                ", sex=‘" + sex + ‘\\‘‘ +
                ‘‘;
    
public class Data2Object 
    public static class Map extends Mapper<LongWritable, Text,Person, NullWritable>
        Person person;
        String line;
        @Override
        protected void setup(Context context) throws IOException, InterruptedException 
            person=new Person();
            line=new String();
        

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException 
            if (key.get()!=0)
                line=new String(value.getBytes(),0,value.getLength(),"GBK");
                String[] split = line.toString().split(",");
                if (split.length>=4)
                    person.setName(split[0]);
                    person.setAge(Integer.parseInt(split[1]));
                    person.setScore(Integer.valueOf(split[2]));
                    person.setSex(split[3]);
                    System.out.println(person);
                    context.write(person,NullWritable.get());
                
            
        

        public static void main(String[] args) throws Exception 
            Configuration conf=new Configuration();
            Job job=Job.getInstance(conf);
            job.setMapperClass(Map.class);
            job.setJarByClass(Data2Object.class);
            job.setMapOutputKeyClass(Person.class);
            job.setMapOutputValueClass(NullWritable.class);

            job.setNumReduceTasks(0);
            FileInputFormat.setInputPaths(job,new Path("D:\\\\新建文件夹\\\\test06\\\\"));
            FileSystem fs=FileSystem.get(conf);
            Path paths = new Path("D:\\\\新建文件夹\\\\data");
            if (fs.exists(paths))
                fs.delete(paths,true);
            
            FileOutputFormat.setOutputPath(job,paths);
            boolean b = job.waitForCompletion(true);
            System.exit(b?0:1);
        
    

 6.数据清洗

网站提取
提取出对应公司的名称。
如:http://v.baidu.com/movie提取出baidu
    http://movie.youku.com提取出youku
    https://image.baidu.com提取出baidu
    http://blog.csdn.net/article/details/47444699提取出csdn
数据处理:
统计每个“公司”的上行流量之和、下行流量之和、总流量之和。
总流量=上行流量+下行流量

结果展示形式:
公司,上行流量之和,下行流量之和,总流量之和

 

 这是实例的文件可以多弄点数据

电话号码 网址 上行流量 下行流量
15639120688 http://v.baidu.com/movie 3936 12058
13905256439 http://movie.youku.com 10132 538
15192566948 https://image.baidu.com 19789 5238
14542296218 http://v.baidu.com/tv 7504 13253
17314017739 http://www.weibo.com/?category=7 7003 79
14554637796 http://v.baidu.com/tv 15494 7950
13793181795 http://weibo.com/?category=1760 996 15246
18350161914 https://image.baidu.com 1600 5101
15255537988 http://blog.csdn.net/article/details/47444699 17666 7643
18515646476 https://zhidao.baidu.com/question/1430480451137504979.html 10826 10043
15932420636 http://movie.youku.com 977 3136
18567886220 http://www.weibo.com/?category=7 4652 4336
13694165557 http://movie.youku.com 9610 17284
14728294152 http://www.weibo.com/?category=7 15955 6533
13773776226 http://blog.csdn.net/article/details/47444699 10536 8208
15399710194 http://v.baidu.com/tv 5224 6962
17508400165 http://v.baidu.com/movie 18758 15853
13307519578 http://blog.csdn.net/article/details/47444699 14261 15569
15975769645 http://v.baidu.com/tv 9118 7682
17640300232 http://blog.csdn.net/article/details/47444699 2790 56
18539313261 https://zhidao.baidu.com/question/1430480451137504979.html 1131 18106
15531448828 https://zhidao.baidu.com/question/1430480451137504979.html 2181 2498
17779548801 https://zhidao.baidu.com/question/1430480451137504979.html 1287 5243
//先创建对象的实体类
public class Gongsi implements Writable 
    private String name;
    private int shang;
    private int xia;

    public String getName() 
        return name;
    

    public void setName(String name) 
        this.name = name;
    

    public int getShang() 
        return shang;
    

    public void setShang(int shang) 
        this.shang = shang;
    

    public int getXia() 
        return xia;
    

    public void setXia(int xia) 
        this.xia = xia;
    

    public Gongsi() 
    

    public Gongsi(String name, int shang, int xia) 
        this.name = name;
        this.shang = shang;
        this.xia = xia;
    

    @Override
    public void write(DataOutput dataOutput) throws IOException 
        dataOutput.writeUTF(this.name);
        dataOutput.writeInt(this.shang);
        dataOutput.writeInt(this.xia);
    

    @Override
    public void readFields(DataInput dataInput) throws IOException 
        this.name = dataInput.readUTF();
        this.shang = dataInput.readInt();
        this.xia = dataInput.readInt();
    

    @Override
    public String toString() 
        return "Gongsi" +
                "name=‘" + name + ‘\\‘‘ +
                ", shang=" + shang +
                ", xia=" + xia +
                ‘‘;
    
public class NameMapReduceObject 
    public static class NameMapReduceMap extends Mapper<LongWritable, Text,Text,Gongsi>
        Text text;
        Gongsi gongsi;
        String names;
        @Override
        protected void setup(Context context) throws IOException, InterruptedException 
            text=new Text();
            gongsi=new Gongsi();
        

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException 
                String[] split = value.toString().split(" ");
            if (key.get()!=0)
                String[] http = split[1].split("://");
                String[] yuming = http[1].split("/");
                String[] name = yuming[0].split("[.]");
                if (name.length>=3)//判断一下切出来的是否长度大于等于3
                    names=name[1];
                else 
                    names=name[0];
                
                //给实体赋值
                gongsi.setName(names);
                gongsi.setShang(Integer.parseInt(split[2]));
                gongsi.setXia(Integer.parseInt(split[3]));
                text.set(names);
                context.write(text,gongsi);
            

        
    
    public static class NameMapReduceReduce extends Reducer<Text,Gongsi,Text,Text> 
        Text text;
        @Override
        protected void setup(Context context) throws IOException, InterruptedException 
            text=new Text();
        

        @Override
        protected void reduce(Text key, Iterable<Gongsi> values, Context context) throws IOException, InterruptedException 
            int shang=0;
            int xia=0;
            int sum=0;
            for (Gongsi gs:values) 
                shang += gs.getShang();
                xia += gs.getXia();
            
            sum = shang+xia;
            String zong=shang+" "+xia+" "+sum;
            text.set(zong);
            context.write(key,text);
        
    
    public static void main(String[] args) throws Exception 
        Configuration conf=new Configuration();
        Job job=Job.getInstance(conf);
        job.setMapperClass(NameMapReduceMap.class);
        job.setReducerClass(NameMapReduceReduce.class);
        job.setJarByClass(NameMapReduceObject.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Gongsi.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
        FileInputFormat.setInputPaths(job,new Path("D:\\\\新建文件夹\\\\http\\\\"));
        FileSystem fs=FileSystem.get(conf);
        Path paths = new Path("D:\\\\新建文件夹\\\\name1");
        if (fs.exists(paths))
            fs.delete(paths,true);
        
        FileOutputFormat.setOutputPath(job,paths);
        boolean b = job.waitForCompletion(true);
        System.exit(b?0:1);
    

 7.计算每个人相应科目的平均分

需求说明:
计算每个人相应科目的平均分。

将同一个科目的放在一个文件中,并按照平均分从大到小排序。

结果展示形式:
part1文件中:
computer huangxiaoming  92.5
computer xuzheng  91.2
……

part2文件中:
english zhaobenshan 95.1
english liuyifei 94.3
……
//文件准备
computer,huangxiaoming,85,86,41,75,93,42,85
computer,xuzheng,54,52,86,91,42
computer,huangbo,85,42,96,38
english,zhaobenshan,54,52,86,91,42,85,75
english,liuyifei,85,41,75,21,85,96,14
algorithm,liuyifei,75,85,62,48,54,96,15
computer,huangjiaju,85,75,86,85,85
english,huangdatou,48,58,67,86,15,33,85
algorithm,huanglei,76,95,86,74,68,74,48
algorithm,huangjiaju,85,75,86,85,85,74,86
computer,huangdatou,48,58,67,86,15,33,85
english,zhouqi,85,86,41,75,93,42,85,75,55,47,22
english,huangbo,85,42,96,38,55,47,22
algorithm,liutao,85,75,85,99,66
computer,huangzitao,85,86,41,75,93,42,85
math,wangbaoqiang,85,86,41,75,93,42,85
computer,liujialing,85,41,75,21,85,96,14,74,86
computer,liuyifei,75,85,62,48,54,96,15
computer,liutao,85,75,85,99,66,88,75,91
computer,huanglei,76,95,86,74,68,74,48
english,liujialing,75,85,62,48,54,96,15
math,huanglei,76,95,86,74,68,74,48
math,huangjiaju,85,75,86,85,85,74,86
math,liutao,48,58,67,86,15,33,85
english,huanglei,85,75,85,99,66,88,75,91
math,xuzheng,54,52,86,91,42,85,75
math,huangxiaoming,85,75,85,99,66,88,75,91
math,liujialing,85,86,41,75,93,42,85,75
english,huangxiaoming,85,86,41,75,93,42,85
algorithm,huangdatou,48,58,67,86,15,33,85
algorithm,huangzitao,85,86,41,75,93,42,85,75
//因为我们要进行排序所以要实现WritableComparable重写compareTo方法
public class Arrst implements WritableComparable<Arrst> 
    private String kemu;
    private String name;
    private double abs;

    public Arrst() 
    

    public Arrst(String kemu, String name,double abs) 
        this.kemu = kemu;
        this.name = name;
        this.abs=abs;
    

    public double getAbs() 
        return abs;
    

    public void setAbs(double abs) 
        this.abs = abs;
    

    public String getKemu() 
        return kemu;
    

    public void setKemu(String kemu) 
        this.kemu = kemu;
    

    public String getName() 
        return name;
    

    public void setName(String name) 
        this.name = name;
    

    @Override
    public String toString() 
        return "Arrst" +
                "kemu=‘" + kemu + ‘\\‘‘ +
                ", name=‘" + name + ‘\\‘‘ +
                ", abs=" + abs +
                ‘‘;
    

    @Override
    public void write(DataOutput dataOutput) throws IOException 
        dataOutput.writeUTF(this.kemu);
        dataOutput.writeUTF(this.name);
        dataOutput.writeDouble(this.abs);
    

    @Override
    public void readFields(DataInput dataInput) throws IOException 
        this.kemu = dataInput.readUTF();
        this.name = dataInput.readUTF();
        this.abs=dataInput.readDouble();
    

    @Override
    public int compareTo(Arrst o) 
        if (o.getAbs() == this.getAbs())//如果平均分相同则按照名字排序
            return o.getName().compareTo(this.getName());
        else //如果不同从大到小排序
            return o.getAbs() > this.getAbs() ? 1:-1;
        
    
//编写我们的分区确保每一科目都进入相同的文件中
public class PartitonerScore extends Partitioner<Arrst, Text> 
    Map<String, Integer> num=new HashMap<>();

    @Override
    public int getPartition(Arrst arrst, Text text, int i) 
        num.put("computer",0);
        num.put("english",1);
        num.put("algorithm",2);
        num.put("math",3);
        Integer integer = num.get(arrst.getKemu());//用key获得value
        return integer;
    
public class ScoerMapReduce 
    public static class Map extends Mapper<LongWritable, Text,Arrst,Text> 
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException 
            String[] split = value.toString().split(",");
            double sum=0;
            int s=0;
            double abs=0;
            for (int i = 2; i <split.length ; i++) 
                s++;
                sum += Integer.parseInt(split[i]);
            
            abs=sum/s;
            Arrst arrst=new Arrst();
            arrst.setKemu(split[0]);
            arrst.setName(split[1]);
            arrst.setAbs(abs);
            context.write(arrst,new Text(split[0]));
        
    

    public static class Reduce extends Reducer<Arrst,Text,Text,Text> 
        @Override
        protected void reduce(Arrst key, Iterable<Text> values, Context context) throws IOException, InterruptedException 
            for (Text it:values) 
                String ss=key.getName()+" "+key.getAbs();
                context.write(new Text(it),new Text(ss));
            
        
    

    public static void main(String[] args) throws Exception 
        Configuration conf=new Configuration();
        Job job=Job.getInstance(conf);
        job.setMapperClass(Map.class);
        job.setReducerClass(Reduce.class);
        job.setJarByClass(ScoerMapReduce.class);
        job.setMapOutputKeyClass(Arrst.class);
        job.setMapOutputValueClass(Text.class);
        job.setPartitionerClass(PartitonerScore.class);

        job.setOutputValueClass(Text.class);
        job.setOutputKeyClass(Text.class);
        job.setNumReduceTasks(4);//这里不要忘记写我们想要几个文件
        FileInputFormat.setInputPaths(job,new Path("D:\\\\新建文件夹\\\\score\\\\"));
        Path path = new Path("D:\\\\新建文件夹\\\\scoreoutput");
        FileSystem fileSystem = FileSystem.get(conf);
        if (fileSystem.exists(path))
            fileSystem.delete(path);
        
        FileOutputFormat.setOutputPath(job,path);
        job.submit();
    

 8.MapReducejoin的应用

Map Join

  • 一个数据集很大,另一个数据集很小(能够被完全放进内存中),Map Join  会把小表全部读入内存中,把小表COPY多份分发到大表数据所在的实例上的内存里,在map阶段直接拿另外一个表的数据和内存中表数据做匹配,由于在map阶段中就进行了join操作,省去了reduce运行的效率会高很多。
  •  适用于关联表中有小表的情形;可以将小表分发到所有的map节点,这样,map节点就可以在本地对自己所读到的大表数据进行join操作并输出最终结果,可以大大提高join操作的并发度,加快处理速度。并使用distributecache(分布式缓存)机制将小表的数据分发到每一个maptask执行节点,从而每一个maptask节点可以从本地加载到小表的数据,进而在本地可以实现join

Reduce Join

     Reduce端连接比Map端连接更为普遍,因为输入的数据不需要特定的结构,

                但是效率比较低,因为所有数据都必须经过Shuffle过程。

                   基本思路:

                      1、Map端读取所有的文件,并在输出的内容里加上标示,代表数据是从哪个文                       件里来的。

                      2、在reduce处理函数中,按照标识对数据进行处理。

                      3、然后根据Key去join来求出结果直接输出。

 准备两个文件上传到hdfs集群上

110 鞋子
111 裤子
112 上衣
113 泳衣
110 500
111 300
112 200
113 1000
public class JoinMapReduce 
    public static class JoinMapReduceMap extends Mapper<LongWritable, Text, Text, NullWritable> 
        Map<String,String> shangmap=new HashMap<>();
        String line;
        Text text;
        @Override
        protected void setup(Context context) throws IOException, InterruptedException 
            line=new String();
            text=new Text();
            BufferedReader br = new BufferedReader(new FileReader("file.txt"));//准备一个流读取我们hdfs上的文件
            String line="";
            while ((line=br.readLine())!=null)
                String[] split = line.split(" ");
                shangmap.put(split[0],split[1]);//把数据添加到map集合中
            
        
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException 
            String[] split = value.toString().split(" ");
            String jia = shangmap.get(split[0]);//利用刚才得到的数据座位key去获得value
            String ss=split[0]+" "+jia+" "+split[1];
            text.set(ss);
            context.write(text,NullWritable.get());
        
    


    public static void main(String[] args) throws Exception 
        Configuration conf=new Configuration();
        Job job=Job.getInstance(conf);
        job.setMapperClass(JoinMapReduceMap.class);
        job.setJarByClass(JoinMapReduce.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(NullWritable.class);

        job.setNumReduceTasks(0);//不用reduce阶段就设置为0
        FileInputFormat.setInputPaths(job,new Path("hdfs://master:9000/mapjoin/input1/"));
        FileSystem fs=FileSystem.get(conf);
        Path paths = new Path("hdfs://master:9000/outfile");//这个位置的目录不能存在
        job.addCacheFile(new URI("hdfs://master:9000/mapjoin/input2/file.txt"));//这个必须精确到某个文件
        if (fs.exists(paths))
            fs.delete(paths,true);
        
        FileOutputFormat.setOutputPath(job,paths);
        boolean b = job.waitForCompletion(true);
        System.exit(b?0:1);
    

我们把程序都写好后开始打包发送到我们的hadoop集群上

技术图片

用xftp把刚才我们打包好的jar文件拖到我们的linux系统上

然后在存放jar目录下执行

hadoop jar join.jar com.dzm.join.JoinMapReduce

hadoop jar固定这么写   后面的 join.jar是我们刚才发送的jar名字   在后面是我们的全包名+类名

查看我们刚才执行完的程序

hdfs dfs -cat /test/output/part*

/test/output/part*是我们程序里自定义的输出目录位置

以上是关于MapReduce几个小案例的主要内容,如果未能解决你的问题,请参考以下文章

大数据之Hadoop(MapReduce):CombineTextInputFormat案例实操

Hadoop基础(十八):MapReduce框架原理切片机制

MapReduce入门—— MapReduce概述 + WordCount案例实操

0009 - 基于MapReduce的应用案例

大数据之Hadoop(MapReduce):MapReduce扩展案例

MapReduce的思想