(16)mongodb mapReduce分布式统计示例遇到的一个未解问题,求平均值不对,希望哪位大神给指点一下
Posted javasl
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了(16)mongodb mapReduce分布式统计示例遇到的一个未解问题,求平均值不对,希望哪位大神给指点一下相关的知识,希望对你有一定的参考价值。
mapReduce 的优势在于分布式,这一节记录一个分布式统计的示例,整个过程分为这几步:启动分布式服务、手动分片、导入数据、执行 mapReduce。下面单独说说这几步。
1、启动分布式服务
参考前面的第10节和第11节,不在重复说了。
2、手动分片:
sh.enableSharding(‘test‘); 用test库做分片
sh.shardCollection(‘test.dz‘,sn:1); 指定片键,test库的dz集合,sn为片键
mongos> for(var i=1;i<=8;i++)
... sh.splitAt(‘test.dz‘,sn:i*1000);
... 手动分片,当sn的值遇到1k、2k、3k...时形成一个chunk块
3、导数据,共7052条
./bin/mongoimport --port 30000 -d test -c dz --type csv -f sn,date,lev,wei,jing,deep,area --headerline --file ./地震数据.csv
登录27017查看,导入了3053条
> db.dz.find().count();
3053
登录27018查看,导入了3999条
> db.dz.find().count();
3999
4、统计地址数据的分布,经纬度都是每跨5度统计在一起。
var map = function()
var j = Math.floor(this.jing / 5) * 5;
var w = Math.floor(this.wei / 5) * 5;
var block =j + ‘:‘ + w;
emit(block,1);
var reduce = function(block,values)
return Array.sum(values);
db.dz.mapReduce(map,reduce,out:‘res‘);
分析:Math.floor()函数是取整数部分 ,计算出的 j 应该是每隔 5 度为单位,例如:5、10、15、20 等,计算出 j 等于 15,经度应该坐落在区间 [15,20)内。纬度同理可以 这样得到。看一下结果:
mongos> db.res.find(); "_id" : "-100:15", "value" : 9 "_id" : "-105:-40", "value" : 1 "_id" : "-105:10", "value" : 1 "_id" : "-105:15", "value" : 6 "_id" : "-105:5", "value" : 1 "_id" : "-10:-25", "value" : 1 "_id" : "-110:15", "value" : 4 "_id" : "-110:20", "value" : 1 "_id" : "-110:25", "value" : 1 "_id" : "-115:-30", "value" : 1 "_id" : "-115:-35", "value" : 2 "_id" : "-115:20", "value" : 1 "_id" : "-115:25", "value" : 4 "_id" : "-120:30", "value" : 1 "_id" : "-120:35", "value" : 3 "_id" : "-125:-60", "value" : 1 "_id" : "-125:35", "value" : 1 "_id" : "-125:40", "value" : 2 "_id" : "-130:40", "value" : 2 "_id" : "-130:45", "value" : 4 Type "it" for more mongos> it "_id" : "-135:50", "value" : 8 "_id" : "-135:55", "value" : 2 "_id" : "-140:55", "value" : 3 "_id" : "-145:55", "value" : 1 "_id" : "-145:60", "value" : 1 "_id" : "-150:55", "value" : 1 "_id" : "-150:65", "value" : 1 "_id" : "-155:55", "value" : 6 "_id" : "-155:60", "value" : 2 "_id" : "-15:-20", "value" : 1 "_id" : "-15:-5", "value" : 1 "_id" : "-15:-60", "value" : 1 "_id" : "-15:70", "value" : 2 "_id" : "-160:15", "value" : 1 "_id" : "-160:55", "value" : 2 "_id" : "-160:65", "value" : 1 "_id" : "-165:50", "value" : 2 "_id" : "-170:50", "value" : 5 "_id" : "-175:-20", "value" : 16 "_id" : "-175:-25", "value" : 13 Type "it" for more mongos> it "_id" : "-175:50", "value" : 10 "_id" : "-180:-15", "value" : 2 "_id" : "-180:-20", "value" : 26 "_id" : "-180:-25", "value" : 23 "_id" : "-180:-30", "value" : 7 "_id" : "-180:-35", "value" : 22 "_id" : "-180:50", "value" : 9 "_id" : "-20:-40", "value" : 1 "_id" : "-20:-45", "value" : 1 "_id" : "-20:-5", "value" : 1 "_id" : "-20:-60", "value" : 1 "_id" : "-20:-65", "value" : 1 "_id" : "-25:-5", "value" : 1 "_id" : "-25:-60", "value" : 3 "_id" : "-30:-60", "value" : 15 "_id" : "-30:-65", "value" : 1 "_id" : "-35:-60", "value" : 1 "_id" : "-35:50", "value" : 1 "_id" : "-35:55", "value" : 2 "_id" : "-45:10", "value" : 1 Type "it" for more
为了验证数据的正确性,可以将数据导入mysql中,以下面这三条数据为例子说明 mapReduce 的统计是正确的
"_id" : "-100:15", "value" : 9 select * from dzsj w WHERE w.jing>=-100 and w.jing<-95 and w.wei>=15 and w.wei<20
"_id" : "-115:25", "value" : 4 select * from dzsj w WHERE w.jing>=-115 and w.jing<-110 and w.wei>=25 and w.wei<30
"_id" : "-155:55", "value" : 6 select * from dzsj w WHERE w.jing>=-155 and w.jing<-150 and w.wei>=55 and w.wei<60
5、在上面的基础上统计平均震级,这次只统计经纬度大于0的,这里出现了问题尚未解决。
var map = function()
if(this.jing<0 || this.wei<0)
return;
var j = Math.floor(this.jing / 5) * 5;
var w = Math.floor(this.wei / 5) * 5;
var block =j + ‘:‘ + w;
emit(block,this.lev);
var reduce = function(block,values)
return Array.avg(values);
db.dz.mapReduce(map,reduce,out:‘res‘);
执行结果如下:
mongos> db.res.find().sort(value:-1); "_id" : "65:25", "value" : 7.5 "_id" : "140:65", "value" : 7.3 "_id" : "60:25", "value" : 7.050000000000001 "_id" : "95:50", "value" : 7 "_id" : "140:25", "value" : 6.920833333333333 "_id" : "150:50", "value" : 6.85 "_id" : "25:40", "value" : 6.8 "_id" : "95:5", "value" : 6.8 "_id" : "125:10", "value" : 6.783333333333333 "_id" : "165:50", "value" : 6.733333333333333 "_id" : "90:20", "value" : 6.666666666666667 "_id" : "160:50", "value" : 6.645 "_id" : "175:50", "value" : 6.608333333333333 "_id" : "125:30", "value" : 6.6 "_id" : "145:0", "value" : 6.6 "_id" : "90:0", "value" : 6.5166666666666675 "_id" : "155:50", "value" : 6.4875 "_id" : "45:30", "value" : 6.47 "_id" : "140:10", "value" : 6.45 "_id" : "135:30", "value" : 6.445833333333333 Type "it" for more mongos> it "_id" : "140:15", "value" : 6.4 "_id" : "145:15", "value" : 6.4 "_id" : "145:5", "value" : 6.4 "_id" : "135:35", "value" : 6.35 "_id" : "140:20", "value" : 6.300000000000001 "_id" : "95:15", "value" : 6.300000000000001 "_id" : "165:55", "value" : 6.3 "_id" : "160:55", "value" : 6.254166666666666 "_id" : "140:40", "value" : 6.239583333333333 "_id" : "125:5", "value" : 6.222916666666666 "_id" : "125:0", "value" : 6.217499999999999 "_id" : "5:70", "value" : 6.2 "_id" : "65:40", "value" : 6.2 "_id" : "155:45", "value" : 6.1899999999999995 "_id" : "120:10", "value" : 6.185714285714285 "_id" : "145:45", "value" : 6.175000000000001 "_id" : "170:50", "value" : 6.166666666666666 "_id" : "25:35", "value" : 6.154166666666667 "_id" : "120:0", "value" : 6.15 "_id" : "135:25", "value" : 6.15 Type "it" for more mongos>
拿出两个数据来对比,发现并不是我们要的结果:
"_id" : "140:20", "value" : 6.300000000000001
select AVG(lev) from dzsj w WHERE w.jing>=140 and w.jing<145 and w.wei>=20 and w.wei<25 计算结果是 6.333333333333333
"_id" : "145:45", "value" : 6.175000000000001
select * from dzsj w WHERE w.jing>=145 and w.jing<150 and w.wei>=45 and w.wei<50 计算结果是 6.08
"_id" : "160:55", "value" : 6.114285714285715
select AVG(lev) from dzsj w WHERE w.jing>=160 and w.jing<165 and w.wei>=55 and w.wei<60 计算结果是 6.050000000000001
我们先求一下和,执行以下代码:
var map = function() if(this.jing<0 || this.wei<0) return; var j = Math.floor(this.jing / 5) * 5; var w = Math.floor(this.wei / 5) * 5; var block =j + ‘:‘ + w; emit(block,this.lev); var map = function() var j = Math.floor(this.jing / 5) * 5; var w = Math.floor(this.wei / 5) * 5; var block =j + ‘:‘ + w; emit(block,this.lev); var reduce = function(block,values) return Array.sum(values); db.dz.mapReduce(map,reduce,out:‘res‘);
查看一下部分结果:
mongos> db.res.find(_id:‘140:20‘); "_id" : "140:20", "value" : 19 mongos> db.res.find(_id:‘145:45‘); "_id" : "145:45", "value" : 30.400000000000002 mongos> db.res.find(_id:‘160:55‘); "_id" : "160:55", "value" : 48.400000000000006 mongos>
与mysql中对比一下,发现求和是一样的。
select SUM(lev) from dzsj w WHERE w.jing>=140 and w.jing<145 and w.wei>=20 and w.wei<25 19
select SUM(lev) from dzsj w WHERE w.jing>=145 and w.jing<150 and w.wei>=45 and w.wei<50 30.400000000000002
select SUM(lev) from dzsj w WHERE w.jing>=160 and w.jing<165 and w.wei>=55 and w.wei<60 48.400000000000006
在对比一下前面求的数量,发现数量也是一样的。
"_id" : "140:20", "value" : 3
"_id" : "145:45", "value" : 5
"_id" : "160:55", "value" : 8
select count(1) from dzsj w WHERE w.jing>=140 and w.jing<145 and w.wei>=20 and w.wei<25 3
select count(1) from dzsj w WHERE w.jing>=145 and w.jing<150 and w.wei>=45 and w.wei<50 5
select count(1) from dzsj w WHERE w.jing>=160 and w.jing<165 and w.wei>=55 and w.wei<60 8
这就奇怪了,求和一样,求数量也一样,计算出的平均值不一样,哪位好心人能发现问题希望能指点一二,谢谢!
以上是关于(16)mongodb mapReduce分布式统计示例遇到的一个未解问题,求平均值不对,希望哪位大神给指点一下的主要内容,如果未能解决你的问题,请参考以下文章