mongoDB mapreduce需要很长时间才能运行3m文件
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了mongoDB mapreduce需要很长时间才能运行3m文件相关的知识,希望对你有一定的参考价值。
我有一个包含300万份文件的集合。每个文档有40个字段。字段如下。
{
"b_date" : "2016-04-05",
"d_date" : "2016-06-25",
"pos" : "MISC",
"origin" : "DXB",
"destination" : "HGA",
"pax" : 1,
"pax_1" : 2
},
{
"b_date" : "2016-04-05",
"d_date" : "2016-06-25",
"pos" : "MISC",
"origin" : "DXB",
"destination" : "HGA",
"pax" : 4,
"pax_1" : 5
},
{
"b_date" : "2016-04-05",
"d_date" : "2016-06-26",
"pos" : "MISC",
"origin" : "DXB",
"destination" : "HGA",
"pax" : 3,
"pax_1" : 3
}
现在我想通过分组pax
,pax_1
,b_date
,d_date
,pos
字段来获得origin
和destination
的总和。累积pax是pos
,origin
,destination
字段的分组,但累积pax和pax_1应根据b_date
和d_date
的升序增加。
预期的结果是。
{
"_id.dep_date" : "2016-04-05",
"_id.sale_date" : "2016-06-25",
"_id.pos" : "MISC",
"_id.origin" : "DXB",
"_id.destination" : "HGA",
"value.pax" : 5,
"value.cumulative_pax":5,
"value.pax_1" : 7,
"value.cumulative_pax_1":7,
},
{
"_id.dep_date" : "2016-04-05",
"_id.sale_date" : "2016-06-26",
"_id.pos" : "MISC",
"_id.origin" : "DXB",
"_id.destination" : "HGA",
"value.pax" : 3,
"value.cumulative_pax":8,
"value.pax_1" : 3,
"value.cumulative_pax_1":10,
}
我的mapReduce代码
db.collection.mapReduce(
function() {
emit(
{
"pos" : this.pos,
"origin" : this.origin,
"destination" : this.destination,
'dep_date': this.d_date,
'sale_date': this.b_date,
},
{
'pax':this.pax,
'pax_1':this.pax_1,
}
);
}
,
function(key,values) {
paxt = 0;
paxt_1 = 0;
for (var i in values){
paxt += values[i].pax;
paxt_1 += values[i].pax_1;
}
return {'pax':paxt,
'pax_1':paxt_1,
};
}
,
{
'scope':{
'pos':'',
'origin':'',
'destination':'',
'dep_date': '',
'sale_date': '',
'result':{}
}
,
'finalize':function(key,value) {
if (pos != key.pos ||
origin != key.origin ||
destination != key.destination ||
){
result['pax'] = 0;
result['pax_1'] = 0;
result['cumulative_pax'] = 0;
result['cumulative_pax_1'] = 0;
}
result['pax'] += value.pax;
result['cumulative_pax'] = value.pax;
result['pax_1'] += value.pax_1;
result['cumulative_pax_1'] = value.pax_1;
pos = key.pos;
origin = key.origin;
destination = key.destination;
dep_date = key.dep_date;
sale_date = key.sale_date;
return result;
}
,
'out':'some_collection'
}
)
这张地图减少了返回的预期值,但花了很多时间,比如3小时。是因为'b_date'和'd_date'是字符串格式的日期?或者如何进行优化。 聚合在3分钟内返回结果,但我无法通过使用聚合获得累积pax。
答案
地图减少代码,
db.collection.mapReduce(
function() {
emit(
{
"pos" : this.pos,
"origin" : this.origin,
"destination" : this.destination,
'dep_date': this.d_date,
'sale_date': this.b_date,
},
{
'pax':this.pax,
'pax_1':this.pax_1,
}
);
}
,
function(key,values) {
paxt = 0;
paxt_1 = 0;
for (var i in values){
paxt += values[i].pax;
paxt_1 += values[i].pax_1;
}
return {'pax':paxt,
'pax_1':paxt_1,
};
}
,
{
'scope':{
'pos':'',
'origin':'',
'destination':'',
'dep_date': '',
'sale_date': '',
'result':{}
}
,
'finalize':function(key,value) {
if (pos != key.pos ||
origin != key.origin ||
destination != key.destination ||
){
result['pax'] = 0;
result['pax_1'] = 0;
result['cumulative_pax'] = 0;
result['cumulative_pax_1'] = 0;
}
result['pax'] += value.pax;
result['cumulative_pax'] = value.pax;
result['pax_1'] += value.pax_1;
result['cumulative_pax_1'] = value.pax_1;
pos = key.pos;
origin = key.origin;
destination = key.destination;
dep_date = key.dep_date;
sale_date = key.sale_date;
return result;
}
,
'out':'some_collection'
}
)
以上是关于mongoDB mapreduce需要很长时间才能运行3m文件的主要内容,如果未能解决你的问题,请参考以下文章