mongoDB统计数据--mapReduce实现
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了mongoDB统计数据--mapReduce实现相关的知识,希望对你有一定的参考价值。
需求背景:按不同的规则,分别显示所有状态(status=0,status=1,...)的总数和指定状态(status=1)的总数,需按“所有状态总数”、“指定状态总数”排序。
需求及技术分析过程:看似非常简单,如果用关系型数据库,可以分别统计所有状态总数、指定状态总数再用join连接再排序实现。
现在只考虑mongo情况吧,下面分别列出尝试多种方式在mongo中的实现过程。
mongoDB版本:3.4.9
ruleInfo数据表结构:
db.ruleInfo.find() { "_id" : ObjectId("59d7b29fec981b49586be66a"), "ruleId" : "100", "status" : 0 } { "_id" : ObjectId("59d7b32bec981b49586be66b"), "ruleId" : "200", "status" : 1 } { "_id" : ObjectId("59d7b364ec981b49586be66c"), "ruleId" : "300", "status" : 0 } { "_id" : ObjectId("59d7b373ec981b49586be66d"), "ruleId" : "400", "status" : 0 } { "_id" : ObjectId("59d7b37eec981b49586be66e"), "ruleId" : "500", "status" : 1 } { "_id" : ObjectId("59d7b39eec981b49586be66f"), "ruleId" : "500", "status" : 0 } { "_id" : ObjectId("59d7b43bec981b49586be670"), "ruleId" : "500", "status" : 0 } { "_id" : ObjectId("59d7b441ec981b49586be671"), "ruleId" : "300", "status" : 1 } { "_id" : ObjectId("59d87edddfbeb7326c37d6c9"), "ruleId" : "300", "status" : 1 } { "_id" : ObjectId("59d99b99b865b2a754d85402"), "ruleId" : "600", "status" : 1 } { "_id" : ObjectId("59d9b159b865b2a754d85403"), "ruleId" : "600", "status" : 1 }
实现方法一(NOK):
1.分别查出统计结果(所用状态总数、指定状态总数)用程序代码实现汇总
// 所有状态总数统计 db.ruleInfo.aggregate([ {"$project":{"ruleId":1}}, {"$group":{"_id":"$ruleId","allSum":{"$sum":1}}}, {"$sort":{"allSum":-1,"statuses":-1}} ]); // 指定状态总数统计 db.ruleInfo.aggregate([ {"$match":{"status":1}}, {"$project":{"ruleId":1}}, {"$group":{"_id":"$ruleId","allSum":{"$sum":1}}}, {"$sort":{"allSum":-1,"statuses":-1}} ]);
总结:程序代码实现汇总,为了达到排序的精确度,可能需要把大量的数据查询到内存中。还是放弃了这种实现。
实现方法二(NOK):
1.使用管道操作$group查询中$push方法
//使用管道操作$group查询中$push方法 db.ruleInfo.aggregate( {"$project":{"ruleId":1,"status":1}}, {"$group":{"_id":"$ruleId","allSum":{"$sum":1},"statuses":{"$push":"$status"} }}, {"$sort":{"statuses":-1,"allSum":-1} })
查询结果:
{ "_id" : "600", "allSum" : 2, "statuses" : [ 1, 1 ] } { "_id" : "500", "allSum" : 3, "statuses" : [ 1, 0, 0 ] } { "_id" : "200", "allSum" : 1, "statuses" : [ 1 ] } { "_id" : "300", "allSum" : 3, "statuses" : [ 0, 1, 1 ] } { "_id" : "100", "allSum" : 1, "statuses" : [ 0 ] } { "_id" : "400", "allSum" : 1, "statuses" : [ 0 ] }
总结:程序稍微处理下就可以。但发现数组里的元素排序不是想要的结果。放弃此实现。
2.使用管道操作$group查询中$sum方法
db.ruleInfo.aggregate( {"$project":{"ruleId":1,"status":1}}, {"$group":{"_id":"$ruleId","allSum":{"$sum":1},"statuses":{"$sum":"$status"} }}, {"$sort":{"statuses":-1,"allSum":-1} })
查询结果:
{ "_id" : "300", "allSum" : 3, "statuses" : 2 } { "_id" : "600", "allSum" : 2, "statuses" : 2 } { "_id" : "500", "allSum" : 3, "statuses" : 1 } { "_id" : "200", "allSum" : 1, "statuses" : 1 } { "_id" : "100", "allSum" : 1, "statuses" : 0 } { "_id" : "400", "allSum" : 1, "statuses" : 0 }
总结,能达到想要的效果。刚好我只想要累计status=1的情况,但实际业务中,不太可能只有两种状态!!
实现方法三(NOK)
1.使用$lookup 查询。(注意,$lookup是mongo3.2才有的方法)
db.ruleInfo.aggregate( {"$project":{"ruleId":1}},{"$group":{"_id":"$ruleId","allSum":{"$sum":1}}}, {"$lookup":{from: "ruleInfo", localField: "_id", foreignField: "ruleId", as: "refVOs" } }, {"$sort":{"refVOs.status":-1,"allSum":-1}}, {"$limit":10} )
查询结果:
{ "_id" : "600", "allSum" : 2, "refVOs" : [ { "_id" : ObjectId("59d99b99b865b2a754d85402"), "ruleId" : "600", "status" : 1 }, { "_id" : ObjectId("59d9b159b865b2a754d85403"), "ruleId" : "600", "status" : 1 } ] } { "_id" : "500", "allSum" : 3, "refVOs" : [ { "_id" : ObjectId("59d7b37eec981b49586be66e"), "ruleId" : "500", "status" : 1 }, { "_id" : ObjectId("59d7b39eec981b49586be66f"), "ruleId" : "500", "status" : 0 }, { "_id" : ObjectId("59d7b43bec981b49586be670"), "ruleId" : "500", "status" : 0 } ] } { "_id" : "200", "allSum" : 1, "refVOs" : [ { "_id" : ObjectId("59d7b32bec981b49586be66b"), "ruleId" : "200", "status" : 1 } ] } { "_id" : "300", "allSum" : 3, "refVOs" : [ { "_id" : ObjectId("59d7b364ec981b49586be66c"), "ruleId" : "300", "status" : 0 }, { "_id" : ObjectId("59d7b441ec981b49586be671"), "ruleId" : "300", "status" : 1 }, { "_id" : ObjectId("59d87edddfbeb7326c37d6c9"), "ruleId" : "300", "status" : 1 } ] } { "_id" : "100", "allSum" : 1, "refVOs" : [ { "_id" : ObjectId("59d7b29fec981b49586be66a"), "ruleId" : "100", "status" : 0 } ] } { "_id" : "400", "allSum" : 1, "refVOs" : [ { "_id" : ObjectId("59d7b373ec981b49586be66d"), "ruleId" : "400", "status" : 0 } ] }
总结:排序达不到理想效果。
实现方法四(NOK)
1.使用group聚合命令查询
//group聚合命令实现 db.ruleInfo.group({ key:{"ruleId" : "$ruleId"}, initial:{"ruleId":"","allSum":0,"confirmSum":0}, reduce:function(doc, out){ out.ruleId = doc.ruleId; out.allSum += 1; if(doc.status == 1){ out.confirmSum +=1; } } })
查询结果:
[ { "ruleId" : "100", "allSum" : 1, "confirmSum" : 0 }, { "ruleId" : "200", "allSum" : 1, "confirmSum" : 1 }, { "ruleId" : "300", "allSum" : 3, "confirmSum" : 2 }, { "ruleId" : "400", "allSum" : 1, "confirmSum" : 0 }, { "ruleId" : "500", "allSum" : 3, "confirmSum" : 1 }, { "ruleId" : "600", "allSum" : 2, "confirmSum" : 2 } ]
总结:简单的完美实现。问题是不能sort和limit。
实现方法五
1.mapReduce实现
var map = function(){ emit(this.ruleId, {ruleCount:1,status:this.status}); }; var reduce = function(key, emits){ var ruleCount = 0; var confirmCount = 0; for (var i = 0; i < emits.length; i++){ ruleCount += 1; if (emits[i].status == 1){ confirmCount += 1; } } return {"ruleCount":ruleCount, "confirmCount":confirmCount}; }; var finalize = function(key,values){ if (values.ruleCount == 1){ if (values.status == 1){ values.confirmCount = 1; }else{ values.confirmCount = 0; } delete values.status; } return values; }; db.runCommand({ mapreduce:"ruleInfo", map:map, reduce:reduce, finalize:finalize, out:"ruleInfo_result" }); //db.ruleInfo_result.find().sort({"value.ruleCount":-1,"value.confirmCount":-1}).limit(10); db.ruleInfo_result.find().sort({"value.confirmCount":-1,"value.ruleCount":-1}).limit(10);
查询结果:
{ "_id" : "300", "value" : { "ruleCount" : 3, "confirmCount" : 2 } } { "_id" : "600", "value" : { "ruleCount" : 2, "confirmCount" : 2 } } { "_id" : "500", "value" : { "ruleCount" : 3, "confirmCount" : 1 } } { "_id" : "200", "value" : { "ruleCount" : 1, "confirmCount" : 1 } } { "_id" : "100", "value" : { "ruleCount" : 1, "confirmCount" : 0 } } { "_id" : "400", "value" : { "ruleCount" : 1, "confirmCount" : 0 } }
总结:基本实现。据说mapReduce非常慢!!!
以上是关于mongoDB统计数据--mapReduce实现的主要内容,如果未能解决你的问题,请参考以下文章