mongoDB统计数据--mapReduce实现

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了mongoDB统计数据--mapReduce实现相关的知识,希望对你有一定的参考价值。

需求背景:按不同的规则,分别显示所有状态(status=0,status=1,...)的总数和指定状态(status=1)的总数,需按“所有状态总数”、“指定状态总数”排序。

需求及技术分析过程:看似非常简单,如果用关系型数据库,可以分别统计所有状态总数、指定状态总数再用join连接再排序实现。

          现在只考虑mongo情况吧,下面分别列出尝试多种方式在mongo中的实现过程。

mongoDB版本:3.4.9

ruleInfo数据表结构
 db.ruleInfo.find()
{ "_id" : ObjectId("59d7b29fec981b49586be66a"), "ruleId" : "100", "status" : 0 }
{ "_id" : ObjectId("59d7b32bec981b49586be66b"), "ruleId" : "200", "status" : 1 }
{ "_id" : ObjectId("59d7b364ec981b49586be66c"), "ruleId" : "300", "status" : 0 }
{ "_id" : ObjectId("59d7b373ec981b49586be66d"), "ruleId" : "400", "status" : 0 }
{ "_id" : ObjectId("59d7b37eec981b49586be66e"), "ruleId" : "500", "status" : 1 }
{ "_id" : ObjectId("59d7b39eec981b49586be66f"), "ruleId" : "500", "status" : 0 }
{ "_id" : ObjectId("59d7b43bec981b49586be670"), "ruleId" : "500", "status" : 0 }
{ "_id" : ObjectId("59d7b441ec981b49586be671"), "ruleId" : "300", "status" : 1 }
{ "_id" : ObjectId("59d87edddfbeb7326c37d6c9"), "ruleId" : "300", "status" : 1 }
{ "_id" : ObjectId("59d99b99b865b2a754d85402"), "ruleId" : "600", "status" : 1 }
{ "_id" : ObjectId("59d9b159b865b2a754d85403"), "ruleId" : "600", "status" : 1 }

 

实现方法一(NOK):

1.分别查出统计结果(所用状态总数、指定状态总数)用程序代码实现汇总 

// 所有状态总数统计
db.ruleInfo.aggregate([
{"$project":{"ruleId":1}},
{"$group":{"_id":"$ruleId","allSum":{"$sum":1}}},
{"$sort":{"allSum":-1,"statuses":-1}}
]);

// 指定状态总数统计
db.ruleInfo.aggregate([
{"$match":{"status":1}},
{"$project":{"ruleId":1}},
{"$group":{"_id":"$ruleId","allSum":{"$sum":1}}},
{"$sort":{"allSum":-1,"statuses":-1}}
]);

 

 总结:程序代码实现汇总,为了达到排序的精确度,可能需要把大量的数据查询到内存中。还是放弃了这种实现。

 

实现方法二(NOK):

1.使用管道操作$group查询中$push方法

//使用管道操作$group查询中$push方法
db.ruleInfo.aggregate(
{"$project":{"ruleId":1,"status":1}},
{"$group":{"_id":"$ruleId","allSum":{"$sum":1},"statuses":{"$push":"$status"} }},
{"$sort":{"statuses":-1,"allSum":-1}
})

 

查询结果:

{ "_id" : "600", "allSum" : 2, "statuses" : [ 1, 1 ] }
{ "_id" : "500", "allSum" : 3, "statuses" : [ 1, 0, 0 ] }
{ "_id" : "200", "allSum" : 1, "statuses" : [ 1 ] }
{ "_id" : "300", "allSum" : 3, "statuses" : [ 0, 1, 1 ] }
{ "_id" : "100", "allSum" : 1, "statuses" : [ 0 ] }
{ "_id" : "400", "allSum" : 1, "statuses" : [ 0 ] }

 

总结:程序稍微处理下就可以。但发现数组里的元素排序不是想要的结果。放弃此实现。

2.使用管道操作$group查询中$sum方法

db.ruleInfo.aggregate(
{"$project":{"ruleId":1,"status":1}},
{"$group":{"_id":"$ruleId","allSum":{"$sum":1},"statuses":{"$sum":"$status"} }},
{"$sort":{"statuses":-1,"allSum":-1}
})

 

查询结果:

{ "_id" : "300", "allSum" : 3, "statuses" : 2 }
{ "_id" : "600", "allSum" : 2, "statuses" : 2 }
{ "_id" : "500", "allSum" : 3, "statuses" : 1 }
{ "_id" : "200", "allSum" : 1, "statuses" : 1 }
{ "_id" : "100", "allSum" : 1, "statuses" : 0 }
{ "_id" : "400", "allSum" : 1, "statuses" : 0 }

 

总结,能达到想要的效果。刚好我只想要累计status=1的情况,但实际业务中,不太可能只有两种状态!!

 

实现方法三(NOK)

1.使用$lookup 查询。(注意,$lookup是mongo3.2才有的方法)

db.ruleInfo.aggregate(
{"$project":{"ruleId":1}},{"$group":{"_id":"$ruleId","allSum":{"$sum":1}}},
{"$lookup":{from: "ruleInfo",        
    localField: "_id",       
    foreignField: "ruleId",        
    as: "refVOs"         
    } 
},
{"$sort":{"refVOs.status":-1,"allSum":-1}},
{"$limit":10}
)

 

查询结果:

{ "_id" : "600", "allSum" : 2, "refVOs" : [ { "_id" : ObjectId("59d99b99b865b2a754d85402"), "ruleId" : "600", "status" : 1 }, { "_id" : ObjectId("59d9b159b865b2a754d85403"), "ruleId" : "600", "status" : 1 } ] }
{ "_id" : "500", "allSum" : 3, "refVOs" : [ { "_id" : ObjectId("59d7b37eec981b49586be66e"), "ruleId" : "500", "status" : 1 }, { "_id" : ObjectId("59d7b39eec981b49586be66f"), "ruleId" : "500", "status" : 0 }, { "_id" : ObjectId("59d7b43bec981b49586be670"), "ruleId" : "500", "status" : 0 } ] }
{ "_id" : "200", "allSum" : 1, "refVOs" : [ { "_id" : ObjectId("59d7b32bec981b49586be66b"), "ruleId" : "200", "status" : 1 } ] }
{ "_id" : "300", "allSum" : 3, "refVOs" : [ { "_id" : ObjectId("59d7b364ec981b49586be66c"), "ruleId" : "300", "status" : 0 }, { "_id" : ObjectId("59d7b441ec981b49586be671"), "ruleId" : "300", "status" : 1 }, { "_id" : ObjectId("59d87edddfbeb7326c37d6c9"), "ruleId" : "300", "status" : 1 } ] }
{ "_id" : "100", "allSum" : 1, "refVOs" : [ { "_id" : ObjectId("59d7b29fec981b49586be66a"), "ruleId" : "100", "status" : 0 } ] }
{ "_id" : "400", "allSum" : 1, "refVOs" : [ { "_id" : ObjectId("59d7b373ec981b49586be66d"), "ruleId" : "400", "status" : 0 } ] }

 

总结:排序达不到理想效果。

 

实现方法四(NOK)

1.使用group聚合命令查询

//group聚合命令实现
db.ruleInfo.group({  
    key:{"ruleId" : "$ruleId"},  
    initial:{"ruleId":"","allSum":0,"confirmSum":0},  
    reduce:function(doc, out){    
        out.ruleId = doc.ruleId;    
        out.allSum += 1;    
        if(doc.status == 1){        
        out.confirmSum +=1;    
        }  
    }
})

 

查询结果:

[
    {
        "ruleId" : "100",
        "allSum" : 1,
        "confirmSum" : 0
    },
    {
        "ruleId" : "200",
        "allSum" : 1,
        "confirmSum" : 1
    },
    {
        "ruleId" : "300",
        "allSum" : 3,
        "confirmSum" : 2
    },
    {
        "ruleId" : "400",
        "allSum" : 1,
        "confirmSum" : 0
    },
    {
        "ruleId" : "500",
        "allSum" : 3,
        "confirmSum" : 1
    },
    {
        "ruleId" : "600",
        "allSum" : 2,
        "confirmSum" : 2
    }
]

 

总结:简单的完美实现。问题是不能sort和limit。

 

实现方法五

1.mapReduce实现

var map = function(){         
    emit(this.ruleId, {ruleCount:1,status:this.status});         
    };
var reduce = function(key, emits){        
    var ruleCount = 0;        
    var confirmCount = 0;                
    for (var i = 0; i < emits.length; i++){  
        ruleCount += 1;    
        if (emits[i].status == 1){                
            confirmCount += 1;         
        }         
    }   
    return {"ruleCount":ruleCount, "confirmCount":confirmCount};         
};            
var finalize = function(key,values){
    if (values.ruleCount == 1){
        if (values.status == 1){
            values.confirmCount = 1;
        }else{
            values.confirmCount = 0;
        }
        delete values.status;
    }
    return values;
};
db.runCommand({   
 mapreduce:"ruleInfo",  
 map:map,   
 reduce:reduce,   
 finalize:finalize,
 out:"ruleInfo_result"
 });    
//db.ruleInfo_result.find().sort({"value.ruleCount":-1,"value.confirmCount":-1}).limit(10);
db.ruleInfo_result.find().sort({"value.confirmCount":-1,"value.ruleCount":-1}).limit(10);

 

查询结果:

{ "_id" : "300", "value" : { "ruleCount" : 3, "confirmCount" : 2 } }
{ "_id" : "600", "value" : { "ruleCount" : 2, "confirmCount" : 2 } }
{ "_id" : "500", "value" : { "ruleCount" : 3, "confirmCount" : 1 } }
{ "_id" : "200", "value" : { "ruleCount" : 1, "confirmCount" : 1 } }
{ "_id" : "100", "value" : { "ruleCount" : 1, "confirmCount" : 0 } }
{ "_id" : "400", "value" : { "ruleCount" : 1, "confirmCount" : 0 } }

 

总结:基本实现。据说mapReduce非常慢!!!

 

 

 

 

以上是关于mongoDB统计数据--mapReduce实现的主要内容,如果未能解决你的问题,请参考以下文章

mongodb学习3---mongo的MapReduce

MongoDB——聚合操作之MapReduce

MongoDB——聚合操作之MapReduce

Mongodb中数据聚合之基本聚合函数countdistinctgroup

mongoDB MapReduce

用mongodb的mapreduce可以加两个条件吗