10 亿份文档的 Couchbase N1QL 索引

Posted

技术标签:

【中文标题】10 亿份文档的 Couchbase N1QL 索引【英文标题】:Couchbase N1QL Index for 1 billion documents 【发布时间】:2020-01-18 22:01:05 【问题描述】:

我在 Couchbase 企业(6.0.2 build 2413)中针对大约 10 亿个文档运行以下查询。基于此查询创建的性能最高的索引是什么? (想要在特定的时间段内完成报告,所以以最快的速度输出索引是主要目标)

select LogJobID, LoggingType as LoggingTypeID, count(*) as AffectedLineCount
from (
    select Max([CreateDate, SequenceID, a])[2].LoggingType, Max([CreateDate, SequenceID, a])[2].LogJobID
    from `LogBucket` a
    where LoggingType in [3001, 4004, 6002]
    group by LogFileID, RowKey) as a
group by a.LoggingType, a.LogJobID

我尝试创建了以下索引:

CREATE INDEX `data_job_productivity_index1` 
ON `LogBucket`(`LogFileID`,`RowKey`,`LoggingType`,`LogJobID`,`CreateDate`,`SequenceID`) 
PARTITION BY hash((meta().`id`)) WHERE (`LoggingType` in [3001, 4004, 6002])

但是当我检查解释时,它使用了不同的索引(一个专用于不同的报告查询)。


  "plan": 
    "#operator": "Sequence",
    "~children": [
      
        "#operator": "Sequence",
        "~children": [
          
            "#operator": "IndexScan3",
            "as": "a",
            "index": "analyst_log_LogJob_activity",
            "index_id": "f85999b9b7cc0d3f",
            "index_projection": 
              "primary_key": true
            ,
            "keyspace": "LogBucket",
            "namespace": "default",
            "spans": [
              
                "exact": true,
                "range": [
                  
                    "high": "3001",
                    "inclusion": 3,
                    "low": "3001"
                  
                ]
              ,
              
                "exact": true,
                "range": [
                  
                    "high": "4004",
                    "inclusion": 3,
                    "low": "4004"
                  
                ]
              ,
              
                "exact": true,
                "range": [
                  
                    "high": "6002",
                    "inclusion": 3,
                    "low": "6002"
                  
                ]
              
            ],
            "using": "gsi"
          ,
          
            "#operator": "Fetch",
            "as": "a",
            "keyspace": "LogBucket",
            "namespace": "default"
          ,
          
            "#operator": "Parallel",
            "~child": 
              "#operator": "Sequence",
              "~children": [
                
                  "#operator": "Filter",
                  "condition": "((`a`.`LoggingType`) in [3001, 4004, 6002])"
                ,
                
                  "#operator": "InitialGroup",
                  "aggregates": [
                    "max([(`a`.`CreateDate`), (`a`.`SequenceID`), `a`])"
                  ],
                  "group_keys": [
                    "(`a`.`LogFileID`)",
                    "(`a`.`RowKey`)"
                  ]
                
              ]
            
          ,
          
            "#operator": "IntermediateGroup",
            "aggregates": [
              "max([(`a`.`CreateDate`), (`a`.`SequenceID`), `a`])"
            ],
            "group_keys": [
              "(`a`.`LogFileID`)",
              "(`a`.`RowKey`)"
            ]
          ,
          
            "#operator": "FinalGroup",
            "aggregates": [
              "max([(`a`.`CreateDate`), (`a`.`SequenceID`), `a`])"
            ],
            "group_keys": [
              "(`a`.`LogFileID`)",
              "(`a`.`RowKey`)"
            ]
          ,
          
            "#operator": "Parallel",
            "~child": 
              "#operator": "Sequence",
              "~children": [
                
                  "#operator": "InitialProject",
                  "result_terms": [
                    
                      "expr": "((max([(`a`.`CreateDate`), (`a`.`SequenceID`), `a`])[2]).`LoggingType`)"
                    ,
                    
                      "expr": "((max([(`a`.`CreateDate`), (`a`.`SequenceID`), `a`])[2]).`LogJobID`)"
                    
                  ]
                ,
                
                  "#operator": "FinalProject"
                
              ]
            
          
        ]
      ,
      
        "#operator": "Alias",
        "as": "a"
      ,
      
        "#operator": "Parallel",
        "~child": 
          "#operator": "Sequence",
          "~children": [
            
              "#operator": "InitialGroup",
              "aggregates": [
                "count(*)"
              ],
              "group_keys": [
                "(`a`.`LoggingType`)",
                "(`a`.`LogJobID`)"
              ]
            
          ]
        
      ,
      
        "#operator": "IntermediateGroup",
        "aggregates": [
          "count(*)"
        ],
        "group_keys": [
          "(`a`.`LoggingType`)",
          "(`a`.`LogJobID`)"
        ]
      ,
      
        "#operator": "FinalGroup",
        "aggregates": [
          "count(*)"
        ],
        "group_keys": [
          "(`a`.`LoggingType`)",
          "(`a`.`LogJobID`)"
        ]
      ,
      
        "#operator": "Parallel",
        "~child": 
          "#operator": "Sequence",
          "~children": [
            
              "#operator": "InitialProject",
              "result_terms": [
                
                  "expr": "(`a`.`LogJobID`)"
                ,
                
                  "as": "LoggingTypeID",
                  "expr": "(`a`.`LoggingType`)"
                ,
                
                  "as": "AffectedLineCount",
                  "expr": "count(*)"
                
              ]
            ,
            
              "#operator": "FinalProject"
            
          ]
        
      
    ]
  ,
  "text": "select LogJobID, LoggingType as LoggingTypeID, count(*) as AffectedLineCount\nfrom (\n    select Max([CreateDate, SequenceID, a])[2].LoggingType, Max([CreateDate, SequenceID, a])[2].LogJobID\n    from `LogBucket` a\n    where LoggingType in [3001, 4004, 6002]\n    group by LogFileID, RowKey) as a\ngroup by a.LoggingType, a.LogJobID"

它选择使用的索引是这样创建的:

CREATE INDEX `analyst_log_LogJob_activity` ON `LogBucket`(`LoggingType`,`LogJobID`) PARTITION BY hash((meta().`id`)) 

第二个索引的问题在于,它在索引下包含所有 10 亿个文档,而我尝试为这个新报告创建/专用的文档由于 LoggingType where 子句而显着减少。

【问题讨论】:

【参考方案1】:

您可以按如下方式创建覆盖索引。仅当所有查询都使用相同的 LoggingType 值时才使用索引 WHERE 子句。

CREATE INDEX `data_job_productivity_index1` ON `LogBucket`
           (`LoggingType`, `LogFileID`,`RowKey`,`CreateDate`,`SequenceID`, `LogJobID`)
PARTITION BY HASH(META().`id`) WHERE LoggingType IN [3001, 4004, 6002];

SELECT LogJobID, LoggingTypeID, COUNT(1) AS AffectedLineCount
FROM (
    SELECT MAX([CreateDate, SequenceID, LoggingTypeID:LoggingType,LogJobID ])[2].*
    FROM `LogBucket` AS a
    WHERE LoggingType IN [3001, 4004, 6002]
    GROUP BY LogFileID, RowKey) AS a
GROUP BY LoggingTypeID, LogJobID;

确保覆盖内部子查询并使用索引聚合 https://blog.couchbase.com/understanding-index-grouping-aggregation-couchbase-n1ql-query/

探索索引复制以获得高可用性和性能 https://docs.couchbase.com/server/current/learn/services-and-indexes/indexes/index-replication.html

如果 LoggingType、LogFileID、RowKey 是不可变的,则将它们作为分区键进行探索 而不是 META().id https://blog.couchbase.com/couchbase-gsi-index-partitioning/

https://blog.couchbase.com/create-right-index-get-right-performance/

探索指数顾问https://index-advisor.couchbase.com

【讨论】:

hmm 非常有趣。该索引与我创建的索引非常相似,但顺序不同。属性顺序对覆盖索引定义有影响吗? 查看blog.couchbase.com/create-right-index-get-right-performance Max 只使用必填字段。而不是整个文档。

以上是关于10 亿份文档的 Couchbase N1QL 索引的主要内容,如果未能解决你的问题,请参考以下文章

VB.net中的Back tic

无法执行 Spring-data Couchbase 查询

Spring认证中国教育管理中心-Spring Data Couchbase教程九

Elasticsearch我们如何在 5 天内在同一个 Elasticsearch 集群中重新索引 360 亿份文档

删除所有 Couchbase 数据/文档 ios(或删除所有 ios 数据?)

Couchbase:查找二进制文档的索引