10 亿份文档的 Couchbase N1QL 索引
Posted
技术标签:
【中文标题】10 亿份文档的 Couchbase N1QL 索引【英文标题】:Couchbase N1QL Index for 1 billion documents 【发布时间】:2020-01-18 22:01:05 【问题描述】:我在 Couchbase 企业(6.0.2 build 2413)中针对大约 10 亿个文档运行以下查询。基于此查询创建的性能最高的索引是什么? (想要在特定的时间段内完成报告,所以以最快的速度输出索引是主要目标)
select LogJobID, LoggingType as LoggingTypeID, count(*) as AffectedLineCount
from (
select Max([CreateDate, SequenceID, a])[2].LoggingType, Max([CreateDate, SequenceID, a])[2].LogJobID
from `LogBucket` a
where LoggingType in [3001, 4004, 6002]
group by LogFileID, RowKey) as a
group by a.LoggingType, a.LogJobID
我尝试创建了以下索引:
CREATE INDEX `data_job_productivity_index1`
ON `LogBucket`(`LogFileID`,`RowKey`,`LoggingType`,`LogJobID`,`CreateDate`,`SequenceID`)
PARTITION BY hash((meta().`id`)) WHERE (`LoggingType` in [3001, 4004, 6002])
但是当我检查解释时,它使用了不同的索引(一个专用于不同的报告查询)。
"plan":
"#operator": "Sequence",
"~children": [
"#operator": "Sequence",
"~children": [
"#operator": "IndexScan3",
"as": "a",
"index": "analyst_log_LogJob_activity",
"index_id": "f85999b9b7cc0d3f",
"index_projection":
"primary_key": true
,
"keyspace": "LogBucket",
"namespace": "default",
"spans": [
"exact": true,
"range": [
"high": "3001",
"inclusion": 3,
"low": "3001"
]
,
"exact": true,
"range": [
"high": "4004",
"inclusion": 3,
"low": "4004"
]
,
"exact": true,
"range": [
"high": "6002",
"inclusion": 3,
"low": "6002"
]
],
"using": "gsi"
,
"#operator": "Fetch",
"as": "a",
"keyspace": "LogBucket",
"namespace": "default"
,
"#operator": "Parallel",
"~child":
"#operator": "Sequence",
"~children": [
"#operator": "Filter",
"condition": "((`a`.`LoggingType`) in [3001, 4004, 6002])"
,
"#operator": "InitialGroup",
"aggregates": [
"max([(`a`.`CreateDate`), (`a`.`SequenceID`), `a`])"
],
"group_keys": [
"(`a`.`LogFileID`)",
"(`a`.`RowKey`)"
]
]
,
"#operator": "IntermediateGroup",
"aggregates": [
"max([(`a`.`CreateDate`), (`a`.`SequenceID`), `a`])"
],
"group_keys": [
"(`a`.`LogFileID`)",
"(`a`.`RowKey`)"
]
,
"#operator": "FinalGroup",
"aggregates": [
"max([(`a`.`CreateDate`), (`a`.`SequenceID`), `a`])"
],
"group_keys": [
"(`a`.`LogFileID`)",
"(`a`.`RowKey`)"
]
,
"#operator": "Parallel",
"~child":
"#operator": "Sequence",
"~children": [
"#operator": "InitialProject",
"result_terms": [
"expr": "((max([(`a`.`CreateDate`), (`a`.`SequenceID`), `a`])[2]).`LoggingType`)"
,
"expr": "((max([(`a`.`CreateDate`), (`a`.`SequenceID`), `a`])[2]).`LogJobID`)"
]
,
"#operator": "FinalProject"
]
]
,
"#operator": "Alias",
"as": "a"
,
"#operator": "Parallel",
"~child":
"#operator": "Sequence",
"~children": [
"#operator": "InitialGroup",
"aggregates": [
"count(*)"
],
"group_keys": [
"(`a`.`LoggingType`)",
"(`a`.`LogJobID`)"
]
]
,
"#operator": "IntermediateGroup",
"aggregates": [
"count(*)"
],
"group_keys": [
"(`a`.`LoggingType`)",
"(`a`.`LogJobID`)"
]
,
"#operator": "FinalGroup",
"aggregates": [
"count(*)"
],
"group_keys": [
"(`a`.`LoggingType`)",
"(`a`.`LogJobID`)"
]
,
"#operator": "Parallel",
"~child":
"#operator": "Sequence",
"~children": [
"#operator": "InitialProject",
"result_terms": [
"expr": "(`a`.`LogJobID`)"
,
"as": "LoggingTypeID",
"expr": "(`a`.`LoggingType`)"
,
"as": "AffectedLineCount",
"expr": "count(*)"
]
,
"#operator": "FinalProject"
]
]
,
"text": "select LogJobID, LoggingType as LoggingTypeID, count(*) as AffectedLineCount\nfrom (\n select Max([CreateDate, SequenceID, a])[2].LoggingType, Max([CreateDate, SequenceID, a])[2].LogJobID\n from `LogBucket` a\n where LoggingType in [3001, 4004, 6002]\n group by LogFileID, RowKey) as a\ngroup by a.LoggingType, a.LogJobID"
它选择使用的索引是这样创建的:
CREATE INDEX `analyst_log_LogJob_activity` ON `LogBucket`(`LoggingType`,`LogJobID`) PARTITION BY hash((meta().`id`))
第二个索引的问题在于,它在索引下包含所有 10 亿个文档,而我尝试为这个新报告创建/专用的文档由于 LoggingType where 子句而显着减少。
【问题讨论】:
【参考方案1】:您可以按如下方式创建覆盖索引。仅当所有查询都使用相同的 LoggingType 值时才使用索引 WHERE 子句。
CREATE INDEX `data_job_productivity_index1` ON `LogBucket`
(`LoggingType`, `LogFileID`,`RowKey`,`CreateDate`,`SequenceID`, `LogJobID`)
PARTITION BY HASH(META().`id`) WHERE LoggingType IN [3001, 4004, 6002];
SELECT LogJobID, LoggingTypeID, COUNT(1) AS AffectedLineCount
FROM (
SELECT MAX([CreateDate, SequenceID, LoggingTypeID:LoggingType,LogJobID ])[2].*
FROM `LogBucket` AS a
WHERE LoggingType IN [3001, 4004, 6002]
GROUP BY LogFileID, RowKey) AS a
GROUP BY LoggingTypeID, LogJobID;
确保覆盖内部子查询并使用索引聚合 https://blog.couchbase.com/understanding-index-grouping-aggregation-couchbase-n1ql-query/
探索索引复制以获得高可用性和性能 https://docs.couchbase.com/server/current/learn/services-and-indexes/indexes/index-replication.html
如果 LoggingType、LogFileID、RowKey 是不可变的,则将它们作为分区键进行探索 而不是 META().id https://blog.couchbase.com/couchbase-gsi-index-partitioning/
https://blog.couchbase.com/create-right-index-get-right-performance/
探索指数顾问https://index-advisor.couchbase.com
【讨论】:
hmm 非常有趣。该索引与我创建的索引非常相似,但顺序不同。属性顺序对覆盖索引定义有影响吗? 查看blog.couchbase.com/create-right-index-get-right-performance Max 只使用必填字段。而不是整个文档。以上是关于10 亿份文档的 Couchbase N1QL 索引的主要内容,如果未能解决你的问题,请参考以下文章
Spring认证中国教育管理中心-Spring Data Couchbase教程九
Elasticsearch我们如何在 5 天内在同一个 Elasticsearch 集群中重新索引 360 亿份文档