可直接拿来用的kafka+prometheus+grafana监控告警配置
Posted NetWhite
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了可直接拿来用的kafka+prometheus+grafana监控告警配置相关的知识,希望对你有一定的参考价值。
kafka配置jmx_exporter
点击:https://github.com/prometheus/jmx_exporter,选择下面的jar包下载:
将下载好的这个agent jar包上传到kafka的broker节点所在服务器上,每个broker都需要,比如上传到如下路径:
/opt/agent/jmx_prometheus_javaagent-0.16.1.jar
修改kafka启动脚本: bin/kafka-run-class.sh,增加java agent配置如下:
JMX_EXPORTER_OPTS="-javaagent:/opt/agent/jmx_prometheus_javaagent-0.16.1.jar=9095:/opt/agent/kafka_broker.yml"
KAFKA_JMX_OPTS="$KAFKA_JMX_OPTS $JMX_EXPORTER_OPTS"
这两行代码可以放在这个位置,脚本的最后几行:
这里指定了9095作为端口,jmx_exporter用到的kafka_broker.yml 配置如下:
---
startDelaySeconds: 20
lowercaseOutputName: true
lowercaseOutputLabelNames: true
blacklistObjectNames:
- "kafka.consumer:type=*,id=*"
- "kafka.consumer:type=*,client-id=*"
- "kafka.consumer:type=*,client-id=*,node-id=*"
- "kafka.producer:type=*,id=*"
- "kafka.producer:type=*,client-id=*"
- "kafka.producer:type=*,client-id=*,node-id=*"
- "kafka.*:type=kafka-metrics-count,*"
# This will ignore the admin client metrics from Kafka Brokers and will blacklist certain metrics
# that do not make sense for ingestion.
# "kafka.admin.client:type=*, node-id=*, client-id=*"
# "kafka.admin.client:type=*, client-id=*"
# "kafka.admin.client:type=*, id=*"
- "kafka.admin.client:*"
- "kafka.server:type=*,cipher=*,protocol=*,listener=*,networkProcessor=*"
- "kafka.server:type=*"
rules:
# This is by far the biggest contributor to the number of sheer metrics being produced.
# Always keep it on the top for the case of probability when so many metrics will hit the first condition and exit.
# "kafka.cluster:type=*, name=*, topic=*, partition=*"
# "kafka.log:type=*,name=*, topic=*, partition=*"
- pattern: kafka.(\\w+)<type=(.+), name=(.+), topic=(.+), partition=(.+)><>Value
name: kafka_$1_$2_$3
type: GAUGE
labels:
topic: "$4"
partition: "$5"
# "kafka.server:type=*,name=*, client-id=*, topic=*, partition=*"
- pattern: kafka.server<type=(.+), name=(.+), clientId=(.+), topic=(.+), partition=(.*)><>Value
name: kafka_server_$1_$2
type: GAUGE
labels:
clientId: "$3"
topic: "$4"
partition: "$5"
- pattern: kafka.server<type=(.+), name=(.+), clientId=(.+), brokerHost=(.+), brokerPort=(.+)><>Value
name: kafka_server_$1_$2
type: GAUGE
labels:
clientId: "$3"
broker: "$4:$5"
# "kafka.network:type=*, name=*, request=*, error=*"
# "kafka.network:type=*, name=*, request=*, version=*"
- pattern: kafka.(\\w+)<type=(.+), name=(.+), (.+)=(.+), (.+)=(.+)><>(Count|Value)
name: kafka_$1_$2_$3
labels:
"$4": "$5"
"$6": "$7"
- pattern: kafka.(\\w+)<type=(.+), name=(.+), (.+)=(.*), (.+)=(.+)><>(\\d+)thPercentile
name: kafka_$1_$2_$3
type: GAUGE
labels:
"$4": "$5"
"$6": "$7"
quantile: "0.$8"
# "kafka.rest:type=*, topic=*, partition=*, client-id=*"
# "kafka.rest:type=*, cipher=*, protocol=*, client-id=*"
- pattern: kafka.(\\w+)<type=(.+), (.+)=(.+), (.+)=(.+), (.+)=(.+)><>Value
name: kafka_$1_$2
labels:
"$3": "$4"
"$5": "$6"
"$7": "$8"
# Count and Value
# "kafka.server:type=*, name=*, topic=*"
# "kafka.server:type=*, name=*, clientId=*"
# "kafka.server:type=*, name=*, delayedOperation=*"
# "kafka.server:type=*, name=*, fetcherType=*"
# "kafka.network:type=*, name=*, networkProcessor=*"
# "kafka.network:type=*, name=*, processor=*"
# "kafka.network:type=*, name=*, request=*"
# "kafka.network:type=*, name=*, listener=*"
# "kafka.log:type=*, name=*, logDirectory=*"
# "kafka.log:type=*, name=*, op=*"
# "kafka.rest:type=*, node-id=*, client-id=*"
- pattern: kafka.(\\w+)<type=(.+), name=(.+), (.+)=(.+)><>(Count|Value)
name: kafka_$1_$2_$3
labels:
"$4": "$5"
# "kafka.consumer:type=*, topic=*, client-id=*"
# "kafka.producer:type=*, topic=*, client-id=*"
# "kafka.rest:type=*, topic=*, client-id=*"
# "kafka.server:type=*, broker-id=*, fetcher-id=*"
# "kafka.server:type=*, listener=*, networkProcessor=*"
- pattern: kafka.(\\w+)<type=(.+), (.+)=(.+), (.+)=(.+)><>(Count|Value)
name: kafka_$1_$2
labels:
"$3": "$4"
"$5": "$6"
# "kafka.network:type=*, name=*"
# "kafka.server:type=*, name=*"
# "kafka.controller:type=*, name=*"
# "kafka.databalancer:type=*, name=*"
# "kafka.log:type=*, name=*"
# "kafka.utils:type=*, name=*"
- pattern: kafka.(\\w+)<type=(.+), name=(.+)><>(Count|Value)
name: kafka_$1_$2_$3
# "kafka.producer:type=*, client-id=*"
# "kafka.producer:type=*, id=*"
# "kafka.rest:type=*, client-id=*"
# "kafka.rest:type=*, http-status-code=*"
# "kafka.server:type=*, BrokerId=*"
# "kafka.server:type=*, listener=*"
# "kafka.server:type=*, id=*"
- pattern: kafka.(\\w+)<type=(.+), (.+)=(.+)><>Value
name: kafka_$1_$2
labels:
"$3": "$4"
- pattern: kafka.server<type=KafkaRequestHandlerPool, name=RequestHandlerAvgIdlePercent><>OneMinuteRate
name: kafka_server_kafkarequesthandlerpool_requesthandleravgidlepercent_total
type: GAUGE
# "kafka.server:type=*, listener=*, networkProcessor=*, clientSoftwareName=*, clientSoftwareVersion=*"
- pattern: kafka.server<type=socket-server-metrics, clientSoftwareName=(.+), clientSoftwareVersion=(.+), listener=(.+), networkProcessor=(.+)><>connections
name: kafka_server_socketservermetrics_connections
type: GAUGE
labels:
client_software_name: "$1"
client_software_version: "$2"
listener: "$3"
network_processor: "$4"
- pattern: "kafka.server<type=socket-server-metrics, listener=(.+), networkProcessor=(.+)><>(.+):"
name: kafka_server_socketservermetrics_$3
type: GAUGE
labels:
listener: "$1"
network_processor: "$2"
# "kafka.coordinator.group:type=*, name=*"
# "kafka.coordinator.transaction:type=*, name=*"
- pattern: kafka.coordinator.(\\w+)<type=(.+), name=(.+)><>(Count|Value)
name: kafka_coordinator_$1_$2_$3
# Percentile
- pattern: kafka.(\\w+)<type=(.+), name=(.+), (.+)=(.*)><>(\\d+)thPercentile
name: kafka_$1_$2_$3
type: GAUGE
labels:
"$4": "$5"
quantile: "0.$6"
- pattern: kafka.(\\w+)<type=(.+), name=(.+)><>(\\d+)thPercentile
name: kafka_$1_$2_$3
type: GAUGE
labels:
quantile: "0.$4"
将kafka每个broker都这样配置,重启kafka。
Prometheus配置
修改prometheus的配置prometheus.yml,增加如下配置:
- job_name: 'kafka'
metrics_path: /metrics
static_configs:
- targets: ['kafka1:9095', 'kafka2:9095', 'kafka3:9095']
labels:
env: "test"
p.s. 注意job_name不要修改,值就是"kafka",要不我下面的grafana不能直接用,还需要每个面板依次修改。
Grafana配置
下面的Grafana面板我已经配置好,可以直接拿来用,之后可以根据需要增加或删除相关面板:
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": "-- Grafana --",
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"description": "Kafka resource usage and throughput",
"editable": true,
"gnetId": 721,
"graphTooltip": 0,
"id": 4,
"iteration": 1628943241052,
"links": [],
"panels": [
{
"collapsed": false,
"datasource": null,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 0
},
"id": 42,
"panels": [],
"title": "集群健康检查",
"type": "row"
},
{
"cacheTimeout": null,
"datasource": "Prometheus",
"description": "Number of active controllers in the cluster.",
"fieldConfig": {
"defaults": {
"custom": {},
"mappings": [
{
"id": 0,
"op": "=",
"text": "N/A",
"type": 1,
"value": "null"
}
],
"nullValueMode": "connected",
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#299c46",
"value": null
},
{
"color": "#e5ac0e",
"value": 2
},
{
"color": "#bf1b00"
}
]
},
"unit": "none"
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 4,
"x": 0,
"y": 1
},
"id": 12,
"interval": null,
"links": [],
"maxDataPoints": 100,
"options": {
"colorMode": "value",
"fieldOptions": {
"calcs": [
"lastNotNull"
]
},
"graphMode": "none",
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"last"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"pluginVersion": "7.3.1",
"targets": [
{
"expr": "sum(kafka_controller_kafkacontroller_activecontrollercount{job=\\"kafka\\",env=\\"$env\\",instance=~\\"$broker_id\\"})",
"format": "time_series",
"intervalFactor": 1,
"refId": "A"
}
],
"timeFrom": null,
"timeShift": null,
"title": "激活状态控制器数量",
"type": "stat"
},
{
"cacheTimeout": null,
"datasource": "Prometheus",
"description": "Number of Brokers Online",
"fieldConfig": {
"defaults": {
"custom": {},
"mappings": [
{
"id": 0,
"op": "=",
"text": "N/A",
"type": 1,
"value": "null"
}
],
"nullValueMode": "connected",
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#d44a3a",
"value": null
},
{
"color": "rgba(237, 129, 40, 0.89)",
"value": 0
},
{
"color": "semi-dark-green",
"value": 2
}
]
},
"unit": "none"
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 4,
"x": 4,
"y": 1
},
"id": 14,
"interval": null,
"links": [],
"maxDataPoints": 100,
"options": {
"colorMode": "value",
"fieldOptions": {
"calcs": [
"lastNotNull"
]
},
"graphMode": "none",
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"last"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"pluginVersion": "7.3.1",
"repeat": null,
"repeatDirection": "h",
"targets": [
{
"expr": "count(kafka_server_replicamanager_leadercount{job=\\"kafka\\",env=\\"$env\\",instance=~\\"$broker_id\\"})",
"format": "time_series",
"intervalFactor": 1,
"legendFormat": "",
"refId": "A"
}
],
"title": "在线broker数量",
"type": "stat"
},
{
"cacheTimeout": null,
"datasource": "Prometheus",
"description": "Unclean leader election rate",
"fieldConfig": {
"defaults": {
"custom": {},
"mappings": [
{
"id": 0,
"op": "=",
"text": "N/A",
"type": 1,
"value": "null"
}
],
"nullValueMode": "connected",
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#299c46",
"value": null
},
{
"color": "rgba(237, 129, 40, 0.89)",
"value": 1
},
{
"color": "#d44a3a"
}
]
},
"unit": "none"
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 4,
"x": 8,
"y": 1
},
"id": 16,
"interval": null,
"links": [],
"maxDataPoints": 100,
"options": {
"colorMode": "value",
"fieldOptions": {
"calcs": [
"lastNotNull"
]
},
"graphMode": "none",
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"last"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"pluginVersion": "7.3.1",
"targets": [
{
"expr": "sum(kafka_controller_controllerstats_uncleanleaderelectionspersec{job=\\"kafka\\",env=\\"$env\\",instance=~\\"$broker_id\\"})",
"format": "time_series",
"intervalFactor": 1,
"refId": "A"
}
],
"title": "Unclean Leader选举比率",
"type": "stat"
},
{
"cacheTimeout": null,
"datasource": "Prometheus",
"description": "",
"fieldConfig": {
"defaults": {
"custom": {},
"mappings": [
{
"id": 0,
"op": "=",
"text": "N/A",
"type": 1,
"value": "null"
}
],
"nullValueMode": "connected",
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#299c46",
"value": null
},
{
"color": "rgba(237, 129, 40, 0.89)",
"value": 2
},
{
"color": "#d44a3a"
}
]
},
"unit": "none"
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 4,
"x": 12,
"y": 1
},
"id": 33,
"interval": null,
"links": [],
"maxDataPoints": 100,
"options": {
"colorMode": "value",
"fieldOptions": {
"calcs": [
"lastNotNull"
]
},
"graphMode": "none",
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"last"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"pluginVersion": "7.3.1",
"targets": [
{
"expr": "sum(kafka_controller_kafkacontroller_preferredreplicaimbalancecount{job=\\"kafka\\",env=\\"$env\\",instance=~\\"$broker_id\\"})",
"format": "time_series",
"intervalFactor": 1,
"refId": "A"
}
],
"title": "未平衡到首选副本的数量",
"type": "stat"
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "Prometheus",
"editable": true,
"error": false,
"fieldConfig": {
"defaults": {
"custom": {},
"links": []
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"grid": {},
"gridPos": {
"h": 8,
"w": 8,
"x": 16,
"y": 1
},
"hiddenSeries": false,
"id": 84,
"isNew": true,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.3.1",
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(kafka_server_brokertopicmetrics_bytesinpersec{job=\\"kafka\\",env=\\"$env\\",instance=~\\"$broker_id\\",topic!=\\"\\"}[5m]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Bytes in",
"metric": "kafka_server_brokertopicmetrics_bytesinpersec",
"refId": "A",
"step": 4
},
{
"expr": "sum(rate(kafka_server_brokertopicmetrics_bytesoutpersec{job=\\"kafka\\",env=\\"$env\\",instance=~\\"$broker_id\\",topic!=\\"\\"}[5m]))",
"format": "time_series",
"hide": false,
"interval": "",
"intervalFactor": 2,
"legendFormat": "Bytes out",
"metric": "kafka_server_brokertopicmetrics_bytesinpersec",
"refId": "B",
"step": 4
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Broker网络吞吐量",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 2,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "Bps",
"label": "Bytes/s",
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"cacheTimeout": null,
"datasource": "Prometheus",
"description": "Partitions that are online",
"fieldConfig": {
"defaults": {
"custom": {},
"mappings": [
{
"id": 0,
"op": "=",
"text": "N/A",
"type": 1,
"value": "null"
}
],
"nullValueMode": "connected",
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#d44a3a",
"value": null
},
{
"color": "rgba(237, 129, 40, 0.89)",
"value": 0
},
{
"color": "#299c46",
"value": 0
}
]
},
"unit": "none"
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 4,
"x": 0,
"y": 5
},
"id": 18,
"interval": null,
"links": [],
"maxDataPoints": 100,
"options": {
"colorMode": "value",
"fieldOptions": {
"calcs": [
"lastNotNull"
]
},
"graphMode": "area",
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"last"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"pluginVersion": "7.3.1",
"targets": [
{
"expr": "sum(kafka_server_replicamanager_partitioncount{job=\\"kafka\\",env=\\"$env\\",instance=~\\"$broker_id\\"})",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
"legendFormat": "",
"refId": "A"
}
],
"title": "在线分区数",
"type": "stat"
},
{
"cacheTimeout": null,
"datasource": "Prometheus",
"description": "Number of partitions that dont have an active leader and are hence not writable or readable.",
"fieldConfig": {
"defaults": {
"custom": {},
"mappings": [
{
"id": 0,
"op": "=",
"text": "N/A",
"type": 1,
"value": "null"
}
],
"nullValueMode": "connected",
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "semi-dark-green",
"value": null
},
{
"color": "#bf1b00",
"value": 1
}
]
},
"unit": "none"
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 4,
"x": 4,
"y": 5
},
"id": 22,
"interval": null,
"links": [],
"maxDataPoints": 100,
"options": {
"colorMode": "value",
"fieldOptions": {
"calcs": [
"lastNotNull"
]
},
"graphMode": "area",
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"last"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"pluginVersion": "7.3.1",
"targets": [
{
"expr": "sum(kafka_controller_kafkacontroller_offlinepartitionscount{job=\\"kafka\\",env=\\"$env\\",instance=~\\"$broker_id\\"})",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
"legendFormat": "",
"refId": "A"
}
],
"timeFrom": null,
"timeShift": null,
"title": "离线分区数量",
"type": "stat"
},
{
"cacheTimeout": null,
"datasource": "Prometheus",
"description": "Number of under-replicated partitions (| ISR | < | all replicas |).",
"fieldConfig": {
"defaults": {
"custom": {},
"mappings": [
{
"id": 0,
"op": "=",
"text": "N/A",
"type": 1,
"value": "null"
}
],
"nullValueMode": "connected",
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "semi-dark-green",
"value": null
},
{
"color": "rgba(237, 129, 40, 0.89)",
"value": 1
},
{
"color": "#bf1b00",
"value": 5
}
]
},
"unit": "none"
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 4,
"x": 8,
"y": 5
},
"id": 20,
"interval": null,
"links": [],
"maxDataPoints": 100,
"options": {
"colorMode": "value",
"fieldOptions": {
"calcs": [
"lastNotNull"
]
},
"graphMode": "area",
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"last"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"pluginVersion": "7.3.1",
"targets": [
{
"expr": "sum(kafka_server_replicamanager_underreplicatedpartitions{job=\\"kafka\\",env=\\"$env\\",instance=~\\"$broker_id\\"})",
"format": "time_series",
"hide": false,
"intervalFactor": 2,
"refId": "A"
}
],
"title": "没有保持同步的分区数",
"type": "stat"
},
{
"cacheTimeout": null,
"datasource": "Prometheus",
"description": "Number of partitions under min insync replicas.",
"fieldConfig": {
"defaults": {
"custom": {},
"mappings": [
{
"id": 0,
"op": "=",
"text": "N/A",
"type": 1,
"value": "null"
}
],
"nullValueMode": "connected",
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "semi-dark-green",
"value": null
},
{
"color": "#bf1b00",
"value": 1
}
]
},
"unit": "none"
},
"overrides": []
},
"gridPos": {
"h": 4,
"w": 4,
"x": 12,
"y": 5
},
"id": 32,
"interval": null,
"links": [],
"maxDataPoints": 100,
"options": {
"colorMode": "value",
"fieldOptions": {
"calcs": [
"lastNotNull"
]
},
"graphMode": "area",
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"last"
],
"fields": "",
"values": false
},
"textMode": "auto"
},
"pluginVersion": "7.3.1",
"targets": [
{
"expr": "sum(kafka_cluster_partition_underminisr{job=\\"kafka\\",env=\\"$env\\",instance=~\\"$broker_id\\"})",
"format": "time_series",
"hide": false,
"interval": "",
"intervalFactor": 2,
"legendFormat": "",
"refId": "A"
}
],
"title": "isr小于最少同步副本的分区数",
"type": "stat"
},
{
"collapsed": false,
"datasource": null,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 9
},
"id": 40,
"panels": [],
"title": "系统状态",
"type": "row"
},
{
"aliasColors": {
"localhost:7071": "#629E51"
},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "Prometheus",
"editable": true,
"error": false,
"fieldConfig": {
"defaults": {
"custom": {},
"links": []
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"grid": {},
"gridPos": {
"h": 7,
"w": 8,
"x": 0,
"y": 10
},
"hiddenSeries": false,
"id": 27,
"isNew": true,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.3.1",
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "irate(process_cpu_seconds_total{job=\\"kafka\\",env=\\"$env\\",instance=~\\"$broker_id\\"}[5m])*100",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "{{instance}}",
"metric": "process_cpu_secondspersec",
"refId": "A",
"step": 4
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "CPU 使用率",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 2,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "percent",
"label": "Cores",
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {
"localhost:7071": "#BA43A9"
},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "Prometheus",
"editable": true,
"error": false,
"fieldConfig": {
"defaults": {
"custom": {},
"links": []
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"grid": {},
"gridPos": {
"h": 7,
"w": 8,
"x": 8,
"y": 10
},
"hiddenSeries": false,
"id": 2,
"isNew": true,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.3.1",
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum without(area)(jvm_memory_bytes_used{job=\\"kafka\\",env=\\"$env\\",instance=~\\"$broker_id\\"})",
"intervalFactor": 2,
"legendFormat": "{{instance}}",
"metric": "jvm_memory_bytes_used",
"refId": "A",
"step": 4
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "JVM 内存使用情况",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 2,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "bytes",
"label": "Memory",
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {
"localhost:7071": "#890F02"
},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "Prometheus",
"editable": true,
"error": false,
"fieldConfig": {
"defaults": {
"custom": {},
"links": []
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"grid": {},
"gridPos": {
"h": 7,
"w": 8,
"x": 16,
"y": 10
},
"hiddenSeries": false,
"id": 3,
"isNew": true,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.3.1",
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum without(gc)(rate(jvm_gc_collection_seconds_sum{job=\\"kafka\\",env=\\"$env\\",instance=~\\"$broker_id\\"}[5m]))",
"intervalFactor": 2,
"legendFormat": "{{instance}}",
"metric": "jvm_gc_collection_seconds_sum",
"refId": "A",
"step": 4
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "GC耗时",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 2,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "percentunit",
"label": "% time in GC",
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {
"localhost:7071": "#629E51"
},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "Prometheus",
"editable": true,
"error": false,
"fieldConfig": {
"defaults": {
"custom": {},
"links": []
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"grid": {},
"gridPos": {
"h": 7,
"w": 12,
"x": 0,
"y": 17
},
"hiddenSeries": false,
"id": 128,
"isNew": true,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.3.1",
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "jvm_gc_collection_seconds_count{job=\\"kafka\\",env=\\"$env\\",instance=~\\"$broker_id\\", gc=~'.*Young.*'} - jvm_gc_collection_seconds_count{job=\\"kafka\\",env=\\"$env\\",instance=~\\"$broker_id\\", gc=~'.*Young.*'} offset 1m",
"format": "time_series",
"interval": "",
"intervalFactor": 2,
"legendFormat": "{{instance}}",
"metric": "process_cpu_secondspersec",
"refId": "A",
"step": 4
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "最近1分钟YGC次数",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 2,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "none",
"label": "Cores",
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {
"localhost:7071": "#629E51"
},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "Prometheus",
"editable": true,
"error": false,
"fieldConfig": {
"defaults": {
"custom": {},
"links": []
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"grid": {},
"gridPos": {
"h": 7,
"w": 12,
"x": 12,
"y": 17
},
"hiddenSeries": false,
"id": 129,
"isNew": true,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.3.1",
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "jvm_gc_collection_seconds_count{job=\\"kafka\\",env=\\"$env\\",instance=~\\"$broker_id\\", gc=~'.*Old.*'} - jvm_gc_collection_seconds_count{job=\\"kafka\\",env=\\"$env\\",instance=~\\"$broker_id\\", gc=~'.*Old.*'} offset 1m",
"format": "time_series",
"interval": "",
"intervalFactor": 2,
"legendFormat": "{{instance}}",
"metric": "process_cpu_secondspersec",
"refId": "A",
"step": 4
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "最近1分钟FGC次数",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 2,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "none",
"label": "Cores",
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"collapsed": false,
"datasource": null,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 24
},
"id": 29,
"panels": [],
"title": "消息的吞吐量",
"type": "row"
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "Prometheus",
"editable": true,
"error": false,
"fieldConfig": {
"defaults": {
"custom": {},
"links": []
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"grid": {},
"gridPos": {
"h": 7,
"w": 8,
"x": 0,
"y": 25
},
"hiddenSeries": false,
"id": 10,
"isNew": true,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.3.1",
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum without(topic)(rate(kafka_server_brokertopicmetrics_messagesinpersec{job=\\"kafka\\",env=\\"$env\\",instance=~\\"$broker_id\\",topic=~\\"$topic\\"}[5m]))",
"format": "time_series",
"interval": "",
"intervalFactor": 2,
"legendFormat": "{{instance}}",
"metric": "kafka_server_brokertopicmetrics_messagesinpersec",
"refId": "A",
"step": 4
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "每个Broker的消息速率",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 2,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "iops",
"label": "Messages/s",
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "Prometheus",
"editable": true,
"error": false,
"fieldConfig": {
"defaults": {
"custom": {},
"links": []
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"grid": {},
"gridPos": {
"h": 7,
"w": 8,
"x": 8,
"y": 25
},
"hiddenSeries": false,
"id": 7,
"isNew": true,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.3.1",
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum without(topic)(rate(kafka_server_brokertopicmetrics_bytesinpersec{job=\\"kafka\\",env=\\"$env\\",instance=~\\"$broker_id\\",topic=~\\"$topic\\"}[5m]))",
"format": "time_series",
"interval": "",
"intervalFactor": 2,
"legendFormat": "{{instance}}",
"metric": "kafka_server_brokertopicmetrics_bytesinpersec",
"refId": "A",
"step": 4
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "每个Broker每秒入站字节数",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 2,
"value_type": "cumulative"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "Bps",
"label": "Bytes/s",
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "Prometheus",
"fieldConfig": {
"defaults": {
"custom": {},
"links": []
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 7,
"w": 8,
"x": 16,
"y": 25
},
"hiddenSeries": false,
"id": 9,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.3.1",
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum without(topic)(rate(kafka_server_brokertopicmetrics_bytesoutpersec{job=\\"kafka\\",env=\\"$env\\",instance=~\\"$broker_id\\",topic=~\\"$topic\\"}[5m]))",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
"legendFormat": "{{instance}}",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "每个Broker上每秒出站字节数",
"tooltip": {
"shared": true,
"sort": 2,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": "0",
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "Prometheus",
"decimals": null,
"editable": true,
"error": false,
"fieldConfig": {
"defaults": {
"custom": {},
"links": []
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"grid": {},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 32
},
"hiddenSeries": false,
"id": 4,
"isNew": true,
"legend": {
"alignAsTable": true,
"avg": true,
"current": true,
"max": true,
"min": false,
"rightSide": true,
"show": true,
"sort": "current",
"sortDesc": true,
"total": false,
"values": true
},
"lines": true,
"linewidth": 2,
"links": [],
"nullPointMode": "connected",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.3.1",
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum without(instance)(rate(kafka_server_brokertopicmetrics_messagesinpersec{job=\\"kafka\\",env=\\"$env\\",instance=~\\"$broker_id\\",topic=~\\"$topic\\"}[5m]))",
"interval": "",
"intervalFactor": 2,
"legendFormat": "{{topic}}",
"metric": "kafka_server_brokertopicmetrics_messagesinpersec",
"refId": "A",
"step": 4
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "每个topic消息速率",
"tooltip": {
"msResolution": false,
"shared": true,
"sort": 2,
"value_type": 以上是关于可直接拿来用的kafka+prometheus+grafana监控告警配置的主要内容,如果未能解决你的问题,请参考以下文章