Telegraf 问题:[outputs.influxdb] 度量缓冲区溢出;已删除 3645 个指标
Posted
技术标签:
【中文标题】Telegraf 问题:[outputs.influxdb] 度量缓冲区溢出;已删除 3645 个指标【英文标题】:Telegraf issue: [outputs.influxdb] Metric buffer overflow; 3645 metrics have been dropped 【发布时间】:2020-01-02 18:13:55 【问题描述】:我正在尝试使用 telegraf 从 1100 个虚拟机的 vcentre 中提取指标并将这些数据存储在 influxdb 中。指标正在“成功”提取,然后显示在 grafana 上创建的仪表板上。
但是,当我检查 telegraf 的状态时,报告说有 3000 - 11,000 个指标被丢弃。我不确定这是否与 telegraf 或 InfluxDB 的配置有关。
Telegraf vsphere 配置:
# Read metrics from VMware vCenter
[[inputs.vsphere]]
## List of vCenter URLs to be monitored. These three lines must be uncommented
## and edited for the plugin to work.
vcenters = [ "https:/***/sdk" ]
username = "***"
password = “***"
#
## VMs
## Typical VM metrics (if omitted or empty, all metrics are collected)
vm_metric_include = [
"cpu.demand.average",
"cpu.idle.summation",
"cpu.latency.average",
"cpu.readiness.average",
"cpu.ready.summation",
"cpu.run.summation",
"cpu.usagemhz.average",
"cpu.used.summation",
"cpu.wait.summation",
"mem.active.average",
"mem.granted.average",
"mem.latency.average",
"mem.swapin.average",
"mem.swapinRate.average",
"mem.swapout.average",
"mem.swapoutRate.average",
"mem.usage.average",
"mem.vmmemctl.average",
"net.bytesRx.average",
"net.bytesTx.average",
"net.droppedRx.summation",
"net.droppedTx.summation",
"net.usage.average",
"power.power.average",
"virtualDisk.numberReadAveraged.average",
"virtualDisk.numberWriteAveraged.average",
"virtualDisk.read.average",
"virtualDisk.readOIO.latest",
"virtualDisk.throughput.usage.average",
"virtualDisk.totalReadLatency.average",
"virtualDisk.totalWriteLatency.average",
"virtualDisk.write.average",
"virtualDisk.writeOIO.latest",
"sys.uptime.latest",
]
# vm_metric_exclude = [] ## Nothing is excluded by default
# vm_instances = true ## true by default
#
## Hosts
## Typical host metrics (if omitted or empty, all metrics are collected)
host_metric_include = [
"cpu.coreUtilization.average",
"cpu.costop.summation",
"cpu.demand.average",
"cpu.idle.summation",
"cpu.latency.average",
"cpu.readiness.average",
"cpu.ready.summation",
"cpu.swapwait.summation",
"cpu.usage.average",
"cpu.usagemhz.average",
"cpu.used.summation",
"cpu.utilization.average",
"cpu.wait.summation",
"disk.deviceReadLatency.average",
"disk.deviceWriteLatency.average",
"disk.kernelReadLatency.average",
"disk.kernelWriteLatency.average",
"disk.numberReadAveraged.average",
"disk.numberWriteAveraged.average",
"disk.read.average",
"disk.totalReadLatency.average",
"disk.totalWriteLatency.average",
"disk.write.average",
"mem.active.average",
"mem.latency.average",
"mem.state.latest",
"mem.swapin.average",
"mem.swapinRate.average",
"mem.swapout.average",
"mem.swapoutRate.average",
"mem.totalCapacity.average",
"mem.usage.average",
"mem.vmmemctl.average",
"net.bytesRx.average",
"net.bytesTx.average",
"net.droppedRx.summation",
"net.errorsTx.summation",
"net.usage.average",
"power.power.average",
"storageAdapter.numberReadAveraged.average",
"storageAdapter.numberWriteAveraged.average",
"storageAdapter.read.average",
"storageAdapter.write.average",
"sys.uptime.latest",
]
# host_metric_exclude = [] ## Nothing excluded by default
# host_instances = true ## true by default
#
## Clusters
cluster_metric_include = [] ## if omitted or empty, all metrics are collected
cluster_metric_exclude = [] ## Nothing excluded by default
# cluster_instances = false ## false by default
#
## Datastores
datastore_metric_include = [] ## if omitted or empty, all metrics are collected
# datastore_metric_exclude = [] ## Nothing excluded by default
# datastore_instances = false ## false by default for Datastores only
#
## Datacenters
datacenter_metric_include = [] ## if omitted or empty, all metrics are collected
# datacenter_metric_exclude = [ "*" ] ## Datacenters are not collected by default.
# datacenter_instances = false ## false by default for Datastores only
#
## Plugin Settings
## separator character to use for measurement and field names (default: "_")
# separator = "_"
## number of objects to retrieve per query for realtime resources (vms and hosts)
## set to 64 for vCenter 5.5 and 6.0 (default: 256)
# max_query_objects = 256
## number of metrics to retrieve per query for non-realtime resources (clusters and datastores)
## set to 64 for vCenter 5.5 and 6.0 (default: 256)
# max_query_metrics = 256
## number of go routines to use for collection and discovery of objects and metrics
# collect_concurrency = 1
# discover_concurrency = 1
## whether or not to force discovery of new objects on initial gather call before collecting metrics
## when true for large environments this may cause errors for time elapsed while collecting metrics
## when false (default) the first collection cycle may result in no or limited metrics while objects are discov$
# force_discover_on_init = false
## the interval before (re)discovering objects subject to metrics collection (default: 300s)
# object_discovery_interval = "300s"
## timeout applies to any of the api request made to vcenter
timeout = "180s"
## When set to true, all samples are sent as integers. This makes the output
## data types backwards compatible with Telegraf 1.9 or lower. Normally all
## samples from vCenter, with the exception of percentages, are integer
## values, but under some conditions, some averaging takes place internally in
## the plugin. Setting this flag to "false" will send values as floats to
## preserve the full precision when averaging takes place.
# use_int_samples = true
## Custom attributes from vCenter can be very useful for queries in order to slice the
## metrics along different dimension and for forming ad-hoc relationships. They are disabled
## by default, since they can add a considerable amount of tags to the resulting metrics. To
## enable, simply set custom_attribute_exlude to [] (empty set) and use custom_attribute_include
## to select the attributes you want to include.
# custom_attribute_include = []
# custom_attribute_exclude = ["*"]
## Optional SSL Config
# ssl_ca = "/path/to/cafile"
# ssl_cert = "/path/to/certfile"
# ssl_key = "/path/to/keyfile"
## Use SSL but skip chain & host verification
insecure_skip_verify = true
Telegraf 代理配置
# Configuration for telegraf agent
[agent]
## Default data collection interval for all inputs
interval = "180s"
## Rounds collection interval to 'interval'
## ie, if interval="10s" then always collect on :00, :10, :20, etc.
round_interval = true
## Telegraf will send metrics to outputs in batches of at most
## metric_batch_size metrics.
## This controls the size of writes that Telegraf sends to output plugins.
metric_batch_size = 1000
## Maximum number of unwritten metrics per output.
metric_buffer_limit = 1000
## Collection jitter is used to jitter the collection by a random amount.
## Each plugin will sleep for a random time within jitter before collecting.
## This can be used to avoid many plugins querying things like sysfs at the
## same time, which can have a measurable effect on the system.
collection_jitter = "0s"
## Default flushing interval for all outputs. Maximum flush_interval will be
## flush_interval + flush_jitter
flush_interval = "10s"
## Jitter the flush interval by a random amount. This is primarily to avoid
## large write spikes for users running a large number of telegraf instances.
## ie, a jitter of 5s and interval 10s means flushes will happen every 10- 15s
flush_jitter = "0s"
## By default or when set to "0s", precision will be set to the same
## timestamp order as the collection interval, with the maximum being 1s.
## ie, when interval = "10s", precision will be "1s"
## Precision will NOT be used for service inputs. It is up to each individual
## service input to set the timestamp at the appropriate precision.
## Valid time units are "ns", "us" (or "µs"), "ms", "s".
precision = ""
## Log at debug level.
# debug = false
## Log only error level messages.
# quiet = false
## Log file name, the empty string means to log to stderr.
# logfile = ""
## The logfile will be rotated after the time interval specified. When set
## to 0 no time based rotation is performed.
# logfile_rotation_interval = "0d"
## The logfile will be rotated when it becomes larger than the specified
## size. When set to 0 no size based rotation is performed.
# logfile_rotation_max_size = "0MB"
## Maximum number of rotated archives to keep, any older logs are deleted.
## If set to -1, no archives are removed.
# logfile_rotation_max_archives = 5
## Override default hostname, if empty use os.Hostname()
hostname = ""
## If set to true, do no set the "host" tag in the telegraf agent.
omit_hostname = false
Telegraf influxDB 插件配置
# Configuration for sending metrics to InfluxDB
[[outputs.influxdb]]
urls = ["http://***********"]
database = "vmware"
timeout = "0s"
我在运行 systemctl status -l telegraf 时收到以下错误:
[outputs.influxdb] 度量缓冲区溢出;已有 3645 个指标 掉了
不断有不同数量的指标被丢弃
我也收到此错误,这可能是一个原因或突出了另一个问题:
[agent] 输入“inputs.vsphere”未在其间隔内完成
不确定问题出在哪里
【问题讨论】:
【参考方案1】:这里有两个问题,而且都很有声有色。
inputs.vsphere
[agent] 输入“inputs.vsphere”未在其间隔内完成
您运行inputs.vsphere
的时间间隔小于插件收集数据的时间。您可能想深入了解running several collectors concurrently 以加快收集速度。如果这本身没有帮助,您需要找到并发和增加 collection interval 之间的最佳平衡点。
不要害怕这样做。根据我的经验,人们大大高估了他们实际需要的粒度。我见过间隔从 10 秒变为几分钟的部署。请注意,您可以set the interval per input plugin。
outputs.influxdb
[outputs.influxdb] 度量缓冲区溢出;已删除 3645 个指标
您收集的数据点超出了缓冲区的容量。只需将metric_buffer_limit
增加删除的最大测量次数(加上一个好的测量只是为了安全起见)。与interval
一样,您可以在代理范围内执行此操作(该值随后应用于每个 输出)或仅针对outputs.influxdb
。
【讨论】:
以上是关于Telegraf 问题:[outputs.influxdb] 度量缓冲区溢出;已删除 3645 个指标的主要内容,如果未能解决你的问题,请参考以下文章
搭建grafana+influxdb+telegraf,以及常见问题