zabbix 监控zookeeper篇
Posted 终点即起点
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了zabbix 监控zookeeper篇相关的知识,希望对你有一定的参考价值。
zabbix 监控zookeeper篇
安装依赖包
yum install -y nc yum install -y zabbix-sender
nc 命令
echo ruok|nc 127.0.0.1 2181 imok echo mntr|nc 127.0.0.1 2181 zk_version 3.4.6-1569965, built on 02/20/2014 09:09 GMT zk_avg_latency 0 zk_max_latency 6 zk_min_latency 0 zk_packets_received 93114 zk_packets_sent 93113 zk_num_alive_connections 4 zk_outstanding_requests 0 zk_server_state leader zk_znode_count 29 zk_watch_count 0 zk_ephemerals_count 14 zk_approximate_data_size 1087 zk_open_file_descriptor_count 39 zk_max_file_descriptor_count 1000000 zk_followers 4 zk_synced_followers 4 zk_pending_syncs 0 echo srvr|nc 127.0.0.1 2181 Zookeeper version: 3.4.6-1569965, built on 02/20/2014 09:09 GMT Latency min/avg/max: 0/0/6 Received: 93121 Sent: 93120 Connections: 4 Outstanding: 0 Zxid: 0x900000020 Mode: leader Node count: 29
ZooKeeper监控项
zk_avg/min/max_latency 响应一个客户端请求的时间,建议这个时间大于10个Tick就报警 zk_outstanding_requests 排队请求的数量,当ZooKeeper超过了它的处理能力时,这个值会增大,建议设置报警阀值为10 zk_packets_received 接收到客户端请求的包数量 zk_packets_sent 发送给客户单的包数量,主要是响应和通知 zk_max_file_descriptor_count 最大允许打开的文件数,由ulimit控制 zk_open_file_descriptor_count 打开文件数量,当这个值大于允许值得85%时报警 Mode 运行的角色,如果没有加入集群就是standalone,加入集群式follower或者leader zk_followers leader角色才会有这个输出,集合中follower的个数。正常的值应该是集合成员的数量减1 zk_pending_syncs leader角色才会有这个输出,pending syncs的数量 zk_znode_count znodes的数量 zk_watch_count watches的数量 Java Heap Size ZooKeeper Java进程的
编写Zabbix监控ZooKeeper的脚本和配置文件
将这些监控数据一次性使用zabbix_sender全部发送给zabbix。采用zabbix_sender一次性发送全部监控数据的脚本,首先想办法将监控项目汇集成一个字典,然后遍历这个字典,将字典中的key:value对通过zabbix_sender的-k和-o参数指定发送出去
vim zookeeper.py
#!/usr/bin/python """{\'zk_followers\': 0, \'zk_outstanding_requests\': 0, \'zk_approximate_data_size\': 890971, \'zk_packets_sent\': 5818488, \'zk_pending_syncs\': 0, \'zk_avg_latency\': 0, \'zk_version\': \'3.4.6-1569965, built on 02/20/2014 09:09 GMT\', \'zk_watch_count\': 1364, \'zk_packets_received\': 5797681, \'zk_open_file_descriptor_count\': 46, \'zk_server_ruok\': \'imok\', \'zk_server_state\': \'follower\', \'zk_synced_followers\': 0, \'zk_max_latency\': 400, \'zk_num_alive_connections\': 18, \'zk_min_latency\': 0, \'zk_ephemerals_count\': 1112, \'zk_znode_count\': 2207, \'zk_max_file_descriptor_count\': 4096} 31022 """ import sys import socket import re import subprocess from StringIO import StringIO import os zabbix_sender = \'/usr/bin/zabbix_sender\' zabbix_conf = \'/etc/zabbix/zabbix_agentd.conf\' send_to_zabbix = 1 # get zookeeper server status class ZooKeeperServer(object): def __init__(self, host=\'localhost\', port=\'2181\', timeout=1): self._address = (host, int(port)) self._timeout = timeout self._result = {} def _create_socket(self): return socket.socket() def _send_cmd(self, cmd): """ Send a 4letter word command to the server """ s = self._create_socket() s.settimeout(self._timeout) s.connect(self._address) s.send(cmd) data = s.recv(2048) s.close() return data def get_stats(self): """ Get ZooKeeper server stats as a map """ """zk_version 3.4.6-1569965, built on 02/20/2014 09:09 GMT zk_avg_latency 0 zk_max_latency 94 zk_min_latency 0 zk_packets_received 1267904 zk_packets_sent 1317835 zk_num_alive_connections 12 zk_outstanding_requests 0 zk_server_state follower zk_znode_count 1684 zk_watch_count 2757 zk_ephemerals_count 899 zk_approximate_data_size 728074 zk_open_file_descriptor_count 41 zk_max_file_descriptor_count 4096 """ data_mntr = self._send_cmd(\'mntr\') data_ruok = self._send_cmd(\'ruok\') if data_mntr: result_mntr = self._parse(data_mntr) if data_ruok: # {\'zk_server_ruok\': \'imok\'} result_ruok = self._parse_ruok(data_ruok) self._result = dict(result_mntr.items() + result_ruok.items()) if not self._result.has_key(\'zk_followers\') and not self._result.has_key(\'zk_synced_followers\') and not self._result.has_key(\'zk_pending_syncs\'): # #### the tree metrics only exposed on leader role zookeeper server, we just set the followers\' to 0 leader_only = {\'zk_followers\':0,\'zk_synced_followers\':0,\'zk_pending_syncs\':0} self._result = dict(result_mntr.items() + result_ruok.items() + leader_only.items()) return self._result def _parse(self, data): """ :param data: zk_outstanding_requests 0 zk_approximate_data_size 653931 :return: {\'zk_outstanding_requests\': \'0\', \'zk_approximate_data_size\': \'653931\',} """ """ Parse the output from the \'mntr\' 4letter word command """ h = StringIO(data) result = {} for line in h.readlines(): try: key, value = self._parse_line(line) result[key] = value except ValueError: pass # ignore broken lines return result def _parse_ruok(self, data): """ :param data: imok :return: {\'zk_server_ruok\': \'imok\'} """ """ Parse the output from the \'ruok\' 4letter word command """ h = StringIO(data) result = {} ruok = h.readline() if ruok: result[\'zk_server_ruok\'] = ruok return result def _parse_line(self, line): # zk_watch_count 1482 try: # zk_max_file_descriptor_count 65535 key, value = map(str.strip, line.split(\'\\t\')) except ValueError: raise ValueError(\'Found invalid line: %s\' % line) if not key: raise ValueError(\'The key is mandatory and should not be empty\') try: value = int(value) except (TypeError, ValueError): pass return key, value def get_pid(self): # ps -ef|grep java|grep zookeeper|awk \'{print $2}\' pidarg = \'\'\'ps -ef|grep java|grep zookeeper|grep -v grep|awk \'{print $2}\' \'\'\' # 31022 pidout = subprocess.Popen(pidarg, shell=True, stdout=subprocess.PIPE) pid = pidout.stdout.readline().strip(\'\\n\') return pid def send_to_zabbix(self, metric): # key = zookeeper.status[zk_max_file_descriptor_count] key = "zookeeper.status[" + metric + "]" if send_to_zabbix > 0: # print key + ":" + str(self._result[metric]) try: subprocess.call([zabbix_sender, "-c", zabbix_conf, "-k", key, "-o", str(self._result[metric])], stdout=FNULL, stderr=FNULL, shell=False) #print "send zabbix success" except OSError, detail: print "Something went wrong while exectuting zabbix_sender : ", detail else: print "Simulation: the following command would be execucted :\\n", zabbix_sender, "-c", zabbix_conf, "-k", key, "-o", self._result[metric], "\\n" def usage(): """Display program usage""" print "\\nUsage : ", sys.argv[0], " alive|all" print "Modes : \\n\\talive : Return pid of running zookeeper\\n\\tall : Send zookeeper stats as well" sys.exit(1) accepted_modes = [\'alive\', \'all\'] if len(sys.argv) == 2 and sys.argv[1] in accepted_modes: mode = sys.argv[1] else: usage() zk = ZooKeeperServer() # print zk.get_stats() pid = zk.get_pid() if pid != "" and mode == \'all\': zk.get_stats() print zk._result FNULL = open(os.devnull, \'w\') for key in zk._result: zk.send_to_zabbix(key) FNULL.close() print pid elif pid != "" and mode == "alive": print pid else: print 0
增加脚本可执行权限
chmod +x /etc/zabbix/scripts/zookeeper.py
zabbix配置文件
vim /etc/zabbix/zabbix_agentd.d/check_zookeeper.conf
UserParameter=zookeeper.status[*],/usr/bin/python /usr/local/zabbix-agent/scripts/check_zookeeper.py $1
重新启动zabbix-agent服务
service zabbix-agent restart
制作Zabbix监控ZooKeeper的模板并设置报警阀值
zookeeper.xml(一定是zabbix采集器的方式)
<?xml version="1.0" encoding="UTF-8"?> <zabbix_export> <version>3.0</version> <date>2017-12-11T08:02:58Z</date> <groups> <group> <name>Zabbix servers</name> </group> </groups> <templates> <template> <template>Zookeeper</template> <name>Zookeeper</name> <description/> <groups> <group> <name>Zabbix servers</name> </group> </groups> <applications> <application> <name>ZooKeeper Status</name> </application> </applications> <items> <item> <name>zookeeper pid</name> <type>2</type> <snmp_community/> <multiplier>0</multiplier> <snmp_oid/> <key>zookeeper.status[alive]</key> <delay>10</delay> <history>90</history> <trends>365</trends> <status>0</status> <value_type>3</value_type> <allowed_hosts/> <units/> <delta>0</delta> <snmpv3_contextname/> <snmpv3_securityname/> <snmpv3_securitylevel>0</snmpv3_securitylevel> <snmpv3_authprotocol>0</snmpv3_authprotocol> <snmpv3_authpassphrase/> <snmpv3_privprotocol>0</snmpv3_privprotocol> <snmpv3_privpassphrase/> <formula>1</formula> <delay_flex/> <params/> <ipmi_sensor/> <data_type>0</data_type> <authtype>0</authtype> <username/> <password/> <publickey/> <privatekey/> <port/> <description/> <inventory_link>0</inventory_link> <applications> <application> <name>ZooKeeper Status</name> </application> </applications> <valuemap/> <logtimefmt/> </item> <item> <name>zookeeper approximate data size</name> <type>2</type> <snmp_community/> <multiplier>0</multiplier> <snmp_oid/> <key>zookeeper.status[zk_approximate_data_size]</key> <delay>0</delay> <history>90</history> <trends>365</trends> <status>0</status> <value_type>3</value_type> <allowed_hosts/> <units/> <delta>0</delta> <snmpv3_contextname/> <snmpv3_securityname/> <snmpv3_securitylevel>0</snmpv3_securitylevel> <snmpv3_authprotocol>0</snmpv3_authprotocol> <snmpv3_authpassphrase/> <snmpv3_privprotocol>0</snmpv3_privprotocol> <snmpv3_privpassphrase/> <formula>1</formula> <delay_flex/> <params/> <ipmi_sensor/> <data_type>0</data_type> <authtype>0</authtype> <username/> <password/> <publickey/> <privatekey/> <port/> <description/> <inventory_link>0</inventory_link> <applications> <application> <name>ZooKeeper Status</name> </application> </applications> <valuemap/> <logtimefmt/> </item> <item> <name>zookeeper average latency</name> <type>2</type> <snmp_community/> <multiplier>0</multiplier> <snmp_oid/> <key>zookeeper.status[zk_avg_latency]</key> <delay>0</delay> <history>90</history> <trends>365</trends> <status>0</status> <value_type>3</value_type> <allowed_hosts/> <units/> <delta>0</delta> <snmpv3_contextname/> <snmpv3_securityname/> <snmpv3_securitylevel>0</snmpv3_securitylevel> <snmpv3_authprotocol>0</snmpv3_authprotocol> <snmpv3_authpassphrase/> <snmpv3_privprotocol>0</snmpv3_privprotocol> <snmpv3_privpassphrase/> <formula>1</formula> <delay_flex/> <params/> <ipmi_sensor/> <data_type>0</data_type> <authtype>0</authtype> <username/> <password/> <publickey/> <privatekey/> <port/> <description/> <inventory_link>0</inventory_link> <applications> <application> <name>ZooKeeper Status</name> </以上是关于zabbix 监控zookeeper篇的主要内容,如果未能解决你的问题,请参考以下文章