zabbix短信报警统计以及报表展示
Posted MySQLDBA笔记
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了zabbix短信报警统计以及报表展示相关的知识,希望对你有一定的参考价值。
对于用过zabbix的用户来说,报警短信可能并不陌生,不知道大家有没有经历过这种场景:
当出现问题后,针对问题进行了临时处理,等到有时间了,却忘记了需要跟进问题最终解决,结果下次还会有报警;
或者等有空了,想针对问题进行研究根本解决时,却发现当初的报警信息已经被其他各种报警淹没,具体时间根本无法提供,相信不少人经历过某个问题导致短信轰炸的悲剧。
针对这些问题,将每周报警短信统计成报表,可以针对一些问题与业务跟进,明确后续的优化方向等。
一、实现
实现原理如下图:
其中核心部分zbx_statis,其实就是我编写的一个python脚本,它会从zabbixDB中查询过去一周的所有报警信息,并按不同维度统计每周的报表上传到公司的git上,同时将一条汇总的sql插入到cmdb的库表中展示。
二、格式依赖
报表的分析统计可以分两个维度:
报警类型纬度
业务纬度
不管从哪个维度进行的统计,都需要一个前提:报警格式规范化。
针对报警内容的需求,我们对zabbix的trigger名称、主机名hostname等进行了规范化。
举例:
[17][15:31:04][productname-test-mysql-00][PROBLEM][005][cpu idle too low (<30%)][0.10 %][负责人:张学岩][15:31:07]
productname-test-mysql-00 是主机名,按业务等级进行命名,用于报警统计的业务纬度统计;
cpu idle too low (<30%) 是报警的类型,可以据此项进行类型纬度的统计;
通过维护一个主要业务列表,然后根据hostname匹配可以从业务纬度进行统计; 通过将报警类型规范化,用固定的格式放在报警信息的固定位置,可以按类型进行统计。
三、报表展示
以下是截取的部分报表的展示。
按报警类型纬度:
最下面的详细信息跳转即业务纬度的统计。
按业务纬度:
CMDB统计图表:
很直观的展示每周的报警数量,如果优化比较好的话,会看到整体应该是下降的趋势。
四、附件
上文提到的报警统计 python 脚本,写的时间比较久了,现在看内容还是比较杂乱,我也懒得改了,放出来供大家参考,内容如下:
#!/usr/bin/env python26
# encoding: utf-8
import MySQLdb
import traceback
import copy
import datetime
import time
import operator
import sys
reload(sys)
sys.setdefaultencoding( "utf-8" )
HOST = 'zabbix_db_host'
DB = 'zabbix'
PORT = 3306
RETRY_TIMES = 3
# 业务类型
GROUP_TYPE = ['A',
'B',
'C',
'...',
'X',
'Y',
]
BASE_DIR = 'alerts_statistic/'
START_TIME = (datetime.datetime.now() - datetime.timedelta(days=(7 + datetime.datetime.now().weekday()))).strftime("%Y%m%d")
END_TIME = (datetime.datetime.now() - datetime.timedelta(days=(datetime.datetime.now().weekday()))).strftime("%Y%m%d")
DAY_SUM = 0
NIGHT_SUM = 0
class Connection:
def __init__(self, *args, **kwargs):
self.args = args
self.kwargs = kwargs
self.kwargs['user'] = "user"
self.kwargs['passwd'] = "password"
self.kwargs['port'] = kwargs['port'] if kwargs.has_key("port") else 3306
self.kwargs['db'] = kwargs['db'] if kwargs.has_key("db") else "information_schema"
self.kwargs['connect_timeout'] = 1
def get_connection(self):
ret = {"errno":0, 'errmsg':"", 'value':None}
conn = None
try:
for i in range(0, RETRY_TIMES):
conn = MySQLdb.connect(*self.args, **self.kwargs)
if conn:
break
ret['value'] = conn
except Exception, err:
ret['error'] = -1
ret['errmsg'] = self.kwargs['host'] + str(err)
traceback.print_exc()
finally:
return ret
def create_connection(*args, **kwargs):
__conn__ = Connection(*args, **kwargs)
ret = __conn__.get_connection()
if ret['errno']:
return None
else:
return ret['value']
def get_alert():
start_timestamp = int(time.mktime(datetime.datetime.strptime(START_TIME + ' 00:00:00', "%Y%m%d %H:%M:%S").timetuple()))
end_timestamp = int(time.mktime(datetime.datetime.strptime(END_TIME + ' 00:00:00', "%Y%m%d %H:%M:%S").timetuple()))
try:
conn = create_connection(host = HOST, db = DB, port = PORT, charset = 'utf8')
if conn:
SQL = """select from_unixtime(a.clock),a.subject from alerts a,events b left join triggers c on b.objectid=c.triggerid where a.eventid=b.eventid and a.alerttype=0 and a.subject not like '%test-%' and a.subject not like '%-test%' and a.clock>={start_time} and a.clock<{end_time} group by a.subject order by a.clock"""
SQL = SQL.format(start_time=start_timestamp, end_time=end_timestamp)
print SQL
cursor = conn.cursor()
cursor.execute(SQL)
ret = cursor.fetchall()
cursor.close()
conn.close()
return ret
except Exception,e:
pass
def alert_statistic(alert_list):
result = {}
alerts_list = []
if alert_list:
for alert in alert_list:
alerts_list.append(alert)
for group in GROUP_TYPE:
alerts = []
alerts_list2 = copy.copy(alerts_list)
for alert in alerts_list2:
if group in alert[1]:
alerts.append(alert)
alerts_list.remove(alert)
result[group] = alerts
result['other'] = alerts_list
result['status'] = 0
else:
result['status'] = 1
return result
def write_to_file(result, day):
if result:
#statis_date = datetime.datetime.now().strftime("%Y-%m-%d")
file_name = BASE_DIR + 'detail/' + START_TIME + '-' + END_TIME + '_' + day + '.md'
writer = open(file_name,'w')
writer.write('## ' + START_TIME + ' - ' + END_TIME + ': ' + day)
alert_sum = 0
for group in result.keys():
alert_sum = alert_sum + len(result[group])
writer.write('\n\n**短信总数:' + str(alert_sum) + '**')
for group in result.keys():
#print group, ":", len(result[group])
if group == 'status':
continue
if len(result[group]) == 0:
continue
writer.write('\n\n### ' + group + '(' + str(len(result[group])) + ')' + '\n\n')
writer.write('|报警内容|报警时间|\n|---|---|\n')
for alert in result[group]:
writer.write('|' + str(alert[1]) + '|' + str(alert[0]) + '|\n')
writer.close()
def day_night_split(result, day='light'):
results = {}
if result:
for group in result.keys():
if group == 'status':
continue
alerts = result[group]
alerts2 = copy.copy(alerts)
for alert in alerts:
alert_time = alert[0]
alert_hour_time = alert_time.strftime("%H")
if int(alert_hour_time) >= 7 and day == 'light':
pass
elif int(alert_hour_time) >= 7 and day == 'night':
alerts2.remove(alert)
elif int(alert_hour_time) < 7 and day == 'light':
alerts2.remove(alert)
elif int(alert_hour_time) < 7 and day == 'night':
pass
results[group] = alerts2
return results
def alert_groupby(alert_list):
alerts = []
alert_group = []
group_list = []
for group in alert_list.keys():
if group == 'status':
continue
for alert in alert_list[group]:
alerts.append(alert[1])
for alert in alerts:
### 兼容添加trigger id的改动
temp1 = alert.split('][')
alert_type = ''
if len(temp1) == 9:
alert_type = alert.split('][')[5].split(',')[0]
else:
alert_type = alert.split('][')[4].split(',')[0]
if alert_type not in alert_group:
alert_group.append(alert_type)
for type in alert_group:
type_dict = {}
count = 0
hostlist = []
for alert in alerts:
if type == alert.split('][')[4].split(',')[0] or type == alert.split('][')[5].split(',')[0]:
count = count + 1
hostname = alert.split('][')[2]
if hostname not in hostlist:
hostlist.append(hostname)
type_dict['type'] = type
type_dict['hostlist'] = ",".join(hostlist)
type_dict['count'] = str(count)
group_list.append(type_dict)
group_list.sort(key=lambda x : int(x['count']), reverse=True)
return group_list
def write_group(group_light, group_night):
if group_light and group_night:
file_name = BASE_DIR + START_TIME + '-' + END_TIME + '.md'
file_detail_light = START_TIME + '-' + END_TIME + '_白天' + '.md'
file_detail_night = START_TIME + '-' + END_TIME + '_夜间' + '.md'
writer = open(file_name,'w')
writer.write('## ' + START_TIME + '-' + END_TIME + '\n\n')
## light
alert_sum = 0
for group in group_light:
alert_sum = alert_sum + int(group['count'])
global DAY_SUM
DAY_SUM = alert_sum
writer.write('### ' + '白天:' + str(alert_sum) + '\n\n')
writer.write("|报警类型|报警数量|报警主机|\n|---|---|---|\n")
for group in group_light:
writer.write("|" + group['type'] + "|" + group['count'] + "|" + group['hostlist'] + "|\n")
writer.write("\n[详细报警信息](detail/" + file_detail_light + ")\n\n")
## night
alert_sum = 0
for group in group_night:
alert_sum = alert_sum + int(group['count'])
global NIGHT_SUM
NIGHT_SUM = alert_sum
writer.write('### ' + '夜间:' + str(alert_sum) + '\n\n')
writer.write("|报警类型|报警数量|报警主机|\n|---|---|---|\n")
for group in group_night:
writer.write("|" + group['type'] + "|" + group['count'] + "|" + group['hostlist'] + "|\n")
writer.write("\n[详细报警信息](detail/" + file_detail_night + ")\n\n")
writer.close()
def write_trend(sql):
host = 'cmdb_host'
db = 'cmdb_db'
port = 3306
try:
conn = create_connection(host = host, db = db, port = port, charset = 'utf8')
if conn:
SQL = sql
# SQL = SQL.format(start_time=start_timestamp, end_time=end_timestamp)
print SQL
cursor = conn.cursor()
cursor.execute(SQL)
ret = cursor.fetchall()
cursor.close()
conn.commit()
conn.close()
return ret
except Exception,e:
print e
def git_push():
import os
os.system("cd alerts_statis && git add alerts_statis && git commit -m 'update' && git push")
if __name__ == '__main__':
# alert 列表
alert_list = get_alert()
# 按业务进行统计
result = alert_statistic(alert_list)
if result['status'] == 0:
# 区分白天夜间
result_day = day_night_split(result, 'light')
result_night = day_night_split(result, 'night')
# 按报警类型划分
light_alert = alert_groupby(result_day)
night_alert = alert_groupby(result_night)
# 写入文件
write_group(light_alert, night_alert)
write_to_file(result_day, '白天')
write_to_file(result_night, '夜间')
git_push()
sql = 'insert into alerts (start_time,end_time,all_count,day_count,night_count)values("' + START_TIME + '","' + END_TIME + '",' + str(DAY_SUM + NIGHT_SUM) + ',' + str(DAY_SUM) + ',' + str(NIGHT_SUM) + ');'
# 写入cmdb
write_trend(sql)
else:
print('There\'s no alert warning or something error.')
End
以上是关于zabbix短信报警统计以及报表展示的主要内容,如果未能解决你的问题,请参考以下文章