[时间维度]日志数据提取事件关键词，解析对应时间点计数，matplotlib绘制统计图，python

Posted 2022-09-15 zhangphil

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了[时间维度]日志数据提取事件关键词，解析对应时间点计数，matplotlib绘制统计图，python相关的知识，希望对你有一定的参考价值。

[时间维度]日志数据提取事件关键词，解析对应时间点计数，matplotlib绘制统计图，python（2）

[时间维度]日志数据提取事件关键词，解析对应时间点计数，matplotlib绘制统计图，python_zhangphil的博客-CSDN博客2）因为数据文件中的行数据时海量的，也意味着高价值事件很多，同样，这条事件对应的时间点也很多。需要把事件发生时间划入到不同的时间段（时间区间），比如（11，12）代表11点到12点，（1，2）代表凌晨1点到2点。（1）数据文件中，对每一行数据模糊查找是否包含了所要寻找的关键词，如果该行以一定的概率（>80%可能性）存在目标关键词，那么该行即为认定为目标高价值数据事件行，然后再从该行中提取事件时间点。比如（9，10），计数5，表示发生再9点到10点（不包含10点）的事件为5件。...............https://blog.csdn.net/zhangphil/article/details/125923359

对上篇的技术实现加以改进：

（1）日期匹配把原先的中文字符年月日时替换掉，用标准的parse匹配。

（2）不再自己实现统计，而是用python的counter统计事件次数。

（3）关键词搜索支持一次性多个匹配查找。

import re

import matplotlib
import matplotlib.pyplot as plt

import dateutil.parser as ps

from collections import Counter

from pprint import pp

import pandas as pd
from fuzzywuzzy import fuzz

FILE_PATH = r'数据文件路径'
KEYS = [r'关键词1', r'关键词2', r'关键词3']
threshold = 90


def change_str(date_s):
    date_s = date_s.replace(r'年', '-')
    date_s = date_s.replace(r'月', '-')
    date_s = date_s.replace(r'日', ' ')

    date_s = date_s.replace(r'时', ':')
    date_s = date_s.replace(r'分', '')

    return date_s


def read_file():
    file = open(FILE_PATH, 'r', encoding='UTF-8')

    all_case_time = []

    case_count = 0
    cnt = 0

    for line in file:
        cnt = cnt + 1
        for k in KEYS:
            pr = fuzz.partial_ratio(line, k)
            if pr >= threshold:
                print('-----')
                print(f'第case_count件')
                case_count = case_count + 1

                try:
                    # 正则匹配 xxxx年xx月xx日xx时xx分
                    mat = re.search(r'\\d4\\年\\d1,2\\月\\d1,2\\日\\d1,2\\时\\d1,2\\分', line)
                    t_str = mat.group().replace('\\n', '')  # 去掉正则匹配到但是多余的 \\n 换行符

                    try:
                        t_str = change_str(t_str)
                        all_case_time.append(ps.parse(t_str))  # 日期提取出来，放到数组中
                    except:
                        print('解析日期失败')
                except:
                    t_str = '-解析异常-'

                s = '第number行,相似度ratio,时间:case_time - content'
                ss = s.format(number=cnt, ratio=pr, case_time=t_str, content=line)
                pp(ss)

                break

        # 快速调试
        #if case_count > 10:
        #    break

    file.close()

    return all_case_time


def date_to_points():
    date_times = read_file()

    hours = []
    for i in range(24):
        hours.append((i, i + 1))

    points = []
    for dt in date_times:
        for h in hours:
            if h[0] <= dt.hour < h[1]:
                points.append(h)
                break

    to_chart(points)


def draw_chart(df):
    myfont = matplotlib.font_manager.FontProperties(fname='C:\\Windows\\Fonts\\msyh.ttc')
    plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号
    plt.rc('font', family='YaHei', weight='bold')

    order = []
    name = []
    mem = []
    for d, i in zip(df.values, df.index):
        order.append(i)
        name.append(d[0])
        mem.append(int(d[1]))

    FONT_SIZE = 12

    fig, ax = plt.subplots(figsize=(15, 13))

    b = ax.barh(y=range(len(name)), width=mem, align='center', color='red')

    # 为横向水平的柱图右侧添加数据标签。
    i = 0
    for rect in b:
        w = rect.get_width()
        ax.text(x=w, y=rect.get_y() + rect.get_height() / 2, s='%d' % (int(w)),
                horizontalalignment='left', verticalalignment='center',
                fontproperties=myfont, fontsize=FONT_SIZE - 2, color='green')
        ax.text(x=w / 2, y=rect.get_y() + rect.get_height() / 2, s=str(order[i]),
                horizontalalignment='center', verticalalignment='center',
                fontproperties=myfont, fontsize=FONT_SIZE - 3, color='white')
        i = i + 1

    ax.set_yticks(range(len(name)))
    ax.set_yticklabels(name, fontsize=FONT_SIZE - 1, fontproperties=myfont)

    ax.invert_yaxis()

    ax.set_xlabel('数据样本', fontsize=FONT_SIZE + 2, fontproperties=myfont)
    ax.set_title('不同时间段的数据点总量排名', fontsize=FONT_SIZE + 5, fontproperties=myfont)

    # 不要横坐标上的label标签。
    plt.xticks(())

    # 清除四周的边框线
    ax.get_yaxis().set_visible(True)
    for spine in ["left", "top", "right", "bottom"]:
        ax.spines[spine].set_visible(False)

    plt.subplots_adjust(left=0.15)  # 调整左侧边距

    # ax.margins(y=0.01) #缩放 zoom in

    ax.set_aspect('auto')

    plt.show()


def to_chart(points):
    # 找出出现次数最多的点数
    c_xy = Counter(points)
    # 再选取前n多的点数
    max_count = c_xy.most_common(24)
    print('max_count', max_count)

    # 数据组装成pandas数据帧。
    pd_data = []
    total = 0
    for mc in max_count:
        total = total + mc[1]
        pd_data.append([mc[0], mc[1]])

    print('总计', total)
    col = ['时间段', '时间点次数']
    df = pd.DataFrame(data=pd_data, columns=col)
    df = df.sort_values(by=col[1], axis=0, ascending=False)  # 降序

    # 重置索引
    df = df.reset_index(drop=True)
    df.index = df.index + 1

    # 前20
    pp(df.head(20))
    # pp(df.values)

    draw_chart(df)


if __name__ == '__main__':
    date_to_points()

输出：

total 1376

...

以上是关于[时间维度]日志数据提取事件关键词，解析对应时间点计数，matplotlib绘制统计图，python的主要内容，如果未能解决你的问题，请参考以下文章

[时间维度]日志数据提取事件关键词，解析对应时间点计数，matplotlib绘制统计图，python

[星期维度]日志数据提取事件关键词，解析对应日期的星期计数，matplotlib绘制统计图，python

[月份维度]日志数据提取包含关键词的事件，解析落入的月份计数，matplotlib绘制统计图，python

数据仓库建模-维度建模