大数据(8r)分时段点击量统计待完成
Posted 小基基o_O
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了大数据(8r)分时段点击量统计待完成相关的知识,希望对你有一定的参考价值。
文章目录
1、实时计算:最近1小时每分钟点击量统计
1.1、Python实现
import datetime
from random import choice, randint
from time import sleep
from time import strftime
from collections import Counter
queue = [] # 消息队列
db_pv = Counter() # 存储计算后的pv
db_uv = dict() # 存储计算后的uv
def last_hour(minute=60):
"""最近1小时的每分钟"""
now = datetime.datetime.now()
for i in range(minute, 0, -1):
yield (now - datetime.timedelta(minutes=i)).strftime('%Y-%m-%d %H:%M')
def pv(name, page, t):
"""最近1小时pv按分钟统计"""
t = datetime.datetime.strptime(t, '%Y-%m-%d %H:%M:%S')
hm = t.strftime('%Y-%m-%d %H:%M')
db_pv[hm] += 1
def uv(name, page, t):
"""最近1小时uv按分钟统计"""
t = datetime.datetime.strptime(t, '%Y-%m-%d %H:%M:%S')
hm = t.strftime('%Y-%m-%d %H:%M')
if hm not in db_uv:
db_uv[hm] = set()
db_uv[hm].add(name)
def last_hour_pv_print(minute=60):
"""每分钟打印一次pv"""
while True:
if datetime.datetime.now().second == 0:
print('\\033[93m' + 'PV'.center(50, '-') + '\\033[0m')
for hm in last_hour(minute):
print('\\033[93m', hm, db_pv.get(hm, 0), '\\033[0m')
sleep(59)
def last_hour_uv_print(minute=60):
"""每分钟打印一次uv"""
while True:
if datetime.datetime.now().second == 1:
print('\\033[93m' + 'UV'.center(50, '-') + '\\033[0m')
for hm in last_hour(minute):
print('\\033[93m', hm, len(db_uv.get(hm, set())), '\\033[0m')
sleep(59)
def producer(second=9):
"""生产随机数据:second越小,生产速度越快"""
names = ('剑圣', '剑圣', '剑圣', '先知', '先知', '巫妖')
pages = ('page1', 'page1', 'page2')
while True:
line = (choice(names), choice(pages), strftime('%Y-%m-%d %H:%M:%S'))
queue.append(line)
print('生产:' + ','.join(line))
sleep(randint(0, second))
def consumer(second=0):
"""消费数据:second越小,消费速度越快"""
while True:
while queue:
name, page, t = queue.pop(0)
print('\\033[35m消费', (name, page, t), '\\033[0m')
pv(name, page, t)
uv(name, page, t)
sleep(second)
if __name__ == '__main__':
from threading import Thread
Thread(target=producer, args=(3,)).start()
Thread(target=consumer, args=(9,)).start()
Thread(target=last_hour_pv_print, args=(5,)).start()
Thread(target=last_hour_uv_print, args=(5,)).start()
"""
待解决:
1、db_uv存的不是最终数值
2、计算值有延时
"""
1.2、Spark实现
2、离线计算:昨天每小时点击量
放个excel折线图
2.1、Python实现
import datetime
lines = '''
剑圣,page1,2021-3-3 07:01:01
剑圣,page1,2021-3-3 07:01:01
剑圣,page2,2021-3-3 07:01:01
先知,page1,2021-3-3 08:01:01
巫妖,page1,2021-3-3 08:01:01
巫妖,page1,2021-3-3 08:01:01
剑圣,page1,2021-3-3 09:01:01
先知,page1,2021-3-3 11:01:01
剑圣,page2,2021-3-3 11:01:01
先知,page1,2021-3-3 13:01:01
守望者,page1,2021-3-4 01:01:01
'''.strip().split('\\n')
# 分时段pv统计
yesterday = datetime.date.today() - datetime.timedelta(days=1)
hours = {i: 0 for i in range(24)}
for line in lines:
name, page, t = line.split(',')
t = datetime.datetime.strptime(t, '%Y-%m-%d %H:%M:%S')
ymd = t.date()
if ymd != yesterday:
continue
hour = t.hour
hours[hour] += 1
print(hours)
# 分时段uv统计
yesterday = datetime.date.today() - datetime.timedelta(days=1)
hours = {i: set() for i in range(24)}
for line in lines:
name, page, t = line.split(',')
t = datetime.datetime.strptime(t, '%Y-%m-%d %H:%M:%S')
ymd = t.date()
if ymd != yesterday:
continue
hour = t.hour
hours[hour].add(name)
hours = {k: len(v) for k, v in hours.items()}
print(hours)
2.2、Spark实现
2.3、HIVE实现
创建数据
每小时PV
每小时UV
3、补充:
3.1、PV和UV
概念 | 全称 | 译名 |
---|---|---|
PV | Page View | 页面浏览量 |
UV | Unique Visitor | 独立访客 |
3.2、日期维度SQL
HIVE创建数据
-- 删库建库
DROP DATABASE IF EXISTS b1;
CREATE DATABASE b1;
USE b1;
-- 建表
CREATE TABLE t1(
page STRING COMMENT "点击页面",
clicks INT COMMENT "点击数",
click_date DATE COMMENT "点击日期");
-- 插入数据
INSERT INTO TABLE t1 VALUES
('Page1',12,'2018-6-11'),('Page2',13,'2018-6-11'),
('Page1',15,'2018-6-12'),('Page2',45,'2018-6-12'),
('Page1',11,'2018-6-13'),('Page2',3,'2018-6-13'),
('Page1',17,'2018-6-14'),('Page2',13,'2018-6-15'),
('Page1',19,'2018-6-20'),('Page2',14,'2018-6-22'),
('Page1',10,'2018-6-25'),('Page2',13,'2018-6-28'),
('Page1',22,'2018-7-1'),('Page2',3,'2018-7-1'),
('Page1',32,'2018-7-15'),('Page2',19,'2018-7-23'),
('Page1',33,'2018-8-1'),('Page2',55,'2018-8-1'),
('Page1',5,'2018-8-2'),('Page2',44,'2018-8-3'),
('Page1',12,'2019-11-27'),('Page2',64,'2019-11-28');
SELECT * FROM t1;
按年汇总
SELECT
YEAR(click_date),
SUM(clicks)
FROM t1
GROUP BY YEAR(click_date);
按月汇总
SELECT
YEAR(click_date),
MONTH(click_date),
SUM(clicks)
FROM
t1
GROUP BY
YEAR(click_date),
MONTH(click_date)
;
按天汇总
SELECT
YEAR(click_date),
MONTH(click_date),
DAY(click_date),
SUM(clicks)
FROM
t1
GROUP BY
YEAR(click_date),
MONTH(click_date),
DAY(click_date)
;
按页面按月汇总
SELECT
page,
CONCAT(YEAR(click_date),'-',MONTH(click_date)),
SUM(clicks)
FROM
t1
GROUP BY
page,
YEAR(click_date),
MONTH(click_date)
;
以上是关于大数据(8r)分时段点击量统计待完成的主要内容,如果未能解决你的问题,请参考以下文章