Hive电子商务消费行为分析项目
Posted 数据攻城小狮子
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Hive电子商务消费行为分析项目相关的知识,希望对你有一定的参考价值。
文章目录
数据说明
某零售企业的门店最近一年收集的数据
customer_details.csv:客户信息
transaction_details.csv:交易信息
store_details.csv:门店信息
store_review.csv:评价信息
环境准备
centos 7虚拟机,Hadoop+Hive+Zeppelin
启动Hadoop、Hive、Zeppelin
./hadoop/sbin/start-all.sh
nohup hive --service hiveserver2 &
./zeppelin09/bin/zeppelin-daemon.sh start
打开Zeppelin页面(hive为虚拟机名称,也可填虚拟机ip)
http://hive:8000/
项目代码
上传数据文件并创建数据表
进入到存放数据的虚拟机目录并查看文件信息
%sh
cd /workspace/hive/store/
wc -l customer_details.csv
wc -l store_details.csv
wc -l store_review.csv
wc -l transaction_details.csv
head -2 customer_details.csv
head -2 store_details.csv
head -2 store_review.csv
head -2 transaction_details.csv
将数据上传至hdfs目录
%sh
cd /workspace/hive/store/
hdfs dfs -rm -r -f -skipTrash /data/shopping/
hdfs dfs -mkdir -p /data/shopping/customer/
hdfs dfs -put customer_details.csv /data/shopping/customer/
hdfs dfs -mkdir -p /data/shopping/transaction/
hdfs dfs -put transaction_details.csv /data/shopping/transaction/
hdfs dfs -mkdir -p /data/shopping/store/
hdfs dfs -put store_details.csv /data/shopping/store/
hdfs dfs -mkdir -p /data/shopping/review/
hdfs dfs -put store_review.csv /data/shopping/review/
hdfs dfs -ls -R /data/shopping
创建hive数据表并载入数据
%hive
create database if not exists shopping;
use shopping;
create external table if not exists ext_customer_details(
customer_id string,
first_name string,
last_name string,
email string,
gender string,
address string,
country string,
language string,
job string,
credit_type string,
credit_no string
)
row format serde 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
location '/data/shopping/customer'
tblproperties("skip.header.line.count"="1")
%hive
use shopping;
create external table if not exists ext_transaction_details(
transaction_id string,
customer_id string,
store_id string,
price decimal(8,2),
product string,
purchase_date date,
purchase_time string
)
row format serde 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
location '/data/shopping/transaction'
tblproperties("skip.header.line.count"="1")
%hive
use shopping;
create external table if not exists ext_store_details(
store_id string,
store_name string,
employee_number string
)
row format serde 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
location '/data/shopping/store'
tblproperties("skip.header.line.count"="1")
%hive
use shopping;
create external table if not exists ext_store_review(
transaction_id string,
store_id string,
review_score string
)
row format serde 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
location '/data/shopping/review'
tblproperties("skip.header.line.count"="1")
数据清洗
%hive
create view if not exists vw_customer_details as select
customer_id,
first_name,
unbase64(last_name) as last_name,
unbase64(email) as email,
gender,
unbase64(address) as address,
country,job,credit_type,
unbase64(concat(unbase64(credit_no),'seed')) as credit_no
from ext_customer_details
%hive
create table if not exists transaction_details(
transaction_id string,
customer_id string,
store_id string,
price decimal(8,2),
product string,
purchase_date date,
purchase_time string
)
partitioned by (purchase_month string)
%hive
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nostrick;
with base as(
select transaction_id,customer_id,store_id,price,product,purchase_date,purchase_time,
from_unixtime(unix_timestamp(purchase_date,'yyyy-MM-dd'),'yyyy-MM') as purchase_month,
row_number() over(partition by transaction_id order by store_id) as rn
from ext_transaction_details
where customer_id<>'customer_id'
)
from base
insert overwrite table transaction_details partition(purchase_month)
select
if(rn=1,transaction_id,concat(transaction_id,'_fix',rn)) as transaction_id,
customer_id,store_id,price,product,purchase_date,purchase_time,purchase_month;
select transaction_id,customer_id,store_id,price,product,purchase_date,purchase_time,purchase_month from transaction_details where transaction_id like '%fix%';
%hive
select count(*) from ext_store_review r join ext_transaction_details t on
r.transaction_id=t.transaction_id and r.store_id=t.store_id
where review_score<>''
%hive
select count(*) from ext_store_review where review_score <>'';
%hive
create view if not exists vw_store_review as
select transaction_id,review_score from ext_store_review where review_score<>''
数据可视化
客户分析
最受客户欢迎的信用卡
%hive
select credit_type,count(distinct credit_no) as credit_cnt
from vw_customer_details group by country,credit_type order by credit_cnt desc;
排名前5的客户职业
%hive
select job,count(*) as pn from vw_customer_details group by job order by pn desc limit 5;
美国女性客户持有的排名前3的信用卡
%hive
select credit_type,count(*) as ct from vw_customer_details
where country='United States' and gender='Female'
group by credit_type order by ct desc limit 3;
按国家和性别进行客户统计
%hive
select count(*),country,gender from vw_customer_details group by country,gender;
交易分析
按月统计总收益
%hive
select sum(price) as revenue_mom,purchase_month from transaction_details group by purchase_month;
按季度统计总收益
%hive
with base as(select price,
concat_ws('-',substr(purchase_date,1,4),cast(ceil(month(purchase_date)/3.0)as string)) as year_quarter
from transaction_details)
select sum(price) as revenue_qoq,year_quarter from base group by year_quarter;
按年统计总收益
%hive
select sum(price) as revenue_mom,substr(purchase_date,1,4) as year
from transaction_details group by substr(purchase_date,1,4);
统计每周各天的总收益
%hive
select sum(price) as revenue_wow,date_format(purchase_date,'u') as weekday
from transaction_details group by date_format(purchase_date,'u');
按时间段统计平均收益和总收益
%hive
with base as(
select price, purchase_time, if(purchase_time like '%PM',
concat_ws(':',string(hour(from_unixtime(unix_timestamp(purchase_time,'hh:mm')))+12),
string(minute(from_unixtime(unix_timestamp(purchase_time,'hh:mm'))))),
from_unixtime(unix_timestamp(purchase_time,'hh:mm'),'HH:mm')) as time_format
from transaction_details
),
timeformat as (
select
purchase_time,price,
(cast(split(time_format,':')[0] as decimal(4,2))+ cast(split(time_format,':')[1] as decimal(4,2))/60)
as purchase_time_in_hrs
from base
),
timebucket as (
select
price,purchase_time, purchase_time_in_hrs,
if(purchase_time_in_hrs>5 and purchase_time_in_hrs <=8,'early morning',
if(purchase_time_in_hrs >8 and purchase_time_in_hrs <=11,'morning',
if(purchase_time_in_hrs>11 and purchase_time_in_hrs<=13,'noon',
if(purchase_time_in_hrs >13 and purchase_time_in_hrs <=18,'afternoon',
if(purchase_time_in_hrs>18 and purchase_time_in_hrs <=22,'evening', 'night'))))) as time_bucket from timeformat
)
select time_bucket, avg(price) as avg_spend, sum(price)/1000 as revenue_k
from timebucket group by time_bucket -- divide 1k to see the chater more clear;
统计每周各天的平均收益
%hive
select avg(price) as avg_price,date_format(purchase_date,'u') as weekday from transaction_details
where date_format(purchase_date,'u') is not null group by date_format(purchase_date,'u');
统计年度-月度的总交易量
%hive
with base as (select
transaction_id,date_format(purchase_date,'u') as weekday,purchase_month,
concat_ws('-', substr(purchase_date,1,4),
cast(ceil(month(purchase_date)/3.0) as string)) as year_quarter,substr(purchase_date,1,4)as year
from transaction_details where purchase_month is not null)
select count(distinct transaction_id) as total,weekday,purchase_month,year_quarter,year
from base group by weekday, purchase_month,year_quarter,year order by year,purchase_month
统计消费次数排行榜的前10位客户
%hive
with base as (
select customer_id,count(distinct transaction_id) as trans_cnt,sum(price) as spend_total
from transaction_details where purchase_month is not null group by customer_id),
cust_detail as(
select td.*,first_name as cust_name from
base td join vw_customer_details cd on td.customer_id=cd.customer_id)
select trans_cnt,cust_name as top10_trans_cust from cust_detail order by trans_cnt desc limit 10;
统计消费额排名前10的客户
%hive
with base as (
select
customer_id,
count(distinct transaction_id) as trans_cnt,
sum(price) as spend_total
from transaction_details
where purchase_month is not null
group by customer_id
),
cust_detail as (
select td.*,first_name as cust_name from
base td join vw_customer_details cd on td.customer_id =cd.customer_id
)
select spend_total,cust_name as top10_trans_cust from cust_detail order by spend_total desc limit 10;
统计周期内消费次数最少的客户
%hive
with base as (select customer_id,count(distinct transaction_id) as trans_cnt
from transaction_details where purchase_month is not null group by customer_id)
select * from base order by trans_cnt limit 10;
统计每年度-季度客户总数
%hive
with base as (select customer_id,
concat_ws('-',substr(purchase_date,1,4),
cast(ceil(month(purchase_date)/3.0) as string)) as year_quarter,substr(purchase_date,1,4) as year
from transaction_details where purchase_month is not null)
select count(distinct customer_id) as total, year_quarter, year
from base group by year_quarter,year order by year_quarter;
统计最大的客户平均消费额
%hive
with base as (select customer_id,avg(price) as price_avg,max(price)as price_max
from transaction_details where purchase_month is not null group by customer_id)
select max(price_avg) from base;
统计每月最高消费额与最常来的客户
%hive
with base as(
select customer_id,purchase_month,sum(price) as price_sum, count(transaction_id) as trans_cnt
from transaction_details where purchase_month is not null group by purchase_month,customer_id),
rank_sum as (select
rank() over(partition by purchase_month order by price_sum desc) as rn_sum,
rank() over(partition by purchase_month order by trans_cnt desc) as rn_cnt,
purchase_month,price_sum,trans_cnt,customer_id from base)
select purchase_month,'spend' as measure_name,price_sum as measure_value,customer_id
from rank_sum where rn_sum=1
union all
select purchase_month,'visit' as measure_name,trans_cnt as measure_value,customer_id
from rank_sum where rn_cnt =1 order by measure_name, purchase_month;
基于消费额统计受欢迎程度排名前5的商品并进行验证
%hive
select product,sum(price) as price_sum from transaction_details
where purchase_month is not null group by product order by price_sum desc limit 5;
基于购买频次统计受欢迎程度排名前5的商品并进行验证
%hive
select product,count(transaction_id) as freq_buy from transaction_details
where purchase_month is not null group by product order by freq_buy desc limit 5;
基于客户数量统计受欢迎程度排名前5的商品并进行验证
%hive
select product,count(customer_id) as freq_cust from transaction_details
where purchase_month is not null group by product order by freq_cust desc limit 5;
门店分析
按客流量统计最受欢迎的门店
%hive
select sd.store_name,count(distinct customer_id) as unique_visit
from transaction_details td join ext_store_details sd on td.store_id=sd.store_id
group by store_name order by unique_visit desc limit 5;
按客户消费额统计最受欢迎的门店
%hive
select sd.store_name,sum(td.price) as total_revnue from
transaction_details td join ext_store_details sd on td.store_id=sd.store_id
group by store_name order by total_revnue desc limit 5;
按交易频次统计最受欢迎的门店
%hive
select sd.store_name,count(transaction_id) as unique_purchase
from transaction_details td join ext_store_details sd on td.store_id=sd.store_id
group by store_name order by unique_purchase desc limit 5;
按客流量统计每个门店最受欢迎的商品
%hive
with base as (select store_id,product,count(distinct customer_id) as freq_cust
from transaction_details where purchase_month is not null group by store_id, product),
prod_rank as (select store_id,product,freq_cust,
rank(Hive实战 --- 电子商务消费行为分析