1.背景及问题
现某IT产品销售公司,有一定量的小公司水平的用户,这些用户在做出购买时,会接触到销售公司的多个营销渠道,不同的渠道上投入怎样分配,以实现营销效益的最大化,便成为了很多公司的市场营销部门亟需解决的问题。
即:找出转化率最高的渠道路径或方式
2.思路步骤
-
线性模型分析
-
马尔科夫链分析
-
可视化马尔科夫链
转换率计算
-
第一次点击 用户访问路径上的第一个触点获取所有贡献值
-
最后一次点击 用户购买之前最后一个触点获取所有贡献值
-
线性模型分析 用户访问路径上的所有触点平分贡献值
-
马尔科夫链 马尔科夫链的转移矩阵 -> 每个触点的移除效应-> 触点贡献值
3.数据集介绍
Id: 某IT产品销售公司的客户,客户类型是小公司
Segment: 客户的画像
Channel:客户生命周期中触及过的渠道; DM(直邮),EM(电子邮件), PHONE(电话)和 WEB(产品销售官网浏览记录)
Date: 客户触及渠道的日期,触及时间长度为1年
Pur_flag: 等于1表示该客户在接触完相应渠道后,完成了IT产品的购买
4.代码及具体步骤
导入模块
import numpy as np
import pandas as pd
import networkx as nx
from pprint import pprint
import os
import matplotlib.pyplot as plt
os.chdir(r\'C:/Users/pc/Desktop/数据分析项目/客户转化分析/\')
# 导入数据,将第四列解析为日期格式
df = pd.read_csv(r\'./ChannelAttribute.csv\', parse_dates=[3])
df.head()
id | segment | channel | date | pur_flag | |
---|---|---|---|---|---|
0 | 20398764672 | Tier 2 | DM | 2018-03-19 | 0 |
1 | 20408399343 | Tier 2 | WEB | 2017-09-27 | 0 |
2 | 20438922645 | Tier 2 | WEB | 2017-11-15 | 0 |
3 | 20225918468 | Tier 2 | DM | 2017-05-24 | 0 |
4 | 20278581048 | Tier 3 | DM | 2018-04-23 | 0 |
创建路径数据
def create_path_with_value(data, element):
path = []
path.append(\'start\')
df2 = data.loc[data[\'id\'] == element, :].sort_values([\'id\', \'date\'], ascending=[False, True])
for i in range(len(df2)):
path.append(df2.iloc[i][\'channel\'])
if df2[\'pur_flag\'].unique() == 1:
path.append(\'conversion\')
conv = 1
conv_null = 0
else:
path.append(\'null\')
conv = 0
conv_null = 1
return [path, conv, conv_null]
final_path, conv, conv_null = [], [], []
for element in df[\'id\'].unique():
rst = create_path_with_value(df, element)
final_path.append(rst[0])
conv.append(rst[1])
conv_null.append(rst[2])
# 路径数据
path_data = pd.DataFrame({\'path\': final_path, \'conv\': conv, \'conv_nulls\': conv_null})
path_data.head(10)
path | conv | conv_nulls | |
---|---|---|---|
0 | [start, WEB, EM, DM, null] | 0 | 1 |
1 | [start, EM, WEB, DM, null] | 0 | 1 |
2 | [start, WEB, EM, DM, null] | 0 | 1 |
3 | [start, DM, EM, WEB, null] | 0 | 1 |
4 | [start, EM, WEB, DM, null] | 0 | 1 |
5 | [start, PHONE, EM, DM, WEB, null] | 0 | 1 |
6 | [start, PHONE, WEB, DM, null] | 0 | 1 |
7 | [start, DM, PHONE, WEB, null] | 0 | 1 |
8 | [start, WEB, EM, DM, conversion] | 1 | 0 |
9 | [start, PHONE, WEB, DM, null] | 0 | 1 |
归因分析(最后一次点击、第一次点击和线性模型)
def create_last_click_stats_pair(data):
temp_path, temp_conv = [], []
for i in range(len(data)):
temp_path.append(data.iloc[i][\'path\'][-2])
temp_conv.append(data.iloc[i][\'conv\'])
return pd.DataFrame({\'touch\': temp_path, \'Last_Conv\': temp_conv})
def create_first_order_states_pair(data):
temp_path, temp_conv = [], []
for i in range(len(data)):
temp_path.append(data.iloc[i][\'path\'][1])
temp_conv.append(data.iloc[i][\'conv\'])
return pd.DataFrame({\'touch\': temp_path, \'First_Conv\': temp_conv})
def create_linear_click_stats_pair(data):
temp_path, temp_conv = [], []
for i in range(len(data)):
if len(data.iloc[i][\'path\'])==6:
for j in range(1,5):
temp_path.append(data.iloc[i][\'path\'][j])
temp_conv.append(data.iloc[i][\'conv\'] / (6 - 2))
elif len(data.iloc[i][\'path\'])==5:
for j in range(1,4):
temp_path.append(data.iloc[i][\'path\'][j])
temp_conv.append(data.iloc[i][\'conv\'] / (5 - 2))
elif len(data.iloc[i][\'path\'])==4:
for j in range(1,3):
temp_path.append(data.iloc[i][\'path\'][j])
temp_conv.append(data.iloc[i][\'conv\'] / (4 - 2))
else:
for j in range(1, 2):
temp_path.append(data.iloc[i][\'path\'][j])
temp_conv.append(data.iloc[i][\'conv\'] / (3 - 2))
return pd.DataFrame({\'touch\': temp_path, \'Linear_Conv\': temp_conv})
last_touch = create_last_click_stats_pair(path_data).groupby(\'touch\')[\'Last_Conv\'].sum().reset_index()
linear_touch = create_linear_click_stats_pair(path_data).groupby(\'touch\')[\'Linear_Conv\'].sum().reset_index()
first_touch = create_first_order_states_pair(path_data).groupby(\'touch\')[\'First_Conv\'].sum().reset_index()
lst = last_touch.set_index(\'touch\').iloc[:, 0:].apply(lambda x: x / x.sum())
li = linear_touch.set_index(\'touch\').iloc[:, 0:].apply(lambda x: x / x.sum())
fst = first_touch.set_index(\'touch\').iloc[:, 0:].apply(lambda x: x / x.sum())
dfs = [fst, lst, li]
dfs = [df for df in dfs]
dfs[0].join(dfs[1:])
First_Conv | Last_Conv | Linear_Conv | |
---|---|---|---|
touch | |||
DM | 0.341152 | 0.744850 | 0.504964 |
EM | 0.278233 | 0.097915 | 0.198478 |
PHONE | 0.094440 | 0.014768 | 0.050488 |
WEB | 0.286175 | 0.142467 | 0.246070 |
**线性模型分析结论: 相比于其他渠道,DM(直邮)是转化率较优的渠道**
马尔科夫链
# 手动计算状态转移矩阵
def split_states(data):
temp_data = []
for i in range(len(data)):
path = data.iloc[i][\'path\']
state_pairs, values = [], []
for j in range(len(path)-1):
state_pairs.append((path[j], path[j+1]))
values.append(1)
temp_data.append([state_pairs, values])
return temp_data
temps = split_states(path_data)
temps[0:3]
[[[(\'start\', \'WEB\'), (\'WEB\', \'EM\'), (\'EM\', \'DM\'), (\'DM\', \'null\')],
[1, 1, 1, 1]],
[[(\'start\', \'EM\'), (\'EM\', \'WEB\'), (\'WEB\', \'DM\'), (\'DM\', \'null\')],
[1, 1, 1, 1]],
[[(\'start\', \'WEB\'), (\'WEB\', \'EM\'), (\'EM\', \'DM\'), (\'DM\', \'null\')],
[1, 1, 1, 1]]]
def transition_maxtrix(data):
state_pairs, values = [], []
for i in range(len(data)):
for j, z in zip(data[i][0], data[i][1]):
state_pairs.append(j)
values.append(z)
temp_df = pd.DataFrame({\'state_pairs\': state_pairs, \'values\': values})
grp_df = temp_df.groupby(\'state_pairs\')[\'values\'].sum().reset_index()
grp_df[[\'start\', \'end\']] = grp_df[\'state_pairs\'].apply(pd.Series)
table = pd.crosstab(grp_df[\'end\'], grp_df[\'start\'], values=grp_df[\'values\'], aggfunc=np.sum, normalize=\'columns\')\\
.applymap(lambda x: "{:3.2f}".format(x))
return table
# 输出状态转移矩阵
tmp = transition_maxtrix(temps)
tmp1 = tmp.transpose()
tmp1
end | DM | EM | PHONE | WEB | conversion | null |
---|---|---|---|---|---|---|
start | ||||||
DM | 0.00 | 0.09 | 0.01 | 0.12 | 0.21 | 0.57 |
EM | 0.43 | 0.00 | 0.02 | 0.41 | 0.04 | 0.09 |
PHONE | 0.18 | 0.11 | 0.00 | 0.69 | 0.01 | 0.01 |
WEB | 0.58 | 0.22 | 0.03 | 0.00 | 0.05 | 0.12 |
start | 0.14 | 0.29 | 0.32 | 0.25 | 0.00 | 0.00 |
计算渠道的移除效应
def channel_remove(data,channel_removed):
state_pairs, values = [], []
for i in range(len(data)):
for j, z in zip(data[i][0], data[i][1]):
state_pairs.append(j)
values.append(z)
temp_df = pd.DataFrame({\'state_pairs\': state_pairs, \'values\': values})
grp_df = temp_df.groupby(\'state_pairs\')[\'values\'].sum().reset_index()
grp_df[[\'start\', \'end\']] = grp_df[\'state_pairs\'].apply(pd.Series)
temp = grp_df.copy()
grp_df[\'start\'] = grp_df[\'start\'].replace(channel_removed, \'unknown\')
grp_df[\'end\'] = grp_df[\'end\'].replace(channel_removed, \'unknown\')
return [grp_df, temp]
# 筛选出成功转化路径
path_data_pur = path_data[path_data[\'conv\']==1]
temps = split_states(path_data_pur)
conversion =[]
columns = [\'start\', \'end\', \'values_x\', \'values_y\', \'perct\']
# 所有渠道
channels_list = list(df[\'channel\'].unique())
df_dummy1 = pd.DataFrame({\'start\': [\'start\', \'conversion\', \'null\'],
\'end\': [\'start\', \'conversion\', \'null\'],
\'values_x\': [0, 0, 0],
\'values_y\': [0, 0, 0],
\'perct\': [0, 1, 1]})
dy_dummy = pd.DataFrame(df_dummy1, columns=columns)
df_dummy2 = pd.DataFrame({\'start\': [\'start\', \'conversion\', \'null\'],
\'end\': [\'start\', \'conversion\', \'null\']})
# 逐个计算移除单个渠道后的总转化数
for chnl in channels_list:
df_remove = channel_remove(temps, chnl)[0]
df_noremove = channel_remove(temps, chnl)[1]
df_temp = df_remove.groupby(\'start\')[\'values\'].sum().reset_index()
df_temp = pd.merge(df_remove, df_temp, on=\'start\', how=\'left\')
df_temp[\'perct\'] = df_temp[\'values_x\']/df_temp[\'values_y\']
df_temp = pd.DataFrame(df_temp, columns=columns)
df_temp = pd.concat([df_temp, dy_dummy], axis=0)
df_ini = pd.DataFrame(df_noremove, columns=[\'start\', \'end\'])
df_temp2 = pd.concat([df_ini, df_dummy2], axis=0)
df_temp = pd.merge(df_temp2, df_temp, on=[\'start\', \'end\'], how=\'left\')
# 用0填充由于左连接出现的NaN
df_temp[\'values_x\'].fillna(0, inplace=True)
df_temp[\'values_y\'].fillna(0, inplace=True)
df_temp[\'perct\'].fillna(0, inplace=True)
df_trans1 = pd.crosstab(df_temp[\'start\'], df_temp[\'end\'], values=df_temp[\'perct\'], aggfunc=np.sum)
df_trans1.update(df_trans1[[\'DM\', \'EM\', \'PHONE\', \'WEB\', \'conversion\', \'null\', \'start\']].fillna(0))
# 转化为numpy矩阵
df_trans_mat = np.matrix(df_trans1)
inist_n1 = pd.crosstab(df_temp[\'start\'], df_temp[\'end\'], values=df_temp[\'values_x\'], aggfunc=np.sum)
inist_n1.update(inist_n1[[\'DM\', \'EM\', \'PHONE\', \'WEB\', \'conversion\', \'null\', \'start\']].fillna(0))
inist_mat = np.matrix(inist_n1.iloc[-1])
# 矩阵乘积
mat = inist_mat*df_trans_mat
# 取出估计出来的转化数
conversion.append(mat[0,4])
# 计算单个渠道的移除效应
chnl_conversion = pd.DataFrame({\'channel\': channels_list, \'conv\': conversion})
df_remove = channel_remove(temps, chnl)[0]
df_noremove = channel_remove(temps, chnl)[1]
tot_conv = df_remove[\'values\'].sum()
chnl_conversion[\'impact\'] = (tot_conv-chnl_conversion[\'conv\'])/tot_conv
tot_impact = chnl_conversion[\'impact\'].sum()
chnl_conversion[\'convet_rate\']= chnl_conversion[\'impact\']/tot_impact
chnl_conversion
channel | conv | impact | convet_rate | |
---|---|---|---|---|
0 | DM | 1003.039274 | 0.961468 | 0.265241 |
1 | WEB | 2724.928034 | 0.895320 | 0.246993 |
2 | EM | 2856.954317 | 0.890248 | 0.245594 |
3 | PHONE | 3179.825240 | 0.877845 | 0.242172 |
马尔科夫链模型分析结论: 相比于其他渠道,DM(直邮)的转换贡献值最高,是转化率较优的渠道
可视化马尔科夫链
states = [\'start\', \'DM\', \'EM\', \'PHONE\', \'WEB\', \'conversion\', \'null\']
def _get_markov_edges(Q):
edges = {}
for col in Q.columns:
for idx in Q.index:
edges[(idx,col)] = Q.loc[idx,col]
return edges
edges_wts = _get_markov_edges(tmp1)
edges_wts
{(\'DM\', \'DM\'): \'0.00\',
(\'EM\', \'DM\'): \'0.43\',
(\'PHONE\', \'DM\'): \'0.18\',
(\'WEB\', \'DM\'): \'0.58\',
(\'start\', \'DM\'): \'0.14\',
(\'DM\', \'EM\'): \'0.09\',
(\'EM\', \'EM\'): \'0.00\',
(\'PHONE\', \'EM\'): \'0.11\',
(\'WEB\', \'EM\'): \'0.22\',
(\'start\', \'EM\'): \'0.29\',
(\'DM\', \'PHONE\'): \'0.01\',
(\'EM\', \'PHONE\'): \'0.02\',
(\'PHONE\', \'PHONE\'): \'0.00\',
(\'WEB\', \'PHONE\'): \'0.03\',
(\'start\', \'PHONE\'): \'0.32\',
(\'DM\', \'WEB\'): \'0.12\',
(\'EM\', \'WEB\'): \'0.41\',
(\'PHONE\', \'WEB\'): \'0.69\',
(\'WEB\', \'WEB\'): \'0.00\',
(\'start\', \'WEB\'): \'0.25\',
(\'DM\', \'conversion\'): \'0.21\',
(\'EM\', \'conversion\'): \'0.04\',
(\'PHONE\', \'conversion\'): \'0.01\',
(\'WEB\', \'conversion\'): \'0.05\',
(\'start\', \'conversion\'): \'0.00\',
(\'DM\', \'null\'): \'0.57\',
(\'EM\', \'null\'): \'0.09\',
(\'PHONE\', \'null\'): \'0.01\',
(\'WEB\', \'null\'): \'0.12\',
(\'start\', \'null\'): \'0.00\'}
# 移除0转化概率的边
for key, value in list(edges_wts.items()):
if value == \'0.00\':
edges_wts.pop(key)
pprint(edges_wts)
{(\'DM\', \'EM\'): \'0.09\',
(\'DM\', \'PHONE\'): \'0.01\',
(\'DM\', \'WEB\'): \'0.12\',
(\'DM\', \'conversion\'): \'0.21\',
(\'DM\', \'null\'): \'0.57\',
(\'EM\', \'DM\'): \'0.43\',
(\'EM\', \'PHONE\'): \'0.02\',
(\'EM\', \'WEB\'): \'0.41\',
(\'EM\', \'conversion\'): \'0.04\',
(\'EM\', \'null\'): \'0.09\',
(\'PHONE\', \'DM\'): \'0.18\',
(\'PHONE\', \'EM\'): \'0.11\',
(\'PHONE\', \'WEB\'): \'0.69\',
(\'PHONE\', \'conversion\'): \'0.01\',
(\'PHONE\', \'null\'): \'0.01\',
(\'WEB\', \'DM\'): \'0.58\',
(\'WEB\', \'EM\'): \'0.22\',
(\'WEB\', \'PHONE\'): \'0.03\',
(\'WEB\', \'conversion\'): \'0.05\',
(\'WEB\', \'null\'): \'0.12\',
(\'start\', \'DM\'): \'0.14\',
(\'start\', \'EM\'): \'0.29\',
(\'start\', \'PHONE\'): \'0.32\',
(\'start\', \'WEB\'): \'0.25\'}
# 用networkx绘制马尔科夫链
os.environ["PATH"] += os.pathsep + \'./graphviz-2.38/release/bin/\'
G = nx.MultiDiGraph()
# 增加节点状态
G.add_nodes_from(states)
print(\'Nodes:\\n{G.nodes()}\\n\')
# 边表示转换概率
for k, v in edges_wts.items():
tmp_origin, tmp_destination = k[0], k[1]
G.add_edge(tmp_origin, tmp_destination, weight=v, label=v)
print(\'Edges:\')
pprint(G.edges(data=True))
pos = nx.drawing.nx_pydot.graphviz_layout(G, prog=\'dot\')
nx.draw_networkx(G, pos)
# 创建边标签
edge_labels = {(n1,n2):d[\'label\'] for n1,n2,d in G.edges(data=True)}
nx.draw_networkx_edge_labels(G , pos, edge_labels=edge_labels)
nx.drawing.nx_pydot.write_dot(G, \'customer_markov.dot\')
Nodes:
{G.nodes()}
Edges:
OutMultiEdgeDataView([(\'DM\', \'WEB\', {\'label\': \'0.12\', \'weight\': \'0.12\'}), (\'DM\', \'EM\', {\'label\': \'0.09\', \'weight\': \'0.09\'}), (\'DM\', \'null\', {\'label\': \'0.57\', \'weight\': \'0.57\'}), (\'DM\', \'PHONE\', {\'label\': \'0.01\', \'weight\': \'0.01\'}), (\'DM\', \'conversion\', {\'label\': \'0.21\', \'weight\': \'0.21\'}), (\'start\', \'EM\', {\'label\': \'0.29\', \'weight\': \'0.29\'}), (\'start\', \'DM\', {\'label\': \'0.14\', \'weight\': \'0.14\'}), (\'start\', \'PHONE\', {\'label\': \'0.32\', \'weight\': \'0.32\'}), (\'start\', \'WEB\', {\'label\': \'0.25\', \'weight\': \'0.25\'}), (\'EM\', \'DM\', {\'label\': \'0.43\', \'weight\': \'0.43\'}), (\'EM\', \'WEB\', {\'label\': \'0.41\', \'weight\': \'0.41\'}), (\'EM\', \'null\', {\'label\': \'0.09\', \'weight\': \'0.09\'}), (\'EM\', \'PHONE\', {\'label\': \'0.02\', \'weight\': \'0.02\'}), (\'EM\', \'conversion\', {\'label\': \'0.04\', \'weight\': \'0.04\'}), (\'WEB\', \'DM\', {\'label\': \'0.58\', \'weight\': \'0.58\'}), (\'WEB\', \'EM\', {\'label\': \'0.22\', \'weight\': \'0.22\'}), (\'WEB\', \'null\', {\'label\': \'0.12\', \'weight\': \'0.12\'}), (\'WEB\', \'PHONE\', {\'label\': \'0.03\', \'weight\': \'0.03\'}), (\'WEB\', \'conversion\', {\'label\': \'0.05\', \'weight\': \'0.05\'}), (\'PHONE\', \'WEB\', {\'label\': \'0.69\', \'weight\': \'0.69\'}), (\'PHONE\', \'DM\', {\'label\': \'0.18\', \'weight\': \'0.18\'}), (\'PHONE\', \'null\', {\'label\': \'0.01\', \'weight\': \'0.01\'}), (\'PHONE\', \'EM\', {\'label\': \'0.11\', \'weight\': \'0.11\'}), (\'PHONE\', \'conversion\', {\'label\': \'0.01\', \'weight\': \'0.01\'})])