🎮 电子游戏销售分析 📊
推荐专栏:《深度学习100例》
推荐专栏:《小白入门深度学习》
目录
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from pandas_profiling import ProfileReport
import pandas.util.testing as tm
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
data_file_path = "vgsales.csv"
companie_region_path = "video-games-developers.csv"
total_sales_column = "Total_Sales"
primary_blue = "#496595"
primary_blue2 = "#85a1c1"
primary_blue3 = "#3f4d63"
primary_grey = "#c6ccd8"
primary_black = "#202022"
primary_bgcolor = "#f4f0ea"
primary_green = px.colors.qualitative.Plotly[2]
plt.rcParams['axes.facecolor'] = primary_bgcolor
colors = [primary_blue, primary_blue2, primary_blue3, primary_grey, primary_black, primary_bgcolor, primary_green]
sns.palplot(sns.color_palette(colors))
一、加载数据 📚
data_df = pd.read_csv(data_file_path)
data_df.head()
| Rank | Name | Platform | Year | Genre | Publisher | NA_Sales | EU_Sales | JP_Sales | Other_Sales | Global_Sales |
---|
0 | 1 | Wii Sports | Wii | 2006.0 | Sports | Nintendo | 41.49 | 29.02 | 3.77 | 8.46 | 82.74 |
---|
1 | 2 | Super Mario Bros. | NES | 1985.0 | Platform | Nintendo | 29.08 | 3.58 | 6.81 | 0.77 | 40.24 |
---|
2 | 3 | Mario Kart Wii | Wii | 2008.0 | Racing | Nintendo | 15.85 | 12.88 | 3.79 | 3.31 | 35.82 |
---|
3 | 4 | Wii Sports Resort | Wii | 2009.0 | Sports | Nintendo | 15.75 | 11.01 | 3.28 | 2.96 | 33.00 |
---|
4 | 5 | Pokemon Red/Pokemon Blue | GB | 1996.0 | Role-Playing | Nintendo | 11.27 | 8.89 | 10.22 | 1.00 | 31.37 |
---|
region_df = pd.read_csv(companie_region_path)
region_df.head()
| Developer | City | Administrative division | Country | Est. | Notable games, series or franchises | Notes |
---|
0 | 0verflow | Tokyo | NaN | Japan | 1997 | School DaysSummer DaysCross Days | Visual Novel brand (both developer and publisher) |
---|
1 | 11 bit studios | Warsaw | Masovian Voivodeship | Poland | 2010 | Frostpunk | Indie developer/publisher |
---|
2 | 1C Company | Moscow | NaN | Russia | 1991 | King's Bounty: Warriors of the North | Game localization. The game development subsid... |
---|
3 | 1-Up Studio | Tokyo | NaN | Japan | 2000 | Mother 3 | Subsidiary of Nintendo. Formed by former emplo... |
---|
4 | 2K Czech | Brno | NaN | Czech Republic | 1997 | MafiaMafia II | Former subsidiary of 2K Games; previously know... |
---|
df = pd.merge(data_df, region_df[['Developer', 'Country']], left_on='Publisher', right_on='Developer', how='left')
df.head()
| Rank | Name | Platform | Year | Genre | Publisher | NA_Sales | EU_Sales | JP_Sales | Other_Sales | Global_Sales | Developer | Country |
---|
0 | 1 | Wii Sports | Wii | 2006.0 | Sports | Nintendo | 41.49 | 29.02 | 3.77 | 8.46 | 82.74 | Nintendo | Japan |
---|
1 | 2 | Super Mario Bros. | NES | 1985.0 | Platform | Nintendo | 29.08 | 3.58 | 6.81 | 0.77 | 40.24 | Nintendo | Japan |
---|
2 | 3 | Mario Kart Wii | Wii | 2008.0 | Racing | Nintendo | 15.85 | 12.88 | 3.79 | 3.31 | 35.82 | Nintendo | Japan |
---|
3 | 4 | Wii Sports Resort | Wii | 2009.0 | Sports | Nintendo | 15.75 | 11.01 | 3.28 | 2.96 | 33.00 | Nintendo | Japan |
---|
4 | 5 | Pokemon Red/Pokemon Blue | GB | 1996.0 | Role-Playing | Nintendo | 11.27 | 8.89 | 10.22 | 1.00 | 31.37 | Nintendo | Japan |
---|
df.columns
Index(['Rank', 'Name', 'Platform', 'Year', 'Genre', 'Publisher', 'NA_Sales',
'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales', 'Developer',
'Country'],
dtype='object')
二、Pandas 分析 🔎
vgames_profile = ProfileReport(df, title='Video Games Profile Report')
三、发行分析 📝
if 'Total_Shipped' in df.columns:
df[total_sales_column] = df['Total_Shipped'].fillna(0) + df['Global_Sales'].fillna(0)
else:
regions = ['NA', 'JP', 'EU', 'Other']
region_sales_sufix = '_Sales'
df[total_sales_column] = df['Global_Sales']
tdf = df.copy()
tdf = df[df['Year'].notna()]
tdf = tdf.sort_values('Year', ascending=True)
top_tdf = tdf.groupby(['Platform', 'Year']).agg({total_sales_column: 'count'}).reset_index()
top_tdf.columns = ['Platform', 'Year', 'Count']
top_tdf = top_tdf[top_tdf['Year'].isin([2016, 2017, 2018, 2019])]
top_tdf = top_tdf[top_tdf['Count'] > top_tdf['Count'].sum() * 0.01]
top_tdf['Year'] = top_tdf['Year'].astype(str)
fig = px.bar(
top_tdf,
x='Platform',
y='Count',
color='Year',
barmode="group"
)
fig.update_layout(title="按平台发布的电子游戏总数")
fig.update_xaxes(type='category')
fig.update_xaxes(categoryorder='category ascending')
fig.show()
platform_tops = ['PS4', 'PSV', 'XOne', 'PC']
四、售卖分析 💵
platform_tdf = tdf.groupby(['Platform', 'Year']).agg({total_sales_column: 'sum'}).reset_index()
platform_tdf = platform_tdf.sort_values('Year', ascending=True)
platform_tdf.head()
| Platform | Year | Total_Sales |
---|
0 | 2600 | 1980.0 | 11.38 |
---|
1 | 2600 | 1981.0 | 35.77 |
---|
2 | 2600 | 1982.0 | 28.86 |
---|
3 | 2600 | 1983.0 | 5.83 |
---|
79 | NES | 1983.0 | 10.96 |
---|
platform_top_tdf = platform_tdf[platform_tdf['Year'].isin([2016, 2017, 2018, 2019])]
platform_top_tdf = platform_top_tdf[platform_top_tdf[total_sales_column] > platform_top_tdf[total_sales_column].sum() * 0.005]
platform_top_tdf['Year'] = platform_top_tdf['Year'].astype(str)
fig = px.bar(
platform_top_tdf,
x='Platform',
y=total_sales_column,
color='Year',
barmode="group"
)
fig.update_layout(title="Total sales by platforms (Millions)")
fig.update_xaxes(type='category')
fig.update_xaxes(categoryorder='category ascending')
fig.show()
汇总销售分析
platform_sum_tdf = platform_tdf.groupby(['Platform']).agg({total_sales_column: 'sum'}).reset_index()
platform_sum_tdf = platform_sum_tdf[platform_sum_tdf[total_sales_column] > platform_sum_tdf[total_sales_column].sum() * 0.03]
fig = px.bar(
platform_sum_tdf,
x='Platform',
y=total_sales_column,
)
fig.update_layout(title="Total sales of all time in the most important platforms (Millions)")
fig.update_xaxes(type='category')
fig.update_xaxes(categoryorder='category ascending')
fig.show()
platform_tmp_tdf = tdf.groupby(['Platform', 'Year']).agg({total_sales_column: ['sum', 'count']})
五、销售分布 📈
df.head()
| Rank | Name | Platform | Year | Genre | Publisher | NA_Sales | EU_Sales | JP_Sales | Other_Sales | Global_Sales | Developer | Country | Total_Sales |
---|
0 | 1 | Wii Sports | Wii | 2006.0 | Sports | Nintendo | 41.49 | 29.02 | 3.77 | 8.46 | 82.74 | Nintendo | Japan | 82.74 |
---|
1 | 2 | Super Mario Bros. | NES | 1985.0 | Platform | Nintendo | 29.08 | 3.58 | 6.81 | 0.77 | 40.24 | Nintendo | Japan | 40.24 |
---|
2 | 3 | Mario Kart Wii | Wii | 2008.0 | Racing | Nintendo | 15.85 | 12.88 | 3.79 | 3.31 | 35.82 | Nintendo | Japan | 35.82 |
---|
3 | 4 | Wii Sports Resort | Wii | 2009.0 | Sports | Nintendo | 15.75 | 11.01 | 3.28 | 2.96 | 33.00 | Nintendo | Japan | 33.00 |
---|
4 | 5 | Pokemon Red/Pokemon Blue | GB | 1996.0 | Role-Playing | Nintendo | 11.27 | 8.89 | 10.22 | 1.00 | 31.37 | Nintendo | Japan | 31.37 |
---|
if 'Total_Shipped' in df.columns:
regions = ['NA', 'JP', 'PAL', 'Other']
else:
regions = ['NA', 'JP', 'EU', 'Other']
region_sales_sufix = '_Sales'
regions_agg = {}
for region in regions:
regions_agg[region + region_sales_sufix] = 'sum'
regions_agg[total_sales_column] = 'sum'
regions_agg
{'NA_Sales': 'sum',
'JP_Sales': 'sum',
'EU_Sales': 'sum',
'Other_Sales': 'sum',
'Total_Sales': 'sum'}
geo_tdf = tdf.groupby(['Year']).agg(regions_agg).reset_index()
geo_tdf = geo_tdf.sort_values('Year', ascending=True)
geo_tdf.head(10)
| Year | NA_Sales | JP_Sales | EU_Sales | Other_Sales | Total_Sales |
---|
0 | 1980.0 | 10.59 | 0.00 | 0.67 | 0.12 | 11.38 |
---|
1 | 1981.0 | 33.40 | 0.00 | 1.96 | 0.32 | 35.77 |
---|
2 | 1982.0 | 26.92 | 0.00 | 1.65 | 0.31 | 28.86 |
---|
3 | 1983.0 | 7.76 | 8.10 | 0.80 | 0.14 | 16.79 |
---|
4 | 1984.0 | 33.28 | 14.27 | 2.10 | 0.70 | 50.36 |
---|
5 | 1985.0 | 33.73 | 14.56 | 4.74 | 0.92 | 53.94 |
---|
6 | 1986.0 | 12.50 | 19.81 | 2.84 | 1.93 | 37.07 |
---|
7 | 1987.0 | 8.46 | 11.63 | 1.41 | 0.20 | 21.74 |
---|
8 | 1988.0 | 23.87 | 15.76 | 6.59 | 0.99 | 47.22 |
---|
9 | 1989.0 | 45.15 | 18.36 | 8.44 | 1.50 | 73.45 |
---|
fig = go.Figure()
for region in regions:
fig.add_trace(go.Scatter(
x=geo_tdf['Year'],
y=geo_tdf[region + region_sales_sufix],
mode='lines',
name=region,
))
fig.update_layout(title="Total sales per year by region (Millions)")
fig.update_xaxes(type='category')
fig.show()
六、销售额分布 🙅
genre_tdf = tdf.groupby(['Genre']).agg(regions_agg)
genre_tdf = genre_tdf.sort_values(total_sales_column, ascending=False)
genre_tdf.head()
| NA_Sales | JP_Sales | EU_Sales | Other_Sales | Total_Sales |
---|
Genre | | | | | |
---|
Action | 861.80 | 158.66 | 516.48 | 184.92 | 1722.88 |
---|
Sports | 670.09 | 134.76 | 371.34 | 132.65 | 1309.24 |
---|
Shooter | 575.16 | 38.18 | 310.45 | 101.90 | 1026.20 |
---|
Role-Playing | 326.50 | 350.29 | 187.58 | 59.38 | 923.84 |
---|
Platform | 445.99 | 130.65 | 200.67 | 51.51 | 829.15 |
---|
按类型和地区划分的销售额分布
fig = px.imshow(genre_tdf.drop(total_sales_column, 1).T)
fig.update_layout(title="Sales distribution by genre and region (Millions)")
fig.show()
近四年
genre_last_tdf = tdf[tdf['Year'].isin([2016, 2017, 2018, 2019])]
genre_last_tdf = genre_last_tdf.groupby(['Genre']).agg(regions_agg)
genre_last_tdf = genre_last_tdf.sort_values(total_sales_column, ascending=False)
genre_last_tdf.head()
| NA_Sales | JP_Sales | EU_Sales | Other_Sales | Total_Sales |
---|
Genre | | | | | |
---|
Action | 5.87 | 5.80 | 6.36 | 1.83 | 19.92 |
---|
Shooter | 7.44 | 0.61 | 7.70 | 2.42 | 18.22 |
---|
Sports | 4.57 | 0.78 | 7.36 | 1.92 | 14.60 |
---|
Role-Playing | 1.39 | 3.67 | 1.29 | 0.44 | 6.80 |
---|
Fighting | 1.60 | 0.64 | 1.15 | 0.46 | 3.86 |
---|
fig = px.imshow(genre_last_tdf.drop(total_sales_column, 1).T)
fig.update_layout(title="Sales distribution by genre and region (Millions) last four years")
fig.show()
genre_total_tdf = genre_tdf.reset_index().sort_values(total_sales_column, ascending=False)
fig = go.Figure()
fig.add_trace(go.Scatter(
x=genre_total_tdf['Genre'],
y=genre_total_tdf[total_sales_column],
mode='lines+markers',
))
fig.update_layout(title="Total sales by genre (Millions)")
fig.update_xaxes(type='category')
fig.show()
genre_tops = list(genre_total_tdf.loc[genre_total_tdf[total_sales_column] > genre_total_tdf[total_sales_column].sum() * 0.03, 'Genre'])
genre_tops
['Action',
'Sports',
'Shooter',
'Role-Playing',
'Platform',
'Misc',
'Racing',
'Fighting',
'Simulation']
genre_tops_df = tdf[tdf['Genre'].isin(genre_tops)]
fig = px.pie(genre_tops_df,
values=total_sales_column,
names='Genre',
title='Population of European continent',
hover_data=['Genre'],
labels={'lifeExp':'Video Games Genres'},
hole=0.3,
)
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()
fig = go.Figure()
fig.add_trace(go.Pie(
labels=genre_tops_df['Genre'],
values=genre_tops_df[total_sales_column],
pull=[0, 0, 0.1, 0.05, 0, 0, 0.05, 0, 0.05],
))
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.update_layout(title="Percent of sales by Genre")
fig.show()
genre_last_total_tdf = genre_last_tdf.reset_index().sort_values(total_sales_column, ascending=False)
fig = go.Figure()
fig.add_trace(go.Scatter(
x=genre_last_total_tdf['Genre'],
y=genre_last_total_tdf[total_sales_column],
mode='lines+markers',
))
fig.update_layout(title="Total sales by genre (Millions)")
fig.update_xaxes(type='category')
fig.show()
七、ESRB 评级的销售分布 🔞
if 'ESRB_Rating' in df.columns:
esrb_tdf = tdf.groupby('ESRB_Rating').agg({total_sales_column: 'sum'}).reset_index()
esrb_tdf.head(10)
if 'ESRB_Rating' in df.columns:
fig = px.bar(esrb_tdf, x='ESRB_Rating', y=total_sales_column)
fig.show()
if 'ESRB_Rating' in df.columns以上是关于电子游戏销售分析的主要内容,如果未能解决你的问题,请参考以下文章
‘kaggle视频游戏销售数据的可视化和分析‘项目实现
数据分析实战 | 探寻销售额下降的原因
数据分析实战 | 探寻销售额下降的原因
数据分析实战 | 探寻销售额下降的原因
PS4游戏销售榜公布 《GTA5》排名第一
双11,用Python爬取4000多条笔记本电脑的销售数据并分析