电子游戏销售分析

Posted K同学啊

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了电子游戏销售分析相关的知识,希望对你有一定的参考价值。

🎮 电子游戏销售分析 📊

推荐专栏:《深度学习100例》

推荐专栏:《小白入门深度学习》

目录

import pandas as pd
import numpy as np

import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from pandas_profiling import ProfileReport
import pandas.util.testing as tm
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
data_file_path = "vgsales.csv"
companie_region_path = "video-games-developers.csv"
total_sales_column = "Total_Sales"
# Defining all our palette colours.
primary_blue = "#496595"
primary_blue2 = "#85a1c1"
primary_blue3 = "#3f4d63"
primary_grey = "#c6ccd8"
primary_black = "#202022"
primary_bgcolor = "#f4f0ea"

primary_green = px.colors.qualitative.Plotly[2]

plt.rcParams['axes.facecolor'] = primary_bgcolor

colors = [primary_blue, primary_blue2, primary_blue3, primary_grey, primary_black, primary_bgcolor, primary_green]
sns.palplot(sns.color_palette(colors))

一、加载数据 📚

# Load core data
data_df = pd.read_csv(data_file_path)
data_df.head()
RankNamePlatformYearGenrePublisherNA_SalesEU_SalesJP_SalesOther_SalesGlobal_Sales
01Wii SportsWii2006.0SportsNintendo41.4929.023.778.4682.74
12Super Mario Bros.NES1985.0PlatformNintendo29.083.586.810.7740.24
23Mario Kart WiiWii2008.0RacingNintendo15.8512.883.793.3135.82
34Wii Sports ResortWii2009.0SportsNintendo15.7511.013.282.9633.00
45Pokemon Red/Pokemon BlueGB1996.0Role-PlayingNintendo11.278.8910.221.0031.37
# Load companie-region data
region_df = pd.read_csv(companie_region_path)
region_df.head()
DeveloperCityAdministrative divisionCountryEst.Notable games, series or franchisesNotes
00verflowTokyoNaNJapan1997School DaysSummer DaysCross DaysVisual Novel brand (both developer and publisher)
111 bit studiosWarsawMasovian VoivodeshipPoland2010FrostpunkIndie developer/publisher
21C CompanyMoscowNaNRussia1991King's Bounty: Warriors of the NorthGame localization. The game development subsid...
31-Up StudioTokyoNaNJapan2000Mother 3Subsidiary of Nintendo. Formed by former emplo...
42K CzechBrnoNaNCzech Republic1997MafiaMafia IIFormer subsidiary of 2K Games; previously know...
df = pd.merge(data_df, region_df[['Developer', 'Country']], left_on='Publisher', right_on='Developer', how='left')
df.head()
RankNamePlatformYearGenrePublisherNA_SalesEU_SalesJP_SalesOther_SalesGlobal_SalesDeveloperCountry
01Wii SportsWii2006.0SportsNintendo41.4929.023.778.4682.74NintendoJapan
12Super Mario Bros.NES1985.0PlatformNintendo29.083.586.810.7740.24NintendoJapan
23Mario Kart WiiWii2008.0RacingNintendo15.8512.883.793.3135.82NintendoJapan
34Wii Sports ResortWii2009.0SportsNintendo15.7511.013.282.9633.00NintendoJapan
45Pokemon Red/Pokemon BlueGB1996.0Role-PlayingNintendo11.278.8910.221.0031.37NintendoJapan
df.columns
Index(['Rank', 'Name', 'Platform', 'Year', 'Genre', 'Publisher', 'NA_Sales',
       'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales', 'Developer',
       'Country'],
      dtype='object')

二、Pandas 分析 🔎

vgames_profile = ProfileReport(df, title='Video Games Profile Report')
# vgames_profile

三、发行分析 📝

if 'Total_Shipped' in df.columns:
    df[total_sales_column] = df['Total_Shipped'].fillna(0) + df['Global_Sales'].fillna(0)
else:
    regions = ['NA', 'JP', 'EU', 'Other']
    region_sales_sufix = '_Sales'
    
    df[total_sales_column] = df['Global_Sales']
tdf = df.copy()
# tdf['Year'] = df['Year'].fillna(df['Year'].mean())
tdf = df[df['Year'].notna()] # Carefull about this
tdf = tdf.sort_values('Year', ascending=True)
# fig = px.histogram(
#     tdf,
#     x='Platform',
#     animation_frame='Year',
#     range_y=[0, 550],
# )
# fig.update_xaxes(type='category')
# fig.update_xaxes(categoryorder='category ascending')
# # fig.show()
top_tdf = tdf.groupby(['Platform', 'Year']).agg({total_sales_column: 'count'}).reset_index()
top_tdf.columns = ['Platform', 'Year', 'Count']
top_tdf = top_tdf[top_tdf['Year'].isin([2016, 2017, 2018, 2019])]
top_tdf = top_tdf[top_tdf['Count'] > top_tdf['Count'].sum() * 0.01]
top_tdf['Year'] = top_tdf['Year'].astype(str)
fig = px.bar(
    top_tdf,
    x='Platform',
    y='Count',
    color='Year',
    barmode="group"
)
fig.update_layout(title="按平台发布的电子游戏总数")
fig.update_xaxes(type='category')
fig.update_xaxes(categoryorder='category ascending')
fig.show()

platform_tops = ['PS4', 'PSV', 'XOne', 'PC']

四、售卖分析 💵

platform_tdf = tdf.groupby(['Platform', 'Year']).agg({total_sales_column: 'sum'}).reset_index()
platform_tdf = platform_tdf.sort_values('Year', ascending=True)
platform_tdf.head()
PlatformYearTotal_Sales
026001980.011.38
126001981.035.77
226001982.028.86
326001983.05.83
79NES1983.010.96
# fig = px.bar(
#     platform_tdf,
#     x='Platform',
#     y=total_sales_column,
#     animation_frame='Year',
#     range_y=[0, 150],
# )
# fig.update_xaxes(type='category')
# fig.update_xaxes(categoryorder='category ascending')
# fig.show()
platform_top_tdf = platform_tdf[platform_tdf['Year'].isin([2016, 2017, 2018, 2019])]
platform_top_tdf = platform_top_tdf[platform_top_tdf[total_sales_column] > platform_top_tdf[total_sales_column].sum() * 0.005]
platform_top_tdf['Year'] = platform_top_tdf['Year'].astype(str)

fig = px.bar(
    platform_top_tdf,
    x='Platform',
    y=total_sales_column,
    color='Year',
    barmode="group"
)
fig.update_layout(title="Total sales by platforms (Millions)")
fig.update_xaxes(type='category')
fig.update_xaxes(categoryorder='category ascending')
fig.show()

汇总销售分析

platform_sum_tdf = platform_tdf.groupby(['Platform']).agg({total_sales_column: 'sum'}).reset_index()
platform_sum_tdf = platform_sum_tdf[platform_sum_tdf[total_sales_column] > platform_sum_tdf[total_sales_column].sum() * 0.03]
fig = px.bar(
    platform_sum_tdf,
    x='Platform',
    y=total_sales_column,
)
fig.update_layout(title="Total sales of all time in the most important platforms (Millions)")
fig.update_xaxes(type='category')
fig.update_xaxes(categoryorder='category ascending')
fig.show()

platform_tmp_tdf = tdf.groupby(['Platform', 'Year']).agg({total_sales_column: ['sum', 'count']})
# Here we can check 

五、销售分布 📈

df.head()
RankNamePlatformYearGenrePublisherNA_SalesEU_SalesJP_SalesOther_SalesGlobal_SalesDeveloperCountryTotal_Sales
01Wii SportsWii2006.0SportsNintendo41.4929.023.778.4682.74NintendoJapan82.74
12Super Mario Bros.NES1985.0PlatformNintendo29.083.586.810.7740.24NintendoJapan40.24
23Mario Kart WiiWii2008.0RacingNintendo15.8512.883.793.3135.82NintendoJapan35.82
34Wii Sports ResortWii2009.0SportsNintendo15.7511.013.282.9633.00NintendoJapan33.00
45Pokemon Red/Pokemon BlueGB1996.0Role-PlayingNintendo11.278.8910.221.0031.37NintendoJapan31.37
if 'Total_Shipped' in df.columns:
    regions = ['NA', 'JP', 'PAL', 'Other']
else:
    regions = ['NA', 'JP', 'EU', 'Other']

region_sales_sufix = '_Sales'
regions_agg = {}

for region in regions:
    regions_agg[region + region_sales_sufix] = 'sum'

regions_agg[total_sales_column] = 'sum'
regions_agg
{'NA_Sales': 'sum',
 'JP_Sales': 'sum',
 'EU_Sales': 'sum',
 'Other_Sales': 'sum',
 'Total_Sales': 'sum'}
geo_tdf = tdf.groupby(['Year']).agg(regions_agg).reset_index()
geo_tdf = geo_tdf.sort_values('Year', ascending=True)
geo_tdf.head(10)
YearNA_SalesJP_SalesEU_SalesOther_SalesTotal_Sales
01980.010.590.000.670.1211.38
11981.033.400.001.960.3235.77
21982.026.920.001.650.3128.86
31983.07.768.100.800.1416.79
41984.033.2814.272.100.7050.36
51985.033.7314.564.740.9253.94
61986.012.5019.812.841.9337.07
71987.08.4611.631.410.2021.74
81988.023.8715.766.590.9947.22
91989.045.1518.368.441.5073.45
fig = go.Figure()

for region in regions:
    
    fig.add_trace(go.Scatter(
        x=geo_tdf['Year'], 
        y=geo_tdf[region + region_sales_sufix], 
        mode='lines',
        name=region,
    ))
fig.update_layout(title="Total sales per year by region (Millions)")
fig.update_xaxes(type='category')
fig.show()

# # Thanks to @amritachatterjee09 for this cool function

# year_geo_df = tdf[["Year",'NA_Sales','EU_Sales','JP_Sales','Other_Sales']]

# year_geo_df[['NA_mean','EU_mean','JP_mean','Other_mean']] = year_geo_df.groupby('Year')[['NA_Sales','EU_Sales','JP_Sales','Other_Sales']].transform('sum')
# year_geo_df = year_geo_df.drop(['NA_Sales','EU_Sales','JP_Sales','Other_Sales'], axis=1)
# year_geo_df = year_geo_df.drop_duplicates()
# year_geo_df = year_geo_df.sort_values("Year")

# temp_df1 = pd.DataFrame({'Place': ['NA_Sales']*year_geo_df.shape[0], 'Year':year_geo_df['Year'], 'Sales': year_geo_df['NA_mean']})
# temp_df2 = pd.DataFrame({'Place': ['EU_Sales']*year_geo_df.shape[0], 'Year': year_geo_df['Year'], 'Sales': year_geo_df['EU_mean']})
# temp_df3 = pd.DataFrame({'Place': ['JP_Sales']*year_geo_df.shape[0], 'Year': year_geo_df['Year'], 'Sales': year_geo_df['JP_mean']})
# temp_df4 = pd.DataFrame({'Place': ['Other_Sales']*year_geo_df.shape[0], 'Year': year_geo_df['Year'], 'Sales': year_geo_df['Other_mean']})

# final = pd.concat([temp_df1,temp_df2,temp_df3,temp_df4], axis=0)
# final = final.sort_values("Year")

# fig=px.bar(
#     final,
#     x='Place', 
#     y="Sales", 
#     animation_frame="Year",
#     animation_group="Place", 
#     color="Place", 
#     hover_name="Place",
#     range_y=[0, 200]
# )
# fig.update_layout(title="Year sales distribution by region",title_x=0.5)

# fig.show()

六、销售额分布 🙅

genre_tdf = tdf.groupby(['Genre']).agg(regions_agg)
genre_tdf = genre_tdf.sort_values(total_sales_column, ascending=False)
genre_tdf.head()
NA_SalesJP_SalesEU_SalesOther_SalesTotal_Sales
Genre
Action861.80158.66516.48184.921722.88
Sports670.09134.76371.34132.651309.24
Shooter575.1638.18310.45101.901026.20
Role-Playing326.50350.29187.5859.38923.84
Platform445.99130.65200.6751.51829.15

按类型和地区划分的销售额分布

fig = px.imshow(genre_tdf.drop(total_sales_column, 1).T)
fig.update_layout(title="Sales distribution by genre and region (Millions)")
fig.show()

近四年

genre_last_tdf = tdf[tdf['Year'].isin([2016, 2017, 2018, 2019])]
genre_last_tdf = genre_last_tdf.groupby(['Genre']).agg(regions_agg)
genre_last_tdf = genre_last_tdf.sort_values(total_sales_column, ascending=False)
genre_last_tdf.head()
NA_SalesJP_SalesEU_SalesOther_SalesTotal_Sales
Genre
Action5.875.806.361.8319.92
Shooter7.440.617.702.4218.22
Sports4.570.787.361.9214.60
Role-Playing1.393.671.290.446.80
Fighting1.600.641.150.463.86
fig = px.imshow(genre_last_tdf.drop(total_sales_column, 1).T)
fig.update_layout(title="Sales distribution by genre and region (Millions) last four years")
fig.show()

# Reorder df to total genre scattewr plot
genre_total_tdf = genre_tdf.reset_index().sort_values(total_sales_column, ascending=False)
fig = go.Figure()
    
fig.add_trace(go.Scatter(
    x=genre_total_tdf['Genre'], 
    y=genre_total_tdf[total_sales_column], 
    mode='lines+markers',
))
fig.update_layout(title="Total sales by genre (Millions)")
fig.update_xaxes(type='category')
# fig.update_xaxes(categoryorder='total descending')
fig.show()

genre_tops = list(genre_total_tdf.loc[genre_total_tdf[total_sales_column] > genre_total_tdf[total_sales_column].sum() * 0.03, 'Genre'])
genre_tops
['Action',
 'Sports',
 'Shooter',
 'Role-Playing',
 'Platform',
 'Misc',
 'Racing',
 'Fighting',
 'Simulation']
genre_tops_df = tdf[tdf['Genre'].isin(genre_tops)]

fig = px.pie(genre_tops_df,
             values=total_sales_column,
             names='Genre',
             title='Population of European continent',
             hover_data=['Genre'], 
             labels={'lifeExp':'Video Games Genres'},
             hole=0.3,
            )
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()

fig  = go.Figure()
fig.add_trace(go.Pie(
    labels=genre_tops_df['Genre'], 
    values=genre_tops_df[total_sales_column], 
    pull=[0, 0, 0.1, 0.05, 0, 0, 0.05, 0, 0.05],
))
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.update_layout(title="Percent of sales by Genre")
fig.show()

# Reorder df to total genre scattewr plot
genre_last_total_tdf = genre_last_tdf.reset_index().sort_values(total_sales_column, ascending=False)
fig = go.Figure()
    
fig.add_trace(go.Scatter(
    x=genre_last_total_tdf['Genre'], 
    y=genre_last_total_tdf[total_sales_column], 
    mode='lines+markers',
))
fig.update_layout(title="Total sales by genre (Millions)")
fig.update_xaxes(type='category')
# fig.update_xaxes(categoryorder='total descending')
fig.show()

七、ESRB 评级的销售分布 🔞

if 'ESRB_Rating' in df.columns:
    esrb_tdf = tdf.groupby('ESRB_Rating').agg({total_sales_column: 'sum'}).reset_index()
    esrb_tdf.head(10)
if 'ESRB_Rating' in df.columns:
    fig = px.bar(esrb_tdf, x='ESRB_Rating', y=total_sales_column)
    fig.show()
if 'ESRB_Rating' in df.columns以上是关于电子游戏销售分析的主要内容,如果未能解决你的问题,请参考以下文章

‘kaggle视频游戏销售数据的可视化和分析‘项目实现

数据分析实战 | 探寻销售额下降的原因

数据分析实战 | 探寻销售额下降的原因

数据分析实战 | 探寻销售额下降的原因

PS4游戏销售榜公布 《GTA5》排名第一

双11,用Python爬取4000多条笔记本电脑的销售数据并分析