在拿到数据进行建模预测之前,先对数据之间的隐藏关系进行窥探,俗称EDA
import pandas as pd import numpy as np import matplotlib.pyplot as plt import re train = pd.read_csv(‘D:/train_1.csv‘).fillna(0) # print(train.head()) for col in train.columns[1:]: train[col] = pd.to_numeric(train[col],downcast=‘integer‘) # print(train) def get_language(page): res = re.search(‘[a-z][a-z].wikipedia.org‘,page) #print (res.group()[0:2]) if res: return res.group()[0:2] return ‘na‘ train[‘lang‘] = train.Page.map(get_language) # print(train.head()) from collections import Counter print(Counter(train.lang)) lang_sets = {} lang_sets[‘en‘] = train[train.lang==‘en‘].iloc[:,0:-1] lang_sets[‘ja‘] = train[train.lang==‘ja‘].iloc[:,0:-1] lang_sets[‘de‘] = train[train.lang==‘de‘].iloc[:,0:-1] lang_sets[‘na‘] = train[train.lang==‘na‘].iloc[:,0:-1] lang_sets[‘fr‘] = train[train.lang==‘fr‘].iloc[:,0:-1] lang_sets[‘zh‘] = train[train.lang==‘zh‘].iloc[:,0:-1] lang_sets[‘ru‘] = train[train.lang==‘ru‘].iloc[:,0:-1] lang_sets[‘es‘] = train[train.lang==‘es‘].iloc[:,0:-1] sums = {} for key in lang_sets: #将每列的点击数相加,求平均数 sums[key] = lang_sets[key].iloc[:, 1:].sum(axis=0) / lang_sets[key].shape[0] days = [r for r in range(sums[‘en‘].shape[0])] fig = plt.figure(1, figsize=[10, 10]) plt.ylabel(‘Views per Page‘) plt.xlabel(‘Day‘) plt.title(‘Pages in Different Languages‘) labels = {‘en‘: ‘English‘, ‘ja‘: ‘Japanese‘, ‘de‘: ‘German‘, ‘na‘: ‘Media‘, ‘fr‘: ‘French‘, ‘zh‘: ‘Chinese‘, ‘ru‘: ‘Russian‘, ‘es‘: ‘Spanish‘ } #描绘每种语言在同样的时间段中每天的点击量 for key in sums: plt.plot(days, sums[key], label=labels[key]) plt.legend() plt.show() #画每个词条这段时间的点击量 def plot_entry(key, idx): data = lang_sets[key].iloc[idx, 1:] fig = plt.figure(1, figsize=(10, 5)) plt.plot(days, data) plt.xlabel(‘day‘) plt.ylabel(‘views‘) plt.title(train.iloc[lang_sets[key].index[idx], 0]) plt.show() idx = [1, 5, 10, 50, 100, 250,500, 750,1000,1500,2000,3000,4000,5000] for i in idx: plot_entry(‘en‘,i) npages = 5 top_pages = {} for key in lang_sets: print(key) sum_set = pd.DataFrame(lang_sets[key][[‘Page‘]]) sum_set[‘total‘] = lang_sets[key].sum(axis=1) sum_set = sum_set.sort_values(‘total‘,ascending=False) print(sum_set.head(10)) top_pages[key] = sum_set.index[0] print(‘\n\n‘) #每种语言前5名页面的点击量 for key in top_pages: fig = plt.figure(1,figsize=(10,5)) cols = train.columns cols = cols[1:-1] data = train.loc[top_pages[key],cols] plt.plot(days,data) plt.xlabel(‘Days‘) plt.ylabel(‘Views‘) plt.title(train.loc[top_pages[key],‘Page‘]) plt.show()