天池二手车_特征工程

Posted 2020-12-16 cgmcoding

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了天池二手车_特征工程相关的知识，希望对你有一定的参考价值。

前面已经做了类别和连续特征的分析，本文将针对特征工程进行

导入数据

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#导入训练集和测试集
train_data =pd.read_csv(‘F:\python\天池_二手车交易价格预测\used_car_train_20200313.csv‘,sep=‘ ‘)
test_data=pd.read_csv(‘F:\python\天池_二手车交易价格预测\used_car_testB_20200421.csv‘,sep=‘ ‘)

删除异常值

#异常值处理
def out_proc(data,col_name,scale=3):
    
    def box_plot_out(data_ser,box_scale):
        ‘‘‘
        data_ser接受pd.Series数据格式
        ‘‘‘
        iqr=box_scale*(data_ser.quantile(0.75)-data_ser.quantile(0.25))   #0.75分位数的值-0.25分位数的值
        val_low=data_ser.quantile(0.25)-iqr
        val_up=data_ser.quantile(0.75) + iqr
        rule_low = (data_ser < val_low)   
        rule_up = (data_ser > val_up)
        return (rule_low, rule_up), (val_low, val_up)  #前面返回异常的pandas.Series 数据，后面返回临界值
    data_n=data.copy()  #先复制一个df
    data_series=data_n[col_name]  #某一列的值
    rule, value = box_plot_out(data_series, box_scale=scale)
    index = np.arange(data_series.shape[0])[rule[0] | rule[1]]  #shape[0]是行数，丨是or的意思，真个就是输出有异常值的索引数
    print("Delete number is: {}".format(len(index)))   #输出异常值个数
    data_n = data_n.drop(index)   #删除异常值
    data_n.reset_index(drop=True, inplace=True)  #重新设置索引
    print("Now column number is: {}".format(data_n.shape[0]))  #删除异常值之后数值的个数
    index_low = np.arange(data_series.shape[0])[rule[0]]   #低于临界值的索引数
    outliers = data_series.iloc[index_low]   #低于临界值的值
    print("Description of data less than the lower bound is:")
    print(pd.Series(outliers).describe())  
    index_up = np.arange(data_series.shape[0])[rule[1]]
    outliers = data_series.iloc[index_up]
    print("Description of data larger than the upper bound is:")
    print(pd.Series(outliers).describe())
    
    fig, ax = plt.subplots(1, 2, figsize=(10, 7))
    sns.boxplot(y=data[col_name], data=data, palette="Set1", ax=ax[0])  #某列原来的箱型图
    sns.boxplot(y=data_n[col_name], data=data_n, palette="Set1", ax=ax[1])  #删除异常值后的箱型图
    return data_n  #返回删除后的值

train_data根据power删除一些异常值

# 这里删不删同学可以自行判断
# 但是要注意 test 的数据不能删 = = 不能掩耳盗铃是不是
train_data= out_proc(train_data,‘power‘,scale=3)
    
train_data.shape

训练集和测试集放在一起，方便构造特征

#用一列做标签区分一下训练集和测试集
train_data[‘train‘]=1
test_data[‘train‘]=0
data = pd.concat([train_data, test_data], ignore_index=True)

创建汽车使用时间（data[‘creatDate‘] - data[‘regDate‘]）

# 不过要注意，数据里有时间出错的格式，所以我们需要 errors=‘coerce‘
data[‘used_time‘] = (pd.to_datetime(data[‘creatDate‘], format=‘%Y%m%d‘, errors=‘coerce‘) - 
                            pd.to_datetime(data[‘regDate‘], format=‘%Y%m%d‘, errors=‘coerce‘)).dt.days

由于有些样本有问题，导致使用时间为空，我们计算一下空值的个数

data[‘used_time‘].isnull().sum()  #15054

以上是关于天池二手车_特征工程的主要内容，如果未能解决你的问题，请参考以下文章