练习1-车费预测

Posted lecoww disappear forever

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了练习1-车费预测相关的知识,希望对你有一定的参考价值。


源代码:

# %%
\'\'\'
步骤:
1、读入数据集,将车费、经纬度进行清洗
(使用plt画散点图(省略))
2、用sklearn进行预测
\'\'\'
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn


train = pd.read_csv(r"C:\\Users\\Administrator\\纽约出租车车费预测\\train.csv",nrows=1000000)


train.head()


train.describe() # 发现车费min为负,经度纬度、乘客数的max过大


train.shape # 原始数据集大小


train.drop(train[train.isna().any(1)].index, axis=0, inplace = True) # 删除任何有nan的行


train.shape # 删除nan之后的大小


# # 清洗乘客数


train["passenger_count"].describe()


train["passenger_count"].value_counts().sort_values(ascending=True) # 寻找人数异常值的个数


train.drop(train[(train[\'passenger_count\'] > 6) | (train[\'passenger_count\'] == 0)].index, inplace = True, axis = 0) #drop异常人数值

train["passenger_count"].value_counts().sort_values(ascending=True)


# # 清洗经纬度
eps = 1e-7
train[(train["pickup_longitude"] - train["dropoff_longitude"] < eps) & (train["pickup_longitude"] - train["dropoff_longitude"] > -eps) & \\
      (train["pickup_latitude"] - train["dropoff_latitude"] < eps) & (train["pickup_latitude"] - train["dropoff_latitude"] > -eps)\\
     ] # 很多起始位置基本小数点前6位没有发生变化


# 与describe里的经纬度对比,需要把一些离平均值很远的行去掉
for name in train.columns[3:7]:
    train.drop(train[(train[name] < train[name].mean()-10) | (train[name] > train[name].mean() + 10)].index\\
    , axis = 0, inplace = True)


train.describe()

# %% [markdown]
# # 清洗车费


train["fare_amount"].value_counts().sort_index(ascending=True) # 计数后按车费排序


train.drop(train[train["fare_amount"] < eps].index, inplace = True, axis = 0)


train["fare_amount"].describe() # 认为大于0即为合法数据


train.describe() # 除了车费,其他数据方差很小,说明异常值基本去除



# # 导入测试集,并给训练集和测试集加入有关时间的列


test = pd.read_csv(r"C:\\Users\\Administrator\\Desktop\\纽约出租车车费预测\\test.csv")




# 转时间类型
train[\'key\'] = pd.to_datetime(train[\'key\'])
train[\'pickup_datetime\'] = pd.to_datetime(train[\'pickup_datetime\'])
test[\'key\'] = pd.to_datetime(test[\'key\'])
test[\'pickup_datetime\'] = pd.to_datetime(test[\'pickup_datetime\'])


train.dtypes

# 增加列
train[\'year\'] = train[\'pickup_datetime\'].dt.year
train[\'month\'] = train[\'pickup_datetime\'].dt.month
train[\'day\'] = train[\'pickup_datetime\'].dt.day
train[\'hour\'] = train[\'pickup_datetime\'].dt.hour
train[\'day of week\'] = train[\'pickup_datetime\'].dt.dayofweek
test[\'year\'] = test[\'pickup_datetime\'].dt.year
test[\'month\'] = test[\'pickup_datetime\'].dt.month
test[\'day\'] = test[\'pickup_datetime\'].dt.day
test[\'hour\'] = test[\'pickup_datetime\'].dt.hour
test[\'day of week\'] = test[\'pickup_datetime\'].dt.dayofweek


train.dtypes

test.dtypes


# # 计算路程以及每mile的车费(预测时没用到,因为是预测test的车费)




def distance(lat1, long1, lat2, long2):
    data = [train, test]
    for i in data:
        R = 6371  # 地球半径(单位:千米)
        phi1 = np.radians(i[lat1])
        phi2 = np.radians(i[lat2])
    
        delta_phi = np.radians(i[lat2]-i[lat1])
        delta_lambda = np.radians(i[long2]-i[long1])
    
        #a = sin²((φB - φA)/2) + cos φA . cos φB . sin²((λB - λA)/2)
        a = np.sin(delta_phi / 2.0) ** 2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda / 2.0) ** 2
    
        #c = 2 * atan2( √a, √(1−a) )
        c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    
        #d = R*c
        d = (R * c) # 单位:千米
        i[\'H_Distance\'] = d
    return d

distance(\'pickup_latitude\',\'pickup_longitude\',\'dropoff_latitude\',\'dropoff_longitude\')

\'\'\'eps = 1e-7
train.drop( train[(train["pickup_longitude"] - train["dropoff_longitude"] < eps) & (train["pickup_longitude"] - train["dropoff_longitude"] > -eps) & \\
      (train["pickup_latitude"] - train["dropoff_latitude"] < eps) & (train["pickup_latitude"] - train["dropoff_latitude"] > -eps)\\
     ].index,inplace = True, axis = 0) # 去除没有动的点
\'\'\'
eps = 1e-7
train.drop(train[(train[\'H_Distance\']< eps) & (train[\'H_Distance\'] > -eps)].index, inplace=True, axis=0)

train["fare_pre_mile"] = train.fare_amount / train.H_Distance # 每mile的价钱


train


train["fare_pre_mile"].describe()

for i in range(0,20): # 发现大于平均值以后的数量占比很小,考虑由于个别异常值导致平均值过大
    print(train[train["fare_pre_mile"] > train["fare_pre_mile"].mean()+i]["fare_pre_mile"].count())


# 首先去除油价搞的离谱的
train.drop(train[(train[\'fare_pre_mile\'] > train["fare_pre_mile"].mean()+1000)].index, inplace=True, axis=0)
train["fare_pre_mile"].describe()


train.drop(train[(train[\'fare_pre_mile\'] > train["fare_pre_mile"].mean()+100)].index, inplace=True, axis=0)
train["fare_pre_mile"].describe()

train.drop(train[(train[\'fare_pre_mile\'] > train["fare_pre_mile"].mean()+100)].index, inplace=True, axis=0)
train["fare_pre_mile"].describe()

train.drop(train[(train[\'fare_pre_mile\'] > train["fare_pre_mile"].mean()+50)].index, inplace=True, axis=0)
train["fare_pre_mile"].describe()

# 发现平均值基本稳定了,油价也比较接近常识

for i in range(0,20): # 遍历每mile油费大于i的count
    print(i," : ",train[train["fare_pre_mile"] > i]["fare_pre_mile"].count())

# 去掉后面一部分
train.drop(train[(train[\'fare_pre_mile\'] > 8)].index, inplace=True, axis=0)

# 再去除小于1的
train.drop(train[(train[\'fare_pre_mile\'] < 1)].index, inplace=True, axis=0)

train[\'fare_pre_mile\'].describe()


# 预测

from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler # 标准化

x_train = train.drop(["key","pickup_datetime","fare_amount","fare_pre_mile"],1) # 训练集数据
y_train = train["fare_amount"] # 训练集结果
x_test = test.drop(["key","pickup_datetime"],1)


std_x = StandardScaler()
x_train = std_x.fit_transform(x_train)
x_test = std_x.fit_transform(x_test)

std_y = StandardScaler()
y_train = std_y.fit_transform(np.array(y_train).reshape(-1,1))


x_train.shape

y_train.shape

x_test.shape

# 梯度下降预测

sgd = SGDRegressor()

y_train = y_train.ravel()
sgd.fit(x_train,y_train)

y_sgd_predict = sgd.predict(x_test)
y_sgd_predict = std_y.inverse_transform(y_sgd_predict)

y_sgd_predict

test["fare_amount"]=y_sgd_predict

train

以上是关于练习1-车费预测的主要内容,如果未能解决你的问题,请参考以下文章

校招真题练习019 毕业旅行问题(头条)

PTA练习题之7-1 出租车计价(15 分)

C语言顺序结构和分支结构总结

spring练习,在Eclipse搭建的Spring开发环境中,使用set注入方式,实现对象的依赖关系,通过ClassPathXmlApplicationContext实体类获取Bean对象(代码片段

条件语句练习-比分预测

Python练习册 第 0013 题: 用 Python 写一个爬图片的程序,爬 这个链接里的日本妹子图片 :-),(http://tieba.baidu.com/p/2166231880)(代码片段