练习1-车费预测
Posted lecoww disappear forever
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了练习1-车费预测相关的知识,希望对你有一定的参考价值。
源代码:
# %%
\'\'\'
步骤:
1、读入数据集,将车费、经纬度进行清洗
(使用plt画散点图(省略))
2、用sklearn进行预测
\'\'\'
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
train = pd.read_csv(r"C:\\Users\\Administrator\\纽约出租车车费预测\\train.csv",nrows=1000000)
train.head()
train.describe() # 发现车费min为负,经度纬度、乘客数的max过大
train.shape # 原始数据集大小
train.drop(train[train.isna().any(1)].index, axis=0, inplace = True) # 删除任何有nan的行
train.shape # 删除nan之后的大小
# # 清洗乘客数
train["passenger_count"].describe()
train["passenger_count"].value_counts().sort_values(ascending=True) # 寻找人数异常值的个数
train.drop(train[(train[\'passenger_count\'] > 6) | (train[\'passenger_count\'] == 0)].index, inplace = True, axis = 0) #drop异常人数值
train["passenger_count"].value_counts().sort_values(ascending=True)
# # 清洗经纬度
eps = 1e-7
train[(train["pickup_longitude"] - train["dropoff_longitude"] < eps) & (train["pickup_longitude"] - train["dropoff_longitude"] > -eps) & \\
(train["pickup_latitude"] - train["dropoff_latitude"] < eps) & (train["pickup_latitude"] - train["dropoff_latitude"] > -eps)\\
] # 很多起始位置基本小数点前6位没有发生变化
# 与describe里的经纬度对比,需要把一些离平均值很远的行去掉
for name in train.columns[3:7]:
train.drop(train[(train[name] < train[name].mean()-10) | (train[name] > train[name].mean() + 10)].index\\
, axis = 0, inplace = True)
train.describe()
# %% [markdown]
# # 清洗车费
train["fare_amount"].value_counts().sort_index(ascending=True) # 计数后按车费排序
train.drop(train[train["fare_amount"] < eps].index, inplace = True, axis = 0)
train["fare_amount"].describe() # 认为大于0即为合法数据
train.describe() # 除了车费,其他数据方差很小,说明异常值基本去除
# # 导入测试集,并给训练集和测试集加入有关时间的列
test = pd.read_csv(r"C:\\Users\\Administrator\\Desktop\\纽约出租车车费预测\\test.csv")
# 转时间类型
train[\'key\'] = pd.to_datetime(train[\'key\'])
train[\'pickup_datetime\'] = pd.to_datetime(train[\'pickup_datetime\'])
test[\'key\'] = pd.to_datetime(test[\'key\'])
test[\'pickup_datetime\'] = pd.to_datetime(test[\'pickup_datetime\'])
train.dtypes
# 增加列
train[\'year\'] = train[\'pickup_datetime\'].dt.year
train[\'month\'] = train[\'pickup_datetime\'].dt.month
train[\'day\'] = train[\'pickup_datetime\'].dt.day
train[\'hour\'] = train[\'pickup_datetime\'].dt.hour
train[\'day of week\'] = train[\'pickup_datetime\'].dt.dayofweek
test[\'year\'] = test[\'pickup_datetime\'].dt.year
test[\'month\'] = test[\'pickup_datetime\'].dt.month
test[\'day\'] = test[\'pickup_datetime\'].dt.day
test[\'hour\'] = test[\'pickup_datetime\'].dt.hour
test[\'day of week\'] = test[\'pickup_datetime\'].dt.dayofweek
train.dtypes
test.dtypes
# # 计算路程以及每mile的车费(预测时没用到,因为是预测test的车费)
def distance(lat1, long1, lat2, long2):
data = [train, test]
for i in data:
R = 6371 # 地球半径(单位:千米)
phi1 = np.radians(i[lat1])
phi2 = np.radians(i[lat2])
delta_phi = np.radians(i[lat2]-i[lat1])
delta_lambda = np.radians(i[long2]-i[long1])
#a = sin²((φB - φA)/2) + cos φA . cos φB . sin²((λB - λA)/2)
a = np.sin(delta_phi / 2.0) ** 2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda / 2.0) ** 2
#c = 2 * atan2( √a, √(1−a) )
c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
#d = R*c
d = (R * c) # 单位:千米
i[\'H_Distance\'] = d
return d
distance(\'pickup_latitude\',\'pickup_longitude\',\'dropoff_latitude\',\'dropoff_longitude\')
\'\'\'eps = 1e-7
train.drop( train[(train["pickup_longitude"] - train["dropoff_longitude"] < eps) & (train["pickup_longitude"] - train["dropoff_longitude"] > -eps) & \\
(train["pickup_latitude"] - train["dropoff_latitude"] < eps) & (train["pickup_latitude"] - train["dropoff_latitude"] > -eps)\\
].index,inplace = True, axis = 0) # 去除没有动的点
\'\'\'
eps = 1e-7
train.drop(train[(train[\'H_Distance\']< eps) & (train[\'H_Distance\'] > -eps)].index, inplace=True, axis=0)
train["fare_pre_mile"] = train.fare_amount / train.H_Distance # 每mile的价钱
train
train["fare_pre_mile"].describe()
for i in range(0,20): # 发现大于平均值以后的数量占比很小,考虑由于个别异常值导致平均值过大
print(train[train["fare_pre_mile"] > train["fare_pre_mile"].mean()+i]["fare_pre_mile"].count())
# 首先去除油价搞的离谱的
train.drop(train[(train[\'fare_pre_mile\'] > train["fare_pre_mile"].mean()+1000)].index, inplace=True, axis=0)
train["fare_pre_mile"].describe()
train.drop(train[(train[\'fare_pre_mile\'] > train["fare_pre_mile"].mean()+100)].index, inplace=True, axis=0)
train["fare_pre_mile"].describe()
train.drop(train[(train[\'fare_pre_mile\'] > train["fare_pre_mile"].mean()+100)].index, inplace=True, axis=0)
train["fare_pre_mile"].describe()
train.drop(train[(train[\'fare_pre_mile\'] > train["fare_pre_mile"].mean()+50)].index, inplace=True, axis=0)
train["fare_pre_mile"].describe()
# 发现平均值基本稳定了,油价也比较接近常识
for i in range(0,20): # 遍历每mile油费大于i的count
print(i," : ",train[train["fare_pre_mile"] > i]["fare_pre_mile"].count())
# 去掉后面一部分
train.drop(train[(train[\'fare_pre_mile\'] > 8)].index, inplace=True, axis=0)
# 再去除小于1的
train.drop(train[(train[\'fare_pre_mile\'] < 1)].index, inplace=True, axis=0)
train[\'fare_pre_mile\'].describe()
# 预测
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler # 标准化
x_train = train.drop(["key","pickup_datetime","fare_amount","fare_pre_mile"],1) # 训练集数据
y_train = train["fare_amount"] # 训练集结果
x_test = test.drop(["key","pickup_datetime"],1)
std_x = StandardScaler()
x_train = std_x.fit_transform(x_train)
x_test = std_x.fit_transform(x_test)
std_y = StandardScaler()
y_train = std_y.fit_transform(np.array(y_train).reshape(-1,1))
x_train.shape
y_train.shape
x_test.shape
# 梯度下降预测
sgd = SGDRegressor()
y_train = y_train.ravel()
sgd.fit(x_train,y_train)
y_sgd_predict = sgd.predict(x_test)
y_sgd_predict = std_y.inverse_transform(y_sgd_predict)
y_sgd_predict
test["fare_amount"]=y_sgd_predict
train
以上是关于练习1-车费预测的主要内容,如果未能解决你的问题,请参考以下文章
spring练习,在Eclipse搭建的Spring开发环境中,使用set注入方式,实现对象的依赖关系,通过ClassPathXmlApplicationContext实体类获取Bean对象(代码片段
Python练习册 第 0013 题: 用 Python 写一个爬图片的程序,爬 这个链接里的日本妹子图片 :-),(http://tieba.baidu.com/p/2166231880)(代码片段