项目实战Airbnb爱彼迎-数据分析与建模

Posted ZSYL

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了项目实战Airbnb爱彼迎-数据分析与建模相关的知识,希望对你有一定的参考价值。

导入科学计算库

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pylab import mpl
# 设置显示中文字体
mpl.rcParams["font.sans-serif"] = ['SimHei']
# 设置正常显示符号
mpl.rcParams["axes.unicode_minus"] = False
import warnings
warnings.filterwarnings('ignore')

Calendar数据集分析

calendar = pd.read_csv("./data/madrid-airbnb-data/calendar.csv")
calendar.head()
listing_iddateavailablepriceadjusted_priceminimum_nightsmaximum_nights
03368342019-09-19f$63.00$63.005250
163692019-09-19f$70.00$70.001365
263692019-09-20f$75.00$75.001365
363692019-09-21f$75.00$75.001365
463692019-09-22t$70.00$70.001365

对price价格数据进行处理

去除$字符使用正则表达式并修改类型为np.float32

calendar['price'] = calendar['price'].str.replace(r'[$,]', '', regex=True).astype(np.float32)
calendar['adjusted_price'] = calendar['adjusted_price'].str.replace(r"[$,]","",regex=True).astype(np.float32)
calendar.head()
listing_iddateavailablepriceadjusted_priceminimum_nightsmaximum_nights
03368342019-09-19f63.063.05250
163692019-09-19f70.070.01365
263692019-09-20f75.075.01365
363692019-09-21f75.075.01365
463692019-09-22t70.070.01365

处理时间序列

date字符串类型转化为datetime类型

calendar['date'] = pd.to_datetime(calendar['date'])
calendar['date'].head()
0   2019-09-19
1   2019-09-19
2   2019-09-20
3   2019-09-21
4   2019-09-22
Name: date, dtype: datetime64[ns]
calendar['date'][0]
Timestamp('2019-09-19 00:00:00')
calendar['date'][0].day
19
# help(calendar['date'].dt)

新增weekday&month列

calendar['weekday'] = calendar['date'].dt.weekday
calendar['month'] = calendar['date'].dt.month
calendar['month'].head()
0    9
1    9
2    9
3    9
4    9
Name: month, dtype: int64

可视化分析

分析爱彼迎的每月房价,分组聚合计算平均值

# list(map(lambda x : str(x)+'月', list(month_price.index)))
from matplotlib import font_manager
plt.figure(figsize=(10, 5))
font = font_manager.FontProperties(fname="C:\\Windows\\Fonts\\msyh.ttc", size=15)
plt.title("爱彼迎每月平均房价",fontproperties=font)
month_price = calendar.groupby("month")['price'].mean()
sns.barplot(list(map(lambda x : str(x)+'月', list(month_price.index))), month_price.values)
plt.show()

分析爱彼迎的每周房价,分组聚合计算平均值

plt.figure(figsize=(10, 5))
font = font_manager.FontProperties(fname="C:\\Windows\\Fonts\\msyh.ttc", size=15)
plt.title("爱彼迎每周平均房价",fontproperties=font)
weekday_price = calendar.groupby("weekday")['price'].mean()
sns.barplot(weekday_price.index, weekday_price.values)
plt.show()

分析价格的分布直方图

plt.figure(figsize=(10, 5))
sns.distplot(calendar[calendar['price']<300]['price'])
plt.show()

listings数据集分析

# 获取爱彼迎酒店的详细信息
listings_detailed = pd.read_csv("./data/madrid-airbnb-data/listings_detailed.csv")
listings_detailed.head()
idlisting_urlscrape_idlast_scrapednamesummaryspacedescriptionexperiences_offeredneighborhood_overview...instant_bookableis_business_travel_readycancellation_policyrequire_guest_profile_picturerequire_guest_phone_verificationcalculated_host_listings_countcalculated_host_listings_count_entire_homescalculated_host_listings_count_private_roomscalculated_host_listings_count_shared_roomsreviews_per_month
06369https://www.airbnb.com/rooms/6369201909190502152019-09-19Rooftop terrace room with ensuite bathroom, Airc.Atico en la calle Principe de Vergara, con bañ...BETTER THAN A HOTEL.Upscale neighboorhood (Met...BETTER THAN A HOTEL.Upscale neighboorhood (Met...noneNice and definitely non touristic neighborhoo......ffflexibleff10100.56
121853https://www.airbnb.com/rooms/21853201909190502152019-09-19Bright and airy roomWe have a quiet and sunny room with a good vie...I am living in a nice flat near the centre of ...We have a quiet and sunny room with a good vie...noneWe live in a leafy neighbourhood with plenty o......ffstrict_14_with_grace_periodff20200.55
224805https://www.airbnb.com/rooms/24805201909190502152019-09-19Gran Via Studio MadridStudio located 50 meters from Gran Via, next t...LOCATION. Brand new flats near the Atlantic H...Studio located 50 meters from Gran Via, next t...noneThe area is next to the Gran Via, so people li......ffmoderateff11000.03
324836https://www.airbnb.com/rooms/24836201909190502152019-09-19Select the Madrid more "cool".Apartamento céntrico junto a Gran Vía (VT-296)...Select the Madrid more "cool" : 70 m, 2 bedroo...Select the Madrid more "cool" : 70 m, 2 bedroo...noneCosmopolita, lleno de locales de moda, restaur......ffmoderateff11000.63
426825https://www.airbnb.com/rooms/26825201909190502152019-09-19Single Room whith private BathroomNice and cozy roon for one person with a priva...Nice and cozy roon for one person with a priva...Nice and cozy roon for one person with a priva...noneEs un barrio muy tranquilo, en una zona de Mad......ffstrict_14_with_grace_periodff10101.19

5 rows × 106 columns

# 获取数据所有字段的详细信息
listings_detailed.columns.values.tolist()
['id',
 'listing_url',
 'scrape_id',
 'last_scraped',
 'name',
 'summary',
 'space',
 'description',
 'experiences_offered',
 'neighborhood_overview',
 'notes',
 'transit',
 'access',
 'interaction',
 'house_rules',
 'thumbnail_url',
 'medium_url',
 'picture_url',
 'xl_picture_url',
 'host_id',
 'host_url',
 'host_name',
 'host_since',
 'host_location',
 'host_about',
 'host_response_time',
 'host_response_rate',
 'host_acceptance_rate',
 'host_is_superhost',
 'host_thumbnail_url',
 'host_picture_url',
 'host_neighbourhood',
 'host_listings_count',
 'host_total_listings_count',
 'host_verifications',
 'host_has_profile_pic',
 'host_identity_verified',
 'street',
 'neighbourhood',
 'neighbourhood_cleansed',
 'neighbourhood_group_cleansed',
 'city',
 'state',
 'zipcode',
 'market',
 'smart_location',
 'country_code',
 'country',
 'latitude',
 'longitude',
 'is_location_exact',
 'property_type',
 'room_type',
 'accommodates',
 'bathrooms',
 'bedrooms',
 'beds',
 'bed_type',
 'amenities',
 'square_feet',
 'price',
 'weekly_price',
 'monthly_price',
 'security_deposit',
 'cleaning_fee',
 'guests_included',
 'extra_people',
 'minimum_nights',
 'maximum_nights',
 'minimum_minimum_nights',
 'maximum_minimum_nights',
 'minimum_maximum_nights',
 'maximum_maximum_nights',
 'minimum_nights_avg_ntm',
 'maximum_nights_avg_ntm',
 'calendar_updated',
 'has_availability',
 'availability_30',
 'availability_60',
 'availability_90',
 'availability_365',
 'calendar_last_scraped',
 'number_of_reviews',
 'number_of_reviews_ltm',
 'first_review',
 'last_review',
 'review_scores_rating',
 'review_scores_accuracy',
 'review_scores_cleanliness',
 'review_scores_checkin',
 'review_scores_communication',
 'review_scores_location',
 'review_scores_value',
 'requires_license',
 'license',
 'jurisdiction_names',
 'instant_bookable',
 'is_business_travel_ready',
 'cancellation_policy',
 'require_guest_profile_picture',
 'require_guest_phone_verification',
 'calculated_host_listings_count',
 'calculated_host_listings_count_entire_homes',
 'calculated_host_listings_count_private_rooms',
 'calculated_host_listings_count_shared_rooms',
 'reviews_per_month']
listings_detailed['price'].head()
0     $70.00
1     $17.00
2     $80.00
3    $115.00
4     $25.00
Name: price, dtype: object

对价格数据进行预处理

listings_detailed['price'] = listings_detailed['price'].str.replace(r"[$,]","",regex=True).astype(np.float32)
# 清理小费
listings_detailed['cleaning_fee'] = listings_detailed['cleaning_fee'].str.replace(r"[$,]","",regex=True).astype(np.float32)
# 存在缺失值
np.any(np.isnan(listings_detailed['cleaning_fee']))
True

处理价格缺失值

listings_detailed['cleaning_fee'].fillna(0, inplace=True)
listings_detailed['cleaning_fee'].head()
0     5.0
1     0.0
2    30.0
3     0.0
4    15.0
Name: cleaning_fee, dtype: float32
# 不存在缺失值
np.any(np.isnan(listings_detailed['cleaning_fee']))
False

新增字段

# 添加一个新的字段:最低消费(房间价格+小费)*最低预定天数
listings_detailed['minimum_cost'] = (listings_detailed['price']+listings_detailed['cleaning_fee'])*listings_detailed['minimum_nights']
listings_detailed['minimum_cost'].head()
0     75.0
1     68.0
2    550.0
3    345.0
4     80.0
Name: minimum_cost, dtype: float32
# 查看房间设施
listings_detailed['amenities']
0        Wifi,"Air conditioning",Kitchen,Elevator,Heat...
1        TV,Internet,Wifi,"Air conditioning",Kitchen,"...
2        TV,Internet,Wifi,"Air conditioning",Kitchen,E...
3        TV,"Cable TV",Internet,Wifi,"Air conditioning...
4        Wifi,"Wheelchair accessible",Doorman,Elevator...
                               ...                        
20832    TV,Wifi,Kitchen,Heating,Washer,"First aid kit...
20833    Wifi,Kitchen,Heating,Washer,Essentials,Shampo...
20834    TV,Wifi,"Air conditioning",Kitchen,Heating,Es...
20835    Wifi,Kitchen,"Smoking allowed","Pets allowed"...
20836    TV,Wifi,"Air conditioning",Kitchen,"Pets allo...
Name: amenities, Length: 20837, dtype: object
# 设施的数量
listings_detailed['n_amenities'] = listings_detailed['amenities'].str[1:-1].str.split(",").apply(len)
pd.cut(listings_detailed['accommodates'], bins=[1,2,3,5,100], include_lowest=True, right=False, labels=['Single','Couple','Family','Group'])
0        Couple
1        Single
2        Family
3        Family
4        Single
          ...  
20832    Family
20833    Couple
20834    Family
20835    Couple
20836    Couple
Name: accommodates, Length: 20837, dtype: category
Categories (4, object): ['Single' < 'Couple' < 'Family' < 'Group']
# 根据可容纳的人数,添加一个新的列,用来表示类型:Single(1)、Couple(2)、Family(5)、Group(100)
# accommodates/listings_detailed['accommodates_type'] = 
listings_detailed['accommodates_type'] = pd.cut(listings_detailed['accommodates'], bins=[1,2,3,5,100], include_lowest=True, right=False, labels=['Single','Couple','Family','Group'])
# 邻居群(不同地理区)中心地区房价可能高
listings_detailed['neighbourhood_group_cleansed'].head()
0     Chamartín
1        Latina
2        Centro
3        Centro
4    Arganzuela
Name: neighbourhood_group_cleansed, dtype: object
# 评分
listings_detailed['review_scores_rating'].head()
0     98.0
1     92.0
2    100.0
3     98.0
4     94.0
Name: review_scores_rating, dtype: float64

获取详细数据

取出一些字段可能影响房价的字段

listings_detailed_df = listings_detailed[['id','host_id','listing_url','room_type',
                                          'neighbourhood_group_cleansed','price','cleaning_fee','amenities','n_amenities',
                                         'accommodates','accommodates_type','minimum_nights','minimum_cost']]
listings_detailed_df.head()
idhost_idlisting_urlroom_typeneighbourhood_group_cleansedpricecleaning_feeamenitiesn_amenitiesaccommodatesaccommodates_typeminimum_nightsminimum_cost
0636913660https://www.airbnb.com/rooms/6369Private roomChamartín70.05.0Wifi,"Air conditioning",Kitchen,Elevator,Heat...162Couple175.0
12185383531https://www.airbnb.com/rooms/21853Private roomLatina17.00.0TV,Internet,Wifi,"Air conditioning",Kitchen,"...311Single468.0
224805101471https://www.airbnb.com/rooms/24805Entire home/aptCentro80.030.0TV,Internet,Wifi,"Air conditioning",Kitchen,E...323Family5550.0
324836101653https://www.airbnb.com/rooms/24836Entire home/aptCentro115.00.0TV,"Cable TV",Internet,Wifi,"Air conditioning...184Family3345.0
426825114340https://www.airbnb.com/rooms/26825Private roomArganzuela25.015.0Wifi,"Wheelchair accessible",Doorman,Elevator...161Single280.0

数据可视化

# 房间类型的情况
room_type_counts = listings_detailed_df['room_type'].value_counts()
fig,axes = plt.subplots(1,2,figsize=(10,5), dpi=100)
axes[0].pie(room_type_counts.values,autopct="%.2f%%",labels=room_type_counts.index)
sns.barplot(room_type_counts.index,room_type_counts.values)
plt.tight_layout()

# 房间地理分布情况
plt.figure(figsize=(10, 5), dpi=100)
neighbourhood_counts = listings_detailed_df['neighbourhood_group_cleansed'].value_counts()
sns.barplot(y=neighbourhood_counts.index,x=neighbourhood_counts.values,orient='h')
plt.show()

# 分析不同地理位置的不同房间类型的分布
 # 按照room_type将层次索引平铺
neighbour_room_type = listings_detailed_df.groupby(['neighbourhood_group_cleansed','room_type']) \\
    .size() \\
    .unstack('room_type') \\
    .fillna(0) \\
    .apply(lambda row: row/row.sum(),axis=1) \\
    .sort_values("Entire home/apt",ascending=True)
neighbour_room_type
room_typeEntire home/aptHotel roomPrivate roomShared room
neighbourhood_group_cleansed
Villaverde0.1686750.0060240.7530120.072289
Vicálvaro0.1846150.0000000.7846150.030769
Moratalaz0.2440940.0157480.7322830.007874
Barajas0.2606060.0000000.7272730.012121
San Blas - Canillejas0.2958020.0000000.6946560.009542
Villa de Vallecas0.3300970.0000000.6213590.048544
Ciudad Lineal0.3605220.0032630.6182710.017945
Usera0.3822710.0360110.5512470.030471
Carabanchel0.3834810.0014750.5943950.020649
Latina0.3863640.0016230.5892860.022727
Fuencarral - El Pardo0.4000000.0000000.5733330.026667
Puente de Vallecas0.4145520.0101520.5566840.018613
Hortaleza0.4307230.0662650.4969880.006024
Moncloa - Aravaca0.5000000.0110700.4833950.005535
Arganzuela0.5396970.0080290.4442460.008029
Chamberí0.5679940.0097670.4034560.018783
Tetuán0.5775660.0071600.4116950.003580
Chamartín0.6134300.0127040.3539020.019964
Retiro0.6255640.0030080.3684210.003008
Salamanca0.6400000.0450910.3083640.006545
Centro0.7331420.0331530.2220400.011665
# 绘制每个地区的类型分布使用重叠的条形图
columns = neighbour_room_type.columns
plt.figure(figsize=(10, 5), dpi=100)
index = neighbour_room_type.index
plt.barh(index,neighbour_room_type[columns[0]])
left = neighbour_room_type[columns[0]].copy()
plt.barh(index,neighbour_room_type[columns[1]],left=left)
left += neighbour_room_type[columns[1]].copy()
plt.barh(index,neighbour_room_type[columns[2]],left=left)
left += neighbour_room_type[columns[2]].copy()
plt.barh(index,neighbour_room_type[columns[3]],left=left)
plt.legend(columns)
plt.show()

fig,ax = plt.subplots(figsize=(10,5), dpi=100)
neighbour_room_type.plot(kind="barh",stacked=True,width=0.75,ax=ax)
plt.show()

# 分析一个房东的房子数量
plt.figure(figsize=(6, 3), dpi=100)
host_number = listings_detailed_df.groupby('host_id').size()
sns.distplot(host_number[host_number<10])
plt.show()

# 分组绘制饼图
# 1,2,3,5+
#[1,2),[2,3),[3,4),5+
plt.figure(figsize=(8, 5), dpi=100)
host_number_bins = pd.cut(host_number,bins=[1,2,3,5,1000],include_lowest=True,right=False,labels=['1','2','3-4','5+']).value_counts()
plt.pie(host_number_bins,autopct="%.2f%%",labels=host_number_bins.index)
plt.show()

Reviews数据集分析

reviews = pd.read_csv("./data/madrid-airbnb-data/reviews_detailed.csv", parse_dates=['date'])  # parse_dates直接解析时间字符串
reviews.head()
listing_ididdatereviewer_idreviewer_namecomments
06369294282010-03-1484790NancySimon and Arturo have the ultimate location in...
16369310182010-03-2384338DavidMyself and Kristy originally planned on stayin...
26369346942010-04-1098655MarionWe had a great time at Arturo and Simon's ! A ...
36369371462010-04-21109871KurtI very much enjoyed the stay. \\r\\nIt's a wond...
46369381682010-04-2698901DennisArturo and Simon are polite and friendly hosts...
# 新增年月两列
reviews['year'] = reviews['date'].dt.year
reviews['month'] = reviews['date'].dt.month
# 绘制每年的评论数
plt.figure(figsize=(8, 5), dpi=100)
n_reviews_year = reviews.groupby("year").size()
sns.barplot(n_reviews_year.index,n_reviews_year.values)
plt.show()

# 月评论数
plt.figure(figsize=(8, 5), dpi=100)
n_reviews_month = reviews.groupby("month"第1590期GraphQL和Thrift在Airbnb架构中的配合使用

Airbnb 在搜索中应用深度学习的经验

3星|《爱彼迎传》:公关稿,大事记

苹果或推出多屏幕iPhone;​爱彼迎CEO:办公室时代已过去;Apache Flink 1.15 发布|极客头条

Python高级应用程序设计任务要求

LeetCode(Shell)- 195. 第十行