pandas与matplotlib综合案例
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了pandas与matplotlib综合案例相关的知识,希望对你有一定的参考价值。
参考技术A需求:分析Xiaoqu_NJ.csv文件,分析南京各小区分布数量、及小区建成情况等。
1、打开Xiaoqu_NJ.csv文件,由于爬取的文件没有表头,所以需要自定义表头。
2、城市定位与街道定位去除’小区‘
3、删除多余的列
4、将热度等字段保留数字,其余删除。用到extract()文本提取函数,它与正则表达式配合使用。\\d+,其中\\d代表数字,+代表批配数字后的数字
5、物业费。保留单价数字,用到
data[\'物业费\'] = data[\'物业费\'].str.extract(\'(. ?)元. ?\')
(.*?)提取元前()内的数据
6、永久删除缺失值
7、增加一列使用年限
8、保存清洗 后的数据
1、导入库
2、读取修改后的数据
3、绘制南京各区小区分布数量图
数组转列表,.tolist()
运行效果:
4、南京小区建成数量与年份的关系图表
运行效果:
运行效果:
5、各区域建成数量与年份关系图
运行效果:
6、南京下辖各区小区均价情况
运行效果
pandas案例分析,附加numpy matplotlib
import pandas as pd
df=pd.read_csv(,sep=;)
这是如果出现; 说明是用;做分隔符,而不是默认的,
import pandas as pd
red_df = pd.read_csv(winequality-red.csv, sep=;)
white_df = pd.read_csv(winequality-white.csv, sep=;)
red_df.head()
white_df.head()
fixed_acidity | volatile_acidity | citric_acid | residual_sugar | chlorides | free_sulfur_dioxide | total_sulfur_dioxide | density | pH | sulphates | alcohol | quality | |
0 | 7.0 | 0.27 | 0.36 | 20.7 | 0.045 | 45.0 | 170.0 | 1.0010 | 3.00 | 0.45 | 8.8 | 6 |
1 | 6.3 | 0.30 | 0.34 | 1.6 | 0.049 | 14.0 | 132.0 | 0.9940 | 3.30 | 0.49 | 9.5 | 6 |
2 | 8.1 | 0.28 | 0.40 | 6.9 | 0.050 | 30.0 | 97.0 | 0.9951 | 3.26 | 0.44 | 10.1 | 6 |
3 | 7.2 | 0.23 | 0.32 | 8.5 | 0.058 | 47.0 | 186.0 | 0.9956 | 3.19 | 0.40 | 9.9 | 6 |
4 | 7.2 | 0.23 | 0.32 | 8.5 | 0.058 | 47.0 | 186.0 | 0.9956 | 3.19 | 0.40 | 9.9 | 6 |
print(red_df.shape)
(1599, 12)
print(white_df.shape)
(4898, 12)
red_df.isnull().sum()
fixed_acidity 0 volatile_acidity 0 citric_acid 0 residual_sugar 0 chlorides 0 free_sulfur_dioxide 0 total_sulfur-dioxide 0 density 0 pH 0 sulphates 0 alcohol 0 quality 0 dtype: int64
white_df.isnull().sum()
fixed_acidity 0 volatile_acidity 0 citric_acid 0 residual_sugar 0 chlorides 0 free_sulfur_dioxide 0 total_sulfur_dioxide 0 density 0 pH 0 sulphates 0 alcohol 0 quality 0 dtype: int64
white_df.duplicated().sum() 重复值统计
937 但是重复行是不可以删除的
红葡萄酒数据集中有多少唯一的质量值?
red_df.quality.nunique()
6
红葡萄酒数据集中的平均密度是多少?
red_df.density.mean()
0.996746679174484
import numpy as np
a=np.random.random(1000)
生成1000个随机数的矩阵
np.mean(a)
求a得平均值
# 导入 numpy 和 pandas import numpy as np import pandas as pd # 加载红葡萄酒和白葡萄酒数据集 red_df = pd.read_csv(winequality-red.csv, sep=;) white_df = pd.read_csv(winequality-white.csv, sep=;)
# 为红葡萄酒数据框创建颜色数组
color_red = np.repeat(red, red_df.shape[0])
# 为白葡萄酒数据框创建颜色数组
color_white = np.repeat(white, white_df.shape[0])
red_df[color] = color_red
red_df.head()
fixed_acidity | volatile_acidity | citric_acid | residual_sugar | chlorides | free_sulfur_dioxide | total_sulfur-dioxide | density | pH | sulphates | alcohol | quality | color | |
0 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 | red |
1 | 7.8 | 0.88 | 0.00 | 2.6 | 0.098 | 25.0 | 67.0 | 0.9968 | 3.20 | 0.68 | 9.8 | 5 | red |
2 | 7.8 | 0.76 | 0.04 | 2.3 | 0.092 | 15.0 | 54.0 | 0.9970 | 3.26 | 0.65 | 9.8 | 5 | red |
3 | 11.2 | 0.28 | 0.56 | 1.9 | 0.075 | 17.0 | 60.0 | 0.9980 | 3.16 | 0.58 | 9.8 | 6 | red |
4 | 7.4 | 0.70 | 0.00 | 1.9 | 0.076 | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 | red |
# 附加数据框
wine_df = red_df.append(white_df)
# 查看数据框,检查是否成功
wine_df.head()
alcohol | chlorides | citric_acid | color | density | fixed_acidity | free_sulfur_dioxide | pH | quality | residual_sugar | sulphates | total_sulfur-dioxide | total_sulfur_dioxide | volatile_acidity | |
0 | 9.4 | 0.076 | 0.00 | red | 0.9978 | 7.4 | 11.0 | 3.51 | 5 | 1.9 | 0.56 | 34.0 | NaN | 0.70 |
1 | 9.8 | 0.098 | 0.00 | red | 0.9968 | 7.8 | 25.0 | 3.20 | 5 | 2.6 | 0.68 | 67.0 | NaN | 0.88 |
2 | 9.8 | 0.092 | 0.04 | red | 0.9970 | 7.8 | 15.0 | 3.26 | 5 | 2.3 | 0.65 | 54.0 | NaN | 0.76 |
3 | 9.8 | 0.075 | 0.56 | red | 0.9980 | 11.2 | 17.0 | 3.16 | 6 | 1.9 | 0.58 | 60.0 | NaN | 0.28 |
4 | 9.4 | 0.076 | 0.00 | red | 0.9978 | 7.4 | 11.0 | 3.51 | 5 | 1.9 | 0.56 | 34.0 | NaN | 0.70 |
将新组合的数据框保存为 winequality_edited.csv
。务必设置 index=False
,以避免保存未命名列!
wine_df.to_csv(winequality_edited.csv, index=False)
new_labels=list(red_df.columns)
new_labels[6]=total_sulfur_dioxide
red_df.columns=new_labels
groupby函数
red_df.groupby(quality).mean()
求出平均值
red_df.groupby([quality,color]).mean()
red_df.groupby([quality,color],as_index=False).mean()
不用颜色和质量做索引as_index=False
只对某一列做平均值
red_df.groupby([quality,color],as_index=False)[ph].mean()
df.groupby(color).mean().quality
df.describe().pH
bin_edges = [2.72, 3.11, 3.21, 3.32, 4.01]
bin_names = [high, mod_high, medium, low]
df[acidity_levels] = pd.cut(df[pH], bin_edges, labels=bin_names)
df.head()
df.groupby(acidity_levels).mean().quality
df.to_csv(winequality_edited.csv, index=False)
等效语句
# selecting malignant records in cancer data
df_m = df[df[diagnosis] == M]
df_m = df.query(diagnosis == "M")
# selecting records of people making over $50K
df_a = df[df[income] == >50K]
df_a = df.query(income == " >50K")
# get the median amount of alcohol content
# 获取酒精含量的中位数 df.alcohol.median()
# 选择酒精含量小于中位数的样本
low_alcohol =df[df.alcohol < 10.3]
# 选择酒精含量大于等于中位数的样本
high_alcohol =df[df.alcohol >= 10.3]
# 确保这些查询中的每个样本只出现一次
num_samples = df.shape[0]
num_samples == low_alcohol[quality].count() + high_alcohol[quality].count() # 应为真
# 获取低酒精含量组和高酒精含量组的平均质量评分
low_alcohol.quality.mean(), high_alcohol.quality.mean()
# 获取残留糖分的中位数
df.residual_sugar.median()
# 选择残留糖分小于中位数的样本
low_sugar =df[df.residual_sugar < 3]
# 选择残留糖分大于等于中位数的样本
high_sugar =df[df.residual_sugar >= 3]
# 确保这些查询中的每个样本只出现一次
num_samples == low_sugar[quality].count() + high_sugar[quality].count() # 应为真
# 获取低糖分组和高糖分组的平均质量评分
low_sugar.quality.mean(), high_sugar.quality.mean()
colors=[red,white]
wine_df.groupby(color)[quality].mean().plot(kkind=bar,title=ceshi1,colors=[red,white],alpha=.7)
引入sns matplotlib
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
...
...
...
colors=[red,white]
color_means=wine_df.groupby(color)[quality].mean()
color_means.plot(kkind=bar,title=ceshi1,colors=colors,alpha=.7)
plt.xlabel("colors",fontsize=18)
plt.ylabel("colors",fontsize=18)
counts= wine_df.groupby([quality,color]).count()[pH]
counts
totals=wine_df.groupby(color).count()[pH]
proportions = counts /totals
proportions.plot(kind=bar,title=ceshi1,colors=colors,alpha=.7)
import matplotlib.pyplot as plt
% matplotlib inline
plt.bar([1, 2, 3], [224, 620, 425]);
# 绘制条柱
plt.bar([1, 2, 3], [224, 620, 425])
# 为 x 轴指定刻度标签及其标签
plt.xticks([1, 2, 3], [a, b, c]);
# 用 x 轴的刻度标签绘制条柱
plt.bar([1, 2, 3], [224, 620, 425], tick_label=[a, b, c]);
plt.bar([1, 2, 3], [224, 620, 425], tick_label=[a, b, c])
plt.title(Some Title)
plt.xlabel(Some X Label)
plt.ylabel(Some Y Label);
# 用查询功能选择每个组,并获取其平均质量
median = df[alcohol].median()
low = df.query(alcohol < .format(median))
high = df.query(alcohol >= .format(median))
mean_quality_low = low[quality].mean()
mean_quality_high = high[quality].mean()
# 用合适的标签创建柱状图
locations = [1, 2]
heights = [mean_quality_low, mean_quality_high]
labels = [Low, High]
plt.bar(locations, heights, tick_label=labels)
plt.title(Average Quality Ratings by Alcohol Content)
plt.xlabel(Alcohol Content)
plt.ylabel(Average Quality Rating);
用 Matplotlib 绘制酒的类型和质量视图
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
% matplotlib inline
import seaborn as sns
sns.set_style(darkgrid)
wine_df = pd.read_csv(winequality_edited.csv)
# 获取每个等级和颜色的数量
color_counts = wine_df.groupby([color, quality]).count()[pH]
color_counts
# 获取每个颜色的总数
color_totals = wine_df.groupby(color).count()[pH]
color_totals
# 将红葡萄酒等级数量除以红葡萄酒样本总数,获取比例
red_proportions = color_counts[red] / color_totals[red]
red_proportions
# 将白葡萄酒等级数量除以白葡萄酒样本总数,获取比例
white_proportions = color_counts[white] / color_totals[white]
white_proportions
ind = np.arange(len(red_proportions)) # 组的 x 坐标位置
width = 0.35 # 条柱的宽度
# 绘制条柱
red_bars = plt.bar(ind, red_proportions, width, color=r, alpha=.7, label=Red Wine)
white_bars = plt.bar(ind + width, white_proportions, width, color=w, alpha=.7, label=White Wine)
# 标题和标签
plt.ylabel(Proportion)
plt.xlabel(Quality)
plt.title(Proportion by Wine Color and Quality)
locations = ind + width / 2 # x 坐标刻度位置
labels = [3, 4, 5, 6, 7, 8, 9] # x 坐标刻度标签
plt.xticks(locations, labels)
# 图例
plt.legend()
red_proportions[9] = 0
red_proportions
以上是关于pandas与matplotlib综合案例的主要内容,如果未能解决你的问题,请参考以下文章
高端实战 Python数据分析与机器学习实战 Numpy/Pandas/Matplotlib等常用库