pandas案例分析,附加numpy matplotlib

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了pandas案例分析,附加numpy matplotlib相关的知识,希望对你有一定的参考价值。


import pandas as pd

df=pd.read_csv(,sep=;)

这是如果出现;  说明是用;做分隔符,而不是默认的,

 

import pandas as pd
red_df = pd.read_csv(winequality-red.csv, sep=;)
white_df = pd.read_csv(winequality-white.csv, sep=;)
red_df.head()
white_df.head()

fixed_acidity

volatile_acidity

citric_acid

residual_sugar

chlorides

free_sulfur_dioxide

total_sulfur_dioxide

density

pH

sulphates

alcohol

quality

0

7.0

0.27

0.36

20.7

0.045

45.0

170.0

1.0010

3.00

0.45

8.8

6

1

6.3

0.30

0.34

1.6

0.049

14.0

132.0

0.9940

3.30

0.49

9.5

6

2

8.1

0.28

0.40

6.9

0.050

30.0

97.0

0.9951

3.26

0.44

10.1

6

3

7.2

0.23

0.32

8.5

0.058

47.0

186.0

0.9956

3.19

0.40

9.9

6

4

7.2

0.23

0.32

8.5

0.058

47.0

186.0

0.9956

3.19

0.40

9.9

6

 

print(red_df.shape)


(1599, 12)


print(white_df.shape)


(4898, 12)




red_df.isnull().sum()


fixed_acidity 0 volatile_acidity 0 citric_acid 0 residual_sugar 0 chlorides 0 free_sulfur_dioxide 0 total_sulfur-dioxide 0 density 0 pH 0 sulphates 0 alcohol 0 quality 0 dtype: int64


white_df.isnull().sum()


fixed_acidity 0 volatile_acidity 0 citric_acid 0 residual_sugar 0 chlorides 0 free_sulfur_dioxide 0 total_sulfur_dioxide 0 density 0 pH 0 sulphates 0 alcohol 0 quality 0 dtype: int64


white_df.duplicated().sum() 重复值统计


937 但是重复行是不可以删除的


红葡萄酒数据集中有多少唯一的质量值?

red_df.quality.nunique()


6


红葡萄酒数据集中的平均密度是多少?

red_df.density.mean()


0.996746679174484


 

import numpy as np

a=np.random.random(1000)

生成1000个随机数的矩阵

np.mean(a)

求a得平均值
 

 


# 导入 numpy 和 pandas import numpy as np import pandas as pd # 加载红葡萄酒和白葡萄酒数据集 red_df = pd.read_csv(winequality-red.csv, sep=;) white_df = pd.read_csv(winequality-white.csv, sep=;)


# 为红葡萄酒数据框创建颜色数组
color_red =  np.repeat(red, red_df.shape[0])
# 为白葡萄酒数据框创建颜色数组
color_white = np.repeat(white, white_df.shape[0])

red_df[color] = color_red
red_df.head()

fixed_acidity

volatile_acidity

citric_acid

residual_sugar

chlorides

free_sulfur_dioxide

total_sulfur-dioxide

density

pH

sulphates

alcohol

quality

color

0

7.4

0.70

0.00

1.9

0.076

11.0

34.0

0.9978

3.51

0.56

9.4

5

red

1

7.8

0.88

0.00

2.6

0.098

25.0

67.0

0.9968

3.20

0.68

9.8

5

red

2

7.8

0.76

0.04

2.3

0.092

15.0

54.0

0.9970

3.26

0.65

9.8

5

red

3

11.2

0.28

0.56

1.9

0.075

17.0

60.0

0.9980

3.16

0.58

9.8

6

red

4

7.4

0.70

0.00

1.9

0.076

11.0

34.0

0.9978

3.51

0.56

9.4

5

red

 

# 附加数据框
wine_df = red_df.append(white_df)

# 查看数据框,检查是否成功
wine_df.head()

alcohol

chlorides

citric_acid

color

density

fixed_acidity

free_sulfur_dioxide

pH

quality

residual_sugar

sulphates

total_sulfur-dioxide

total_sulfur_dioxide

volatile_acidity

0

9.4

0.076

0.00

red

0.9978

7.4

11.0

3.51

5

1.9

0.56

34.0

NaN

0.70

1

9.8

0.098

0.00

red

0.9968

7.8

25.0

3.20

5

2.6

0.68

67.0

NaN

0.88

2

9.8

0.092

0.04

red

0.9970

7.8

15.0

3.26

5

2.3

0.65

54.0

NaN

0.76

3

9.8

0.075

0.56

red

0.9980

11.2

17.0

3.16

6

1.9

0.58

60.0

NaN

0.28

4

9.4

0.076

0.00

red

0.9978

7.4

11.0

3.51

5

1.9

0.56

34.0

NaN

0.70

 

 

将新组合的数据框保存为 ​​winequality_edited.csv​​​。务必设置 ​​index=False​​,以避免保存未命名列!


wine_df.to_csv(winequality_edited.csv, index=False)


 

new_labels=list(red_df.columns)

new_labels[6]=total_sulfur_dioxide

red_df.columns=new_labels

 

groupby函数

 

red_df.groupby(quality).mean()

求出平均值

red_df.groupby([quality,color]).mean()

red_df.groupby([quality,color],as_index=False).mean()

不用颜色和质量做索引as_index=False

只对某一列做平均值

red_df.groupby([quality,color],as_index=False)[ph].mean()

 


df.groupby(color).mean().quality


df.describe().pH


bin_edges = [2.72, 3.11, 3.21, 3.32, 4.01]


bin_names = [high, mod_high, medium, low]


df[acidity_levels] = pd.cut(df[pH], bin_edges, labels=bin_names)


df.head()


df.groupby(acidity_levels).mean().quality


df.to_csv(winequality_edited.csv, index=False)


等效语句

# selecting malignant records in cancer data
df_m = df[df[diagnosis] == M]
df_m = df.query(diagnosis == "M")

# selecting records of people making over $50K
df_a = df[df[income] == >50K]
df_a = df.query(income == " >50K")


# get the median amount of alcohol content


# 获取酒精含量的中位数 df.alcohol.median()


# 选择酒精含量小于中位数的样本
low_alcohol =df[df.alcohol < 10.3]

# 选择酒精含量大于等于中位数的样本
high_alcohol =df[df.alcohol >= 10.3]

# 确保这些查询中的每个样本只出现一次
num_samples = df.shape[0]
num_samples == low_alcohol[quality].count() + high_alcohol[quality].count() # 应为真

# 获取低酒精含量组和高酒精含量组的平均质量评分

low_alcohol.quality.mean(), high_alcohol.quality.mean()

# 获取残留糖分的中位数


df.residual_sugar.median()


# 选择残留糖分小于中位数的样本
low_sugar =df[df.residual_sugar < 3]

# 选择残留糖分大于等于中位数的样本
high_sugar =df[df.residual_sugar >= 3]

# 确保这些查询中的每个样本只出现一次
num_samples == low_sugar[quality].count() + high_sugar[quality].count() # 应为真

 

# 获取低糖分组和高糖分组的平均质量评分
low_sugar.quality.mean(), high_sugar.quality.mean()

 

colors=[red,white]

wine_df.groupby(color)[quality].mean().plot(kkind=bar,title=ceshi1,colors=[red,white],alpha=.7)

 

 

 

引入sns  matplotlib 

import pandas as pd

import matplotlib.pyplot  as  plt

import seaborn as sns

%matplotlib inline

...

...

...

colors=[red,white]

color_means=wine_df.groupby(color)[quality].mean()

color_means.plot(kkind=bar,title=ceshi1,colors=colors,alpha=.7)

plt.xlabel("colors",fontsize=18)

plt.ylabel("colors",fontsize=18)

 

 

 

counts= wine_df.groupby([quality,color]).count()[pH]

counts

totals=wine_df.groupby(color).count()[pH]

proportions = counts /totals

proportions.plot(kind=bar,title=ceshi1,colors=colors,alpha=.7)

 

 

 

import matplotlib.pyplot as plt
% matplotlib inline

plt.bar([1, 2, 3], [224, 620, 425]);

pandas案例分析,附加numpy

 

# 绘制条柱
plt.bar([1, 2, 3], [224, 620, 425])

# 为 x 轴指定刻度标签及其标签
plt.xticks([1, 2, 3], [a, b, c]);

pandas案例分析,附加numpy

# 用 x 轴的刻度标签绘制条柱
plt.bar([1, 2, 3], [224, 620, 425], tick_label=[a, b, c]);

 

plt.bar([1, 2, 3], [224, 620, 425], tick_label=[a, b, c])
plt.title(Some Title)
plt.xlabel(Some X Label)
plt.ylabel(Some Y Label);

pandas案例分析,附加numpy

 

# 用查询功能选择每个组,并获取其平均质量
median = df[alcohol].median()
low = df.query(alcohol < .format(median))
high = df.query(alcohol >= .format(median))

mean_quality_low = low[quality].mean()
mean_quality_high = high[quality].mean()

 

# 用合适的标签创建柱状图
locations = [1, 2]
heights = [mean_quality_low, mean_quality_high]
labels = [Low, High]
plt.bar(locations, heights, tick_label=labels)
plt.title(Average Quality Ratings by Alcohol Content)
plt.xlabel(Alcohol Content)
plt.ylabel(Average Quality Rating);

 

 

用 Matplotlib 绘制酒的类型和质量视图

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
% matplotlib inline
import seaborn as sns
sns.set_style(darkgrid)

wine_df = pd.read_csv(winequality_edited.csv)

# 获取每个等级和颜色的数量
color_counts = wine_df.groupby([color, quality]).count()[pH]
color_counts

# 获取每个颜色的总数
color_totals = wine_df.groupby(color).count()[pH]
color_totals

# 将红葡萄酒等级数量除以红葡萄酒样本总数,获取比例
red_proportions = color_counts[red] / color_totals[red]
red_proportions

# 将白葡萄酒等级数量除以白葡萄酒样本总数,获取比例
white_proportions = color_counts[white] / color_totals[white]
white_proportions

ind = np.arange(len(red_proportions))  # 组的 x 坐标位置
width = 0.35       # 条柱的宽度

# 绘制条柱
red_bars = plt.bar(ind, red_proportions, width, color=r, alpha=.7, label=Red Wine)
white_bars = plt.bar(ind + width, white_proportions, width, color=w, alpha=.7, label=White Wine)

# 标题和标签
plt.ylabel(Proportion)
plt.xlabel(Quality)
plt.title(Proportion by Wine Color and Quality)
locations = ind + width / 2  # x 坐标刻度位置
labels = [3, 4, 5, 6, 7, 8, 9]  # x 坐标刻度标签
plt.xticks(locations, labels)

# 图例
plt.legend()

 

red_proportions[9] = 0
red_proportions

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

以上是关于pandas案例分析,附加numpy matplotlib的主要内容,如果未能解决你的问题,请参考以下文章

用于附加和创建pandas数据帧的快速numpy数组结构

无法在嵌套循环中使用 pandas 附加更大的数据帧。如何更改为 numpy 向量化?

为啥使用numpy和pandas来进行数据处理?

数据分析模块Numpy Pandas

Python数据分析-Pandas

numpy pandas1