pandas基础
Posted 未来可期-2018
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了pandas基础相关的知识,希望对你有一定的参考价值。
文章目录
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
1. pandas简介
s=pd.Series(data=[1,3,6,np.nan,44,1])
dates=pd.date_range("20160101",periods=6)
df=pd.DataFrame(data=np.random.randn(6,4),
index=dates,columns=["a","b","c","d"])
# 获取行索引
df.index
# 获取列索引
df.columns
#获取数据
df.values
# df的简介
df.describe()
# df转置
df.T
# 将columns倒序排列
df.sort_index(axis=1,ascending=False)
# 将index倒序排列
df.sort_index(axis=0,ascending=False)
# 按某一列的值进行排序或者某些列,通过by=list
df.sort_values(by=["d","a"],ascending=True)
2. 选择数据
dates=pd.date_range("20160101","20160106")
df=pd.DataFrame(data=np.arange(24).reshape((6,4)),index=dates,columns=["A","B","C","D"])
# 行索引
df["A"]
df.A
df[0:3]
# select by label:loc
df.loc["20160101",:]
df.loc[:,["A","B"]]
# select by position:iloc
df.iloc[[1,3,5],1:3]
# boolean index
df[df.A>8]
3. 设置值
dates=pd.date_range("20160101","20160106")
df=pd.DataFrame(data=np.arange(24).reshape((6,4)),index=dates,columns=["A","B","C","D"])
df.iloc[1,1]=111
df.loc["20160101","A"]=222
# 指定某一列
df.A[df.A>4]=0
# 添加一列
df['E']=np.nan
# 添加一列
df["F"]=pd.Series(data=[1,2,3,4,5,6],index=dates)
4. 处理丢失数列
dates=pd.date_range("20160101","20160106")
df=pd.DataFrame(data=np.arange(24).reshape((6,4)),index=dates,columns=["A","B","C","D"])
df.iloc[0,1]=np.nan
df.iloc[1,2]=np.nan
# 将含nan的行丢掉
df.dropna(axis=0,how="any")
# 将含nan的列丢掉
df.dropna(axis=1,how="any")
# 将含nan的cell填充为value
df.fillna(value=0)
# 查看dataframe为空情况
df.isnull().sum()
5. 导入导出
'''
sep 分隔符
encoding="utf-8"
index_col 指定源文件的某一列为index
'''
df=pd.read_csv(filepath_or_buffer="student.csv",sep=",",encoding="utf-8")
# 指定columns
df.columns=[]
# 保存文件
df.to_csv(filepath_or_buffer="",encoding="utf-8")
6. df合并
df1=pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'])
df2=pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'])
df3=pd.DataFrame(np.ones((3,4))*2,columns=['a','b','c','d'])
# df在行方向合并
pd.concat((df1,df2,df3),axis=0,ignore_index=True)
df1=pd.DataFrame(data=np.ones((3,4))*0,columns=['a','b','c','d'])
df2=pd.DataFrame(data=np.ones((3,4))*1,columns=['b','c','d','e'])
# 内连接
pd.concat((df1,df2),join='inner',ignore_index=True)
# 外连接
pd.concat((df1,df2),join='outer',ignore_index=True)
# 按行方向进行append
df1.append(df2,ignore_index=True)
left=pd.DataFrame(
"key":["K0","K1","K2","K3"],
"A":["A0","A1","A2","A3"],
"B":["B0","B1","B2","B3"]
)
right=pd.DataFrame(
"key":["K0","K1","K2","K3"],
"C":["C0","C1","C2","C3"],
"D":["D0","D1","D2","D3"]
)
# pd.merge在行的方向进行合并
pd.merge(left=left, right=right, on="key")
pd.merge(left=left,
right=right,
left_on="key",
right_on="key",
how="inner")
pd.merge(left=left,
right=right,
left_index=True,
right_index=True)
7. plot画图
data=pd.DataFrame(
data=np.random.randn(1000,4),
columns=["A","B","C","D"])
# pd.Series().plot()
data["A"].plot()
plt.show()
# df.plot.scatter()
data.plot.scatter("A","B")
plt.show()
以上是关于pandas基础的主要内容,如果未能解决你的问题,请参考以下文章