利用python 进行数据清洗
Posted 11-21
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了利用python 进行数据清洗相关的知识,希望对你有一定的参考价值。
import pandas as pd
data.to_csv("路径",encoding=utf-8) 保存文件
data.head() 查看前几行 data.tail() 查看后几行 data.shape 几行几列 data.index 查看索引 data.columns 查看标题 data.values 查看值 data.info 查看整体结构 data.describe() 对数值型数据进行描述统计 data.value_counts()对值计数 data.sort_index(axis=1/0) 对索引进行排序 参数ascending=False 降序排序 data.sort_calues(by="columens") 按照某一行的值进行排序 inplace=True 修改原始数据
选取数据 data.columns data["cloumns"] data.loc[] 显示索引 data.iloc[] 隐示索引
## 映射函数 data.apply()
#data.apply(abs) 将abs作用于data的每一个数据 data=pd.Series([1,-2,3,-3],index=["a","b","c","d"]) data.apply(abs)
Out[6]:
In [ ]:
data.iloc[[0,2],data.columns.get_loc("one")] 混合索引 0-2 行 one列 data.iloc[[0,2],data.columns.get_indexer(["one","tow"])] 同时得到2列
布尔型索引
In [ ]:
布儿型索引
| 代表 or & 代表 and -代表not
data[(df.one>0)&(df.two>0)] data[(df.one>0)|(df.two>0)] data[(df.one>0)-(df.two>0)]
用与筛选需要的子集
In [9]:
import numpy as np ser=pd.Series(np.arange(5),index=np.arange(5)[::-1],dtype="int32") ser
Out[9]:
In [10]:
ser.isin([2,3,4])#查看是否存在 2,3,4
Out[10]:
sample(),方法随机抽样
In [11]:
###随机抽样 sample()方法从Series或者DataFriame中随机选择行或列 ser.sample()
参数 n=None, #抽取多少个 frac=None, #抽取多少比列 replace=False, #是否为有放回抽样 weights=None, #设定每一行的权重 random_state=None, #是否需要重现随机的结果,设置随机数种子 axis=None # 设定是对行采样还是对列采样
In [12]:
ser=pd.Series([1,2,3,4,5])
ser
Out[12]:
In [13]:
ser.sample()#默认抽取一个
Out[13]:
In [14]:
ser.sample(4)#默认抽取4个
Out[14]:
In [15]:
ser.sample(frac=0.8)#抽取80%
Out[15]:
In [16]:
#不加参数默认进行不放回抽样,使用replace 替换抽样方式 ser.sample(n=5,replace=False)# 不放回
Out[16]:
In [17]:
ser.sample(n=5,replace=True)#有放回
Out[17]:
In [24]:
ser_weight=[0.1,0.2,0.2,0.3,0.4] ser.sample(n=4,weights=ser_weight ) #总体权重和为1 如果输入的值不为一,会从新归一化
Out[24]:
In [25]:
#在采样中,会用DataFrame的某一列作为权重 df=pd.DataFrame({"first":[4,5,6,7],"weight_column":[0.3,0.4,0.2,0.1]}) df
Out[25]:
In [27]:
df.sample(n=2,weights="weight_column")
Out[27]:
In [29]:
df.sample(n=2,axis=1)
Out[29]:
In [32]:
df.sample(n=2,random_state=2)
Out[32]:
In [33]:
?df.sample
数据合并
In [39]:
df1=pd.DataFrame({"A":["A0","A1","A2","A3"],"B":["B0","B1","B2","B3"], "C":["C0","C1","C2","C3"], "D":["D0","D1","D2","D3"]},index=[0,1,2,3]) df2=pd.DataFrame({"A":["A0","A1","A2","A3"],"B":["B0","B1","B2","B3"], "C":["C0","C1","C2","C3"], "D":["D0","D1","D2","D3"]},index=[4,5,6,7]) df3=pd.DataFrame({"A":["A0","A1","A2","A3"],"B":["B0","B1","B2","B3"], "C":["C0","C1","C2","C3"], "D":["D0","D1","D2","D3"]},index=[8,9,10,11])
In [41]:
print(df1);print(df2);print(df3)
用pd。concat()合并数据
In [ ]:
###用pd.concat()合并对象 参数 pd.concat() objs, 数据集 axis=0, 轴线 默认0 join=‘outer‘, 连接方式 inner outer join_axes=None, 用指定的轴进行合并 ignore_index=False,都合并没有就不合并 /True 根据列字段对齐合并,生成新的索引 keys=None, 指定不同数据源 levels=None, names=None, verify_integrity=False, copy=True)
In [43]:
pd.concat([df1,df2,df3])#列合并
Out[43]:
In [45]:
df4=pd.DataFrame({"B":["B0","B1","B2","B3"], "C":["C0","C1","C2","C3"], "E":["E0","E1","E4","E5"]},index=[0,1,4,5]) df4
Out[45]:
In [46]:
pd.concat([df1,df4],axis=1)#横向合并
Out[46]:
In [47]:
pd.concat([df1,df4],axis=1,join="inner")#取交集
Out[47]:
In [49]:
pd.concat([df1,df4],axis=1,join_axes=[df1.index])#指定合并的轴
Out[49]:
In [52]:
pd.concat([df1,df4],ignore_index=False)
Out[52]:
In [53]:
pd.concat([df1,df4],ignore_index=True)#生成新的index
Out[53]:
In [54]:
ser=pd.Series(["s0","s1","s2","s3"],name="s") ser
Out[54]:
In [56]:
pd.concat([df1,ser],axis=1)#合并之后Series的名称自动成为列名称,不指定name自动生成
Out[56]:
In [61]:
pd.concat([df1,df2,df3],keys=["one","two","three"])#区分不同的数据来源
Out[61]:
In [60]:
data=pd.concat([df1,df2,df3]) dic={"one":df1,"two":df2,"three":df3} pd.concat(dic) #也可以区分不同的数据集
Out[60]:
用append 实现合并
In [ ]:
df.append()
[63]:
df1.append(df4)
Out[63]:
In [64]:
df1.append([df2,df3])
Out[64]:
#用append方法添加新行
In [65]:
ser3=pd.Series(["q1","q2","q3","q4"],index=["A","B","C","D"]) ser3
Out[65]:
In [67]:
df1.append(ser3,ignore_index=True)
Out[67]:
pandas 数据清洗案列
In [71]:
import pandas as pd
1.数据的读取
In [84]:
df=pd.read_csv("taobao.csv",encoding="gbk") df.head()
Out[84]:
In [86]:
df.tail(10)
Out[86]:
2.快速探索
In [87]:
df.info()
Out[88]:
In [88]:
#查看描述统计信息
df.describe()
Out[88]:
3.数据的选择
In [90]:
#行的选择 df[0:5] df.iloc[0:5]
Out[90]:
In [91]:
#列的选择
cols=df[["宝贝","价格"]] type(cols) cols.head() cols=df[["宝贝","价格"]].head()#数据太多读取太慢,可选择只查看多少行 cols
Out[91]:
In [95]:
#区域的选择
df.loc[0:3,["宝贝","价格"]] df.loc[df.index[0:3],["宝贝","价格"]]
Out[95]:
4.数据的整理
In [96]:
df["销售额"]=df["价格"]*df["成交量"] df.head()
Out[96]:
In [102]:
#过滤掉价格>=100,成交量<8000的商品信息 df[(df["价格"]<100)&(df["成交量"]>=8000)]
Out[102]:
In [105]:
#将位置设置为索引 #df.index=df["位置"] df1=df.set_index("位置") df1.head()
Out[105]:
In [106]:
#排序
df2=df1.sort_index()
df2.head()
Out[106]:
In [113]:
#两个索引
df3=df.set_index(["位置","品牌"]) df3.head() #并根据位置进行排序 #df4=df3.sort_index(level=0) df4=df3.sort_index(level="位置") df4.head()
Out[113]:
groupby 分组汇总
In [126]:
#删除不需要的数据 deal=df.drop(["宝贝","品牌","位置"],axis=1) deal.head() #inplace=Fals 不修改原始数据 True 修改原始数据
Out[126]:
In [127]:
deal.groupby("位置").mean()#均值
Out[127]:
In [128]:
df["成交量"].groupby(df["位置"]).mean()
Out[128]:
In [130]:
df["成交量"].groupby([df["位置"],df["品牌"]]).mean() #按多组列进行分组
Out[130]:
5.数据合并
In [132]:
#创建数据 df1=df[20:30][["位置","品牌"]] df1.head()
Out[132]:
In [133]:
df2=df[25:35][["品牌","价格","成交量"]] df2.head()
Out[133]:
In [135]:
df2.info()
In [136]:
#pd.merge 根据一个或多个KEY值,将DataFrame连接(join) #pd.concat 沿着一个轴拼接 #combine_first 如果有缺失值,另外要给数据集对其进行填充
In [143]:
pd.merge(df1,df2).head()
Out[143]:
In [142]:
pd.merge(df1,df2,how="outer").head()#how默认为 inner 可修改为 outer left right
Out[142]:
In [145]:
#索引合并 pd.merge(df2,df1,left_index=True,right_index=True).head()
Out[145]:
6.数据重塑
DataFrame 创建数据是无序的
In [2]:
import pandas as pd df=pd.DataFrame({"日期":["2017-01-01","2017-01-02","2017-01-03","2017-02-03","2017-02-04","2017-03-01","2017-03-02"],"最高气温": [12,13,14,15,16,17,15],"最低气温":[7,8,8,9,12,3,5],"天气":["晴","多云","多云","小雨","小雨","晴","阴"],"风向": ["西北风","东北风","东北风","西北风","西北风","北风","南风"],"风力":[2,2,2,1,2,3,2]})
reindex 可对DataFrame 进行排序
In [157]:
df=df.reindex(["日期"]+["最高气温"]+["最低气温"]+["天气"]+["风向"]+["风力"],axis=1) df.head()
Out[157]:
In [160]:
df.stack()#列转化为层级的Series
Out[160]:
In [161]:
df.stack().unstack()#还原
Out[161]:
数据透视表
In [ ]:
pd.pivot_table() data, 数据集 values=None, 值是谁 index=None, 索引是谁 columns=None, 标题是谁 aggfunc=‘mean‘, 聚合的函数是谁 fill_value=None, margins=False, dropna=True, 是否召回 margins_name=‘All‘
In [164]:
df_table=pd.pivot_table(df,index=["天气"],columns=["风向"],values=["最高气温"]) df_table
Out[164]:
In [165]:
df_table.info()
In [3]:
import numpy as np
In [4]:
df=pd.DataFrame({"日期":["2017-01-01","2017-01-02","2017-01-03","2017-02-03","2017-02-04","2017-03-01","2017-03-02"],"最高气温": [12,13,14,15,np.nan,17,15],"最低气温":[7,8,8,np.nan,12,3,5],"天气":[np.nan,"多云","多云","小雨","小雨","晴","阴"],"风向": ["西北风",np.nan,"东北风","西北风",np.nan,"北风","南风"],"风力":[2,2,np.nan,1,2,3,2]})
In [5]:
df
Out[5]:
缺失值的处理
In [7]:
df.isnull()#发现缺失值 True 为有缺失
Out[7]:
In [9]:
df.notnull()#发现缺失值 False 为有缺失
Out[9]:
In [10]:
df.dropna(axis=0) 删除有缺失值的行
Out[10]:
In [11]:
df.dropna(axis=1)#删除有缺失值的列
Out[11]:
缺失值的填充
In [13]:
#用字符串填充 df.fillna("missing")
Out[13]:
In [15]:
#使用前一个数值代替 df.fillna(method="pad")
Out[15]:
In [16]:
df.fillna(method="pad",limit=1)#只向下或向上填充一个,填充过多数据不准
Out[16]:
In [17]:
#向后填充 df.fillna(method="bfill")
Out[17]:
In [18]:
#用均值填充 df.fillna(df.mean())
Out[18]:
In [19]:
df.fillna(df.mean()["最低气温":"最高气温"])#只填充需要填充的行数
Out[19]:
In [21]:
df.loc[:,"最低气温":"最高气温"].fillna(df.mean())
Out[21]:
检测和过滤异常值
In [24]:
#参照正太分布 定义》3标准差或小于-3标准差的值为异常值 sta=(df["最高气温"]-df["最高气温"].mean())/df["最高气温"].std() sta.abs()>1
Out[24]:
In [40]:
df["最高温度是否异常"]=sta.abs()>1 df
Out[40]:
In [41]:
df["最高温度是否异常"].value_counts()
Out[41]:
In [44]:
#用箱线图定义异常值
h=df["最高气温"] iqr=h.quantile(0.75)-h.quantile(0.25) df_max=h.quantile(0.75)+1.5*iqr df_min=h.quantile(0.25)-1.5*iqr
In [45]:
df_max
Out[45]:
In [46]:
df_min
Out[46]:
In [52]:
df["isouter"]=(h>df_max)|(h<df_min)
In [53]:
df
Out[53]:
重复值 duplicated
In [60]:
df.duplicated()
Out[60]:
In [55]:
df.duplicated("风力")
Out[55]:
In [57]:
d2=df.drop_duplicates("风力")#删除有重复项的行
In [59]:
d2
Out[59]:
时间数据的处理
In [61]:
import time
In [62]:
time.time()#时间戳是指格林威治时间自1970年1月1日(00:00:00 gmy)至当前时间的总秒数 #北京时间1970年1月1日(08:00:00)
Out[62]:
In [63]:
time.localtime()
Out[63]:
时间格式的转换
In [ ]:
time.strftime()#format 时间格式 %Y Year with century as a decimal number. %m Month as a decimal number [01,12]. %d Day of the month as a decimal number [01,31]. %H Hour (24-hour clock) as a decimal number [00,23]. %M Minute as a decimal number [00,59]. %S Second as a decimal number [00,61]. %z Time zone offset from UTC. %a Locale‘s abbreviated weekday name. %A Locale‘s full weekday name. %b Locale‘s abbreviated month name. %B Locale‘s full month name. %c Locale‘s appropriate date and time representation. %I Hour (12-hour clock) as a decimal number [01,12]. %p Locale‘s equivalent of either AM or PM.
time.strftime()格式化系统时间
In [64]:
time.strftime("%Y-%m-%d",time.localtime())#把当前时间转换成可读形式,注意转换之后为str格式
Out[64]:
In [66]:
s=time.strftime("%Y-%m-%d",time.localtime())
Out[66]:
In [69]:
type(s)
Out[69]:
In [195]:
d=time.strptime(s,"%Y-%m-%d")#返回datetime格式的时间
Out[195]:
In [196]:
type(d)
Out[196]:
将时间戳转换成系统时间
In [70]:
time.localtime(1533785557.0)
Out[70]:
In [74]:
time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(1533785557.0))
Out[74]:
时间数据的操作
In [91]:
import datetime import pandas as pd import numpy as np
In [80]:
#取当前datetime格式的时间 datetime.datetime(2018,8,8)
Out[80]:
In [ ]:
pd.date_range() start=None, 开始 end=None, 结束 periods=None, 生成多少个 freq=‘D‘, 默认按天计算 tz=None, normalize=False, name=None, closed=None, **kwargs)
生成时间序列数据
In [81]:
#生成时间序列数据 pd.date_range(datetime.datetime(2018,8,8),periods=4)
Out[81]:
In [89]:
pd.date_range("2018-8,-8",periods=4)#指定生成个数
Out[89]:
In [86]:
pd.date_range("2018-8-8","2018-9-9")#指定起始日期
Out[86]:
In [88]:
pd.date_range("2018-8-8 11:00","2018-8-9 00:00",freq="H")#按小时生成序列
Out[88]:
In [93]:
ser=pd.Series(np.arange(10),index=pd.date_range("2018-8-9",periods=10)) ser
Out[93]:
In [94]:
ser["2018-8-9"]
Out[94]:
In [96]:
ser.index[2].year
Out[96]:
In [97]:
ser.index[2].month
Out[97]:
In [99]:
ser.index[2].day
Out[99]:
修改日期格式
pd.to_datetime()
In [123]:
df=pd.DataFrame({"日期":["2017-01-01","2017-01-02","2017-01-03","2017-02-03","2017-02-04","2017-03-01","2017-03-02"],"最高气温": [12,13,14,15,np.nan,17,15],"最低气温":[7,8,8,np.nan,12,3,5],"天气":[np.nan,"多云","多云","小雨","小雨","晴","阴"],"风向": ["西北风",np.nan,"东北风","西北风",np.nan,"北风","南风"],"风力":[2,2,np.nan,1,2,3,2]})
In [124]:
df.info()
format
In [125]:
df["日期"]=pd.to_datetime(df["日期"].values,format="%Y-%m-%d")
In [126]:
df.info()
In [109]:
df
Out[109]:
In [130]:
#将日期设置为索引 df=df.set_index("日期")
In [131]:
df
Out[131]:
In [117]:
#提取1月份的数据
df_join=df[(df.index>="2017-01-01")&(df.index<="2017-02-01")]#注意时间输入需与索引格式一致 df_join
Out[117]:
In [119]:
df["2017-01-01":"2017-01-31"].info()
In [132]:
#转换成月份 df.to_period("M")
Out[132]:
处理字符型数据
In [155]:
data=pd.DataFrame({"Rank":[1,2,3,4,5],"city":["london","benrlin]","madind","rome","pans"],"state":[" kingdom"," gemany","spain ","ltaly","frnce"], "popuiation":["8,615,246","3,437,916","3,165,235","2,872,086","2,273,305"],"dateofcensusestumate":["1 june 2014","31 may 2014", "1 january 2014","30 september 2014","1 jannany 2013"]})
In [156]:
data
Out[156]:
In [157]:
date=data.reindex(["Rank"]+["city"]+["state"]+["popuiation"]+["dateofcensusestumate"],axis=1)#排序
In [158]:
date
Out[158]:
In [159]:
date.info()
去掉逗号
split()分割函数
In [160]:
date["popuiation"].apply(lambda x :x.split(","))#按照逗号分隔
Out[160]:
In [161]:
date["popuiation"].apply(lambda x :x.replace(",",""))#把逗号替代为空 #lambda 匿名函数 #apply 循环
Out[161]:
replace()替换函数
In [162]:
subtr=date["popuiation"].apply(lambda x : int(x.replace(",","")))
In [163]:
date["numericpopuiation"]=subtr date
Out[163]:
In [165]:
date["state"].values# 发现数据有空格
Out[165]:
strip()剔除前后空格函数
In [167]:
date["state"].apply(lambda x :x.strip())#剔除前后空格
Out[167]:
In [170]:
stri=date["state"].apply(lambda x :x.strip())#空格没有了 date["stace"]=stri date["stace"].values
Out[170]:
对指定字符串进行处理
如果我们需要在一系列文本中提取数据?
正则表达式通常被用来检索某个规则的文本
In [172]:
str_1=pd.DataFrame({"title":["网名最喜欢的旅游目的地榜单出炉","让生活更幸福是旅游业的使命","一带一路国家中东欧游客增两倍","旅游业改革开启旅游强国新篇章"], "link":["http//cntour.cninews/4221/","http//cntour.cninews/4212/","http//cntour.cninews/4202/","http//cntour.cninews/4191/"]})
In [173]:
str_1
Out[173]:
str.extract()运用正则表达式(表达式详情百度查看)
注:括号里面是需要的内容
In [175]:
str_1["link"]
Out[175]:
In [178]:
str_1["link"].str.extract("ews/(.+)/",expand=False)
Out[178]:
In [181]:
str_2=str_1["link"].str.extract("ews/(.+)/",expand=False) str_1["links"]=str_2 str_1
Out[181]:
In [188]:
dic={"4221":"过","4212":"来","4202":"玩","4191":"啊"}
In [191]:
%%time str_1["linkss"]=str_1["links"].map(dic)#map 映射函数,可将dic的值 根据键一一对应,映射到str——1 str_1 Wall time: 3 ms
以上是关于利用python 进行数据清洗的主要内容,如果未能解决你的问题,请参考以下文章