数据规整化:清理转换合并重塑
Posted wangshuang1631
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了数据规整化:清理转换合并重塑相关的知识,希望对你有一定的参考价值。
写在前面的话:
实例中的所有数据都是在GitHub上下载的,打包下载即可。
地址是:http://github.com/pydata/pydata-book
还有一定要说明的:
我使用的是Python2.7,书中的代码有一些有错误,我使用自己的2.7版本调通。
# coding: utf-8
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
df1 = DataFrame('key':['b','b','a','c','a','a','b'],'data1':range(7))
df2 = DataFrame('key':['a','b','d'],'data2':range(3))
df1
df2
pd.merge(df1,df2)
pd.merge(df1,df2,on='key')
df3 = DataFrame('lkey':['b','b','a','c','a','a','b'],'data1':range(7))
df4 = DataFrame('rkey':['a','b','d'],'data2':range(3))
pd.merge(df3,df4,left_on='lkey',right_on='rkey')
pd.merge(df1,df2,how='outer')
df5 = DataFrame('key':['b','b','a','c','a','b'],'data1':range(6))
df6 = DataFrame('key':['a','b','a','b','d'],'data2':range(5))
df5
df6
pd.merge(df5,df6,on='key',how='left')
pd.merge(df5,df6,how='inner')
left = DataFrame('key1':['foo','foo','bar'],'key2':['one','two','one'],'lval':[1,2,3])
right = DataFrame('key1':['foo','foo','bar','bar'],'key2':['one','one','one','two'],'lval':[4,5,6,7])
pd.merge(left,right,on=['key1','key2'],how='outer')
pd.merge(left,right,on='key1')
pd.merge(left,right,on='key1',suffixes=('_left','_right'))
left1 = DataFrame('key':['a','b','a','a','b','c'],'value':range(6))
right1 = DataFrame('group_val':[3.5,7],index=['a','b'])
left1
right1
pd.merge(left1,right1,left_on='key',right_index=True)
pd.merge(left1,right1,left_on='key',right_index=True,how='outer')
lefth = DataFrame('key1':['Ohio','Ohio','Ohio','Nevada','Nevada'],'key2':[2000,2001,2002,2001,2002],'data':np.arange(5.))
righth = DataFrame(np.arange(12).reshape((6,2)),index=[['Nevada','Nevada','Ohio','Ohio','Ohio','Ohio'],[2001,2000,2000,2000,2001,2002]],columns=['event1','event2'])
lefth
righth
pd.merge(lefth,righth,left_on=['key1','key2'],right_index=True)
pd.merge(lefth,righth,left_on=['key1','key2'],right_index=True,how='outer')
left2 = DataFrame([[1.,2.],[3.,4.],[5.,6.]],index=['a','c','e'],columns=['Ohio','Nevada'])
right2 = DataFrame([[7.,8.],[9.,10.],[11.,12.],[13.,14.]],index=['b','c','d','e'],columns=['Missouri','Alabama'])
left2
right2
pd.merge(left2,right2,how='outer',left_index=True,right_index=True)
left2.join(right2,how='outer')
left1.join(right1,on='key')
another = DataFrame([[7.,8.],[9.,10.],[11.,12.],[16.,17.]],index=['a','c','e','f'],columns=['New York','Oregon'])
left2.join([right2,another])
left2.join([right2,another],how='outer')
arr = np.arange(12).reshape((3,4))
arr
np.concatenate([arr,arr],axis=1)
s1 = Series([0,1],index=['a','b'])
s2 = Series([2,3,4],index=['c','d','e'])
s3 = Series([5,6],index=['f','g'])
pd.concat([s1,s2,s3])
pd.concat([s1,s2,s3],axis=1)
s4 = pd.concat([s1 * 5,s3])
pd.concat([s1,s4],axis=1)
pd.concat([s1,s4],axis=1,join='inner')
pd.concat([s1,s4],axis=1,join_axes=[['a','c','b','e']])
result = pd.concat([s1,s1,s3],keys=['one','two','three'])
result
result.unstack()
pd.concat([s1,s2,s3],axis=1,keys=['one','two','three'])
df1 = DataFrame(np.arange(6).reshape(3,2),index=['a','b','c'],columns=['one','two'])
df2 = DataFrame(5 + np.arange(4).reshape(2,2),index=['a','c'],columns=['three','four'])
pd.concat([df1,df2],axis=1,keys=['level1','level2'])
pd.concat('level1':df1,'level2':df2,axis=1)
pd.concat([df1,df2],axis=1,keys=['level1','level2'],names=['upper','lower'])
df1 = DataFrame(np.arange(12).reshape(3,4),columns=['a','b','c','d'])
df2 = DataFrame(np.arange(6).reshape(2,3),columns=['b','d','a'])
df1
df2
pd.concat([df1,df2],ignore_index=True)
a = Series([np.nan,2.5,np.nan,3.5,4.5,np.nan],index=['f','e','d','c','b','a'])
b = Series(np.arange(len(a),dtype=np.float64),index=['f','e','d','c','b','a'])
b[-1] = np.nan
a
b
np.where(pd.isnull(a),b,a)
b[:-2].combine_first(a[2:])
df1 = DataFrame('a':[1,np.nan,5,np.nan],
'b':[np.nan,2,np.nan,6],
'c':range(2,18,4))
df2 = DataFrame('a':[5,4,np.nan,3,7],
'b':[np.nan,3,4,6,8])
df1.combine_first(df2)
data = DataFrame(np.arange(6).reshape((2,3)),index=pd.Index(['Ohio','Colorado'],name='state'),columns=pd.Index(['one','two','three'],name='number'))
data
result = data.stack()
result
result.unstack()
result.unstack(0)
result.unstack('state')
s1 = Series([0,1,2,3],index=['a','b','c','d'])
s2 = Series([4,5,6],index=['c','d','e'])
data2 = pd.concat([s1,s2],keys=['one','two'])
data2.unstack()
data2.unstack().stack(dropna=False)
df = DataFrame('left':result,'right':result + 5,columns=pd.Index(['left','right'],name='side'))
df
df.unstack('state')
df.unstack('state').stack('side')
ldata = DataFrame('date':['03-31','03-31','03-31','06-30','06-30','06-30'],
'item':['real','infl','unemp','real','infl','unemp'],'value':['2710.','000.','5.8','2778.','2.34','5.1'])
ldata
pivoted = ldata.pivot('date','item','value')
pivoted.head()
ldata['value2'] = np.random.randn(len(ldata))
ldata[:10]
pivoted = ldata.pivot('date','item')
pivoted[:5]
pivoted['value'][:5]
unstacked = ldata.set_index(['date','item']).unstack('item')
unstacked[:7]
data = DataFrame('k1':['one'] * 3 + ['two'] *4,'k2':[1,1,2,3,3,4,4])
data
data.duplicated()
data.drop_duplicates()
data['v1'] = range(7)
data.drop_duplicates(['k1'])
data.drop_duplicates(['k1','k2'],take_last=True)
data = DataFrame('food':['bacon','pulled pork','bacon','Pastrami','corned beef','Bacon','pastrami','honey ham','nova lox'],
'ounces':[4,3,12,6,7.5,8,3,5,6])
data
meat_to_animal = 'bacon':'pig',
'pulled pork':'pig',
'pastrami':'cow',
'corned beef':'cow',
'honey ham':'pig',
'nova lox':'salmon'
data['animal'] = data['food'].map(str.lower).map(meat_to_animal)
data
data['food'].map(lambda x: meat_to_animal[x.lower()])
data = Series([1.,-999.,2.,-999.,-1000.,3.])
data
data.replace(-999,np.nan)
data.replace([-999,-1000],[np.nan,666])
data.replace(-999:np.nan, -1000:0)
data = DataFrame(np.arange(12).reshape((3,4)),index=['Ohio','Colorado','New York',],columns=['one','two','three','four'])
data.index = data.index.map(str.upper)
data
data.rename(index=str.title,columns=str.upper)
data.rename(index='OHIO':'INDIANA',columns='three':'peekaboo')
ages = [20,22,25,27,21,23,37,31,61,45,51,41,32]
bins = [18,25,35,60,100]
cats = pd.cut(ages,bins)
cats
cats.labels
cats.levels
pd.value_counts(cats)
pd.cut(ages,[18,26,36,61,100],right=False)
group_names = ['Youth','YouthAdult','MiddleAge','Senior']
pd.cut(ages,bins,labels=group_names)
data = np.random.rand(20)
pd.cut(data,4,precision=2)
data = np.random.rand(1000)
cats = pd.qcut(data,4)
cats
pd.value_counts(cats)
pd.qcut(data,[0,0.1,0.5,0.9,1.])
pd.value_counts(cats)
np.random.seed(12345)
data = DataFrame(np.random.randn(1000,4))
data.describe()
col = data[3]
col[np.abs(col)>3]
data[(np.abs(data)>3).any(1)]
data[np.abs(data)>3] = np.sign(data) * 3
data.describe()
df = DataFrame(np.arange(5*4).reshape((5,4)))
sampler = np.random.permutation(5)
sampler
df
df.take(sampler)
df.take(np.random.permutation(len(df))[:3])
bag = np.array([5,7,-1,6,4])
sampler = np.random.randint(0,len(bag),size=10)
sampler
draws = bag.take(sampler)
draws
df = DataFrame('key':['b','b','a','c','a','b'],'data1':range(6))
pd.get_dummies(df['key'])
dummies = pd.get_dummies(df['key'],prefix='key')
df_with_dummy = df[['data1']].join(dummies)
df_with_dummy
mnames = ['movie_id','title','genres']
movies = pd.read_table('D:\\Source Code\\pydata-book-master\\ch02\\movielens\\movies.dat',sep='::',header=None,names=mnames)
movies[:10]
genre_iter = (set(x.split('|')) for x in movies.genres)
genres = sorted(set.union(*genre_iter))
dummies = DataFrame(np.zeros((len(movies),len(genres))),columns=genres)
for i,gen in enumerate(movies.genres):
dummies.ix[i,gen.split('|')] = 1
movies_windic = movies.join(dummies.add_prefix('Genre_'))
movies_windic.ix[0]
values = np.random.rand(10)
values
bins = [0,0.2,0.4,0.6,0.8,1]
pd.get_dummies(pd.cut(values,bins))
val = 'a,b, guido'
val.split(',')
pieces = [x.strip() for x in val.split(',')]
pieces
first,second,thrid = pieces
first + '::' + second + '::' + thrid
'::'.join(pieces)
'guido' in val
val.index(',')
val.find(':')
val.count(',')
val.replace(',','::')
val.replace(',', '')
import re
text = "foo bar\\t baz \\tqux"
re.split('\\s+',text)
regex = re.compile('\\s+')
regex.split(text)
regex.findall(text)
text = """Dave dave@aa.com
Steve steve@aa.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\\.[A-Z]2,4'
regex = re.compile(pattern,flags=re.IGNORECASE)
regex.findall(text)
m = regex.search(text)
m
text[m.start():m.end()]
print regex.match(text)
print regex.sub('REDACTED',text)
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]2,4)'
regex = re.compile(pattern,flags=re.IGNORECASE)
m = regex.match('wes@shjdjs.net')
m.groups()
regex.findall(text)
print regex.sub(r'Username:\\1,domain:\\2,suffix:\\3',text)
data = 'Dave':'dave@aa.com','Steve':'steve@aa.com','Rob':'rob@gmail.com','Ryan':'ryan@yahoo.com','Wes':np.nan
data = Series(data)
data
data.isnull()
data.str.contains('gmail')
pattern
data.str.findall(pattern,flags=re.IGNORECASE)
matches = data.str.match(pattern,flags=re.IGNORECASE)
matches
matches.str.get(1)
matches.str[0]
data.str[:5]
import json
db = json.load(open('D:\\Source Code\\pydata-book-master\\ch07\\\\foods-2011-10-03.json'))
len(db)
db[0].keys()
db[0]['nutrients'][0]
nutrients = DataFrame(db[0]['nutrients'])
nutrients[:7]
info_keys = ['description','group','id','manufacturer']
info = DataFrame(db,columns=info_keys)
info[:5]
info
pd.value_counts(info.group)[:10]
nutrients = []
for rec in db:
fnuts = DataFrame(rec['nutrients'])
fnuts['id'] = rec['id']
nutrients.append(fnuts)
nutrients = pd.concat(nutrients,ignore_index=True)
nutrients
nutrients.duplicated().sum()
nutrients = nutrients.drop_duplicates()
col_mapping = 'description':'food','group':'fgroup'
info = info.rename(columns=col_mapping,copy=False)
info
col_mapping = 'description':'nutrient','group':'nutgroup'
nutrients = nutrients.rename(columns=col_mapping,copy=False)
nutrients
ndata = pd.merge(nutrients,info,on='id',how='outer')
ndata
ndata.ix[30000]
result = ndata.groupby(['nutrient','fgroup'])['value'].quantile(0.5)
result['Zinc, Zn'].order().plot(kind='barh')
by_nutrient = ndata.groupby(['nutgroup','nutrient'])
get_max = lambda x : x.xs(x.value.idxmax())
get_min = lambda x : x.xs(x.value.idxmin())
max_foods = by_nutrient.apply(get_max)[['value','food']]
max_foods = max_foods.food.str[:50]
max_foods.ix['Amino Acids']
以上是关于数据规整化:清理转换合并重塑的主要内容,如果未能解决你的问题,请参考以下文章