dataframe的合并(append, merge, concat)
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了dataframe的合并(append, merge, concat)相关的知识,希望对你有一定的参考价值。
1,pd.concat:拼接
1.1,axis
df1 = pd.DataFrame(np.ones((3,4))*0, columns = [‘a‘, ‘b‘, ‘c‘, ‘d‘])
df2 = pd.DataFrame(np.ones((3,4))*1, columns = [‘a‘, ‘b‘, ‘c‘, ‘d‘])
df3 = pd.DataFrame(np.ones((3,4))*2, columns = [‘a‘, ‘b‘, ‘c‘, ‘d‘])
1.1,axis
df1 = pd.DataFrame(np.ones((3,4))*0, columns = [‘a‘, ‘b‘, ‘c‘, ‘d‘])
df2 = pd.DataFrame(np.ones((3,4))*1, columns = [‘a‘, ‘b‘, ‘c‘, ‘d‘])
df3 = pd.DataFrame(np.ones((3,4))*2, columns = [‘a‘, ‘b‘, ‘c‘, ‘d‘])
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
a b c d
0 1.0 1.0 1.0 1.0
1 1.0 1.0 1.0 1.0
2 1.0 1.0 1.0 1.0
a b c d
0 2.0 2.0 2.0 2.0
1 2.0 2.0 2.0 2.0
2 2.0 2.0 2.0 2.0
result = pd.concat([df1, df2, df3], axis = 0) # 0表示竖向的合并
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
0 1.0 1.0 1.0 1.0
1 1.0 1.0 1.0 1.0
2 1.0 1.0 1.0 1.0
0 2.0 2.0 2.0 2.0
1 2.0 2.0 2.0 2.0
2 2.0 2.0 2.0 2.0
result=pd.concat([df1,df2,df3], axis=1) # 1表示横向的合并
a b c d a b c d a b c d
0 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0
1 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0
2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0
a b c d a b c d a b c d
0 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0
1 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0
2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0
1.2,ignore_index
df1 = pd.DataFrame(np.ones((3,4))*0, columns = [‘a‘, ‘b‘, ‘c‘, ‘d‘])
df2 = pd.DataFrame(np.ones((3,4))*1, columns = [‘a‘, ‘b‘, ‘c‘, ‘d‘])
df3 = pd.DataFrame(np.ones((3,4))*2, columns = [‘a‘, ‘b‘, ‘c‘, ‘d‘])
result = pd.concat([df1, df2, df3], axis = 0, ignore_index=True) # ignore_index=True表示忽略原来index(axis=0)/column(axis=1)
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
5 1.0 1.0 1.0 1.0
6 2.0 2.0 2.0 2.0
7 2.0 2.0 2.0 2.0
8 2.0 2.0 2.0 2.0
df1 = pd.DataFrame(np.ones((3,4))*0, columns = [‘a‘, ‘b‘, ‘c‘, ‘d‘])
df2 = pd.DataFrame(np.ones((3,4))*1, columns = [‘a‘, ‘b‘, ‘c‘, ‘d‘])
df3 = pd.DataFrame(np.ones((3,4))*2, columns = [‘a‘, ‘b‘, ‘c‘, ‘d‘])
result = pd.concat([df1, df2, df3], axis = 0, ignore_index=True) # ignore_index=True表示忽略原来index(axis=0)/column(axis=1)
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
5 1.0 1.0 1.0 1.0
6 2.0 2.0 2.0 2.0
7 2.0 2.0 2.0 2.0
8 2.0 2.0 2.0 2.0
result = pd.concat([df1, df2, df3], axis = 0, ignore_index=True) # ignore_index=True表示忽略原来index
0 1 2 3 4 5 6 7 8 9 10 11
0 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0
1 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0
2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0
0 1 2 3 4 5 6 7 8 9 10 11
0 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0
1 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0
2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0 2.0 2.0 2.0 2.0
1.3,join
df1 = pd.DataFrame(np.ones((3,4))*0, columns = [‘a‘, ‘b‘, ‘c‘, ‘d‘])
df2 = pd.DataFrame(np.ones((3,4))*1, columns = [‘b‘, ‘c‘, ‘d‘, ‘e‘])
result=pd.concat([df1,df2], axis=0, join=‘inner‘, ignore_index=True) # join=‘inner‘表示只保留column名相同的部分
b c d
0 0.0 0.0 0.0
1 0.0 0.0 0.0
2 0.0 0.0 0.0
3 1.0 1.0 1.0
4 1.0 1.0 1.0
5 1.0 1.0 1.0
df1 = pd.DataFrame(np.ones((3,4))*0, columns = [‘a‘, ‘b‘, ‘c‘, ‘d‘])
df2 = pd.DataFrame(np.ones((3,4))*1, columns = [‘b‘, ‘c‘, ‘d‘, ‘e‘])
result=pd.concat([df1,df2], axis=0, join=‘inner‘, ignore_index=True) # join=‘inner‘表示只保留column名相同的部分
b c d
0 0.0 0.0 0.0
1 0.0 0.0 0.0
2 0.0 0.0 0.0
3 1.0 1.0 1.0
4 1.0 1.0 1.0
5 1.0 1.0 1.0
1.4, join_axes
df1 = pd.DataFrame(np.ones((3,4))*0, columns = [‘a‘, ‘b‘, ‘c‘, ‘d‘], index=[1, 2, 3])
df2 = pd.DataFrame(np.ones((3,4))*1, columns = [‘b‘, ‘c‘, ‘d‘, ‘e‘], index=[2, 3, 4])
df1 = pd.DataFrame(np.ones((3,4))*0, columns = [‘a‘, ‘b‘, ‘c‘, ‘d‘], index=[1, 2, 3])
df2 = pd.DataFrame(np.ones((3,4))*1, columns = [‘b‘, ‘c‘, ‘d‘, ‘e‘], index=[2, 3, 4])
result=pd.concat([df1,df2], axis=1)
a b c d b c d e
1 0.0 0.0 0.0 0.0 NaN NaN NaN NaN
2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
4 NaN NaN NaN NaN 1.0 1.0 1.0 1.0
a b c d b c d e
1 0.0 0.0 0.0 0.0 NaN NaN NaN NaN
2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
4 NaN NaN NaN NaN 1.0 1.0 1.0 1.0
result=pd.concat([df1,df2], axis=1, join_axes=[df1.index]) # join_axes=[df1.index]表示按照df1的index进行合并
a b c d b c d e
1 0.0 0.0 0.0 0.0 NaN NaN NaN NaN
2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
a b c d b c d e
1 0.0 0.0 0.0 0.0 NaN NaN NaN NaN
2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
2,pd.append()
2.1,append
pd.append只有竖向往下加数据,没有axis属性
df1 = pd.DataFrame(np.ones((3,4))*0, columns = [‘a‘, ‘b‘, ‘c‘, ‘d‘], index=[1, 2, 3])
df2 = pd.DataFrame(np.ones((3,4))*1, columns = [‘b‘, ‘c‘, ‘d‘, ‘e‘], index=[2, 3, 4])
df3 = pd.DataFrame(np.ones((3,4))*2, columns = [‘b‘, ‘c‘, ‘d‘, ‘e‘], index=[2, 3, 4])
2.1,append
pd.append只有竖向往下加数据,没有axis属性
df1 = pd.DataFrame(np.ones((3,4))*0, columns = [‘a‘, ‘b‘, ‘c‘, ‘d‘], index=[1, 2, 3])
df2 = pd.DataFrame(np.ones((3,4))*1, columns = [‘b‘, ‘c‘, ‘d‘, ‘e‘], index=[2, 3, 4])
df3 = pd.DataFrame(np.ones((3,4))*2, columns = [‘b‘, ‘c‘, ‘d‘, ‘e‘], index=[2, 3, 4])
df1 = df1.append(df2)
a b c d e
1 0.0 0.0 0.0 0.0 NaN
2 0.0 0.0 0.0 0.0 NaN
3 0.0 0.0 0.0 0.0 NaN
2 NaN 1.0 1.0 1.0 1.0
3 NaN 1.0 1.0 1.0 1.0
4 NaN 1.0 1.0 1.0 1.0
a b c d e
1 0.0 0.0 0.0 0.0 NaN
2 0.0 0.0 0.0 0.0 NaN
3 0.0 0.0 0.0 0.0 NaN
2 NaN 1.0 1.0 1.0 1.0
3 NaN 1.0 1.0 1.0 1.0
4 NaN 1.0 1.0 1.0 1.0
df1 = df1.append([df2, df3], ignore_index=True) # 可以一次加多个df,也可以忽略index
a b c d e
0 0.0 0.0 0.0 0.0 NaN
1 0.0 0.0 0.0 0.0 NaN
2 0.0 0.0 0.0 0.0 NaN
3 NaN 1.0 1.0 1.0 1.0
4 NaN 1.0 1.0 1.0 1.0
5 NaN 1.0 1.0 1.0 1.0
6 NaN 2.0 2.0 2.0 2.0
7 NaN 2.0 2.0 2.0 2.0
8 NaN 2.0 2.0 2.0 2.0
a b c d e
0 0.0 0.0 0.0 0.0 NaN
1 0.0 0.0 0.0 0.0 NaN
2 0.0 0.0 0.0 0.0 NaN
3 NaN 1.0 1.0 1.0 1.0
4 NaN 1.0 1.0 1.0 1.0
5 NaN 1.0 1.0 1.0 1.0
6 NaN 2.0 2.0 2.0 2.0
7 NaN 2.0 2.0 2.0 2.0
8 NaN 2.0 2.0 2.0 2.0
s1=pd.Series([1,2,3,4],index=[‘a‘,‘b‘,‘c‘,‘d‘])
df1 = df1.append(s1, ignore_index=True) # 可以加series,ignore_index=True必须加上,s1个数必须和df1的列数相同
a b c d
0 0.0 0.0 0.0 0.0 # 注意ignore_index = True以后,df1的index变成从0开始了!!!
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 2.0 3.0 4.0
df1 = df1.append(s1, ignore_index=True) # 可以加series,ignore_index=True必须加上,s1个数必须和df1的列数相同
a b c d
0 0.0 0.0 0.0 0.0 # 注意ignore_index = True以后,df1的index变成从0开始了!!!
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 2.0 3.0 4.0
l1=[1,2,3,4]
df1.loc[4] = l1
a b c d
1 0.0 0.0 0.0 0.0 # df1的index还是按照原来从1开始
2 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0
4 1.0 2.0 3.0 4.0
df1.loc[4] = l1
a b c d
1 0.0 0.0 0.0 0.0 # df1的index还是按照原来从1开始
2 0.0 0.0 0.0 0.0
3 0.0 0.0 0.0 0.0
4 1.0 2.0 3.0 4.0
2.2,append举例
df1 = pd.DataFrame({‘B‘: [‘B0‘, ‘B1‘, ‘B2‘, ‘B3‘,‘B4‘],
‘D‘: [‘D0‘, ‘D1‘, ‘D2‘, ‘D3‘,‘D4‘],
‘A‘: [‘A0‘, ‘A1‘, ‘A2‘, ‘A3‘,‘A4‘],
‘E‘: [‘E0‘, ‘E1‘, ‘E2‘, ‘E3‘,‘E4‘]},
index=[0, 1, 2, 3,4],columns=[‘B‘,‘D‘,‘A‘,‘E‘])
df1 = pd.DataFrame({‘B‘: [‘B0‘, ‘B1‘, ‘B2‘, ‘B3‘,‘B4‘],
‘D‘: [‘D0‘, ‘D1‘, ‘D2‘, ‘D3‘,‘D4‘],
‘A‘: [‘A0‘, ‘A1‘, ‘A2‘, ‘A3‘,‘A4‘],
‘E‘: [‘E0‘, ‘E1‘, ‘E2‘, ‘E3‘,‘E4‘]},
index=[0, 1, 2, 3,4],columns=[‘B‘,‘D‘,‘A‘,‘E‘])
df2 = pd.DataFrame({‘F‘: [‘F4‘, ‘F5‘, ‘F6‘, ‘F7‘,‘F8‘],
‘A‘: [‘A4‘, ‘A5‘, ‘A6‘, ‘A7‘,‘A8‘],
‘B‘: [‘B4‘, ‘B5‘, ‘B6‘, ‘B7‘,‘B8‘],
‘C‘: [‘C4‘, ‘C5‘, ‘C6‘, ‘C7‘,‘C8‘]},
index=[5, 9, 6, 7,10])
‘A‘: [‘A4‘, ‘A5‘, ‘A6‘, ‘A7‘,‘A8‘],
‘B‘: [‘B4‘, ‘B5‘, ‘B6‘, ‘B7‘,‘B8‘],
‘C‘: [‘C4‘, ‘C5‘, ‘C6‘, ‘C7‘,‘C8‘]},
index=[5, 9, 6, 7,10])
输入df3=df1.append(df2)后,df3变为:
A B C D E F
0 A0 B0 NaN D0 E0 NaN
1 A1 B1 NaN D1 E1 NaN
2 A2 B2 NaN D2 E2 NaN
3 A3 B3 NaN D3 E3 NaN
4 A4 B4 NaN D4 E4 NaN
5 A4 B4 C4 NaN NaN F4
9 A5 B5 C5 NaN NaN F5
6 A6 B6 C6 NaN NaN F6
7 A7 B7 C7 NaN NaN F7
10 A8 B8 C8 NaN NaN F8
列项重新排列了,如果想要列项按照df1的走,即df1的次序,并且如果df1没有的列,就直接扔掉:
A B C D E F
0 A0 B0 NaN D0 E0 NaN
1 A1 B1 NaN D1 E1 NaN
2 A2 B2 NaN D2 E2 NaN
3 A3 B3 NaN D3 E3 NaN
4 A4 B4 NaN D4 E4 NaN
5 A4 B4 C4 NaN NaN F4
9 A5 B5 C5 NaN NaN F5
6 A6 B6 C6 NaN NaN F6
7 A7 B7 C7 NaN NaN F7
10 A8 B8 C8 NaN NaN F8
列项重新排列了,如果想要列项按照df1的走,即df1的次序,并且如果df1没有的列,就直接扔掉:
df4=pd.concat([df1,df2],axis=0)
df4.reindex(columns=df1.columns)
3,pd.merge # 合并
3.1,1个key
left=pd.DataFrame({‘key‘:[‘K0‘,‘K1‘,‘K2‘,‘K3‘],
‘A‘:[‘A0‘,‘A1‘,‘A2‘,‘A3‘],
‘B‘:[‘B0‘,‘B1‘,‘B2‘,‘B3‘]})
right=pd.DataFrame({‘key‘:[‘K0‘,‘K1‘,‘K2‘,‘K3‘],
‘C‘:[‘C0‘,‘C1‘,‘C2‘,‘C3‘],
‘D‘:[‘D0‘,‘D1‘,‘D2‘,‘D3‘]})
res=pd.merge(left,right)
res=pd.merge(left,right,on=[‘key‘]) # 默认会自动选择共同的column进行合并,如果没有共同的column会报错
A B key C D
0 A0 B0 K0 C0 D0
1 A1 B1 K1 C1 D1
2 A2 B2 K2 C2 D2
3 A3 B3 K3 C3 D3
res=pd.merge(left,right,on=[‘key‘]) # 默认会自动选择共同的column进行合并,如果没有共同的column会报错
A B key C D
0 A0 B0 K0 C0 D0
1 A1 B1 K1 C1 D1
2 A2 B2 K2 C2 D2
3 A3 B3 K3 C3 D3
3.2,多个key,how可取[‘left‘, ‘right‘, ‘inner‘, ‘outer‘]
left=pd.DataFrame({‘key1‘:[‘K0‘,‘K0‘,‘K1‘,‘K2‘],
‘key2‘:[‘K0‘,‘K1‘,‘K0‘,‘K1‘],
‘A‘:[‘A0‘,‘A1‘,‘A2‘,‘A3‘],
‘B‘:[‘B0‘,‘B1‘,‘B2‘,‘B3‘]})
right=pd.DataFrame({‘key1‘:[‘K0‘,‘K1‘,‘K1‘,‘K2‘],
‘key2‘:[‘K0‘,‘K0‘,‘K0‘,‘K0‘],
‘C‘:[‘C0‘,‘C1‘,‘C2‘,‘C3‘],
‘D‘:[‘D0‘,‘D1‘,‘D2‘,‘D3‘]})
A B key1 key2
0 A0 B0 K0 K0
1 A1 B1 K0 K1
2 A2 B2 K1 K0
3 A3 B3 K2 K1
C D key1 key2
0 C0 D0 K0 K0
1 C1 D1 K1 K0
2 C2 D2 K1 K0
3 C3 D3 K2 K0
left=pd.DataFrame({‘key1‘:[‘K0‘,‘K0‘,‘K1‘,‘K2‘],
‘key2‘:[‘K0‘,‘K1‘,‘K0‘,‘K1‘],
‘A‘:[‘A0‘,‘A1‘,‘A2‘,‘A3‘],
‘B‘:[‘B0‘,‘B1‘,‘B2‘,‘B3‘]})
right=pd.DataFrame({‘key1‘:[‘K0‘,‘K1‘,‘K1‘,‘K2‘],
‘key2‘:[‘K0‘,‘K0‘,‘K0‘,‘K0‘],
‘C‘:[‘C0‘,‘C1‘,‘C2‘,‘C3‘],
‘D‘:[‘D0‘,‘D1‘,‘D2‘,‘D3‘]})
A B key1 key2
0 A0 B0 K0 K0
1 A1 B1 K0 K1
2 A2 B2 K1 K0
3 A3 B3 K2 K1
C D key1 key2
0 C0 D0 K0 K0
1 C1 D1 K1 K0
2 C2 D2 K1 K0
3 C3 D3 K2 K0
res=pd.merge(left,right,on=[‘key1‘,‘key2‘]) # 需要key1,key2完全相同才合并,left有一行K1/K0,right有两行K1/K0,则打印2遍
res=pd.merge(left,right,on=[‘key1‘,‘key2‘], how = ‘inner‘) # 默认是how=‘inner‘,即key的value要相同
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 A2 B2 K1 K0 C1 D1
2 A2 B2 K1 K0 C2 D2
res=pd.merge(left,right,on=[‘key1‘,‘key2‘], how = ‘inner‘) # 默认是how=‘inner‘,即key的value要相同
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 A2 B2 K1 K0 C1 D1
2 A2 B2 K1 K0 C2 D2
res=pd.merge(left,right,on=[‘key1‘,‘key2‘], how = ‘outer‘) # 根据key1,key2取值全部合并,如果left或者right没有的取NaN
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 A1 B1 K0 K1 NaN NaN
2 A2 B2 K1 K0 C1 D1
3 A2 B2 K1 K0 C2 D2
4 A3 B3 K2 K1 NaN NaN
5 NaN NaN K2 K0 C3 D3
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 A1 B1 K0 K1 NaN NaN
2 A2 B2 K1 K0 C1 D1
3 A2 B2 K1 K0 C2 D2
4 A3 B3 K2 K1 NaN NaN
5 NaN NaN K2 K0 C3 D3
res=pd.merge(left,right,on=[‘key1‘,‘key2‘], how = ‘left‘) # 根据left的key合并,重复出现了2行K1K0(因为right有2行)
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 A1 B1 K0 K1 NaN NaN
2 A2 B2 K1 K0 C1 D1
3 A2 B2 K1 K0 C2 D2
4 A3 B3 K2 K1 NaN NaN
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 A1 B1 K0 K1 NaN NaN
2 A2 B2 K1 K0 C1 D1
3 A2 B2 K1 K0 C2 D2
4 A3 B3 K2 K1 NaN NaN
res=pd.merge(left,right,on=[‘key1‘,‘key2‘],how=‘right‘) # 根据right的key合并
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 A2 B2 K1 K0 C1 D1
2 A2 B2 K1 K0 C2 D2
3 NaN NaN K2 K0 C3 D3
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 A2 B2 K1 K0 C1 D1
2 A2 B2 K1 K0 C2 D2
3 NaN NaN K2 K0 C3 D3
3.3, indicator
默认false,true表示显示合并方式,是left还是right,还是both
默认false,true表示显示合并方式,是left还是right,还是both
以上是关于dataframe的合并(append, merge, concat)的主要内容,如果未能解决你的问题,请参考以下文章
R语言merge函数连接多个dataframe数据集迭代内连接dataframe数据( iteratively merge data frames in R)默认merge函数通过公共列名合并数据