合并具有多索引和列值的数据框
Posted
技术标签:
【中文标题】合并具有多索引和列值的数据框【英文标题】:Merging dataframes with multi indexes and column value 【发布时间】:2021-09-25 03:31:19 【问题描述】:我有两个具有多个索引和日期的数据框:
df1
df1 = pd.DataFrame.from_dict(('group', ''): 0: 'A',
1: 'A',
2: 'A',
3: 'A',
4: 'A',
5: 'A',
6: 'A',
7: 'A',
8: 'B',
9: 'B',
10: 'B',
11: 'B',
12: 'B',
13: 'B',
14: 'B',
15: 'B',
16: 'C',
17: 'C',
18: 'C',
19: 'C',
20: 'C',
21: 'C',
22: 'C',
23: 'C',
24: 'D',
25: 'D',
26: 'D',
27: 'D',
28: 'D',
29: 'D',
30: 'D',
('category', ''): 0: 'Apple',
1: 'Amazon',
2: 'Google',
3: 'Netflix',
4: 'Facebook',
5: 'Uber',
6: 'Tesla',
7: 'total',
8: 'Apple',
9: 'Amazon',
10: 'Google',
11: 'Netflix',
12: 'Facebook',
13: 'Uber',
14: 'Tesla',
15: 'total',
16: 'Apple',
17: 'Amazon',
18: 'Google',
19: 'Netflix',
20: 'Facebook',
21: 'Uber',
22: 'Tesla',
23: 'total',
24: 'Apple',
25: 'Amazon',
26: 'Google',
27: 'Netflix',
28: 'Uber',
29: 'Tesla',
30: 'total',
(pd.Timestamp('2021-06-28 00:00:00'), 'total_orders'): 0: 88.0,
1: 66.0,
2: 191.0,
3: 558.0,
4: 12.0,
5: 4.0,
6: 56.0,
7: 975.0,
8: 90.0,
9: 26.0,
10: 49.0,
11: 250.0,
12: 7.0,
13: 2.0,
14: 44.0,
15: 468.0,
16: 36.0,
17: 52.0,
18: 94.0,
19: 750.0,
20: 10.0,
21: 0.0,
22: 52.0,
23: 994.0,
24: 16.0,
25: 22.0,
26: 5.0,
27: 57.0,
28: 3.0,
29: 33.0,
30: 136.0,
(pd.Timestamp('2021-06-28 00:00:00'), 'total_sales'): 0: 4603.209999999999,
1: 2485.059999999998,
2: 4919.39999999998,
3: 6097.77,
4: 31.22,
5: 155.71,
6: 3484.99,
7: 17237.35999999996,
8: 561.54,
9: 698.75,
10: 1290.13,
11: 4292.68000000001,
12: 947.65,
13: 329.0,
14: 2889.65,
15: 9989.4,
16: 330.8899999999994,
17: 2076.26,
18: 2982.270000000004,
19: 11978.62000000002,
20: 683.0,
21: 0.0,
22: 3812.16999999999,
23: 20963.21000000002,
24: 234.4900000000002,
25: 896.1,
26: 231.0,
27: 893.810000000001,
28: 129.0,
29: 1712.329999999998,
30: 4106.729999999996,
(pd.Timestamp('2021-07-05 00:00:00'), 'total_orders'): 0: 109.0,
1: 48.0,
2: 174.0,
3: 592.0,
4: 13.0,
5: 5.0,
6: 43.0,
7: 984.0,
8: 62.0,
9: 13.0,
10: 37.0,
11: 196.0,
12: 8.0,
13: 1.0,
14: 3.0,
15: 30.0,
16: 76.0,
17: 5.0,
18: 147.0,
19: 88.0,
20: 8.0,
21: 1.0,
22: 78.0,
23: 1248.0,
24: 1.0,
25: 18.0,
26: 23.0,
27: 83.0,
28: 0.0,
29: 29.0,
30: 154.0,
(pd.Timestamp('2021-07-05 00:00:00'), 'total_sales'): 0: 3453.02,
1: 17868.730000000003,
2: 44707.82999999999,
3: 61425.99,
4: 1261.0,
5: 1914.6000000000001,
6: 24146.09,
7: 154777.25999999998,
8: 6201.489999999999,
9: 5513.960000000001,
10: 9645.87,
11: 25086.785,
12: 663.0,
13: 448.61,
14: 26332.7,
15: 73892.415,
16: 556.749999999999,
17: 1746.859999999997,
18: 4103.219999999994,
19: 15571.52000000008,
20: 86.0,
21: 69.0,
22: 5882.759999999995,
23: 26476.11000000004,
24: 53.0,
25: 801.220000000001,
26: 684.56,
27: 1232.600000000002,
28: 0.0,
29: 15902.1,
30: 43943.48,
(pd.Timestamp('2021-07-12 00:00:00'), 'total_orders'): 0: 32.0,
1: 15.0,
2: 89.0,
3: 239.0,
4: 2.0,
5: 3.0,
6: 20.0,
7: 400.0,
8: 0.0,
9: 0.0,
10: 0.0,
11: 0.0,
12: 0.0,
13: 0.0,
14: 0.0,
15: 0.0,
16: 21.0,
17: 14.0,
18: 58.0,
19: 281.0,
20: 3.0,
21: 3.0,
22: 33.0,
23: 413.0,
24: 7.0,
25: 6.0,
26: 4.0,
27: 13.0,
28: 0.0,
29: 18.0,
30: 48.0,
(pd.Timestamp('2021-07-12 00:00:00'), 'total_sales'): 0: 2147.7000000000003,
1: 4767.3,
2: 2399.300000000003,
3: 3137.440000000002,
4: 178.0,
5: 866.61,
6: 10639.03,
7: 73235.38,
8: 0.0,
9: 0.0,
10: 0.0,
11: 0.0,
12: 0.0,
13: 0.0,
14: 0.0,
15: 0.0,
16: 220.94,
17: 727.5199999999995,
18: 2500.96999999999,
19: 4414.00999999998,
20: 15.0,
21: 196.71,
22: 2170.1,
23: 9745.24999999997,
24: 126.55,
25: 290.2,
26: 146.01,
27: 233.0,
28: 0.0,
29: 973.18,
30: 1658.940000000002).set_index(['group','category'])
df2
df2 = pd.DataFrame.from_dict('group': 0: 'total_full',
1: 'total_full',
2: 'A',
3: 'A',
4: 'B',
5: 'B',
6: 'C',
7: 'C',
8: 'D',
9: 'D',
10: 'Apple_total',
11: 'Apple_total',
12: 'A',
13: 'A',
14: 'B',
15: 'B',
16: 'C',
17: 'C',
18: 'D',
19: 'D',
20: 'Amazon_total',
21: 'Amazon_total',
22: 'A',
23: 'A',
24: 'B',
25: 'B',
26: 'C',
27: 'C',
28: 'D',
29: 'D',
30: 'Google_total',
31: 'Google_total',
32: 'A',
33: 'A',
34: 'B',
35: 'B',
36: 'C',
37: 'C',
38: 'D',
39: 'D',
40: 'Facebook_total',
41: 'Facebook_total',
42: 'A',
43: 'A',
44: 'B',
45: 'B',
46: 'C',
47: 'C',
48: 'D',
49: 'D',
50: 'Netflix_total',
51: 'Netflix_total',
52: 'A',
53: 'A',
54: 'B',
55: 'B',
56: 'C',
57: 'C',
58: 'D',
59: 'D',
60: 'Tesla_total',
61: 'Tesla_total',
62: 'A',
63: 'A',
64: 'B',
65: 'B',
66: 'C',
67: 'C',
68: 'D',
69: 'D',
70: 'Uber_total',
71: 'Uber_total',
72: 'A',
73: 'A',
74: 'B',
75: 'B',
76: 'C',
77: 'C',
78: 'D',
79: 'D',
'category': 0: 'total_full',
1: 'total_full',
2: 'group_total',
3: 'group_total',
4: 'group_total',
5: 'group_total',
6: 'group_total',
7: 'group_total',
8: 'group_total',
9: 'group_total',
10: 'Apple_total',
11: 'Apple_total',
12: 'Apple',
13: 'Apple',
14: 'Apple',
15: 'Apple',
16: 'Apple',
17: 'Apple',
18: 'Apple',
19: 'Apple',
20: 'Amazon_total',
21: 'Amazon_total',
22: 'Amazon',
23: 'Amazon',
24: 'Amazon',
25: 'Amazon',
26: 'Amazon',
27: 'Amazon',
28: 'Amazon',
29: 'Amazon',
30: 'Google_total',
31: 'Google_total',
32: 'Google',
33: 'Google',
34: 'Google',
35: 'Google',
36: 'Google',
37: 'Google',
38: 'Google',
39: 'Google',
40: 'Facebook_total',
41: 'Facebook_total',
42: 'Facebook',
43: 'Facebook',
44: 'Facebook',
45: 'Facebook',
46: 'Facebook',
47: 'Facebook',
48: 'Facebook',
49: 'Facebook',
50: 'Netflix_total',
51: 'Netflix_total',
52: 'Netflix',
53: 'Netflix',
54: 'Netflix',
55: 'Netflix',
56: 'Netflix',
57: 'Netflix',
58: 'Netflix',
59: 'Netflix',
60: 'Tesla_total',
61: 'Tesla_total',
62: 'Tesla',
63: 'Tesla',
64: 'Tesla',
65: 'Tesla',
66: 'Tesla',
67: 'Tesla',
68: 'Tesla',
69: 'Tesla',
70: 'Uber_total',
71: 'Uber_total',
72: 'Uber',
73: 'Uber',
74: 'Uber',
75: 'Uber',
76: 'Uber',
77: 'Uber',
78: 'Uber',
79: 'Uber',
'type': 0: 'Sales_1',
1: 'Sales_2',
2: 'Sales_1',
3: 'Sales_2',
4: 'Sales_1',
5: 'Sales_2',
6: 'Sales_1',
7: 'Sales_2',
8: 'Sales_1',
9: 'Sales_2',
10: 'Sales_1',
11: 'Sales_2',
12: 'Sales_1',
13: 'Sales_2',
14: 'Sales_1',
15: 'Sales_2',
16: 'Sales_1',
17: 'Sales_2',
18: 'Sales_1',
19: 'Sales_2',
20: 'Sales_1',
21: 'Sales_2',
22: 'Sales_1',
23: 'Sales_2',
24: 'Sales_1',
25: 'Sales_2',
26: 'Sales_1',
27: 'Sales_2',
28: 'Sales_1',
29: 'Sales_2',
30: 'Sales_1',
31: 'Sales_2',
32: 'Sales_1',
33: 'Sales_2',
34: 'Sales_1',
35: 'Sales_2',
36: 'Sales_1',
37: 'Sales_2',
38: 'Sales_1',
39: 'Sales_2',
40: 'Sales_1',
41: 'Sales_2',
42: 'Sales_1',
43: 'Sales_2',
44: 'Sales_1',
45: 'Sales_2',
46: 'Sales_1',
47: 'Sales_2',
48: 'Sales_1',
49: 'Sales_2',
50: 'Sales_1',
51: 'Sales_2',
52: 'Sales_1',
53: 'Sales_2',
54: 'Sales_1',
55: 'Sales_2',
56: 'Sales_1',
57: 'Sales_2',
58: 'Sales_1',
59: 'Sales_2',
60: 'Sales_1',
61: 'Sales_2',
62: 'Sales_1',
63: 'Sales_2',
64: 'Sales_1',
65: 'Sales_2',
66: 'Sales_1',
67: 'Sales_2',
68: 'Sales_1',
69: 'Sales_2',
70: 'Sales_1',
71: 'Sales_2',
72: 'Sales_1',
73: 'Sales_2',
74: 'Sales_1',
75: 'Sales_2',
76: 'Sales_1',
77: 'Sales_2',
78: 'Sales_1',
79: 'Sales_2',
'2021-06-28': 0: 67.5277641202152,
1: 82.7854700135998,
2: 21.50082266792856,
3: 22.03644997199996,
4: 64.460440147,
5: 10.1060499896,
6: 65.1530371974946,
7: 50.6429700519999,
8: 56.413464107792045,
9: 0,
10: 17.48074540313092,
11: 26.8376199976,
12: 52.172,
13: 61.16600000040001,
14: 20.9447844,
15: 40.69122000000001,
16: 83.55718929717925,
17: 14.98039999719995,
18: 20.806771705951697,
19: np.nan,
20: 18.3766353690825,
21: 12.82565001479992,
22: 52.425508769690694,
23: 25.661999978399994,
24: 17.88071596,
25: 24.384659998799997,
26: 91.10086982794643,
27: 12.77899003759993,
28: 16.969540811445366,
29: np.nan,
30: 18.8795397517309,
31: 26.73017999840005,
32: 53.52039700062155,
33: 58.81199999639999,
34: 12.1243325,
35: 24.0544100028,
36: 55.94068246571674,
37: 133.86376999920006,
38: 7.294127785392621,
39: np.nan,
40: 6.07807089184563,
41: 7.27483001599998,
42: 2.300470581874837,
43: 30.71300000639998,
44: 5.810764652,
45: 12.333119997600003,
46: 25.475930745418292,
47: 64.228710012,
48: 9.490904912552498,
49: np.nan,
50: 8.184780211399392,
51: 24.59321999400001,
52: 6.807138946302334,
53: 12.0879999972,
54: 0.869207661,
55: 0.324,
56: 0.5084336040970575,
57: 12.181219996800007,
58: 0,
59: np.nan,
60: 9.293956915067886,
61: 11.171379993599999,
62: 6.384936971649232,
63: 3.657999996,
64: 0.913782413,
65: 1.9992000012000002,
66: 1.5322078073061867,
67: 5.514179996399999,
68: 0.4630297231124678,
69: np.nan,
70: 36.23403557795798,
71: 53.35258999919999,
72: 21.890370397789923,
73: 9.937449997200002,
74: 5.916852561,
75: 6.319439989199998,
76: 7.03772344983066,
77: 37.095700012799995,
78: 1.3890891693374032,
79: np.nan,
'2021-07-05': 0: 65.4690491915759,
1: 98.5235100112003,
2: 21.4573181155924,
3: 241.06741999679997,
4: 67.481716829,
5: 11.60325000040002,
6: 27.5807099999998,
7: 65.8528400140003,
8: 58.949304246983736,
9: 0.0,
10: 185.65887577993723,
11: 318.9965699964001,
12: 54.517,
13: 66.55265999039996,
14: 21.92632044,
15: 43.67116000320002,
16: 87.47349898707688,
17: 208.7727500028001,
18: 21.742056352860352,
19: np.nan,
20: 16.6038963173654,
21: 25.28952001920013,
22: 54.7820864335212,
23: 36.75802000560001,
24: 18.71872129,
25: 30.1634600016,
26: 95.37075040035738,
27: 138.3680400120001,
28: 17.73233819348684,
29: np.nan,
30: 14.80302342121337,
31: 251.83851001200003,
32: 55.926190956481534,
33: 72.4443400032,
34: 12.69221484,
35: 26.032340003999998,
36: 58.56261169338368,
37: 153.36183000480003,
38: 7.622005931348156,
39: np.nan,
40: 72.24367956241771,
41: 14.83083001279999,
42: 29.5726042895728,
43: 38.723000005199985,
44: 6.083562133,
45: 12.845630001599998,
46: 26.66998281055652,
47: 63.26220000600001,
48: 9.917530329288393,
49: np.nan,
50: 8.555606693927,
51: 23.802009994800002,
52: 7.113126469779095,
53: 7.206999998399999,
54: 0.910216433,
55: 1.4089999991999997,
56: 0.5322637911479053,
57: 15.186009997200001,
58: 0.0,
59: np.nan,
60: 9.716385738295367,
61: 14.7327399948,
62: 6.671946105284065,
63: 5.691999996,
64: 0.956574175,
65: 1.0203399996,
66: 1.6040220980113027,
67: 8.020399999199999,
68: 0.4838433599999999,
69: np.nan,
70: 37.88758167841983,
71: 59.03332998119994,
72: 22.874363860953647,
73: 13.690399997999998,
74: 6.194107518,
75: 6.4613199911999954,
76: 7.367580219466185,
77: 38.881609991999944,
78: 1.4515300799999995,
79: np.nan,
'2021-07-12': 0: 607.2971827405001,
1: 88.9671100664001,
2: 21.26749278974862,
3: 17.1524199804,
4: 64.471138092,
5: 89.84481002279999,
6: 26.2044999999998,
7: 51.9698800632001,
8: 5.354051858751745,
9: 0.0,
10: 177.42361595891452,
11: 287.5395700032,
12: 52.117,
13: 47.388199995600004,
14: 20.94835038,
15: 41.4250800048,
16: 83.57340667555117,
17: 198.72629000280003,
18: 20.784858903363354,
19: np.nan,
20: 178.323907459086,
21: 185.83897002839998,
22: 52.37029646474982,
23: 27.87144997800001,
24: 17.88339044,
25: 23.645340010799984,
26: 91.11855133792106,
27: 134.3221800396,
28: 16.95166921641509,
29: np.nan,
30: 128.82813286243115,
31: 192.6867300156,
32: 53.46403160619618,
33: 41.412320006399995,
34: 12.1261155,
35: 11.840830002000002,
36: 55.95153983444301,
37: 139.43358000720002,
38: 7.286445921791947,
39: np.nan,
40: 69.04410667683521,
41: 93.877410018,
42: 28.270665735943805,
43: 27.512680004399986,
44: 5.811656147,
45: 5.2319800032,
46: 25.480875296710053,
47: 61.132750010400024,
48: 9.480909497181356,
49: np.nan,
50: 8.178601399067174,
51: 17.6743199976,
52: 6.7999699585309585,
53: 6.131999998799999,
54: 0.870099156,
55: 0.6185600004,
56: 0.5085322845362154,
57: 10.923759998400003,
58: 0.0,
59: np.nan,
60: 9.287042311133577,
61: 19.966500000000007,
62: 6.378212628950804,
63: 6.524999997600001,
64: 0.913782413,
65: 1.9303400016,
66: 1.5325051891827732,
67: 11.511160000800006,
68: 0.4625420799999998,
69: np.nan,
70: 36.21177607303267,
71: 51.3836100036,
72: 21.86731639537707,
73: 10.310769999600003,
74: 5.917744056,
75: 5.152679999999999,
76: 7.039089381655591,
77: 35.920160003999996,
78: 1.3876262399999995,
79: np.nan).set_index(['group','category','type'])
我正在尝试通过group, category, date (date is a column)
将df2
与df1
合并,以便我的输出如下所示:
在我想要的输出示例中,我省略了来自df2
和sales_1 & sales_2
的值,但是这些行应该用来自df2
的相应group
和category
值填充。
2021-06-28 2021-07-05 2021-07-12
total_orders total_sales sales_1 sales_2 total_orders total_sales sales_1 sales_2 total_orders total_sales sales_1 sales_2
group category
A Apple 88.000 4,603.210
Amazon 66.000 2,485.060
Google 191.000 4,919.400
Netflix 558.000 6,097.770
Facebook 12.000 31.220
Uber 4.000 155.710
Tesla 56.000 3,484.990
total 975.000 17,237.360
B Apple 90.000 561.540
Amazon 26.000 698.750
Google 49.000 1,290.130
Netflix 250.000 4,292.680
Facebook 7.000 947.650
Uber 2.000 329.000
Tesla 44.000 2,889.650
total 468.000 9,989.400
C Apple 36.000 330.890
Amazon 52.000 2,076.260
Google 94.000 2,982.270
Netflix 750.000 11,978.620
Facebook 10.000 683.000
Uber 0.000 0.000
Tesla 52.000 3,812.170
total 994.000 20,963.210
D Apple 16.000 234.490
Amazon 22.000 896.100
Google 5.000 231.000
Netflix 57.000 893.810
Uber 3.000 129.000
Tesla 33.000 1,712.330
total 136.000 4,106.730
这样sales_1 & sales_2
就合并到group & category
上,并且在同一个date
列上。 df2
中的 total_x
可以忽略,因为它可以从字段中计算出来。
total_values
不用于合并,只有它后面的。
我尝试过的:
df1.reset_index().merge(df2.reset_index(), left_on=['group', 'category'], right_on=['group', 'category'])
会引发警告:
UserWarning:不同级别之间的合并可能会产生意想不到的结果(左侧 2 个级别,右侧 1 个级别)
这不是我期望的合并方式。我怎样才能达到我想要的输出?
使用
df = df1.merge(df2.unstack(), left_index=True, right_index=True)
生产:
然后只是重新排序列,因为我想要一个唯一的日期和 4 列吗?或者可能是某个日期有00:00:00
?
【问题讨论】:
这是很多数据。是不是可以减少它并仍然出现问题? 【参考方案1】:首先在df2
的列中创建DatetimeIndex
,然后unstack
并通过MultiIndex
es 合并:
f = lambda x: pd.to_datetime(x)
df = (df1.merge(df2.rename(columns=f).unstack(), left_index=True, right_index=True)
.sort_index(axis=1))
print (df.head())
2021-06-28 2021-07-05 \
Sales_1 Sales_2 total_orders total_sales Sales_1
group category
A Apple 52.172000 61.166 88.0 4603.21 54.517000
Amazon 52.425509 25.662 66.0 2485.06 54.782086
Google 53.520397 58.812 191.0 4919.40 55.926191
Netflix 6.807139 12.088 558.0 6097.77 7.113126
Facebook 2.300471 30.713 12.0 31.22 29.572604
2021-07-12 \
Sales_2 total_orders total_sales Sales_1 Sales_2
group category
A Apple 66.55266 109.0 3453.02 52.117000 47.38820
Amazon 36.75802 48.0 17868.73 52.370296 27.87145
Google 72.44434 174.0 44707.83 53.464032 41.41232
Netflix 7.20700 592.0 61425.99 6.799970 6.13200
Facebook 38.72300 13.0 1261.00 28.270666 27.51268
total_orders total_sales
group category
A Apple 32.0 2147.70
Amazon 15.0 4767.30
Google 89.0 2399.30
Netflix 239.0 3137.44
Facebook 2.0 178.00
【讨论】:
以上是关于合并具有多索引和列值的数据框的主要内容,如果未能解决你的问题,请参考以下文章