合并具有多索引和列值的数据框

Posted

技术标签:

【中文标题】合并具有多索引和列值的数据框【英文标题】:Merging dataframes with multi indexes and column value 【发布时间】:2021-09-25 03:31:19 【问题描述】:

我有两个具有多个索引和日期的数据框:

df1

df1 = pd.DataFrame.from_dict(('group', ''): 0: 'A',
  1: 'A',
  2: 'A',
  3: 'A',
  4: 'A',
  5: 'A',
  6: 'A',
  7: 'A',
  8: 'B',
  9: 'B',
  10: 'B',
  11: 'B',
  12: 'B',
  13: 'B',
  14: 'B',
  15: 'B',
  16: 'C',
  17: 'C',
  18: 'C',
  19: 'C',
  20: 'C',
  21: 'C',
  22: 'C',
  23: 'C',
  24: 'D',
  25: 'D',
  26: 'D',
  27: 'D',
  28: 'D',
  29: 'D',
  30: 'D',
 ('category', ''): 0: 'Apple',
  1: 'Amazon',
  2: 'Google',
  3: 'Netflix',
  4: 'Facebook',
  5: 'Uber',
  6: 'Tesla',
  7: 'total',
  8: 'Apple',
  9: 'Amazon',
  10: 'Google',
  11: 'Netflix',
  12: 'Facebook',
  13: 'Uber',
  14: 'Tesla',
  15: 'total',
  16: 'Apple',
  17: 'Amazon',
  18: 'Google',
  19: 'Netflix',
  20: 'Facebook',
  21: 'Uber',
  22: 'Tesla',
  23: 'total',
  24: 'Apple',
  25: 'Amazon',
  26: 'Google',
  27: 'Netflix',
  28: 'Uber',
  29: 'Tesla',
  30: 'total',
 (pd.Timestamp('2021-06-28 00:00:00'), 'total_orders'): 0: 88.0,
  1: 66.0,
  2: 191.0,
  3: 558.0,
  4: 12.0,
  5: 4.0,
  6: 56.0,
  7: 975.0,
  8: 90.0,
  9: 26.0,
  10: 49.0,
  11: 250.0,
  12: 7.0,
  13: 2.0,
  14: 44.0,
  15: 468.0,
  16: 36.0,
  17: 52.0,
  18: 94.0,
  19: 750.0,
  20: 10.0,
  21: 0.0,
  22: 52.0,
  23: 994.0,
  24: 16.0,
  25: 22.0,
  26: 5.0,
  27: 57.0,
  28: 3.0,
  29: 33.0,
  30: 136.0,
 (pd.Timestamp('2021-06-28 00:00:00'), 'total_sales'): 0: 4603.209999999999,
  1: 2485.059999999998,
  2: 4919.39999999998,
  3: 6097.77,
  4: 31.22,
  5: 155.71,
  6: 3484.99,
  7: 17237.35999999996,
  8: 561.54,
  9: 698.75,
  10: 1290.13,
  11: 4292.68000000001,
  12: 947.65,
  13: 329.0,
  14: 2889.65,
  15: 9989.4,
  16: 330.8899999999994,
  17: 2076.26,
  18: 2982.270000000004,
  19: 11978.62000000002,
  20: 683.0,
  21: 0.0,
  22: 3812.16999999999,
  23: 20963.21000000002,
  24: 234.4900000000002,
  25: 896.1,
  26: 231.0,
  27: 893.810000000001,
  28: 129.0,
  29: 1712.329999999998,
  30: 4106.729999999996,
 (pd.Timestamp('2021-07-05 00:00:00'), 'total_orders'): 0: 109.0,
  1: 48.0,
  2: 174.0,
  3: 592.0,
  4: 13.0,
  5: 5.0,
  6: 43.0,
  7: 984.0,
  8: 62.0,
  9: 13.0,
  10: 37.0,
  11: 196.0,
  12: 8.0,
  13: 1.0,
  14: 3.0,
  15: 30.0,
  16: 76.0,
  17: 5.0,
  18: 147.0,
  19: 88.0,
  20: 8.0,
  21: 1.0,
  22: 78.0,
  23: 1248.0,
  24: 1.0,
  25: 18.0,
  26: 23.0,
  27: 83.0,
  28: 0.0,
  29: 29.0,
  30: 154.0,
 (pd.Timestamp('2021-07-05 00:00:00'), 'total_sales'): 0: 3453.02,
  1: 17868.730000000003,
  2: 44707.82999999999,
  3: 61425.99,
  4: 1261.0,
  5: 1914.6000000000001,
  6: 24146.09,
  7: 154777.25999999998,
  8: 6201.489999999999,
  9: 5513.960000000001,
  10: 9645.87,
  11: 25086.785,
  12: 663.0,
  13: 448.61,
  14: 26332.7,
  15: 73892.415,
  16: 556.749999999999,
  17: 1746.859999999997,
  18: 4103.219999999994,
  19: 15571.52000000008,
  20: 86.0,
  21: 69.0,
  22: 5882.759999999995,
  23: 26476.11000000004,
  24: 53.0,
  25: 801.220000000001,
  26: 684.56,
  27: 1232.600000000002,
  28: 0.0,
  29: 15902.1,
  30: 43943.48,
 (pd.Timestamp('2021-07-12 00:00:00'), 'total_orders'): 0: 32.0,
  1: 15.0,
  2: 89.0,
  3: 239.0,
  4: 2.0,
  5: 3.0,
  6: 20.0,
  7: 400.0,
  8: 0.0,
  9: 0.0,
  10: 0.0,
  11: 0.0,
  12: 0.0,
  13: 0.0,
  14: 0.0,
  15: 0.0,
  16: 21.0,
  17: 14.0,
  18: 58.0,
  19: 281.0,
  20: 3.0,
  21: 3.0,
  22: 33.0,
  23: 413.0,
  24: 7.0,
  25: 6.0,
  26: 4.0,
  27: 13.0,
  28: 0.0,
  29: 18.0,
  30: 48.0,
 (pd.Timestamp('2021-07-12 00:00:00'), 'total_sales'): 0: 2147.7000000000003,
  1: 4767.3,
  2: 2399.300000000003,
  3: 3137.440000000002,
  4: 178.0,
  5: 866.61,
  6: 10639.03,
  7: 73235.38,
  8: 0.0,
  9: 0.0,
  10: 0.0,
  11: 0.0,
  12: 0.0,
  13: 0.0,
  14: 0.0,
  15: 0.0,
  16: 220.94,
  17: 727.5199999999995,
  18: 2500.96999999999,
  19: 4414.00999999998,
  20: 15.0,
  21: 196.71,
  22: 2170.1,
  23: 9745.24999999997,
  24: 126.55,
  25: 290.2,
  26: 146.01,
  27: 233.0,
  28: 0.0,
  29: 973.18,
  30: 1658.940000000002).set_index(['group','category'])

df2

df2 = pd.DataFrame.from_dict('group': 0: 'total_full',
  1: 'total_full',
  2: 'A',
  3: 'A',
  4: 'B',
  5: 'B',
  6: 'C',
  7: 'C',
  8: 'D',
  9: 'D',
  10: 'Apple_total',
  11: 'Apple_total',
  12: 'A',
  13: 'A',
  14: 'B',
  15: 'B',
  16: 'C',
  17: 'C',
  18: 'D',
  19: 'D',
  20: 'Amazon_total',
  21: 'Amazon_total',
  22: 'A',
  23: 'A',
  24: 'B',
  25: 'B',
  26: 'C',
  27: 'C',
  28: 'D',
  29: 'D',
  30: 'Google_total',
  31: 'Google_total',
  32: 'A',
  33: 'A',
  34: 'B',
  35: 'B',
  36: 'C',
  37: 'C',
  38: 'D',
  39: 'D',
  40: 'Facebook_total',
  41: 'Facebook_total',
  42: 'A',
  43: 'A',
  44: 'B',
  45: 'B',
  46: 'C',
  47: 'C',
  48: 'D',
  49: 'D',
  50: 'Netflix_total',
  51: 'Netflix_total',
  52: 'A',
  53: 'A',
  54: 'B',
  55: 'B',
  56: 'C',
  57: 'C',
  58: 'D',
  59: 'D',
  60: 'Tesla_total',
  61: 'Tesla_total',
  62: 'A',
  63: 'A',
  64: 'B',
  65: 'B',
  66: 'C',
  67: 'C',
  68: 'D',
  69: 'D',
  70: 'Uber_total',
  71: 'Uber_total',
  72: 'A',
  73: 'A',
  74: 'B',
  75: 'B',
  76: 'C',
  77: 'C',
  78: 'D',
  79: 'D',
 'category': 0: 'total_full',
  1: 'total_full',
  2: 'group_total',
  3: 'group_total',
  4: 'group_total',
  5: 'group_total',
  6: 'group_total',
  7: 'group_total',
  8: 'group_total',
  9: 'group_total',
  10: 'Apple_total',
  11: 'Apple_total',
  12: 'Apple',
  13: 'Apple',
  14: 'Apple',
  15: 'Apple',
  16: 'Apple',
  17: 'Apple',
  18: 'Apple',
  19: 'Apple',
  20: 'Amazon_total',
  21: 'Amazon_total',
  22: 'Amazon',
  23: 'Amazon',
  24: 'Amazon',
  25: 'Amazon',
  26: 'Amazon',
  27: 'Amazon',
  28: 'Amazon',
  29: 'Amazon',
  30: 'Google_total',
  31: 'Google_total',
  32: 'Google',
  33: 'Google',
  34: 'Google',
  35: 'Google',
  36: 'Google',
  37: 'Google',
  38: 'Google',
  39: 'Google',
  40: 'Facebook_total',
  41: 'Facebook_total',
  42: 'Facebook',
  43: 'Facebook',
  44: 'Facebook',
  45: 'Facebook',
  46: 'Facebook',
  47: 'Facebook',
  48: 'Facebook',
  49: 'Facebook',
  50: 'Netflix_total',
  51: 'Netflix_total',
  52: 'Netflix',
  53: 'Netflix',
  54: 'Netflix',
  55: 'Netflix',
  56: 'Netflix',
  57: 'Netflix',
  58: 'Netflix',
  59: 'Netflix',
  60: 'Tesla_total',
  61: 'Tesla_total',
  62: 'Tesla',
  63: 'Tesla',
  64: 'Tesla',
  65: 'Tesla',
  66: 'Tesla',
  67: 'Tesla',
  68: 'Tesla',
  69: 'Tesla',
  70: 'Uber_total',
  71: 'Uber_total',
  72: 'Uber',
  73: 'Uber',
  74: 'Uber',
  75: 'Uber',
  76: 'Uber',
  77: 'Uber',
  78: 'Uber',
  79: 'Uber',
 'type': 0: 'Sales_1',
  1: 'Sales_2',
  2: 'Sales_1',
  3: 'Sales_2',
  4: 'Sales_1',
  5: 'Sales_2',
  6: 'Sales_1',
  7: 'Sales_2',
  8: 'Sales_1',
  9: 'Sales_2',
  10: 'Sales_1',
  11: 'Sales_2',
  12: 'Sales_1',
  13: 'Sales_2',
  14: 'Sales_1',
  15: 'Sales_2',
  16: 'Sales_1',
  17: 'Sales_2',
  18: 'Sales_1',
  19: 'Sales_2',
  20: 'Sales_1',
  21: 'Sales_2',
  22: 'Sales_1',
  23: 'Sales_2',
  24: 'Sales_1',
  25: 'Sales_2',
  26: 'Sales_1',
  27: 'Sales_2',
  28: 'Sales_1',
  29: 'Sales_2',
  30: 'Sales_1',
  31: 'Sales_2',
  32: 'Sales_1',
  33: 'Sales_2',
  34: 'Sales_1',
  35: 'Sales_2',
  36: 'Sales_1',
  37: 'Sales_2',
  38: 'Sales_1',
  39: 'Sales_2',
  40: 'Sales_1',
  41: 'Sales_2',
  42: 'Sales_1',
  43: 'Sales_2',
  44: 'Sales_1',
  45: 'Sales_2',
  46: 'Sales_1',
  47: 'Sales_2',
  48: 'Sales_1',
  49: 'Sales_2',
  50: 'Sales_1',
  51: 'Sales_2',
  52: 'Sales_1',
  53: 'Sales_2',
  54: 'Sales_1',
  55: 'Sales_2',
  56: 'Sales_1',
  57: 'Sales_2',
  58: 'Sales_1',
  59: 'Sales_2',
  60: 'Sales_1',
  61: 'Sales_2',
  62: 'Sales_1',
  63: 'Sales_2',
  64: 'Sales_1',
  65: 'Sales_2',
  66: 'Sales_1',
  67: 'Sales_2',
  68: 'Sales_1',
  69: 'Sales_2',
  70: 'Sales_1',
  71: 'Sales_2',
  72: 'Sales_1',
  73: 'Sales_2',
  74: 'Sales_1',
  75: 'Sales_2',
  76: 'Sales_1',
  77: 'Sales_2',
  78: 'Sales_1',
  79: 'Sales_2',
 '2021-06-28': 0: 67.5277641202152,
  1: 82.7854700135998,
  2: 21.50082266792856,
  3: 22.03644997199996,
  4: 64.460440147,
  5: 10.1060499896,
  6: 65.1530371974946,
  7: 50.6429700519999,
  8: 56.413464107792045,
  9: 0,
  10: 17.48074540313092,
  11: 26.8376199976,
  12: 52.172,
  13: 61.16600000040001,
  14: 20.9447844,
  15: 40.69122000000001,
  16: 83.55718929717925,
  17: 14.98039999719995,
  18: 20.806771705951697,
  19: np.nan,
  20: 18.3766353690825,
  21: 12.82565001479992,
  22: 52.425508769690694,
  23: 25.661999978399994,
  24: 17.88071596,
  25: 24.384659998799997,
  26: 91.10086982794643,
  27: 12.77899003759993,
  28: 16.969540811445366,
  29: np.nan,
  30: 18.8795397517309,
  31: 26.73017999840005,
  32: 53.52039700062155,
  33: 58.81199999639999,
  34: 12.1243325,
  35: 24.0544100028,
  36: 55.94068246571674,
  37: 133.86376999920006,
  38: 7.294127785392621,
  39: np.nan,
  40: 6.07807089184563,
  41: 7.27483001599998,
  42: 2.300470581874837,
  43: 30.71300000639998,
  44: 5.810764652,
  45: 12.333119997600003,
  46: 25.475930745418292,
  47: 64.228710012,
  48: 9.490904912552498,
  49: np.nan,
  50: 8.184780211399392,
  51: 24.59321999400001,
  52: 6.807138946302334,
  53: 12.0879999972,
  54: 0.869207661,
  55: 0.324,
  56: 0.5084336040970575,
  57: 12.181219996800007,
  58: 0,
  59: np.nan,
  60: 9.293956915067886,
  61: 11.171379993599999,
  62: 6.384936971649232,
  63: 3.657999996,
  64: 0.913782413,
  65: 1.9992000012000002,
  66: 1.5322078073061867,
  67: 5.514179996399999,
  68: 0.4630297231124678,
  69: np.nan,
  70: 36.23403557795798,
  71: 53.35258999919999,
  72: 21.890370397789923,
  73: 9.937449997200002,
  74: 5.916852561,
  75: 6.319439989199998,
  76: 7.03772344983066,
  77: 37.095700012799995,
  78: 1.3890891693374032,
  79: np.nan,
 '2021-07-05': 0: 65.4690491915759,
  1: 98.5235100112003,
  2: 21.4573181155924,
  3: 241.06741999679997,
  4: 67.481716829,
  5: 11.60325000040002,
  6: 27.5807099999998,
  7: 65.8528400140003,
  8: 58.949304246983736,
  9: 0.0,
  10: 185.65887577993723,
  11: 318.9965699964001,
  12: 54.517,
  13: 66.55265999039996,
  14: 21.92632044,
  15: 43.67116000320002,
  16: 87.47349898707688,
  17: 208.7727500028001,
  18: 21.742056352860352,
  19: np.nan,
  20: 16.6038963173654,
  21: 25.28952001920013,
  22: 54.7820864335212,
  23: 36.75802000560001,
  24: 18.71872129,
  25: 30.1634600016,
  26: 95.37075040035738,
  27: 138.3680400120001,
  28: 17.73233819348684,
  29: np.nan,
  30: 14.80302342121337,
  31: 251.83851001200003,
  32: 55.926190956481534,
  33: 72.4443400032,
  34: 12.69221484,
  35: 26.032340003999998,
  36: 58.56261169338368,
  37: 153.36183000480003,
  38: 7.622005931348156,
  39: np.nan,
  40: 72.24367956241771,
  41: 14.83083001279999,
  42: 29.5726042895728,
  43: 38.723000005199985,
  44: 6.083562133,
  45: 12.845630001599998,
  46: 26.66998281055652,
  47: 63.26220000600001,
  48: 9.917530329288393,
  49: np.nan,
  50: 8.555606693927,
  51: 23.802009994800002,
  52: 7.113126469779095,
  53: 7.206999998399999,
  54: 0.910216433,
  55: 1.4089999991999997,
  56: 0.5322637911479053,
  57: 15.186009997200001,
  58: 0.0,
  59: np.nan,
  60: 9.716385738295367,
  61: 14.7327399948,
  62: 6.671946105284065,
  63: 5.691999996,
  64: 0.956574175,
  65: 1.0203399996,
  66: 1.6040220980113027,
  67: 8.020399999199999,
  68: 0.4838433599999999,
  69: np.nan,
  70: 37.88758167841983,
  71: 59.03332998119994,
  72: 22.874363860953647,
  73: 13.690399997999998,
  74: 6.194107518,
  75: 6.4613199911999954,
  76: 7.367580219466185,
  77: 38.881609991999944,
  78: 1.4515300799999995,
  79: np.nan,
 '2021-07-12': 0: 607.2971827405001,
  1: 88.9671100664001,
  2: 21.26749278974862,
  3: 17.1524199804,
  4: 64.471138092,
  5: 89.84481002279999,
  6: 26.2044999999998,
  7: 51.9698800632001,
  8: 5.354051858751745,
  9: 0.0,
  10: 177.42361595891452,
  11: 287.5395700032,
  12: 52.117,
  13: 47.388199995600004,
  14: 20.94835038,
  15: 41.4250800048,
  16: 83.57340667555117,
  17: 198.72629000280003,
  18: 20.784858903363354,
  19: np.nan,
  20: 178.323907459086,
  21: 185.83897002839998,
  22: 52.37029646474982,
  23: 27.87144997800001,
  24: 17.88339044,
  25: 23.645340010799984,
  26: 91.11855133792106,
  27: 134.3221800396,
  28: 16.95166921641509,
  29: np.nan,
  30: 128.82813286243115,
  31: 192.6867300156,
  32: 53.46403160619618,
  33: 41.412320006399995,
  34: 12.1261155,
  35: 11.840830002000002,
  36: 55.95153983444301,
  37: 139.43358000720002,
  38: 7.286445921791947,
  39: np.nan,
  40: 69.04410667683521,
  41: 93.877410018,
  42: 28.270665735943805,
  43: 27.512680004399986,
  44: 5.811656147,
  45: 5.2319800032,
  46: 25.480875296710053,
  47: 61.132750010400024,
  48: 9.480909497181356,
  49: np.nan,
  50: 8.178601399067174,
  51: 17.6743199976,
  52: 6.7999699585309585,
  53: 6.131999998799999,
  54: 0.870099156,
  55: 0.6185600004,
  56: 0.5085322845362154,
  57: 10.923759998400003,
  58: 0.0,
  59: np.nan,
  60: 9.287042311133577,
  61: 19.966500000000007,
  62: 6.378212628950804,
  63: 6.524999997600001,
  64: 0.913782413,
  65: 1.9303400016,
  66: 1.5325051891827732,
  67: 11.511160000800006,
  68: 0.4625420799999998,
  69: np.nan,
  70: 36.21177607303267,
  71: 51.3836100036,
  72: 21.86731639537707,
  73: 10.310769999600003,
  74: 5.917744056,
  75: 5.152679999999999,
  76: 7.039089381655591,
  77: 35.920160003999996,
  78: 1.3876262399999995,
  79: np.nan).set_index(['group','category','type'])

我正在尝试通过group, category, date (date is a column)df2df1 合并,以便我的输出如下所示:

在我想要的输出示例中,我省略了来自df2sales_1 & sales_2 的值,但是这些行应该用来自df2 的相应groupcategory 值填充。

                    2021-06-28                                                               2021-07-05                                                      2021-07-12
                    total_orders    total_sales         sales_1        sales_2                  total_orders    total_sales         sales_1        sales_2            total_orders    total_sales         sales_1        sales_2
group    category
A        Apple        88.000          4,603.210
         Amazon       66.000          2,485.060
         Google      191.000          4,919.400
         Netflix     558.000          6,097.770
         Facebook     12.000             31.220
         Uber          4.000            155.710
         Tesla        56.000          3,484.990
         total       975.000         17,237.360
B        Apple        90.000            561.540
         Amazon       26.000            698.750
         Google       49.000          1,290.130
         Netflix     250.000          4,292.680
         Facebook      7.000            947.650
         Uber          2.000            329.000
         Tesla        44.000          2,889.650
         total       468.000          9,989.400
C        Apple        36.000            330.890
         Amazon       52.000          2,076.260
         Google       94.000          2,982.270
         Netflix     750.000         11,978.620
         Facebook     10.000            683.000
         Uber          0.000              0.000
         Tesla        52.000          3,812.170
         total       994.000         20,963.210
D        Apple        16.000            234.490
         Amazon       22.000            896.100
         Google        5.000            231.000
         Netflix      57.000            893.810
         Uber          3.000            129.000
         Tesla        33.000          1,712.330
         total       136.000          4,106.730

这样sales_1 & sales_2 就合并到group & category 上,并且在同一个date 列上。 df2 中的 total_x 可以忽略,因为它可以从字段中计算出来。

total_values 不用于合并,只有它后面的。

我尝试过的:

df1.reset_index().merge(df2.reset_index(), left_on=['group', 'category'], right_on=['group', 'category'])

会引发警告:

UserWarning:不同级别之间的合并可能会产生意想不到的结果(左侧 2 个级别,右侧 1 个级别)

这不是我期望的合并方式。我怎样才能达到我想要的输出?

使用

df = df1.merge(df2.unstack(), left_index=True, right_index=True)

生产:

然后只是重新排序列,因为我想要一个唯一的日期和 4 列吗?或者可能是某个日期有00:00:00

【问题讨论】:

这是很多数据。是不是可以减少它并仍然出现问题? 【参考方案1】:

首先在df2 的列中创建DatetimeIndex,然后unstack 并通过MultiIndexes 合并:

f = lambda x: pd.to_datetime(x)
df = (df1.merge(df2.rename(columns=f).unstack(), left_index=True, right_index=True)
        .sort_index(axis=1))

print (df.head())

               2021-06-28                                  2021-07-05  \
                  Sales_1 Sales_2 total_orders total_sales    Sales_1   
group category                                                          
A     Apple     52.172000  61.166         88.0     4603.21  54.517000   
      Amazon    52.425509  25.662         66.0     2485.06  54.782086   
      Google    53.520397  58.812        191.0     4919.40  55.926191   
      Netflix    6.807139  12.088        558.0     6097.77   7.113126   
      Facebook   2.300471  30.713         12.0       31.22  29.572604   

                                                  2021-07-12            \
                 Sales_2 total_orders total_sales    Sales_1   Sales_2   
group category                                                           
A     Apple     66.55266        109.0     3453.02  52.117000  47.38820   
      Amazon    36.75802         48.0    17868.73  52.370296  27.87145   
      Google    72.44434        174.0    44707.83  53.464032  41.41232   
      Netflix    7.20700        592.0    61425.99   6.799970   6.13200   
      Facebook  38.72300         13.0     1261.00  28.270666  27.51268   

                                         
               total_orders total_sales  
group category                           
A     Apple            32.0     2147.70  
      Amazon           15.0     4767.30  
      Google           89.0     2399.30  
      Netflix         239.0     3137.44  
      Facebook          2.0      178.00  

【讨论】:

以上是关于合并具有多索引和列值的数据框的主要内容,如果未能解决你的问题,请参考以下文章

合并两个具有聚合列值的数据框作为结果

将两个具有相似列值的数据框合并在一起[重复]

Pandas Dataframe Multiindex 按级别和列值排序

根据 Pandas 中的公共列值合并两个数据框

合并两个具有多索引的数据框

合并两个共享相同列值的数据框