将数据帧的一部分合并到另一个数据帧中,同时保留以前的数据
Posted
技术标签:
【中文标题】将数据帧的一部分合并到另一个数据帧中,同时保留以前的数据【英文标题】:Merging part of data frame into another data frame while preserving previous data 【发布时间】:2022-01-22 11:59:12 【问题描述】:我有一个如下所示的数据框:
> data
Lake_name Lake_name_percent surface_area percent2 Lake_name_percent2 prev.surface_area percent3 X
1 AlanHenry AlanHenry0.705 2109.350 0.705 AlanHenry0.708 2116.203 0.708 277
2 AlanHenry AlanHenry0.82 2354.878 0.820 AlanHenry0.821 2355.950 0.821 303
3 AlanHenry AlanHenry0.82 2354.878 0.820 AlanHenry0.856 2426.508 0.856 341
4 AlanHenry AlanHenry0.912 2553.133 0.912 AlanHenry0.886 2505.336 0.886 243
5 AlanHenry AlanHenry0.966 2660.238 0.966 AlanHenry0.958 2637.476 0.958 211
6 AlanHenry AlanHenry1.009 2779.621 1.009 AlanHenry0.989 2694.901 0.989 173
17 <NA> AmonGCarter0.724 NA NA AmonGCarter0.67 NA NA 155
18 <NA> AmonGCarter0.74 NA NA AmonGCarter0.629 NA NA 568
19 <NA> AmonGCarter0.8 NA NA AmonGCarter0.885 NA NA 456
20 <NA> AmonGCarter0.885 NA NA AmonGCarter0.842 NA NA 244
21 <NA> AmonGCarter0.914 NA NA AmonGCarter0.958 NA NA 36
22 <NA> AmonGCarter0.958 NA NA AmonGCarter1.035 NA NA 417
23 <NA> AmonGCarter0.976 NA NA AmonGCarter1.075 NA NA 58
24 <NA> AmonGCarter1.018 NA NA AmonGCarter0.92 NA NA 603
25 <NA> AmonGCarter1.035 NA NA AmonGCarter0.918 NA NA 194
> dput(data)
structure(list(Lake_name = c("AlanHenry", "AlanHenry", "AlanHenry",
"AlanHenry", "AlanHenry", "AlanHenry", NA, NA, NA, NA, NA, NA,
NA, NA, NA), Lake_name_percent = c("AlanHenry0.705", "AlanHenry0.82",
"AlanHenry0.82", "AlanHenry0.912", "AlanHenry0.966", "AlanHenry1.009",
"AmonGCarter0.724", "AmonGCarter0.74", "AmonGCarter0.8", "AmonGCarter0.885",
"AmonGCarter0.914", "AmonGCarter0.958", "AmonGCarter0.976", "AmonGCarter1.018",
"AmonGCarter1.035"), surface_area = c(2109.35, 2354.878, 2354.878,
2553.1325, 2660.238125, 2779.62076923077, NA, NA, NA, NA, NA,
NA, NA, NA, NA), percent2 = c(0.705, 0.82, 0.82, 0.912, 0.966,
1.009, NA, NA, NA, NA, NA, NA, NA, NA, NA), Lake_name_percent2 = c("AlanHenry0.708",
"AlanHenry0.821", "AlanHenry0.856", "AlanHenry0.886", "AlanHenry0.958",
"AlanHenry0.989", "AmonGCarter0.67", "AmonGCarter0.629", "AmonGCarter0.885",
"AmonGCarter0.842", "AmonGCarter0.958", "AmonGCarter1.035", "AmonGCarter1.075",
"AmonGCarter0.92", "AmonGCarter0.918"), prev.surface_area = c(2116.20333333333,
2355.95, 2426.50833333333, 2505.33642857143, 2637.47595744681,
2694.90068965517, NA, NA, NA, NA, NA, NA, NA, NA, NA), percent3 = c(0.708,
0.821, 0.856, 0.886, 0.958, 0.989, NA, NA, NA, NA, NA, NA, NA,
NA, NA), X = c(277L, 303L, 341L, 243L, 211L, 173L, 155L, 568L,
456L, 244L, 36L, 417L, 58L, 603L, 194L)), class = "data.frame", row.names = c(1L,
2L, 3L, 4L, 5L, 6L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L
))
我正在尝试将此数据框与如下所示的数据框合并:
surface_area Lake_name percent2 Lake_name_percent
<dbl> <chr> <dbl> <chr>
1 1159. AmonGCarter 0.724 AmonGCarter0.724
2 1176. AmonGCarter 0.74 AmonGCarter0.74
3 1240. AmonGCarter 0.8 AmonGCarter0.8
4 1329. AmonGCarter 0.885 AmonGCarter0.885
5 1360. AmonGCarter 0.914 AmonGCarter0.914
6 1407. AmonGCarter 0.958 AmonGCarter0.958
7 1426. AmonGCarter 0.976 AmonGCarter0.976
8 1468 AmonGCarter 1.02 AmonGCarter1.018
9 1484. AmonGCarter 1.03 AmonGCarter1.035
> dput(AmonGCarter.sa)
structure(list(surface_area = c(1159.1, 1175.8, 1239.65714285714,
1329.2, 1359.6, 1406.7037037037, 1426.2, 1468, 1484.2), Lake_name = c("AmonGCarter",
"AmonGCarter", "AmonGCarter", "AmonGCarter", "AmonGCarter", "AmonGCarter",
"AmonGCarter", "AmonGCarter", "AmonGCarter"), percent2 = c(0.724,
0.74, 0.8, 0.885, 0.914, 0.958, 0.976, 1.018, 1.035), Lake_name_percent = c("AmonGCarter0.724",
"AmonGCarter0.74", "AmonGCarter0.8", "AmonGCarter0.885", "AmonGCarter0.914",
"AmonGCarter0.958", "AmonGCarter0.976", "AmonGCarter1.018", "AmonGCarter1.035"
)), row.names = c(NA, -9L), groups = structure(list(percent2 = c(0.724,
0.74, 0.8, 0.885, 0.914, 0.958, 0.976, 1.018, 1.035), .rows = structure(list(
1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -9L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
当我像这样合并数据集时:
merge(data,AmonGCarter.sa, all.y = T)
没有 AlanHenry 数据,我的数据看起来像这样:
> merge(data,AmonGCarter.sa, all.y = T)
Lake_name Lake_name_percent surface_area percent2 Lake_name_percent2 prev.surface_area percent3 X
1 AmonGCarter AmonGCarter0.724 1159.100 0.724 <NA> NA NA NA
2 AmonGCarter AmonGCarter0.74 1175.800 0.740 <NA> NA NA NA
3 AmonGCarter AmonGCarter0.8 1239.657 0.800 <NA> NA NA NA
4 AmonGCarter AmonGCarter0.885 1329.200 0.885 <NA> NA NA NA
5 AmonGCarter AmonGCarter0.914 1359.600 0.914 <NA> NA NA NA
6 AmonGCarter AmonGCarter0.958 1406.704 0.958 <NA> NA NA NA
7 AmonGCarter AmonGCarter0.976 1426.200 0.976 <NA> NA NA NA
8 AmonGCarter AmonGCarter1.018 1468.000 1.018 <NA> NA NA NA
9 AmonGCarter AmonGCarter1.035 1484.200 1.035 <NA> NA NA NA
> dput(merge(data,AmonGCarter.sa, all.y = T))
structure(list(Lake_name = c("AmonGCarter", "AmonGCarter", "AmonGCarter",
"AmonGCarter", "AmonGCarter", "AmonGCarter", "AmonGCarter", "AmonGCarter",
"AmonGCarter"), Lake_name_percent = c("AmonGCarter0.724", "AmonGCarter0.74",
"AmonGCarter0.8", "AmonGCarter0.885", "AmonGCarter0.914", "AmonGCarter0.958",
"AmonGCarter0.976", "AmonGCarter1.018", "AmonGCarter1.035"),
surface_area = c(1159.1, 1175.8, 1239.65714285714, 1329.2,
1359.6, 1406.7037037037, 1426.2, 1468, 1484.2), percent2 = c(0.724,
0.74, 0.8, 0.885, 0.914, 0.958, 0.976, 1.018, 1.035), Lake_name_percent2 = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_
), prev.surface_area = c(NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), percent3 = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_), X = c(NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_)), row.names = c(NA, -9L), class = "data.frame")
如何将 AmonGCarter
中的数据添加到保留 AlanHenry 先前数据的合并数据框中?
【问题讨论】:
【参考方案1】:您可以在tidyverse
中使用full_join
。这会创建额外的列(但我们用后缀控制名称),但是我们可以使用coalesce
来组合类似的列。然后,我们可以删除多余的列。
library(tidyverse)
data %>%
full_join(AmonGCarter.sa, by = 'Lake_name_percent', suffix = c('_x', '_y')) %>%
mutate(across(ends_with('_x'), ~ coalesce(., get(
sub('_x', '_y', cur_column())
)),
.names = 'sub("_x", "", .col)')) %>%
select(!ends_with('_x') & !ends_with('_y')) %>%
select(names(data))
另一个简单的方法是使用rquery
包中的natural_join
。它会将data
中的NA
值替换为AmonGCarter.sa
中的值。然后,要按照data
的原始顺序获取列,您可以使用select
from dplyr
。
library(rquery)
library(rqdatatable)
library(dplyr)
rquery::natural_join(data, AmonGCarter.sa, by = "Lake_name_percent", jointype = "FULL") %>%
dplyr::select(names(data))
输出
Lake_name Lake_name_percent surface_area percent2 Lake_name_percent2 prev.surface_area percent3 X
1 AlanHenry AlanHenry0.705 2109.350 0.705 AlanHenry0.708 2116.203 0.708 277
2 AlanHenry AlanHenry0.82 2354.878 0.820 AlanHenry0.821 2355.950 0.821 303
3 AlanHenry AlanHenry0.82 2354.878 0.820 AlanHenry0.856 2426.508 0.856 341
4 AlanHenry AlanHenry0.912 2553.133 0.912 AlanHenry0.886 2505.336 0.886 243
5 AlanHenry AlanHenry0.966 2660.238 0.966 AlanHenry0.958 2637.476 0.958 211
6 AlanHenry AlanHenry1.009 2779.621 1.009 AlanHenry0.989 2694.901 0.989 173
7 AmonGCarter AmonGCarter0.724 1159.100 0.724 AmonGCarter0.67 NA NA 155
8 AmonGCarter AmonGCarter0.74 1175.800 0.740 AmonGCarter0.629 NA NA 568
9 AmonGCarter AmonGCarter0.8 1239.657 0.800 AmonGCarter0.885 NA NA 456
10 AmonGCarter AmonGCarter0.885 1329.200 0.885 AmonGCarter0.842 NA NA 244
11 AmonGCarter AmonGCarter0.914 1359.600 0.914 AmonGCarter0.958 NA NA 36
12 AmonGCarter AmonGCarter0.958 1406.704 0.958 AmonGCarter1.035 NA NA 417
13 AmonGCarter AmonGCarter0.976 1426.200 0.976 AmonGCarter1.075 NA NA 58
14 AmonGCarter AmonGCarter1.018 1468.000 1.018 AmonGCarter0.92 NA NA 603
15 AmonGCarter AmonGCarter1.035 1484.200 1.035 AmonGCarter0.918 NA NA 194
【讨论】:
tidyverse 解决方案效果很好,但是当我尝试 natural_join 时出现错误:rquery_apply_to_data_frame 中的错误(source,enode,env = env):rquery::rquery_apply_to_data_frame 没有默认执行程序或数据库(请尝试库('rqdatatable'))。我只是选择了 tidyverse 而不是添加更多库以上是关于将数据帧的一部分合并到另一个数据帧中,同时保留以前的数据的主要内容,如果未能解决你的问题,请参考以下文章
合并两个不同长度的python pandas数据帧,但将所有行保留在输出数据帧中
如何根据一个数据帧中的列值和R中另一个数据帧的列标题名称有条件地创建新列