合并具有多个分隔符的列
Posted
技术标签:
【中文标题】合并具有多个分隔符的列【英文标题】:Merge columns with multiple delimiters 【发布时间】:2022-01-17 19:46:46 【问题描述】:如何合并具有可变数量分隔符的列,以便获得类似于输出的内容(假设所有内容都是字符)?
dt1
letter
1 a
2 b+c
3 c
4 d+e+f+g
5 a+g+e
dt2
letter number
1 a 1
2 b 2
3 c 3
4 d 4
5 e 5
6 f 6
7 g 7
> output
letter number
1 a 1
2 b+c 2+3
3 c 3
4 d+e+f+g 4+5+6+7
5 a+g+e 1+7+5
dt1<-data.frame(letter=c("a","b+c","c","d+e+f+g","a+g+e"))
dt2<-data.frame(letter=c("a","b","c","d","e","f","g"),number=c("1","2","3","4","5","6","7"))
output<-data.frame(letter=c("a","b+c","c","d+e+f+g","a+g+e"), number=c("1","2+3","3","4+5+6+7","1+7+5"))
【问题讨论】:
【参考方案1】:一种快速易读的library(stringi)
方法:
library(stringi)
dt1 <- data.frame(letter=c("a","b+c","c","d+e+f+g","a+g+e"))
dt2 <- data.frame(letter=c("a","b","c","d","e","f","g"), number=c("1","2","3","4","5","6","7"))
dt1$number <- stri_replace_all_fixed(dt1$letter, pattern = dt2$letter, replacement = dt2$number, vectorize_all = FALSE)
dt1
结果:
> dt1
letter number
1 a 1
2 b+c 2+3
3 c 3
4 d+e+f+g 4+5+6+7
5 a+g+e 1+7+5
另见related answer。
编辑:当前可用答案的基准:
Unit: microseconds
expr min lq mean median uq max neval
Sotos 2689.6 2689.6 2689.6 2689.6 2689.6 2689.6 1
ismirsehregal 26.4 26.4 26.4 26.4 26.4 26.4 1
www 42247.8 42247.8 42247.8 42247.8 42247.8 42247.8 1
MerijnvanTilborg 1723.5 1723.5 1723.5 1723.5 1723.5 1723.5 1
YuriySaraykin 21859.2 21859.2 21859.2 21859.2 21859.2 21859.2 1
danlooo 4165.7 4165.7 4165.7 4165.7 4165.7 4165.7 1
重现基准:
library(microbenchmark)
library(tidyverse)
library(stringi)
dt1 <- data.frame(letter=c("a","b+c","c","d+e+f+g","a+g+e"))
dt2 <- data.frame(letter=c("a","b","c","d","e","f","g"), number=c("1","2","3","4","5","6","7"))
microbenchmark(
Sotos =
sapply(strsplit(dt1$letter, '+', fixed = TRUE), function(i)
paste(dt2$number[dt2$letter %in% i], collapse = '+'))
,
ismirsehregal =
stri_replace_all_fixed(
dt1$letter,
pattern = dt2$letter,
replacement = dt2$number,
vectorize_all = FALSE
)
,
www =
dt1 %>%
mutate(ID = 1:n()) %>%
separate_rows(letter, sep = "\\+") %>%
left_join(dt2, by = "letter") %>%
group_by(ID) %>%
summarize(across(.fns = ~ paste0(., collapse = "+"))) %>%
ungroup() %>%
select(-ID)
,
MerijnvanTilborg =
dt1 %>% mutate(MerijnvanTilborg = stri_replace_all_regex(letter, dt2$letter, dt2$number, vectorize_all = F))
,
YuriySaraykin =
dt1 %>%
rowwise() %>%
mutate(tmp = str_split(letter, pattern = "\\+")) %>%
ungroup() %>%
mutate(number = map_chr(tmp, ~ paste0(match(.x, dt2$letter), collapse = "+"))) %>%
select(-tmp)
,
danlooo =
dt1 %>%
as_tibble() %>%
mutate(number = letter %>% map_chr(
~ .x %>%
str_split("[+]") %>%
simplify() %>%
map_chr( ~ deframe(dt2)[.x]) %>%
paste0(collapse = "+")
))
,
times = 1L
)
【讨论】:
【参考方案2】:无需拆分任何数据,因为您只需要将特定字母替换为特定数字。
dt1<-data.frame(letter=c("a","b+c","c","d+e+f+g","a+g+e"), stringsAsFactors = F)
dt2<-data.frame(letter=c("a","b","c","d","e","f","g"),number=c("1","2","3","4","5","6","7"), stringsAsFactors = F)
library(stringi)
dt1 %>% mutate(number = stri_replace_all_regex(letter, dt2$letter, dt2$number, vectorize_all = F))
letter number
1 a 1
2 b+c 2+3
3 c 3
4 d+e+f+g 4+5+6+7
5 a+g+e 1+7+5
另一种解决方案可能更短
dt1 <- data.frame(letter=c("a","b+c","c","d+e+f+g","a+g+e"), stringsAsFactors = F)
v <- c("1","2","3","4","5","6","7")
names(v) <- c("a","b","c","d","e","f","g")
dt1 %>% mutate(number = str_replace_all(letter, v))
【讨论】:
【参考方案3】:dt1<-data.frame(letter=c("a","b+c","c","d+e+f+g","a+g+e"))
dt2<-data.frame(letter=c("a","b","c","d","e","f","g"),number=c("1","2","3","4","5","6","7"))
library(tidyverse)
dt1 %>%
rowwise() %>%
mutate(tmp = str_split(letter, pattern = "\\+")) %>%
ungroup() %>%
mutate(number = map_chr(tmp, ~paste0(match(.x, dt2$letter), collapse = "+"))) %>%
select(-tmp)
#> # A tibble: 5 x 2
#> letter number
#> <chr> <chr>
#> 1 a 1
#> 2 b+c 2+3
#> 3 c 3
#> 4 d+e+f+g 4+5+6+7
#> 5 a+g+e 1+7+5
由reprex package (v2.0.1) 于 2021 年 12 月 14 日创建
【讨论】:
【参考方案4】:使用tidyverse
的解决方案。
library(tidyverse)
output <- dt1 %>%
mutate(ID = 1:n()) %>%
separate_rows(letter, sep = "\\+") %>%
left_join(dt2, by = "letter") %>%
group_by(ID) %>%
summarize(across(.fns = ~paste0(., collapse = "+"))) %>%
ungroup() %>%
select(-ID)
output
# # A tibble: 5 x 2
# letter number
# <chr> <chr>
# 1 a 1
# 2 b+c 2+3
# 3 c 3
# 4 d+e+f+g 4+5+6+7
# 5 a+g+e 1+7+5
【讨论】:
【参考方案5】:一个基本的 R 解决方案可以是,
dt1$res <- sapply(strsplit(dt1$letter, '+', fixed = TRUE), function(i)paste(dt2$number[dt2$letter %in% i], collapse = '+'))
# letter res
#1 a 1
#2 b+c 2+3
#3 c 3
#4 d+e+f+g 4+5+6+7
#5 a+g+e 1+5+7
【讨论】:
是否有机会改进代码,使其实际上保持合并因素的顺序?我的问题实际上是一个非常复杂的数据集,一些单元格会导致,例如:字母=d+e+f+g 和 res=5+4+6+7。我相信这一定与 dt2 中事物的排序方式有关。 @Sotos【参考方案6】:library(tidyverse)
dt1 <- data.frame(letter = c("a", "b+c", "c", "d+e+f+g", "a+g+e"))
dt2 <- data.frame(letter = c("a", "b", "c", "d", "e", "f", "g"),
number = c("1", "2", "3", "4", "5", "6", "7"))
dt1 %>%
as_tibble() %>%
mutate(
number = letter %>% map_chr(~ .x %>%
str_split("[+]") %>%
simplify() %>%
map_chr(~ deframe(dt2)[.x]) %>%
paste0(collapse = "+")
)
)
#> # A tibble: 5 x 2
#> letter number
#> <chr> <chr>
#> 1 a 1
#> 2 b+c 2+3
#> 3 c 3
#> 4 d+e+f+g 4+5+6+7
#> 5 a+g+e 1+7+5
由reprex package (v2.0.1) 于 2021 年 12 月 14 日创建
【讨论】:
以上是关于合并具有多个分隔符的列的主要内容,如果未能解决你的问题,请参考以下文章