合并具有多个分隔符的列

Posted

技术标签:

【中文标题】合并具有多个分隔符的列【英文标题】:Merge columns with multiple delimiters 【发布时间】:2022-01-17 19:46:46 【问题描述】:

如何合并具有可变数量分隔符的列,以便获得类似于输出的内容(假设所有内容都是字符)?

 dt1
   letter
1       a
2     b+c
3       c
4 d+e+f+g
5   a+g+e

 dt2
  letter number
1      a      1
2      b      2
3      c      3
4      d      4
5      e      5
6      f      6
7      g      7

> output
   letter  number
1       a       1
2     b+c     2+3
3       c       3
4 d+e+f+g 4+5+6+7
5   a+g+e   1+7+5
dt1<-data.frame(letter=c("a","b+c","c","d+e+f+g","a+g+e"))

dt2<-data.frame(letter=c("a","b","c","d","e","f","g"),number=c("1","2","3","4","5","6","7"))

output<-data.frame(letter=c("a","b+c","c","d+e+f+g","a+g+e"), number=c("1","2+3","3","4+5+6+7","1+7+5"))

【问题讨论】:

【参考方案1】:

一种快速易读的library(stringi) 方法:

library(stringi)    

dt1 <- data.frame(letter=c("a","b+c","c","d+e+f+g","a+g+e"))
dt2 <- data.frame(letter=c("a","b","c","d","e","f","g"), number=c("1","2","3","4","5","6","7"))

dt1$number <- stri_replace_all_fixed(dt1$letter, pattern = dt2$letter, replacement = dt2$number, vectorize_all = FALSE)

dt1

结果:

> dt1
   letter  number
1       a       1
2     b+c     2+3
3       c       3
4 d+e+f+g 4+5+6+7
5   a+g+e   1+7+5

另见related answer。


编辑:当前可用答案的基准:

Unit: microseconds
             expr     min      lq    mean  median      uq     max neval
            Sotos  2689.6  2689.6  2689.6  2689.6  2689.6  2689.6     1
    ismirsehregal    26.4    26.4    26.4    26.4    26.4    26.4     1
              www 42247.8 42247.8 42247.8 42247.8 42247.8 42247.8     1
 MerijnvanTilborg  1723.5  1723.5  1723.5  1723.5  1723.5  1723.5     1
    YuriySaraykin 21859.2 21859.2 21859.2 21859.2 21859.2 21859.2     1
          danlooo  4165.7  4165.7  4165.7  4165.7  4165.7  4165.7     1

重现基准:

library(microbenchmark)
library(tidyverse)
library(stringi)  

dt1 <- data.frame(letter=c("a","b+c","c","d+e+f+g","a+g+e"))
dt2 <- data.frame(letter=c("a","b","c","d","e","f","g"), number=c("1","2","3","4","5","6","7"))

microbenchmark(
  Sotos = 
    sapply(strsplit(dt1$letter, '+', fixed = TRUE), function(i)
      paste(dt2$number[dt2$letter %in% i], collapse = '+'))
  ,
  ismirsehregal = 
    stri_replace_all_fixed(
      dt1$letter,
      pattern = dt2$letter,
      replacement = dt2$number,
      vectorize_all = FALSE
    )
  ,
  www = 
    dt1 %>%
      mutate(ID = 1:n()) %>%
      separate_rows(letter, sep = "\\+") %>%
      left_join(dt2, by = "letter") %>%
      group_by(ID) %>%
      summarize(across(.fns = ~ paste0(., collapse = "+"))) %>%
      ungroup() %>%
      select(-ID)
  ,
  MerijnvanTilborg = 
    dt1 %>% mutate(MerijnvanTilborg = stri_replace_all_regex(letter, dt2$letter, dt2$number, vectorize_all = F))
  ,
  YuriySaraykin = 
    dt1 %>%
      rowwise() %>%
      mutate(tmp = str_split(letter, pattern = "\\+")) %>%
      ungroup() %>%
      mutate(number = map_chr(tmp, ~ paste0(match(.x, dt2$letter), collapse = "+"))) %>%
      select(-tmp)
  ,
  danlooo = 
    dt1 %>%
      as_tibble() %>%
      mutate(number = letter %>% map_chr(
        ~ .x %>%
          str_split("[+]") %>%
          simplify() %>%
          map_chr( ~ deframe(dt2)[.x]) %>%
          paste0(collapse = "+")
      ))
  ,
  times = 1L
)

【讨论】:

【参考方案2】:

无需拆分任何数据,因为您只需要将特定字母替换为特定数字。

dt1<-data.frame(letter=c("a","b+c","c","d+e+f+g","a+g+e"), stringsAsFactors = F)
dt2<-data.frame(letter=c("a","b","c","d","e","f","g"),number=c("1","2","3","4","5","6","7"), stringsAsFactors = F)

library(stringi)

dt1 %>% mutate(number = stri_replace_all_regex(letter, dt2$letter, dt2$number, vectorize_all = F))

   letter  number
1       a       1
2     b+c     2+3
3       c       3
4 d+e+f+g 4+5+6+7
5   a+g+e   1+7+5

另一种解决方案可能更短

dt1 <- data.frame(letter=c("a","b+c","c","d+e+f+g","a+g+e"), stringsAsFactors = F)

v <- c("1","2","3","4","5","6","7")
names(v) <- c("a","b","c","d","e","f","g")

dt1 %>% mutate(number = str_replace_all(letter, v))

【讨论】:

【参考方案3】:
dt1<-data.frame(letter=c("a","b+c","c","d+e+f+g","a+g+e"))

dt2<-data.frame(letter=c("a","b","c","d","e","f","g"),number=c("1","2","3","4","5","6","7"))

library(tidyverse)
dt1 %>% 
  rowwise() %>% 
  mutate(tmp = str_split(letter, pattern = "\\+")) %>% 
  ungroup() %>% 
  mutate(number = map_chr(tmp, ~paste0(match(.x, dt2$letter), collapse = "+"))) %>% 
  select(-tmp)
#> # A tibble: 5 x 2
#>   letter  number 
#>   <chr>   <chr>  
#> 1 a       1      
#> 2 b+c     2+3    
#> 3 c       3      
#> 4 d+e+f+g 4+5+6+7
#> 5 a+g+e   1+7+5

由reprex package (v2.0.1) 于 2021 年 12 月 14 日创建

【讨论】:

【参考方案4】:

使用tidyverse 的解决方案。

library(tidyverse)

output <- dt1 %>%
  mutate(ID = 1:n()) %>%
  separate_rows(letter, sep = "\\+") %>%
  left_join(dt2, by = "letter") %>%
  group_by(ID) %>%
  summarize(across(.fns = ~paste0(., collapse = "+"))) %>%
  ungroup() %>%
  select(-ID)
output
# # A tibble: 5 x 2
#   letter  number 
#   <chr>   <chr>  
# 1 a       1      
# 2 b+c     2+3    
# 3 c       3      
# 4 d+e+f+g 4+5+6+7
# 5 a+g+e   1+7+5 

【讨论】:

【参考方案5】:

一个基本的 R 解决方案可以是,

dt1$res <- sapply(strsplit(dt1$letter, '+', fixed = TRUE), function(i)paste(dt2$number[dt2$letter %in% i], collapse = '+'))

#   letter     res
#1       a       1
#2     b+c     2+3
#3       c       3
#4 d+e+f+g 4+5+6+7
#5   a+g+e   1+5+7

【讨论】:

是否有机会改进代码,使其实际上保持合并因素的顺序?我的问题实际上是一个非常复杂的数据集,一些单元格会导致,例如:字母=d+e+f+g 和 res=5+4+6+7。我相信这一定与 dt2 中事物的排序方式有关。 @Sotos【参考方案6】:
library(tidyverse)
dt1 <- data.frame(letter = c("a", "b+c", "c", "d+e+f+g", "a+g+e"))
dt2 <- data.frame(letter = c("a", "b", "c", "d", "e", "f", "g"),
  number = c("1", "2", "3", "4", "5", "6", "7"))

dt1 %>%
  as_tibble() %>%
  mutate(
    number = letter %>% map_chr(~ .x %>%
      str_split("[+]") %>%
      simplify() %>%
      map_chr(~ deframe(dt2)[.x]) %>%
      paste0(collapse = "+")
    )
  )
#> # A tibble: 5 x 2
#>   letter  number
#>   <chr>   <chr>  
#> 1 a       1      
#> 2 b+c     2+3    
#> 3 c       3      
#> 4 d+e+f+g 4+5+6+7
#> 5 a+g+e   1+7+5

由reprex package (v2.0.1) 于 2021 年 12 月 14 日创建

【讨论】:

以上是关于合并具有多个分隔符的列的主要内容,如果未能解决你的问题,请参考以下文章

使用逗号分隔符将单个 CSV 列批量转换为多个

shell中的cut和paste函数,可以从多个文本中提取特定的列

合并 hive 中的列

sql把相同条件的列值合并在一起,并用“;”分隔

postgres 9.4将多个分隔符上的列拆分为新列

选择具有相同列的所有行以分隔mysql中的列