将数据框中的两个混乱向量拆分为一个公共列

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了将数据框中的两个混乱向量拆分为一个公共列相关的知识,希望对你有一定的参考价值。

数据集样本:

library(dplyr)
sample <- structure(list(Rank = c(15, 17, 20, 2, 16, 8, 21, 5, 13, 31, 22, 18, 2, 19, 11, 11, 8, 7, 12, 9, 5, 23, 17, 16, 15, 14, 4, 20, 13, 2), Athlete = c("François Gourmet(BEL)", "Agustín Félix(ESP)", "Keisuke Ushiro", "Michael Schrader", "Pieter Braun", "Laurent Hernu(FRA)", "Dmitriy Karpov", "Laurent Hernu(FRA)", "Thomas van der Plaetsen", "Attila Szabó", "Nadir El Fassi", "Eduard Mikhan", "Leonel Suárez", "Janek Õiglane", "Hans van Alphen(BEL)", "Roman Šebrle", "André Niklaus(GER)", "Pascal Behrenbruch", "Pieter Braun", "Oleksandr Yurkov(UKR)", "Eelco Sintnicolaas", "Brent Newdick", "Kim Kun-woo", "Akihiko Nakamura", "Bastien Auzeil", "Frédéric Xhonneux", "Janek Õiglane", "Keisuke Ushiro", "Roman Šebrle", "Rico Freimuth"), Total = c(7974, 7749, 7498, 8670, 7890, 8280, 7550, 8218, 8069, 7610, 7922, 7968, 8640, 7581, 8034, 8266, 8020, 8211, 8114, 8264, 8298, 7915, 7860, 7745, 7922, 7616, 8371, 7532, 8069, 8564), `100m` = c(10.67, 11.17, 11.53, 10.73, 11.22, 10.97, 11.24, 11.2, 11.2, 11.15, 11.12, 10.97, 11.13, 11.51, 11.11, 11.16, 11.19, 11.08, 11.11, 10.93, 10.76, 11.11, 11.11, 10.86, 11.35, 11.28, 11.08, 11.51, 11.25, 10.53), LJ = c(7.15, 7.12, 6.64, 7.85, 7.17, 7.31, 6.86, 7.22, 7.79, 7.09, 7.26, 7.42, 7.24, 6.78, 7.35, 7.8, 7.21, 6.8, 7.29, 7.37, 7.29, 7.42, 7.24, 7.26, 6.87, 7.21, 7.33, 6.73, 7.3, 7.48), SP = c(13.74, 13.29, 13.43, 14.56, 14.48, 14.43, 15.69, 13.99, 12.76, 13.92, 13.62, 14.15, 15.2, 14.43, 14.67, 14.98, 13.87, 16.01, 13.9, 15.15, 14.13, 14.35, 12.96, 11.67, 15.23, 12.92, 15.13, 14.93, 15.2, 14.85), HJ = c(1.85, 2.03, 1.96, 1.99, 1.93, 2.03, 1.93, 2.03, 2.17, 1.84, 1.99, 1.96, 2.11, 1.92, 1.88, 2.11, 1.97, 1.93, 2.04, 1.97, 1.93, 1.99, 1.96, 1.95, 1.96, 2.03, 2.05, 1.89, 2.05, 1.99), `400m` = c(47.98, 52.08, 51.43, 47.66, 48.54, 49.31, 52.01, 48.95, 49.46, 49.79, 51.35, 48.8, 48, 50.95, 48.52, 50.42, 49.95, 49.9, 48.24, 49.45, 48.35, 50.1, 49.24, 47.81, 50.36, 49.04, 49.58, 50.85, 51.18, 48.41), `110mh` = c(15.02, 14.75, 15.35, 14.29, 14.67, 14.01, 14.64, 14.15, 14.79, 14.65, 14.9, 14.82, 14.45, 15.33, 14.77, 14.44, 14.5, 14.33, 14.37, 14.41, 14.42, 14.82, 14.95, 14.72, 14.59, 15.75, 14.56, 15.43, 14.75, 13.68), DT = c(39.87, 43.67, 47.64, 46.44, 42.59, 43.93, 47.1, 46.13, 37.2, 43.75, 42.25, 48, 44.71, 40.94, 44.3, 46.3, 42.68, 48.56, 42.09, 48.1, 42.23, 43.6, 39.53, 33.48, 46.86, 38.62, 42.11, 46.85, 46.93, 51.17), PV = c(5, 5, 4.6, 5, 4.7, 5.1, 4.8, 4.9, 5.1, 4.4, 4.8, 4.6, 5, 4.6, 4.3, 4.6, 5.1, 4.9, 4.9, 5, 5.2, 4.8, 4.9, 4.7, 4.8, 4.7, 5.1, 4.7, 4.8, 4.8), JT = c(57.73, 56.69, 63.28, 65.67, 59.26, 59.9, 46.91, 59.63, 58.91, 59.56, 57.65, 50.74, 75.19, 68.51, 65.71, 65.61, 57.55, 66.5, 56.95, 58.63, 61.07, 51.52, 53.33, 53.57, 60.8, 50.18, 71.73, 56.52, 67.28, 62.34), `1500m` = c(265.51, 288.27, 291.9, 265.38, 278.4, 277.41, 298.41, 268.4, 285.86, 285.64, 256.51, 273.71, 267.25, 283.06, 262.5, 290.33, 268.8, 276.64, 272.46, 278.43, 265.4, 270.57, 255.63, 256.36, 279.8, 262.71, 279.24, 283.51, 296.5, 281.57), Year = structure(c(4L, 4L, 9L, 7L, 9L, 1L, 6L, 2L, 6L, 5L, 5L, 7L, 5L, 8L, 4L, 5L, 2L, 6L, 8L, 1L, 6L, 5L, 6L, 8L, 9L,     3L, 9L, 8L, 6L, 9L), .Label = c("2001", "2003", "2005", "2007",     "2009", "2011", "2013", "2015", "2017"), class = "factor"),     Nationality = c(NA, NA, "Japan(JPN)", "Germany(GER)", "Netherlands(NED)",     NA, "Kazakhstan(KAZ)", NA, "Belgium(BEL)", "Hungary", "France",     "Belarus(BLR)", "Cuba", "Estonia(EST)", NA, "Czech Republic",     NA, "Germany(GER)", "Netherlands(NED)", NA, "Netherlands(NED)",     "New Zealand", "South Korea(KOR)", "Japan(JPN)", "France(FRA)",     NA, "Estonia(EST)", "Japan(JPN)", "Czech Republic(CZE)",     "Germany(GER)"), Notes = c(NA, NA, NA, "PB", NA, NA, NA,     NA, NA, NA, "SB", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,     "PB", "NR", NA, "SB", NA, "PB", NA, NA, NA)), .Names = c("Rank", "Athlete", "Total", "100m", "LJ", "SP", "HJ", "400m", "110mh", "DT", "PV", "JT", "1500m", "Year", "Nationality", "Notes"), row.names = c(NA, -30L), class = c("tbl_df", "tbl", "data.frame"))

# A tibble: 30 x 16
    Rank                 Athlete Total `100m`    LJ    SP    HJ `400m` `110mh`    DT    PV    JT `1500m`   Year      Nationality Notes
   <dbl>                   <chr> <dbl>  <dbl> <dbl> <dbl> <dbl>  <dbl>   <dbl> <dbl> <dbl> <dbl>   <dbl> <fctr>            <chr> <chr>
 1    15   François Gourmet(BEL)  7974  10.67  7.15 13.74  1.85  47.98   15.02 39.87   5.0 57.73  265.51   2007             <NA>  <NA>
 2    17      Agustín Félix(ESP)  7749  11.17  7.12 13.29  2.03  52.08   14.75 43.67   5.0 56.69  288.27   2007             <NA>  <NA>
 3    20          Keisuke Ushiro  7498  11.53  6.64 13.43  1.96  51.43   15.35 47.64   4.6 63.28  291.90   2017       Japan(JPN)  <NA>
 4     2        Michael Schrader  8670  10.73  7.85 14.56  1.99  47.66   14.29 46.44   5.0 65.67  265.38   2013     Germany(GER)    PB
 5    16            Pieter Braun  7890  11.22  7.17 14.48  1.93  48.54   14.67 42.59   4.7 59.26  278.40   2017 Netherlands(NED)  <NA>
 6     8      Laurent Hernu(FRA)  8280  10.97  7.31 14.43  2.03  49.31   14.01 43.93   5.1 59.90  277.41   2001             <NA>  <NA>
 7    21          Dmitriy Karpov  7550  11.24  6.86 15.69  1.93  52.01   14.64 47.10   4.8 46.91  298.41   2011  Kazakhstan(KAZ)  <NA>
 8     5      Laurent Hernu(FRA)  8218  11.20  7.22 13.99  2.03  48.95   14.15 46.13   4.9 59.63  268.40   2003             <NA>  <NA>
 9    13 Thomas van der Plaetsen  8069  11.20  7.79 12.76  2.17  49.46   14.79 37.20   5.1 58.91  285.86   2011     Belgium(BEL)  <NA>
10    31            Attila Szabó  7610  11.15  7.09 13.92  1.84  49.79   14.65 43.75   4.4 59.56  285.64   2009          Hungary  <NA>
# ... with 20 more rows

我有两个特征向量,“运动员和国籍”,在我的数据集中,一些条目的末尾附有括号中的国家代码。我希望能够将这两个向量中的国家代码分成一个新变量,比如说“countrycode”,同时摆脱括号。我不确定分裂的最佳方法或语法是什么 - dplyr ::可能分开?虽然我不确定如何合并组合分割期间括号内国家代码中的字符,以及某些条目不需要拆分的事实。

然后我会从新变量中删除括号后执行类似的操作。

sample$countrycode<- gsub(pattern="\\(",replacement="",x=sample$countrycode)
sample$countrycode<- gsub(pattern="\\)",replacement="",x=sample$countrycode)

谢谢

答案

希望这对你有用:

library(dplyr)

res <- sample %>% mutate(
    countrycode = case_when(
        is.na(Nationality) & grepl('\\(', Athlete) ~ gsub('.*?\\((.*)\\)', '\\1', Athlete),
        grepl('\\(', Nationality) ~ gsub('.*?\\((.*)\\)', '\\1', Nationality),
        TRUE ~ Nationality
    )
)

样本输出:

res %>% select(Athlete, Nationality, countrycode)
# # A tibble: 30 x 3
#    Athlete                 Nationality      countrycode
# <chr>                   <chr>            <chr>     
# 1 François Gourmet(BEL)   NA               BEL       
# 2 Agustín Félix(ESP)      NA               ESP       
# 3 Keisuke Ushiro          Japan(JPN)       JPN       
# 4 Michael Schrader        Germany(GER)     GER       
# 5 Pieter Braun            Netherlands(NED) NED       
# 6 Laurent Hernu(FRA)      NA               FRA       
# 7 Dmitriy Karpov          Kazakhstan(KAZ)  KAZ       
# 8 Laurent Hernu(FRA)      NA               FRA       
# 9 Thomas van der Plaetsen Belgium(BEL)     BEL       
# 10 Attila Szabó            Hungary          Hungary   
# # ... with 20 more rows

删除TRUE ~ Nationality以仅提取Frank注释的国家/地区代码:

sample %>% mutate(
    countrycode = case_when(
        is.na(Nationality) & grepl('\\(', Athlete) ~ gsub('.*?\\((.*)\\)', '\\1', Athlete),
        grepl('\\(', Nationality) ~ gsub('.*?\\((.*)\\)', '\\1', Nationality)
    ))
另一答案

一个丑陋的方法是使用sub

library(data.table)
DT = data.table(sample)

patt = "^.*\\((.{3})\\).*$"; rp = "\\1"
DT[Athlete %like% patt, cc := sub(patt, rp, Athlete)]
DT[Nationality %like% patt, cc := sub(patt, rp, Nationality)]

如果您已经使用了tidyverse软件包,那么来自stringr软件包的str_extract可能会更清晰。另外,对于上面代码的dplyr模拟,也许看看case_when函数。 (我对这些工具不太熟悉,无法知道确切的语法。)

结果看起来像......

> DT[, .(Athlete, Nationality, cc)]
                    Athlete         Nationality  cc
 1:   François Gourmet(BEL)                  NA BEL
 2:      Agustín Félix(ESP)                  NA ESP
 3:          Keisuke Ushiro          Japan(JPN) JPN
 4:        Michael Schrader        Germany(GER) GER
 5:            Pieter Braun    Netherlands(NED) NED
 6:      Laurent Hernu(FRA)                  NA FRA
 7:          Dmitriy Karpov     Kazakhstan(KAZ) KAZ
 8:      Laurent Hernu(FRA)                  NA FRA
 9: Thomas van der Plaetsen        Belgium(BEL) BEL
10:            Attila Szabó             Hungary  NA
11:          Nadir El Fassi              France  NA
12:           Eduard Mikhan        Belarus(BLR) BLR
13:           Leonel Suárez                Cuba  NA
14:           Janek Õiglane        Estonia(EST) EST
15:    Hans van Alphen(BEL)                  NA BEL
16:            Roman Šebrle      Czech Republic  NA
17:      André Niklaus(GER)                  NA GER
18:      Pascal Behrenbruch        Germany(GER) GER
19:            Pieter Braun    Netherlands(NED) NED
20:   Oleksandr Yurkov(UKR)                  NA UKR
21:      Eelco Sintnicolaas    Netherlands(NED) NED
22:           Brent Newdick         New Zealand  NA
23:             Kim Kun-woo    South Korea(KOR) KOR
24:        Akihiko Nakamura          Japan(JPN) JPN
25:          Bastien Auzeil         France(FRA) FRA
26:       Frédéric Xhonneux                  NA  NA
27:           Janek Õiglane        Estonia(EST) EST
28:          Keisuke Ushiro          Japan(JPN) JPN
29:            Roman Šebrle Czech Republic(CZE) CZE
30:           Rico Freimuth        Germany(GER) GER
                    Athlete         Nationality  cc
另一答案

这个简单的解决方案也有效

library(stringr)
data1$country_code <- sapply(data1$Nationality, function(x) unlist(stri_extract_all(str = x, regex = '([A-Z]+)'))[2])

        Nationality country_code
1:               NA           NA
2:               NA           NA
3:       Japan(JPN)          JPN
4:     Germany(GER)          GER
5: Netherlands(NED)          NED
6:               NA           NA

以上是关于将数据框中的两个混乱向量拆分为一个公共列的主要内容,如果未能解决你的问题,请参考以下文章

将 dict 元组拆分为数据框中的单个记录

将数据框中的每一行除以 Python 中的向量

将数据框中的 1 列拆分为 2 列 [重复]

将字符串(或字符串列表)拆分为 spark 数据框中的各个列

将数据框列拆分为 R 中的向量

将数据框中的结构类型列拆分为多列