将数据框中的两个混乱向量拆分为一个公共列
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了将数据框中的两个混乱向量拆分为一个公共列相关的知识,希望对你有一定的参考价值。
数据集样本:
library(dplyr)
sample <- structure(list(Rank = c(15, 17, 20, 2, 16, 8, 21, 5, 13, 31, 22, 18, 2, 19, 11, 11, 8, 7, 12, 9, 5, 23, 17, 16, 15, 14, 4, 20, 13, 2), Athlete = c("François Gourmet(BEL)", "Agustín Félix(ESP)", "Keisuke Ushiro", "Michael Schrader", "Pieter Braun", "Laurent Hernu(FRA)", "Dmitriy Karpov", "Laurent Hernu(FRA)", "Thomas van der Plaetsen", "Attila Szabó", "Nadir El Fassi", "Eduard Mikhan", "Leonel Suárez", "Janek Õiglane", "Hans van Alphen(BEL)", "Roman Šebrle", "André Niklaus(GER)", "Pascal Behrenbruch", "Pieter Braun", "Oleksandr Yurkov(UKR)", "Eelco Sintnicolaas", "Brent Newdick", "Kim Kun-woo", "Akihiko Nakamura", "Bastien Auzeil", "Frédéric Xhonneux", "Janek Õiglane", "Keisuke Ushiro", "Roman Šebrle", "Rico Freimuth"), Total = c(7974, 7749, 7498, 8670, 7890, 8280, 7550, 8218, 8069, 7610, 7922, 7968, 8640, 7581, 8034, 8266, 8020, 8211, 8114, 8264, 8298, 7915, 7860, 7745, 7922, 7616, 8371, 7532, 8069, 8564), `100m` = c(10.67, 11.17, 11.53, 10.73, 11.22, 10.97, 11.24, 11.2, 11.2, 11.15, 11.12, 10.97, 11.13, 11.51, 11.11, 11.16, 11.19, 11.08, 11.11, 10.93, 10.76, 11.11, 11.11, 10.86, 11.35, 11.28, 11.08, 11.51, 11.25, 10.53), LJ = c(7.15, 7.12, 6.64, 7.85, 7.17, 7.31, 6.86, 7.22, 7.79, 7.09, 7.26, 7.42, 7.24, 6.78, 7.35, 7.8, 7.21, 6.8, 7.29, 7.37, 7.29, 7.42, 7.24, 7.26, 6.87, 7.21, 7.33, 6.73, 7.3, 7.48), SP = c(13.74, 13.29, 13.43, 14.56, 14.48, 14.43, 15.69, 13.99, 12.76, 13.92, 13.62, 14.15, 15.2, 14.43, 14.67, 14.98, 13.87, 16.01, 13.9, 15.15, 14.13, 14.35, 12.96, 11.67, 15.23, 12.92, 15.13, 14.93, 15.2, 14.85), HJ = c(1.85, 2.03, 1.96, 1.99, 1.93, 2.03, 1.93, 2.03, 2.17, 1.84, 1.99, 1.96, 2.11, 1.92, 1.88, 2.11, 1.97, 1.93, 2.04, 1.97, 1.93, 1.99, 1.96, 1.95, 1.96, 2.03, 2.05, 1.89, 2.05, 1.99), `400m` = c(47.98, 52.08, 51.43, 47.66, 48.54, 49.31, 52.01, 48.95, 49.46, 49.79, 51.35, 48.8, 48, 50.95, 48.52, 50.42, 49.95, 49.9, 48.24, 49.45, 48.35, 50.1, 49.24, 47.81, 50.36, 49.04, 49.58, 50.85, 51.18, 48.41), `110mh` = c(15.02, 14.75, 15.35, 14.29, 14.67, 14.01, 14.64, 14.15, 14.79, 14.65, 14.9, 14.82, 14.45, 15.33, 14.77, 14.44, 14.5, 14.33, 14.37, 14.41, 14.42, 14.82, 14.95, 14.72, 14.59, 15.75, 14.56, 15.43, 14.75, 13.68), DT = c(39.87, 43.67, 47.64, 46.44, 42.59, 43.93, 47.1, 46.13, 37.2, 43.75, 42.25, 48, 44.71, 40.94, 44.3, 46.3, 42.68, 48.56, 42.09, 48.1, 42.23, 43.6, 39.53, 33.48, 46.86, 38.62, 42.11, 46.85, 46.93, 51.17), PV = c(5, 5, 4.6, 5, 4.7, 5.1, 4.8, 4.9, 5.1, 4.4, 4.8, 4.6, 5, 4.6, 4.3, 4.6, 5.1, 4.9, 4.9, 5, 5.2, 4.8, 4.9, 4.7, 4.8, 4.7, 5.1, 4.7, 4.8, 4.8), JT = c(57.73, 56.69, 63.28, 65.67, 59.26, 59.9, 46.91, 59.63, 58.91, 59.56, 57.65, 50.74, 75.19, 68.51, 65.71, 65.61, 57.55, 66.5, 56.95, 58.63, 61.07, 51.52, 53.33, 53.57, 60.8, 50.18, 71.73, 56.52, 67.28, 62.34), `1500m` = c(265.51, 288.27, 291.9, 265.38, 278.4, 277.41, 298.41, 268.4, 285.86, 285.64, 256.51, 273.71, 267.25, 283.06, 262.5, 290.33, 268.8, 276.64, 272.46, 278.43, 265.4, 270.57, 255.63, 256.36, 279.8, 262.71, 279.24, 283.51, 296.5, 281.57), Year = structure(c(4L, 4L, 9L, 7L, 9L, 1L, 6L, 2L, 6L, 5L, 5L, 7L, 5L, 8L, 4L, 5L, 2L, 6L, 8L, 1L, 6L, 5L, 6L, 8L, 9L, 3L, 9L, 8L, 6L, 9L), .Label = c("2001", "2003", "2005", "2007", "2009", "2011", "2013", "2015", "2017"), class = "factor"), Nationality = c(NA, NA, "Japan(JPN)", "Germany(GER)", "Netherlands(NED)", NA, "Kazakhstan(KAZ)", NA, "Belgium(BEL)", "Hungary", "France", "Belarus(BLR)", "Cuba", "Estonia(EST)", NA, "Czech Republic", NA, "Germany(GER)", "Netherlands(NED)", NA, "Netherlands(NED)", "New Zealand", "South Korea(KOR)", "Japan(JPN)", "France(FRA)", NA, "Estonia(EST)", "Japan(JPN)", "Czech Republic(CZE)", "Germany(GER)"), Notes = c(NA, NA, NA, "PB", NA, NA, NA, NA, NA, NA, "SB", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "PB", "NR", NA, "SB", NA, "PB", NA, NA, NA)), .Names = c("Rank", "Athlete", "Total", "100m", "LJ", "SP", "HJ", "400m", "110mh", "DT", "PV", "JT", "1500m", "Year", "Nationality", "Notes"), row.names = c(NA, -30L), class = c("tbl_df", "tbl", "data.frame"))
# A tibble: 30 x 16
Rank Athlete Total `100m` LJ SP HJ `400m` `110mh` DT PV JT `1500m` Year Nationality Notes
<dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fctr> <chr> <chr>
1 15 François Gourmet(BEL) 7974 10.67 7.15 13.74 1.85 47.98 15.02 39.87 5.0 57.73 265.51 2007 <NA> <NA>
2 17 Agustín Félix(ESP) 7749 11.17 7.12 13.29 2.03 52.08 14.75 43.67 5.0 56.69 288.27 2007 <NA> <NA>
3 20 Keisuke Ushiro 7498 11.53 6.64 13.43 1.96 51.43 15.35 47.64 4.6 63.28 291.90 2017 Japan(JPN) <NA>
4 2 Michael Schrader 8670 10.73 7.85 14.56 1.99 47.66 14.29 46.44 5.0 65.67 265.38 2013 Germany(GER) PB
5 16 Pieter Braun 7890 11.22 7.17 14.48 1.93 48.54 14.67 42.59 4.7 59.26 278.40 2017 Netherlands(NED) <NA>
6 8 Laurent Hernu(FRA) 8280 10.97 7.31 14.43 2.03 49.31 14.01 43.93 5.1 59.90 277.41 2001 <NA> <NA>
7 21 Dmitriy Karpov 7550 11.24 6.86 15.69 1.93 52.01 14.64 47.10 4.8 46.91 298.41 2011 Kazakhstan(KAZ) <NA>
8 5 Laurent Hernu(FRA) 8218 11.20 7.22 13.99 2.03 48.95 14.15 46.13 4.9 59.63 268.40 2003 <NA> <NA>
9 13 Thomas van der Plaetsen 8069 11.20 7.79 12.76 2.17 49.46 14.79 37.20 5.1 58.91 285.86 2011 Belgium(BEL) <NA>
10 31 Attila Szabó 7610 11.15 7.09 13.92 1.84 49.79 14.65 43.75 4.4 59.56 285.64 2009 Hungary <NA>
# ... with 20 more rows
我有两个特征向量,“运动员和国籍”,在我的数据集中,一些条目的末尾附有括号中的国家代码。我希望能够将这两个向量中的国家代码分成一个新变量,比如说“countrycode”,同时摆脱括号。我不确定分裂的最佳方法或语法是什么 - dplyr ::可能分开?虽然我不确定如何合并组合分割期间括号内国家代码中的字符,以及某些条目不需要拆分的事实。
然后我会从新变量中删除括号后执行类似的操作。
sample$countrycode<- gsub(pattern="\\(",replacement="",x=sample$countrycode)
sample$countrycode<- gsub(pattern="\\)",replacement="",x=sample$countrycode)
谢谢
答案
希望这对你有用:
library(dplyr)
res <- sample %>% mutate(
countrycode = case_when(
is.na(Nationality) & grepl('\\(', Athlete) ~ gsub('.*?\\((.*)\\)', '\\1', Athlete),
grepl('\\(', Nationality) ~ gsub('.*?\\((.*)\\)', '\\1', Nationality),
TRUE ~ Nationality
)
)
样本输出:
res %>% select(Athlete, Nationality, countrycode)
# # A tibble: 30 x 3
# Athlete Nationality countrycode
# <chr> <chr> <chr>
# 1 François Gourmet(BEL) NA BEL
# 2 Agustín Félix(ESP) NA ESP
# 3 Keisuke Ushiro Japan(JPN) JPN
# 4 Michael Schrader Germany(GER) GER
# 5 Pieter Braun Netherlands(NED) NED
# 6 Laurent Hernu(FRA) NA FRA
# 7 Dmitriy Karpov Kazakhstan(KAZ) KAZ
# 8 Laurent Hernu(FRA) NA FRA
# 9 Thomas van der Plaetsen Belgium(BEL) BEL
# 10 Attila Szabó Hungary Hungary
# # ... with 20 more rows
删除TRUE ~ Nationality
以仅提取Frank注释的国家/地区代码:
sample %>% mutate(
countrycode = case_when(
is.na(Nationality) & grepl('\\(', Athlete) ~ gsub('.*?\\((.*)\\)', '\\1', Athlete),
grepl('\\(', Nationality) ~ gsub('.*?\\((.*)\\)', '\\1', Nationality)
))
另一答案
一个丑陋的方法是使用sub
:
library(data.table)
DT = data.table(sample)
patt = "^.*\\((.{3})\\).*$"; rp = "\\1"
DT[Athlete %like% patt, cc := sub(patt, rp, Athlete)]
DT[Nationality %like% patt, cc := sub(patt, rp, Nationality)]
如果您已经使用了tidyverse软件包,那么来自stringr软件包的str_extract
可能会更清晰。另外,对于上面代码的dplyr模拟,也许看看case_when
函数。 (我对这些工具不太熟悉,无法知道确切的语法。)
结果看起来像......
> DT[, .(Athlete, Nationality, cc)]
Athlete Nationality cc
1: François Gourmet(BEL) NA BEL
2: Agustín Félix(ESP) NA ESP
3: Keisuke Ushiro Japan(JPN) JPN
4: Michael Schrader Germany(GER) GER
5: Pieter Braun Netherlands(NED) NED
6: Laurent Hernu(FRA) NA FRA
7: Dmitriy Karpov Kazakhstan(KAZ) KAZ
8: Laurent Hernu(FRA) NA FRA
9: Thomas van der Plaetsen Belgium(BEL) BEL
10: Attila Szabó Hungary NA
11: Nadir El Fassi France NA
12: Eduard Mikhan Belarus(BLR) BLR
13: Leonel Suárez Cuba NA
14: Janek Õiglane Estonia(EST) EST
15: Hans van Alphen(BEL) NA BEL
16: Roman Šebrle Czech Republic NA
17: André Niklaus(GER) NA GER
18: Pascal Behrenbruch Germany(GER) GER
19: Pieter Braun Netherlands(NED) NED
20: Oleksandr Yurkov(UKR) NA UKR
21: Eelco Sintnicolaas Netherlands(NED) NED
22: Brent Newdick New Zealand NA
23: Kim Kun-woo South Korea(KOR) KOR
24: Akihiko Nakamura Japan(JPN) JPN
25: Bastien Auzeil France(FRA) FRA
26: Frédéric Xhonneux NA NA
27: Janek Õiglane Estonia(EST) EST
28: Keisuke Ushiro Japan(JPN) JPN
29: Roman Šebrle Czech Republic(CZE) CZE
30: Rico Freimuth Germany(GER) GER
Athlete Nationality cc
另一答案
这个简单的解决方案也有效
library(stringr)
data1$country_code <- sapply(data1$Nationality, function(x) unlist(stri_extract_all(str = x, regex = '([A-Z]+)'))[2])
Nationality country_code
1: NA NA
2: NA NA
3: Japan(JPN) JPN
4: Germany(GER) GER
5: Netherlands(NED) NED
6: NA NA
以上是关于将数据框中的两个混乱向量拆分为一个公共列的主要内容,如果未能解决你的问题,请参考以下文章