使用 R 从创建新列的行中提取数据
Extract data from rows creating new columns using R
我有一个看起来像这样的数据集:
数据集代码:
sample <- structure(list(`Employee ID` = c(200, 201, 202, 203, 204, 205
), `Leader1 Name` = c("CH", "CH", "CH", "CH", "CH", "CH"), `Leader1 Level` = c("Founder",
"Founder", "Founder", "Founder", "Founder", "Founder"), `Leader2 Name` = c("HF",
"SR", "HF", "HF", "AK", "UT"), `Leader2 Level` = c("CEO", "VP",
"CEO", "CEO", "Exec", "Exec"), `Leader3 Name` = c("KK", NA, "NA",
NA, "TR", NA), `Leader3 Level` = c("VP", NA, "VP", NA, "VP",
NA), `Leader4 Name` = c("EQ", NA, "YY", NA, NA, NA), `Leader4 Level` = c("Director",
NA, "VP", NA, NA, NA)), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
我想知道每个员工在每个层级中的领导姓名。所以我期待的输出是这样的:
我的方法是使用分隔符连接所有领导者姓名和级别,然后去掉每个级别
sample$AllLevels <- paste(sample$`Leader1 Name`, sample$`Leader1 Level`,
sample$`Leader2 Name`, sample$`Leader2 Level`,
sample$`Leader3 Name`, sample$`Leader3 Level`,
sample$`Leader4 Name`, sample$`Leader4 Level`,
sep = "~~")
sample$DirectorLevel <- unlist(lapply(strsplit(sample$AllLevels, "Director", fixed = TRUE), '[', 1))
sample$VPLevel <- unlist(lapply(strsplit(sample$DirectorLevel, "VP", fixed = TRUE), '[', 1))
sample$ExecLevel <- unlist(lapply(strsplit(sample$VPLevel, "Exec", fixed = TRUE), '[', 1))
sample$CEOLevel <- unlist(lapply(strsplit(sample$ExecLevel, "CEO", fixed = TRUE), '[', 1))
sample$FounderLevel <- unlist(lapply(strsplit(sample$CEOLevel, "Founder", fixed = TRUE), '[', 1))
sample$Director <- unlist(lapply(strsplit(sample$DirectorLevel, "~~", fixed = TRUE), tail, 1))
sample$VP <- unlist(lapply(strsplit(sample$VPLevel, "~~", fixed = TRUE), tail, 1))
sample$Exec <- unlist(lapply(strsplit(sample$ExecLevel, "~~", fixed = TRUE), tail, 1))
sample$CEO <- unlist(lapply(strsplit(sample$CEOLevel, "~~", fixed = TRUE), tail, 1))
sample$Founder <- unlist(lapply(strsplit(sample$FounderLevel, "~~", fixed = TRUE), tail, 1))
sample <- sample[,c(1, 16:20)]
这是我得到的输出。
这种方法的问题是,如果中间没有特定级别的领导者,而不是 NA,我会拉下一个领导者。请参阅员工 ID 200 的第 1 行。VP "KK" 也在 Exec 级别重复。其他行也有同样的问题。
是否有更好的方法来获得所需的结果?
这是 tidyverse
的一个选项(pivot_longer
来自 tidyr
的开发版本 - 从 here 安装信息),我们将数据重塑为 'long' 格式 (pivot_longer
),spread
经过一些预处理
后返回 'wide' 格式
library(dplyr)
library(tidyr)# ‘0.8.3.9000’
library(stringr)
lvls <- c("Director", "VP", "Exec", "CEO", "Founder")
sample %>%
rename_at(-1, ~ str_replace(., "(\S+) (\S+)", '\2_\1')) %>%
pivot_longer(-`Employee ID`, names_to = c(".value", "group"), names_sep = '_') %>%
na.omit %>%
select(-group) %>%
group_by(`Employee ID`,
Level = factor(Level,
levels = lvls))%>%
mutate(rn = row_number()) %>%
spread(Level, Name) %>%
select(-rn)
# A tibble: 7 x 6
# Groups: Employee ID [6]
# `Employee ID` Director VP Exec CEO Founder
# <dbl> <chr> <chr> <chr> <chr> <chr>
#1 200 EQ KK <NA> HF CH
#2 201 <NA> SR <NA> <NA> CH
#3 202 <NA> NA <NA> HF CH
#4 202 <NA> YY <NA> <NA> <NA>
#5 203 <NA> <NA> <NA> HF CH
#6 204 <NA> TR AK <NA> CH
#7 205 <NA> <NA> UT <NA> CH
注意:OP 输出中提到的重复步骤已更正
或使用 data.table
中的 melt/dcast
library(data.table)
dcast(melt(setDT(sample), measure = patterns("Level", "Name"),
na.rm = TRUE)[, value1 := factor(value1,
levels = lvls)],
`Employee ID` + rowid(`Employee ID`, value1) ~ value1,
value.name = 'value2')[, `Employee ID_1` := NULL][]
#. Employee ID Director VP Exec CEO Founder
#1: 200 EQ KK <NA> HF CH
#2: 201 <NA> SR <NA> <NA> CH
#3: 202 <NA> NA <NA> HF CH
#4: 202 <NA> YY <NA> <NA> <NA>
#5: 203 <NA> <NA> <NA> HF CH
#6: 204 <NA> TR AK <NA> CH
#7: 205 <NA> <NA> UT <NA> CH
我有一个看起来像这样的数据集:
数据集代码:
sample <- structure(list(`Employee ID` = c(200, 201, 202, 203, 204, 205
), `Leader1 Name` = c("CH", "CH", "CH", "CH", "CH", "CH"), `Leader1 Level` = c("Founder",
"Founder", "Founder", "Founder", "Founder", "Founder"), `Leader2 Name` = c("HF",
"SR", "HF", "HF", "AK", "UT"), `Leader2 Level` = c("CEO", "VP",
"CEO", "CEO", "Exec", "Exec"), `Leader3 Name` = c("KK", NA, "NA",
NA, "TR", NA), `Leader3 Level` = c("VP", NA, "VP", NA, "VP",
NA), `Leader4 Name` = c("EQ", NA, "YY", NA, NA, NA), `Leader4 Level` = c("Director",
NA, "VP", NA, NA, NA)), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
我想知道每个员工在每个层级中的领导姓名。所以我期待的输出是这样的:
我的方法是使用分隔符连接所有领导者姓名和级别,然后去掉每个级别
sample$AllLevels <- paste(sample$`Leader1 Name`, sample$`Leader1 Level`,
sample$`Leader2 Name`, sample$`Leader2 Level`,
sample$`Leader3 Name`, sample$`Leader3 Level`,
sample$`Leader4 Name`, sample$`Leader4 Level`,
sep = "~~")
sample$DirectorLevel <- unlist(lapply(strsplit(sample$AllLevels, "Director", fixed = TRUE), '[', 1))
sample$VPLevel <- unlist(lapply(strsplit(sample$DirectorLevel, "VP", fixed = TRUE), '[', 1))
sample$ExecLevel <- unlist(lapply(strsplit(sample$VPLevel, "Exec", fixed = TRUE), '[', 1))
sample$CEOLevel <- unlist(lapply(strsplit(sample$ExecLevel, "CEO", fixed = TRUE), '[', 1))
sample$FounderLevel <- unlist(lapply(strsplit(sample$CEOLevel, "Founder", fixed = TRUE), '[', 1))
sample$Director <- unlist(lapply(strsplit(sample$DirectorLevel, "~~", fixed = TRUE), tail, 1))
sample$VP <- unlist(lapply(strsplit(sample$VPLevel, "~~", fixed = TRUE), tail, 1))
sample$Exec <- unlist(lapply(strsplit(sample$ExecLevel, "~~", fixed = TRUE), tail, 1))
sample$CEO <- unlist(lapply(strsplit(sample$CEOLevel, "~~", fixed = TRUE), tail, 1))
sample$Founder <- unlist(lapply(strsplit(sample$FounderLevel, "~~", fixed = TRUE), tail, 1))
sample <- sample[,c(1, 16:20)]
这是我得到的输出。
这种方法的问题是,如果中间没有特定级别的领导者,而不是 NA,我会拉下一个领导者。请参阅员工 ID 200 的第 1 行。VP "KK" 也在 Exec 级别重复。其他行也有同样的问题。
是否有更好的方法来获得所需的结果?
这是 tidyverse
的一个选项(pivot_longer
来自 tidyr
的开发版本 - 从 here 安装信息),我们将数据重塑为 'long' 格式 (pivot_longer
),spread
经过一些预处理
library(dplyr)
library(tidyr)# ‘0.8.3.9000’
library(stringr)
lvls <- c("Director", "VP", "Exec", "CEO", "Founder")
sample %>%
rename_at(-1, ~ str_replace(., "(\S+) (\S+)", '\2_\1')) %>%
pivot_longer(-`Employee ID`, names_to = c(".value", "group"), names_sep = '_') %>%
na.omit %>%
select(-group) %>%
group_by(`Employee ID`,
Level = factor(Level,
levels = lvls))%>%
mutate(rn = row_number()) %>%
spread(Level, Name) %>%
select(-rn)
# A tibble: 7 x 6
# Groups: Employee ID [6]
# `Employee ID` Director VP Exec CEO Founder
# <dbl> <chr> <chr> <chr> <chr> <chr>
#1 200 EQ KK <NA> HF CH
#2 201 <NA> SR <NA> <NA> CH
#3 202 <NA> NA <NA> HF CH
#4 202 <NA> YY <NA> <NA> <NA>
#5 203 <NA> <NA> <NA> HF CH
#6 204 <NA> TR AK <NA> CH
#7 205 <NA> <NA> UT <NA> CH
注意:OP 输出中提到的重复步骤已更正
或使用 data.table
melt/dcast
library(data.table)
dcast(melt(setDT(sample), measure = patterns("Level", "Name"),
na.rm = TRUE)[, value1 := factor(value1,
levels = lvls)],
`Employee ID` + rowid(`Employee ID`, value1) ~ value1,
value.name = 'value2')[, `Employee ID_1` := NULL][]
#. Employee ID Director VP Exec CEO Founder
#1: 200 EQ KK <NA> HF CH
#2: 201 <NA> SR <NA> <NA> CH
#3: 202 <NA> NA <NA> HF CH
#4: 202 <NA> YY <NA> <NA> <NA>
#5: 203 <NA> <NA> <NA> HF CH
#6: 204 <NA> TR AK <NA> CH
#7: 205 <NA> <NA> UT <NA> CH