通过将值与列名称匹配来填充 data.frame
fill data.frame by matching value with column name
我有两个data.frames:
df1(空,但有特定的列名)
apple orange banana pear grape
0 0 0 0 0
df2
fruit1 count1 fruit2 count2
apple 2 pear 1
grape 4 orange 2
banana 1 NA NA
这是我想要的输出:
apples oranges bananas pears grapes
2 0 0 1 0
0 2 0 0 4
0 0 1 0 0
我考虑过按照以下方式做一些事情:
for f (in range(nrow(df2))){
for (i in range(ncol(df1))){
if(fruit1==columnName[i]){
df1[f,i]<-count1
ect...
但是,我正在处理一个相当大的数据集,这似乎不是正确的方法。
可能有更优雅的解决方案,但这里是一种解决方法。这个想法是将 df2 移动到长格式,然后将它加入到长格式 data.frame 中,其中包含每一行的所有可能的水果。将缺失值转换为零。然后将数据从长格式转换回宽格式。
# assemble df2
df2 <- structure(list(fruit1 = structure(c(1L, 3L, 2L),
.Label = c("apple", "banana", "grape"),
class = "factor"),
count1 = c(2L, 4L, 1L),
fruit2 = structure(c(2L, 1L, NA),
.Label = c("orange", "pear"),
class = "factor"),
count2 = c(1L, 2L, NA)),
.Names = c("fruit1", "count1", "fruit2", "count2"),
class = "data.frame", row.names = c(NA, -3L))
# add a column for row numbers
df2$r_number <- 1:nrow(df2)
# load libraries
library(tidyr)
library(dplyr)
# assemble a data.frame in long form by row numbers and fruits
fruit <- df2 %>%
select(starts_with("fruit"), r_number) %>%
gather(condition, fruits, starts_with("fruit")) %>%
mutate(condition = gsub("fruit", "key", condition)) %>%
filter(!is.na(fruits))
# assemble a data.frame in long form by row numbers and counts
count <- df2 %>%
select(starts_with("count"), r_number) %>%
gather(condition, counts, starts_with("count")) %>%
mutate(condition = gsub("count", "key", condition),
counts = ifelse(is.na(counts), 0, counts))
# join counts onto fruits
fruit %<>%
left_join(count, by = c("condition", "r_number")) %>%
mutate(fruits = paste0(fruits, "s")) %>%
select(-condition)
# create data.frame for all possible row numbers and fruits
all_possibles <- data.frame( r_number = rep(c(1:nrow(df2)), length(unique(fruit$fruits))),
fruits = unlist(lapply(unique(fruit$fruits), function(x) rep(x, nrow(df2)))))
# join actual fruits/counts onto all possible fruits and counts
# and shift data from long to wide
output <- all_possibles %>%
left_join(fruit, by = c("r_number", "fruits")) %>%
mutate(counts = ifelse(is.na(counts), 0, counts)) %>%
spread(fruits, counts) %>%
select(-r_number)
考虑这个数据整理基本解决方案:
df1 <- data.frame(apple=0, orange=0, banana=0, pear=0, grape=0)
df2 <- read.table(text="apple 2 pear 1
grape 4 orange 2
banana 1 NA NA",
col.names = c("fruit1", "count1", "fruit2", "count2"))
# ITERATE BY ROW TAKING FRUIT AS HEADER AND COUNT AND CELLS
dfList <- lapply(1:nrow(df2), function(x) {
temp <- data.frame(df2[x, c('count1','count2')])
names(temp) <-c(as.character(df2[x,1]),
as.character(df2[x,3]))
return(temp)
})
# ADD/REMOVE FRUIT COLUMNS
temp <- lapply(dfList, function(y) {
for (x in names(df1)) {
if (!(x %in% names(y))) { y[[x]] <- 0 }
}
for (x in names(y)){
if (!(x %in% names(df1))) { y[[x]] <- NULL }
}
return(y)
})
finaldf <- do.call(rbind, temp)
names(finaldf) <- paste0(names(finaldf), 's')
finaldf <- finaldf[c('apples','oranges','bananas','pears','grapes')]
finaldf
# apples oranges bananas pears grapes
# 1 2 0 0 1 0
# 2 0 2 0 0 4
# 3 0 0 1 0 0
这是使用 dplyr
和 tidyr
的另一种方法,代码行更少。但是这个不使用df1
.
library(dplyr)
library(tidyr)
# Create Data Frame
df2 <- data.frame(fruit1 = c("apple", "grape", "banana"),
count1 = c(2, 4, 1),
fruit2 = c("pear", "orange", NA),
count2 = c(1, 2, NA))
# Add row numbers
df2$row_num <- seq(1:3)
final <- df2 %>%
select(fruit1, count1, row_num) %>% #select the first group of fruit variables
rename(fruit2 = fruit1, count2 = count1) %>% # rename variables so they have the same name
bind_rows(df2[, c(3:5)]) %>% # combine with the second set of fruit variables
filter(!is.na(fruit2)) %>% # remove rows without a fruit
spread(fruit2, count2, fill = 0) %>% # spread data
select(-row_num) # remove the row numbers
一个data.table选项:
library(data.table)
setDT(df2)
# add row number
df2[, r := .I]
# "melt" common columns together
cols = c("fruit", "count")
m2 = melt(df2, measure=patterns(cols), value.name=cols)
# add unobserved fruits, if any
m2[, fruit := factor(fruit, levels = names(df1))]
# "cast" each fruit to its own column, ignoring NA rows
dcast(m2[!is.na(fruit)], r ~ fruit, fill = 0L, drop = FALSE)
r apple orange banana pear grape
1: 1 2 0 0 1 0
2: 2 0 2 0 0 4
3: 3 0 0 1 0 0
factor
是为了防止 df1
中有一些 df2
中没有的额外关卡。如果你有一行 fruit1
和 fruit2
都是空白的,你必须弄清楚你想如何扩展这种方法。
我有两个data.frames:
df1(空,但有特定的列名)
apple orange banana pear grape
0 0 0 0 0
df2
fruit1 count1 fruit2 count2
apple 2 pear 1
grape 4 orange 2
banana 1 NA NA
这是我想要的输出:
apples oranges bananas pears grapes
2 0 0 1 0
0 2 0 0 4
0 0 1 0 0
我考虑过按照以下方式做一些事情:
for f (in range(nrow(df2))){
for (i in range(ncol(df1))){
if(fruit1==columnName[i]){
df1[f,i]<-count1
ect...
但是,我正在处理一个相当大的数据集,这似乎不是正确的方法。
可能有更优雅的解决方案,但这里是一种解决方法。这个想法是将 df2 移动到长格式,然后将它加入到长格式 data.frame 中,其中包含每一行的所有可能的水果。将缺失值转换为零。然后将数据从长格式转换回宽格式。
# assemble df2
df2 <- structure(list(fruit1 = structure(c(1L, 3L, 2L),
.Label = c("apple", "banana", "grape"),
class = "factor"),
count1 = c(2L, 4L, 1L),
fruit2 = structure(c(2L, 1L, NA),
.Label = c("orange", "pear"),
class = "factor"),
count2 = c(1L, 2L, NA)),
.Names = c("fruit1", "count1", "fruit2", "count2"),
class = "data.frame", row.names = c(NA, -3L))
# add a column for row numbers
df2$r_number <- 1:nrow(df2)
# load libraries
library(tidyr)
library(dplyr)
# assemble a data.frame in long form by row numbers and fruits
fruit <- df2 %>%
select(starts_with("fruit"), r_number) %>%
gather(condition, fruits, starts_with("fruit")) %>%
mutate(condition = gsub("fruit", "key", condition)) %>%
filter(!is.na(fruits))
# assemble a data.frame in long form by row numbers and counts
count <- df2 %>%
select(starts_with("count"), r_number) %>%
gather(condition, counts, starts_with("count")) %>%
mutate(condition = gsub("count", "key", condition),
counts = ifelse(is.na(counts), 0, counts))
# join counts onto fruits
fruit %<>%
left_join(count, by = c("condition", "r_number")) %>%
mutate(fruits = paste0(fruits, "s")) %>%
select(-condition)
# create data.frame for all possible row numbers and fruits
all_possibles <- data.frame( r_number = rep(c(1:nrow(df2)), length(unique(fruit$fruits))),
fruits = unlist(lapply(unique(fruit$fruits), function(x) rep(x, nrow(df2)))))
# join actual fruits/counts onto all possible fruits and counts
# and shift data from long to wide
output <- all_possibles %>%
left_join(fruit, by = c("r_number", "fruits")) %>%
mutate(counts = ifelse(is.na(counts), 0, counts)) %>%
spread(fruits, counts) %>%
select(-r_number)
考虑这个数据整理基本解决方案:
df1 <- data.frame(apple=0, orange=0, banana=0, pear=0, grape=0)
df2 <- read.table(text="apple 2 pear 1
grape 4 orange 2
banana 1 NA NA",
col.names = c("fruit1", "count1", "fruit2", "count2"))
# ITERATE BY ROW TAKING FRUIT AS HEADER AND COUNT AND CELLS
dfList <- lapply(1:nrow(df2), function(x) {
temp <- data.frame(df2[x, c('count1','count2')])
names(temp) <-c(as.character(df2[x,1]),
as.character(df2[x,3]))
return(temp)
})
# ADD/REMOVE FRUIT COLUMNS
temp <- lapply(dfList, function(y) {
for (x in names(df1)) {
if (!(x %in% names(y))) { y[[x]] <- 0 }
}
for (x in names(y)){
if (!(x %in% names(df1))) { y[[x]] <- NULL }
}
return(y)
})
finaldf <- do.call(rbind, temp)
names(finaldf) <- paste0(names(finaldf), 's')
finaldf <- finaldf[c('apples','oranges','bananas','pears','grapes')]
finaldf
# apples oranges bananas pears grapes
# 1 2 0 0 1 0
# 2 0 2 0 0 4
# 3 0 0 1 0 0
这是使用 dplyr
和 tidyr
的另一种方法,代码行更少。但是这个不使用df1
.
library(dplyr)
library(tidyr)
# Create Data Frame
df2 <- data.frame(fruit1 = c("apple", "grape", "banana"),
count1 = c(2, 4, 1),
fruit2 = c("pear", "orange", NA),
count2 = c(1, 2, NA))
# Add row numbers
df2$row_num <- seq(1:3)
final <- df2 %>%
select(fruit1, count1, row_num) %>% #select the first group of fruit variables
rename(fruit2 = fruit1, count2 = count1) %>% # rename variables so they have the same name
bind_rows(df2[, c(3:5)]) %>% # combine with the second set of fruit variables
filter(!is.na(fruit2)) %>% # remove rows without a fruit
spread(fruit2, count2, fill = 0) %>% # spread data
select(-row_num) # remove the row numbers
一个data.table选项:
library(data.table)
setDT(df2)
# add row number
df2[, r := .I]
# "melt" common columns together
cols = c("fruit", "count")
m2 = melt(df2, measure=patterns(cols), value.name=cols)
# add unobserved fruits, if any
m2[, fruit := factor(fruit, levels = names(df1))]
# "cast" each fruit to its own column, ignoring NA rows
dcast(m2[!is.na(fruit)], r ~ fruit, fill = 0L, drop = FALSE)
r apple orange banana pear grape
1: 1 2 0 0 1 0
2: 2 0 2 0 0 4
3: 3 0 0 1 0 0
factor
是为了防止 df1
中有一些 df2
中没有的额外关卡。如果你有一行 fruit1
和 fruit2
都是空白的,你必须弄清楚你想如何扩展这种方法。