向数据框添加新行(其中嵌套数据框)时出现问题
Issue when adding new rows (with nested dataframes within) to a dataframe
我在 R 中处理复杂的 JSON 文件(通过使用 jsonlite 包),我需要将行附加到包含嵌套数据帧的数据帧。
这是原始数据帧的结构(df,只有一个我想在其上添加新行的观察)以及我希望的新行的结构添加 (new_element).
> str(df)
$ :'data.frame': 1 obs. of 3 variables:
..$ created_at : chr "2017-10-20 08:55:13"
..$ status : chr "Done"
..$ validation_results:'data.frame': 1 obs. of 4 variables:
.. ..$ conversions : chr "OK"
.. ..$ boxes_shipped : chr "OK"
.. ..$ outlook_history: chr "OK"
.. ..$ write2db : chr "OK"
> str(new_element)
'data.frame': 1 obs. of 3 variables:
$ created_at : chr "2017-10-20 09:20:30"
$ status : chr "Pending"
$ validation_results:'data.frame': 1 obs. of 4 variables:
..$ conversions : chr "NEW"
..$ boxes_shipped : chr "NEW"
..$ outlook_history: chr "NEW"
..$ write2db : chr "NEW"
问题来了。尝试将 new_element 添加到 df:
时
df <- rbind(df, new_element)
我收到这个错误:
Error in `row.names<-.data.frame`(`*tmp*`, value = value) :
duplicate 'row.names' are not allowed
In addition: Warning message:
non-unique values when setting 'row.names': ‘1’
至于名字:
> names(df)
[1] "created_at" "status" "validation_results"
> names(new_element)
[1] "created_at" "status" "validation_results"
有人知道实际发生了什么吗?
编辑:
应@Moody_Mudskipper 的要求,我添加了 dput 输出。
> dput(df)
structure(list(created_at = "2017-10-20 12:15:52", status = "Done",
validation_results = structure(list(conversions = "OK",
boxes_shipped = "OK", outlook_history = "OK", write2db = "OK"), .Names = c("conversions",
"boxes_shipped", "outlook_history", "write2db"), row.names = c(NA,
-1L), class = "data.frame")), .Names = c("created_at", "status",
"validation_results"), row.names = 1L, class = "data.frame")
> dput(new_element)
structure(list(created_at = "2017-10-20 12:16:12", status = "Pending",
validation_results = structure(list(conversions = "NEW",
boxes_shipped = "NEW", outlook_history = "NEW", write2db = "NEW"), .Names = c("conversions",
"boxes_shipped", "outlook_history", "write2db"), row.names = c(NA,
-1L), class = "data.frame")), .Names = c("created_at", "status",
"validation_results"), row.names = 1L, class = "data.frame")
编辑2:
这是我期待获得的结果数据帧输出,可以使用 jsonlite 包将其转换为 JSON 文件。
> str(result)
'data.frame': 2 obs. of 3 variables:
$ created_at : chr "2017-10-20 08:55:13" "2017-10-20 09:20:30"
$ status : chr "Done" "Pending"
$ validation_results:'data.frame': 2 obs. of 4 variables:
..$ conversions : chr "OK" "NEW"
..$ boxes_shipped : chr "OK" "NEW"
..$ outlook_history: chr "OK" "NEW"
..$ write2db : chr "OK" "NEW"
> dput(result)
structure(list(created_at = c("2017-10-20 08:55:13", "2017-10-20
09:20:30"), status = c("Done", "Pending"), validation_results = structure(list(
conversions = c("OK", "NEW"), boxes_shipped = c("OK", "NEW"
), outlook_history = c("OK", "NEW"), write2db =
c("OK", "NEW")), .Names = c("conversions", "boxes_shipped",
"outlook_history", "write2db"), class = "data.frame", row.names =
1:2)), .Names = c("created_at",
"status", "validation_results"), class = "data.frame", row.names =
1:2)
编辑3:
> result <- read_from_JSON('output.json')
> jsonlite::toJSON(result, pretty = TRUE)
[
{
"created_at": "2017-10-20 08:55:13",
"status": "Done",
"validation_results": {
"conversions": "OK",
"boxes_shipped": "OK",
"outlook_history": "OK",
"write2db": "OK"
}
},
{
"created_at": "2017-10-20 09:20:30",
"status": "Pending",
"validation_results": {
"conversions": "NEW",
"boxes_shipped": "NEW",
"outlook_history": "NEW",
"write2db": "NEW"
}
}
]
> str(result)
'data.frame': 2 obs. of 3 variables:
$ created_at : chr "2017-10-20 08:55:13" "2017-10-20 09:20:30"
$ status : chr "Done" "Pending"
$ validation_results:'data.frame': 2 obs. of 4 variables:
..$ conversions : chr "OK" "NEW"
..$ boxes_shipped : chr "OK" "NEW"
..$ outlook_history: chr "OK" "NEW"
..$ write2db : chr "OK" "NEW"
仔细检查 new_element
和 result[2,]
之间的区别后:
new_element <- within(new_element,rownames(validation_results) <- 2)
new_df <- rbind(df,new_element)
identical(new_df,result)
# FALSE -> not all mysteries are solved
identical(toJSON(new_df,pretty=TRUE),toJSON(result,pretty=TRUE))
# TRUE -> but that should be good enough
编辑:
一个通用的解决方案,基于@Parfait 的回答,它将用于 rbind 任何数量的 data.frames 共享相同的结构,而不需要先验知识列 类 :
library(dplyr)
library(purrr)
rbind2 <- function(...){
df_cols <- sapply(list(...)[[1]],function(x) "data.frame" %in% class(x))
df_list <- lapply(list(...),`[`,df_cols) %>% transpose %>% lapply(do.call,what=rbind)
new_df <- do.call(rbind,lapply(list(...),`[`,!df_cols)) # columns that are not data.frames, simple rbind
for (n in names(df_list)){
new_df[[n]] <- df_list[[n]]
}
new_df[names(list(...)[[1]])]
}
new_df <- rbind2(df,new_element)
identical(new_df,result) # TRUE
你可以试试这个,例如:
rbind2(result,result)
旧答案:
源格式有点尴尬,所以我们会做一些争论将 validation_results
列设置为 data.frames
.
的列表
library(dplyr)
library(purrr)
new_df <- list(df,new_element) %>% map(. %>% map_at("validation_results",list)) %>% bind_rows
# created_at status validation_results
# <chr> <chr> <list>
# 1 2017-10-20 12:15:52 Done <data.frame [1 x 4]>
# 2 2017-10-20 12:16:12 Pending <data.frame [1 x 4]>
您可能想在某些时候使用 unnest
library(tidyr)
new_df %>% unnest(validation_results)
# A tibble: 2 x 6
# created_at status conversions boxes_shipped outlook_history write2db
# <chr> <chr> <chr> <chr> <chr> <chr>
# 1 2017-10-20 12:15:52 Done OK OK OK OK
# 2 2017-10-20 12:16:12 Pending NEW NEW NEW NEW
考虑基数 R 的 data.frame()
构造函数绑定向量与列分配:
desired_df <- data.frame(
create_at = c(df$created_at, new_element$created_at),
status = c(df$status, new_element$status),
stringsAsFactors = FALSE
)
# ASSIGN NESTED DF TO NEW COLUMN
desired_df$validation_results = rbind(df$validation_results,
new_element$validation_results)
str(desired_df)
# 'data.frame': 2 obs. of 3 variables:
# $ create_at : chr "2017-10-20 12:15:52" "2017-10-20 12:16:12"
# $ status : chr "Done" "Pending"
# $ validation_results:'data.frame': 2 obs. of 4 variables:
# ..$ conversions : chr "OK" "NEW"
# ..$ boxes_shipped : chr "OK" "NEW"
# ..$ outlook_history: chr "OK" "NEW"
# ..$ write2db : chr "OK" "NEW"
如果需要平面数据框(无嵌套数据框),请不要分配新列并将 rbind()
调用移入 data.frame()
:
desired_df <- data.frame(
create_at = c(df$created_at, new_element$created_at),
status = c(df$status, new_element$status),
rbind(df$validation_results, new_element$validation_results),
stringsAsFactors = FALSE
)
str(desired_df)
# 'data.frame': 2 obs. of 6 variables:
# $ create_at : chr "2017-10-20 12:15:52" "2017-10-20 12:16:12"
# $ status : chr "Done" "Pending"
# $ conversions : chr "OK" "NEW"
# $ boxes_shipped : chr "OK" "NEW"
# $ outlook_history: chr "OK" "NEW"
# $ write2db : chr "OK" "NEW"
我在 R 中处理复杂的 JSON 文件(通过使用 jsonlite 包),我需要将行附加到包含嵌套数据帧的数据帧。
这是原始数据帧的结构(df,只有一个我想在其上添加新行的观察)以及我希望的新行的结构添加 (new_element).
> str(df)
$ :'data.frame': 1 obs. of 3 variables:
..$ created_at : chr "2017-10-20 08:55:13"
..$ status : chr "Done"
..$ validation_results:'data.frame': 1 obs. of 4 variables:
.. ..$ conversions : chr "OK"
.. ..$ boxes_shipped : chr "OK"
.. ..$ outlook_history: chr "OK"
.. ..$ write2db : chr "OK"
> str(new_element)
'data.frame': 1 obs. of 3 variables:
$ created_at : chr "2017-10-20 09:20:30"
$ status : chr "Pending"
$ validation_results:'data.frame': 1 obs. of 4 variables:
..$ conversions : chr "NEW"
..$ boxes_shipped : chr "NEW"
..$ outlook_history: chr "NEW"
..$ write2db : chr "NEW"
问题来了。尝试将 new_element 添加到 df:
时df <- rbind(df, new_element)
我收到这个错误:
Error in `row.names<-.data.frame`(`*tmp*`, value = value) :
duplicate 'row.names' are not allowed
In addition: Warning message:
non-unique values when setting 'row.names': ‘1’
至于名字:
> names(df)
[1] "created_at" "status" "validation_results"
> names(new_element)
[1] "created_at" "status" "validation_results"
有人知道实际发生了什么吗?
编辑:
应@Moody_Mudskipper 的要求,我添加了 dput 输出。
> dput(df)
structure(list(created_at = "2017-10-20 12:15:52", status = "Done",
validation_results = structure(list(conversions = "OK",
boxes_shipped = "OK", outlook_history = "OK", write2db = "OK"), .Names = c("conversions",
"boxes_shipped", "outlook_history", "write2db"), row.names = c(NA,
-1L), class = "data.frame")), .Names = c("created_at", "status",
"validation_results"), row.names = 1L, class = "data.frame")
> dput(new_element)
structure(list(created_at = "2017-10-20 12:16:12", status = "Pending",
validation_results = structure(list(conversions = "NEW",
boxes_shipped = "NEW", outlook_history = "NEW", write2db = "NEW"), .Names = c("conversions",
"boxes_shipped", "outlook_history", "write2db"), row.names = c(NA,
-1L), class = "data.frame")), .Names = c("created_at", "status",
"validation_results"), row.names = 1L, class = "data.frame")
编辑2:
这是我期待获得的结果数据帧输出,可以使用 jsonlite 包将其转换为 JSON 文件。
> str(result)
'data.frame': 2 obs. of 3 variables:
$ created_at : chr "2017-10-20 08:55:13" "2017-10-20 09:20:30"
$ status : chr "Done" "Pending"
$ validation_results:'data.frame': 2 obs. of 4 variables:
..$ conversions : chr "OK" "NEW"
..$ boxes_shipped : chr "OK" "NEW"
..$ outlook_history: chr "OK" "NEW"
..$ write2db : chr "OK" "NEW"
> dput(result)
structure(list(created_at = c("2017-10-20 08:55:13", "2017-10-20
09:20:30"), status = c("Done", "Pending"), validation_results = structure(list(
conversions = c("OK", "NEW"), boxes_shipped = c("OK", "NEW"
), outlook_history = c("OK", "NEW"), write2db =
c("OK", "NEW")), .Names = c("conversions", "boxes_shipped",
"outlook_history", "write2db"), class = "data.frame", row.names =
1:2)), .Names = c("created_at",
"status", "validation_results"), class = "data.frame", row.names =
1:2)
编辑3:
> result <- read_from_JSON('output.json')
> jsonlite::toJSON(result, pretty = TRUE)
[
{
"created_at": "2017-10-20 08:55:13",
"status": "Done",
"validation_results": {
"conversions": "OK",
"boxes_shipped": "OK",
"outlook_history": "OK",
"write2db": "OK"
}
},
{
"created_at": "2017-10-20 09:20:30",
"status": "Pending",
"validation_results": {
"conversions": "NEW",
"boxes_shipped": "NEW",
"outlook_history": "NEW",
"write2db": "NEW"
}
}
]
> str(result)
'data.frame': 2 obs. of 3 variables:
$ created_at : chr "2017-10-20 08:55:13" "2017-10-20 09:20:30"
$ status : chr "Done" "Pending"
$ validation_results:'data.frame': 2 obs. of 4 variables:
..$ conversions : chr "OK" "NEW"
..$ boxes_shipped : chr "OK" "NEW"
..$ outlook_history: chr "OK" "NEW"
..$ write2db : chr "OK" "NEW"
仔细检查 new_element
和 result[2,]
之间的区别后:
new_element <- within(new_element,rownames(validation_results) <- 2)
new_df <- rbind(df,new_element)
identical(new_df,result)
# FALSE -> not all mysteries are solved
identical(toJSON(new_df,pretty=TRUE),toJSON(result,pretty=TRUE))
# TRUE -> but that should be good enough
编辑:
一个通用的解决方案,基于@Parfait 的回答,它将用于 rbind 任何数量的 data.frames 共享相同的结构,而不需要先验知识列 类 :
library(dplyr)
library(purrr)
rbind2 <- function(...){
df_cols <- sapply(list(...)[[1]],function(x) "data.frame" %in% class(x))
df_list <- lapply(list(...),`[`,df_cols) %>% transpose %>% lapply(do.call,what=rbind)
new_df <- do.call(rbind,lapply(list(...),`[`,!df_cols)) # columns that are not data.frames, simple rbind
for (n in names(df_list)){
new_df[[n]] <- df_list[[n]]
}
new_df[names(list(...)[[1]])]
}
new_df <- rbind2(df,new_element)
identical(new_df,result) # TRUE
你可以试试这个,例如:
rbind2(result,result)
旧答案:
源格式有点尴尬,所以我们会做一些争论将 validation_results
列设置为 data.frames
.
library(dplyr)
library(purrr)
new_df <- list(df,new_element) %>% map(. %>% map_at("validation_results",list)) %>% bind_rows
# created_at status validation_results
# <chr> <chr> <list>
# 1 2017-10-20 12:15:52 Done <data.frame [1 x 4]>
# 2 2017-10-20 12:16:12 Pending <data.frame [1 x 4]>
您可能想在某些时候使用 unnest
library(tidyr)
new_df %>% unnest(validation_results)
# A tibble: 2 x 6
# created_at status conversions boxes_shipped outlook_history write2db
# <chr> <chr> <chr> <chr> <chr> <chr>
# 1 2017-10-20 12:15:52 Done OK OK OK OK
# 2 2017-10-20 12:16:12 Pending NEW NEW NEW NEW
考虑基数 R 的 data.frame()
构造函数绑定向量与列分配:
desired_df <- data.frame(
create_at = c(df$created_at, new_element$created_at),
status = c(df$status, new_element$status),
stringsAsFactors = FALSE
)
# ASSIGN NESTED DF TO NEW COLUMN
desired_df$validation_results = rbind(df$validation_results,
new_element$validation_results)
str(desired_df)
# 'data.frame': 2 obs. of 3 variables:
# $ create_at : chr "2017-10-20 12:15:52" "2017-10-20 12:16:12"
# $ status : chr "Done" "Pending"
# $ validation_results:'data.frame': 2 obs. of 4 variables:
# ..$ conversions : chr "OK" "NEW"
# ..$ boxes_shipped : chr "OK" "NEW"
# ..$ outlook_history: chr "OK" "NEW"
# ..$ write2db : chr "OK" "NEW"
如果需要平面数据框(无嵌套数据框),请不要分配新列并将 rbind()
调用移入 data.frame()
:
desired_df <- data.frame(
create_at = c(df$created_at, new_element$created_at),
status = c(df$status, new_element$status),
rbind(df$validation_results, new_element$validation_results),
stringsAsFactors = FALSE
)
str(desired_df)
# 'data.frame': 2 obs. of 6 variables:
# $ create_at : chr "2017-10-20 12:15:52" "2017-10-20 12:16:12"
# $ status : chr "Done" "Pending"
# $ conversions : chr "OK" "NEW"
# $ boxes_shipped : chr "OK" "NEW"
# $ outlook_history: chr "OK" "NEW"
# $ write2db : chr "OK" "NEW"