向数据框添加新行(其中嵌套数据框)时出现问题

Issue when adding new rows (with nested dataframes within) to a dataframe

我在 R 中处理复杂的 JSON 文件(通过使用 jsonlite 包),我需要将行附加到包含嵌套数据帧的数据帧。

这是原始数据帧的结构(df,只有一个我想在其上添加新行的观察)以及我希望的新行的结构添加 (new_element).

> str(df)
 $ :'data.frame':   1 obs. of  3 variables:
  ..$ created_at        : chr "2017-10-20 08:55:13"
  ..$ status            : chr "Done"
  ..$ validation_results:'data.frame':  1 obs. of  4 variables:
  .. ..$ conversions    : chr "OK"
  .. ..$ boxes_shipped  : chr "OK"
  .. ..$ outlook_history: chr "OK"
  .. ..$ write2db       : chr "OK"

> str(new_element)
'data.frame':   1 obs. of  3 variables:
 $ created_at        : chr "2017-10-20 09:20:30"
 $ status            : chr "Pending"
 $ validation_results:'data.frame': 1 obs. of  4 variables:
  ..$ conversions    : chr "NEW"
  ..$ boxes_shipped  : chr "NEW"
  ..$ outlook_history: chr "NEW"
  ..$ write2db       : chr "NEW"

问题来了。尝试将 new_element 添加到 df:

df <- rbind(df, new_element)

我收到这个错误:

Error in `row.names<-.data.frame`(`*tmp*`, value = value) : 
  duplicate 'row.names' are not allowed
In addition: Warning message:
non-unique values when setting 'row.names': ‘1’

至于名字:

> names(df)
[1] "created_at"         "status"             "validation_results"
> names(new_element)
[1] "created_at"         "status"             "validation_results"

有人知道实际发生了什么吗?

编辑:

应@Moody_Mudskipper 的要求,我添加了 dput 输出。

> dput(df)
structure(list(created_at = "2017-10-20 12:15:52", status = "Done", 
validation_results = structure(list(conversions = "OK", 
boxes_shipped = "OK", outlook_history = "OK", write2db = "OK"), .Names = c("conversions", 
"boxes_shipped", "outlook_history", "write2db"), row.names = c(NA, 
-1L), class = "data.frame")), .Names = c("created_at", "status", 
"validation_results"), row.names = 1L, class = "data.frame")

> dput(new_element)
structure(list(created_at = "2017-10-20 12:16:12", status = "Pending", 
validation_results = structure(list(conversions = "NEW", 
boxes_shipped = "NEW", outlook_history = "NEW", write2db = "NEW"), .Names = c("conversions", 
"boxes_shipped", "outlook_history", "write2db"), row.names = c(NA, 
-1L), class = "data.frame")), .Names = c("created_at", "status", 
 "validation_results"), row.names = 1L, class = "data.frame")

编辑2:

这是我期待获得的结果数据帧输出,可以使用 jsonlite 包将其转换为 JSON 文件。

> str(result)
'data.frame':   2 obs. of  3 variables:
 $ created_at        : chr  "2017-10-20 08:55:13" "2017-10-20 09:20:30"
 $ status            : chr  "Done" "Pending"
 $ validation_results:'data.frame': 2 obs. of  4 variables:
  ..$ conversions    : chr  "OK" "NEW"
  ..$ boxes_shipped  : chr  "OK" "NEW"
  ..$ outlook_history: chr  "OK" "NEW"
  ..$ write2db       : chr  "OK" "NEW"

> dput(result)
structure(list(created_at = c("2017-10-20 08:55:13", "2017-10-20 
09:20:30"), status = c("Done", "Pending"), validation_results = structure(list(
conversions = c("OK", "NEW"), boxes_shipped = c("OK", "NEW"
), outlook_history = c("OK", "NEW"), write2db = 
c("OK", "NEW")), .Names = c("conversions", "boxes_shipped", 
"outlook_history", "write2db"), class = "data.frame", row.names = 
1:2)), .Names = c("created_at", 
"status", "validation_results"), class = "data.frame", row.names = 
1:2)

编辑3:

> result <- read_from_JSON('output.json')
> jsonlite::toJSON(result, pretty = TRUE)
[
  {
    "created_at": "2017-10-20 08:55:13",
    "status": "Done",
    "validation_results": {
    "conversions": "OK",
    "boxes_shipped": "OK",
    "outlook_history": "OK",
    "write2db": "OK"
    }
  },
  {
    "created_at": "2017-10-20 09:20:30",
    "status": "Pending",
    "validation_results": {
    "conversions": "NEW",
    "boxes_shipped": "NEW",
    "outlook_history": "NEW",
    "write2db": "NEW"
    }
  }
 ] 
> str(result)
'data.frame':   2 obs. of  3 variables:
 $ created_at        : chr  "2017-10-20 08:55:13" "2017-10-20 09:20:30"
 $ status            : chr  "Done" "Pending"
 $ validation_results:'data.frame': 2 obs. of  4 variables:
  ..$ conversions    : chr  "OK" "NEW"
  ..$ boxes_shipped  : chr  "OK" "NEW"
  ..$ outlook_history: chr  "OK" "NEW"
  ..$ write2db       : chr  "OK" "NEW"

仔细检查 new_elementresult[2,] 之间的区别后:

new_element <- within(new_element,rownames(validation_results) <- 2)
new_df <- rbind(df,new_element)
identical(new_df,result)
# FALSE -> not all mysteries are solved
identical(toJSON(new_df,pretty=TRUE),toJSON(result,pretty=TRUE))
# TRUE -> but that should be good enough

编辑:

一个通用的解决方案,基于@Parfait 的回答,它将用于 rbind 任何数量的 data.frames 共享相同的结构,而不需要先验知识列 类 :

library(dplyr)
library(purrr)
rbind2 <- function(...){
  df_cols <- sapply(list(...)[[1]],function(x) "data.frame" %in% class(x))
  df_list <- lapply(list(...),`[`,df_cols) %>% transpose %>% lapply(do.call,what=rbind)
  new_df <- do.call(rbind,lapply(list(...),`[`,!df_cols))     # columns that are not data.frames, simple rbind
  for (n in names(df_list)){
    new_df[[n]] <- df_list[[n]]
  }
  new_df[names(list(...)[[1]])]
}
new_df <- rbind2(df,new_element)
identical(new_df,result) # TRUE

你可以试试这个,例如:

rbind2(result,result)

旧答案:

源格式有点尴尬,所以我们会做一些争论将 validation_results 列设置为 data.frames.

的列表
library(dplyr)
library(purrr)
new_df <- list(df,new_element) %>% map(. %>% map_at("validation_results",list)) %>% bind_rows
#              created_at  status   validation_results
#                   <chr>   <chr>               <list>
#   1 2017-10-20 12:15:52    Done <data.frame [1 x 4]>
#   2 2017-10-20 12:16:12 Pending <data.frame [1 x 4]>

您可能想在某些时候使用 unnest

library(tidyr)
new_df %>% unnest(validation_results)
# A tibble: 2 x 6
#              created_at  status conversions boxes_shipped outlook_history write2db
#                   <chr>   <chr>       <chr>         <chr>           <chr>    <chr>
#   1 2017-10-20 12:15:52    Done          OK            OK              OK       OK
#   2 2017-10-20 12:16:12 Pending         NEW           NEW             NEW      NEW

考虑基数 R 的 data.frame() 构造函数绑定向量与列分配:

desired_df <- data.frame(
  create_at = c(df$created_at, new_element$created_at),
  status = c(df$status, new_element$status),
  stringsAsFactors = FALSE
)

# ASSIGN NESTED DF TO NEW COLUMN
desired_df$validation_results = rbind(df$validation_results, 
                                      new_element$validation_results)

str(desired_df)
# 'data.frame': 2 obs. of  3 variables:
#  $ create_at         : chr  "2017-10-20 12:15:52" "2017-10-20 12:16:12"
#  $ status            : chr  "Done" "Pending"
#  $ validation_results:'data.frame':   2 obs. of  4 variables:
#   ..$ conversions    : chr  "OK" "NEW"
#   ..$ boxes_shipped  : chr  "OK" "NEW"
#   ..$ outlook_history: chr  "OK" "NEW"
#   ..$ write2db       : chr  "OK" "NEW"

如果需要平面数据框(无嵌套数据框),请不要分配新列并将 rbind() 调用移入 data.frame():

desired_df <- data.frame(
  create_at = c(df$created_at, new_element$created_at),
  status = c(df$status, new_element$status),
  rbind(df$validation_results, new_element$validation_results),
  stringsAsFactors = FALSE
)

str(desired_df)
# 'data.frame': 2 obs. of  6 variables:
#  $ create_at      : chr  "2017-10-20 12:15:52" "2017-10-20 12:16:12"
#  $ status         : chr  "Done" "Pending"
#  $ conversions    : chr  "OK" "NEW"
#  $ boxes_shipped  : chr  "OK" "NEW"
#  $ outlook_history: chr  "OK" "NEW"
#  $ write2db       : chr  "OK" "NEW"