如果丢失，则在结果上创建带有标识符和 NA 的新行

Question

我有一个如下所示的数据框：

   id day  time trial outcome
1   1   1 15:10     2   FALSE
2   1   2 15:01     2    TRUE
3   1   2 20:10     3   FALSE
4   1   3 11:10     1   FALSE
5   1   4 15:10     2   FALSE
6   1   5 20:13     3   FALSE
7   1   6 11:10     1   FALSE
8   1   6 15:10     2   FALSE
9   1   7 11:10     1   FALSE
10  1   7 15:09     2    TRUE
11  1   7 20:00     3    TRUE
12  1   8 11:10     1   FALSE
13  1   8 15:01     2    TRUE
14  1   9 15:00     2    TRUE
15  1   9 20:06     3    TRUE
16  1  10 11:10     1   FALSE
17  1  11 11:10     1   FALSE
18  1  11 15:00     2    TRUE
19  1  12 20:00     3    TRUE
20  1  13 15:02     2    TRUE

参与者每天接受 3 次试验。结果是他们是否看过审判。如果他们不看，则不会记录任何数据行。我想做的是为那些缺失的试验创建行。

因此在上面的示例中，在第 1 天：需要添加试验 1 和 3；其中 id=1，day=1，trial=1 和 3，但 time 和 outcome 有一个 NA。第 2 天，只需添加试验 1；其中 id=1、day=2 和 trial=1，但时间和结果同样为 NA。

我一直在尝试，但一直未能找到可靠的解决方案。

可重现的数据集：

structure(list(id = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("1", 
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", 
"14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", 
"25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", 
"36", "37", "38", "39", "40", "41", "42", "43", "44", "45", "46", 
"47", "48", "49", "50", "51", "52", "53", "54", "55", "56", "57", 
"58", "59", "60", "61", "62", "63", "64", "65", "66", "67", "68", 
"69", "70", "71", "72", "73", "74", "75", "76", "77", "78", "79", 
"80", "81", "82", "83", "84", "85", "86", "87", "88", "89", "90", 
"91", "92", "93", "94", "95", "96"), class = "factor"), day = c(1, 
2, 2, 3, 4, 5, 6, 6, 7, 7, 7, 8, 8, 9, 9, 10, 11, 11, 12, 13), 
    time = c("15:10", "15:01", "20:10", "11:10", "15:10", "20:13", 
    "11:10", "15:10", "11:10", "15:09", "20:00", "11:10", "15:01", 
    "15:00", "20:06", "11:10", "11:10", "15:00", "20:00", "15:02"
    ), trial = c(2, 2, 3, 1, 2, 3, 1, 2, 1, 2, 3, 1, 2, 2, 3, 
    1, 1, 2, 3, 2), outcome = structure(c(1L, 2L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L
    ), .Label = c("FALSE", "TRUE"), class = "factor")), .Names = c("id", 
"day", "time", "trial", "outcome"), row.names = c(NA, 20L), class = "data.frame")

Answer 1

这是一个使用 merge.

的基于 R 的解决方案

# Skeleton dataframe as template
df2 <- cbind.data.frame(
    day = rep(1:max(df$day), each = max(df$trial)),
    trial = rep(1:max(df$trial), max(df$day)));

# 
df <- merge(df, df2, by = c("day", "trial"), all = TRUE)
df;
#   day trial   id  time outcome
#1    1     1 <NA>  <NA>    <NA>
#2    1     2    1 15:10   FALSE
#3    1     3 <NA>  <NA>    <NA>
#4    2     1 <NA>  <NA>    <NA>
#5    2     2    1 15:01    TRUE
#6    2     3    1 20:10   FALSE
#7    3     1    1 11:10   FALSE
#8    3     2 <NA>  <NA>    <NA>
#9    3     3 <NA>  <NA>    <NA>
#10   4     1 <NA>  <NA>    <NA>
#11   4     2    1 15:10   FALSE
#12   4     3 <NA>  <NA>    <NA>
#13   5     1 <NA>  <NA>    <NA>
#14   5     2 <NA>  <NA>    <NA>
#15   5     3    1 20:13   FALSE
#16   6     1    1 11:10   FALSE
#17   6     2    1 15:10   FALSE
#18   6     3 <NA>  <NA>    <NA>
#19   7     1    1 11:10   FALSE
#20   7     2    1 15:09    TRUE
#21   7     3    1 20:00    TRUE
#22   8     1    1 11:10   FALSE
#23   8     2    1 15:01    TRUE
#24   8     3 <NA>  <NA>    <NA>
#25   9     1 <NA>  <NA>    <NA>
#26   9     2    1 15:00    TRUE
#27   9     3    1 20:06    TRUE
#28  10     1    1 11:10   FALSE
#29  10     2 <NA>  <NA>    <NA>
#30  10     3 <NA>  <NA>    <NA>
#31  11     1    1 11:10   FALSE
#32  11     2    1 15:00    TRUE
#33  11     3 <NA>  <NA>    <NA>
#34  12     1 <NA>  <NA>    <NA>
#35  12     2 <NA>  <NA>    <NA>
#36  12     3    1 20:00    TRUE
#37  13     1 <NA>  <NA>    <NA>
#38  13     2    1 15:02    TRUE
#39  13     3 <NA>  <NA>    <NA>

说明：根据最大天数和试验次数构建骨架（模板）dataframe。然后将骨架与原始 dataframe 和 merge(..., by = c("day", "trial"), all = TRUE) 合并，以标记缺少的条目 NA.

示例数据

df <- structure(list(id = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
"14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24",
"25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35",
"36", "37", "38", "39", "40", "41", "42", "43", "44", "45", "46",
"47", "48", "49", "50", "51", "52", "53", "54", "55", "56", "57",
"58", "59", "60", "61", "62", "63", "64", "65", "66", "67", "68",
"69", "70", "71", "72", "73", "74", "75", "76", "77", "78", "79",
"80", "81", "82", "83", "84", "85", "86", "87", "88", "89", "90",
"91", "92", "93", "94", "95", "96"), class = "factor"), day = c(1,
2, 2, 3, 4, 5, 6, 6, 7, 7, 7, 8, 8, 9, 9, 10, 11, 11, 12, 13),
    time = c("15:10", "15:01", "20:10", "11:10", "15:10", "20:13",
    "11:10", "15:10", "11:10", "15:09", "20:00", "11:10", "15:01",
    "15:00", "20:06", "11:10", "11:10", "15:00", "20:00", "15:02"
    ), trial = c(2, 2, 3, 1, 2, 3, 1, 2, 1, 2, 3, 1, 2, 2, 3,
    1, 1, 2, 3, 2), outcome = structure(c(1L, 2L, 1L, 1L, 1L,
    1L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L
    ), .Label = c("FALSE", "TRUE"), class = "factor")), .Names = c("id",
"day", "time", "trial", "outcome"), row.names = c(NA, 20L), class = "data.frame")

Answer 2

您可以使用 expand.grid() 的组合来获得所有可能的组合，然后 merge 它们与原始 data.frame。

dat <- data.frame(id = c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", 
                         "14", "15", "16", "17", "18", "19", "20"),
                  day = c(1, 2, 2, 3, 4, 5, 6, 6, 7, 7, 7, 8, 8, 9, 9, 10, 11, 11, 12, 13), 
                  time = c("15:10", "15:01", "20:10", "11:10", "15:10", "20:13", 
                           "11:10", "15:10", "11:10", "15:09", "20:00", "11:10", "15:01", 
                           "15:00", "20:06", "11:10", "11:10", "15:00", "20:00", "15:02"),
                  trial = c(2, 2, 3, 1, 2, 3, 1, 2, 1, 2, 3, 1, 2, 2, 3, 1, 1, 2, 3, 2),
                  outcome = structure(c(1L, 2L, 1L, 1L, 1L,  1L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L),
                                      .Label = c("FALSE", "TRUE"), class = "factor"))

vals <- expand.grid(id = unique(dat$id),
                    day = unique(dat$day),
                    trial = unique(dat$trial))
vals$time <- NA
vals$outcome <- NA 

dat2 <- merge(vals, dat, all = TRUE)

Answer 3

这是来自 tidyr、

的 complete 的工作

library(tidyr)

complete(df, trial, nesting(id, day))

如果丢失，则在结果上创建带有标识符和 NA 的新行

Create new row with identifier and NA on outcome if missing

r

dplyr

tidyverse

示例数据