如果丢失,则在结果上创建带有标识符和 NA 的新行
Create new row with identifier and NA on outcome if missing
我有一个如下所示的数据框:
id day time trial outcome
1 1 1 15:10 2 FALSE
2 1 2 15:01 2 TRUE
3 1 2 20:10 3 FALSE
4 1 3 11:10 1 FALSE
5 1 4 15:10 2 FALSE
6 1 5 20:13 3 FALSE
7 1 6 11:10 1 FALSE
8 1 6 15:10 2 FALSE
9 1 7 11:10 1 FALSE
10 1 7 15:09 2 TRUE
11 1 7 20:00 3 TRUE
12 1 8 11:10 1 FALSE
13 1 8 15:01 2 TRUE
14 1 9 15:00 2 TRUE
15 1 9 20:06 3 TRUE
16 1 10 11:10 1 FALSE
17 1 11 11:10 1 FALSE
18 1 11 15:00 2 TRUE
19 1 12 20:00 3 TRUE
20 1 13 15:02 2 TRUE
参与者每天接受 3 次试验。结果是他们是否看过审判。如果他们不看,则不会记录任何数据行。我想做的是为那些缺失的试验创建行。
因此在上面的示例中,在第 1 天:需要添加试验 1 和 3;其中 id=1,day=1,trial=1 和 3,但 time 和 outcome 有一个 NA。第 2 天,只需添加试验 1;其中 id=1、day=2 和 trial=1,但时间和结果同样为 NA。
我一直在尝试,但一直未能找到可靠的解决方案。
可重现的数据集:
structure(list(id = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
"14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24",
"25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35",
"36", "37", "38", "39", "40", "41", "42", "43", "44", "45", "46",
"47", "48", "49", "50", "51", "52", "53", "54", "55", "56", "57",
"58", "59", "60", "61", "62", "63", "64", "65", "66", "67", "68",
"69", "70", "71", "72", "73", "74", "75", "76", "77", "78", "79",
"80", "81", "82", "83", "84", "85", "86", "87", "88", "89", "90",
"91", "92", "93", "94", "95", "96"), class = "factor"), day = c(1,
2, 2, 3, 4, 5, 6, 6, 7, 7, 7, 8, 8, 9, 9, 10, 11, 11, 12, 13),
time = c("15:10", "15:01", "20:10", "11:10", "15:10", "20:13",
"11:10", "15:10", "11:10", "15:09", "20:00", "11:10", "15:01",
"15:00", "20:06", "11:10", "11:10", "15:00", "20:00", "15:02"
), trial = c(2, 2, 3, 1, 2, 3, 1, 2, 1, 2, 3, 1, 2, 2, 3,
1, 1, 2, 3, 2), outcome = structure(c(1L, 2L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L
), .Label = c("FALSE", "TRUE"), class = "factor")), .Names = c("id",
"day", "time", "trial", "outcome"), row.names = c(NA, 20L), class = "data.frame")
这是一个使用 merge
.
的基于 R 的解决方案
# Skeleton dataframe as template
df2 <- cbind.data.frame(
day = rep(1:max(df$day), each = max(df$trial)),
trial = rep(1:max(df$trial), max(df$day)));
#
df <- merge(df, df2, by = c("day", "trial"), all = TRUE)
df;
# day trial id time outcome
#1 1 1 <NA> <NA> <NA>
#2 1 2 1 15:10 FALSE
#3 1 3 <NA> <NA> <NA>
#4 2 1 <NA> <NA> <NA>
#5 2 2 1 15:01 TRUE
#6 2 3 1 20:10 FALSE
#7 3 1 1 11:10 FALSE
#8 3 2 <NA> <NA> <NA>
#9 3 3 <NA> <NA> <NA>
#10 4 1 <NA> <NA> <NA>
#11 4 2 1 15:10 FALSE
#12 4 3 <NA> <NA> <NA>
#13 5 1 <NA> <NA> <NA>
#14 5 2 <NA> <NA> <NA>
#15 5 3 1 20:13 FALSE
#16 6 1 1 11:10 FALSE
#17 6 2 1 15:10 FALSE
#18 6 3 <NA> <NA> <NA>
#19 7 1 1 11:10 FALSE
#20 7 2 1 15:09 TRUE
#21 7 3 1 20:00 TRUE
#22 8 1 1 11:10 FALSE
#23 8 2 1 15:01 TRUE
#24 8 3 <NA> <NA> <NA>
#25 9 1 <NA> <NA> <NA>
#26 9 2 1 15:00 TRUE
#27 9 3 1 20:06 TRUE
#28 10 1 1 11:10 FALSE
#29 10 2 <NA> <NA> <NA>
#30 10 3 <NA> <NA> <NA>
#31 11 1 1 11:10 FALSE
#32 11 2 1 15:00 TRUE
#33 11 3 <NA> <NA> <NA>
#34 12 1 <NA> <NA> <NA>
#35 12 2 <NA> <NA> <NA>
#36 12 3 1 20:00 TRUE
#37 13 1 <NA> <NA> <NA>
#38 13 2 1 15:02 TRUE
#39 13 3 <NA> <NA> <NA>
说明:根据最大天数和试验次数构建骨架(模板)dataframe
。然后将骨架与原始 dataframe
和 merge(..., by = c("day", "trial"), all = TRUE)
合并,以标记缺少的条目 NA
.
示例数据
df <- structure(list(id = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
"14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24",
"25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35",
"36", "37", "38", "39", "40", "41", "42", "43", "44", "45", "46",
"47", "48", "49", "50", "51", "52", "53", "54", "55", "56", "57",
"58", "59", "60", "61", "62", "63", "64", "65", "66", "67", "68",
"69", "70", "71", "72", "73", "74", "75", "76", "77", "78", "79",
"80", "81", "82", "83", "84", "85", "86", "87", "88", "89", "90",
"91", "92", "93", "94", "95", "96"), class = "factor"), day = c(1,
2, 2, 3, 4, 5, 6, 6, 7, 7, 7, 8, 8, 9, 9, 10, 11, 11, 12, 13),
time = c("15:10", "15:01", "20:10", "11:10", "15:10", "20:13",
"11:10", "15:10", "11:10", "15:09", "20:00", "11:10", "15:01",
"15:00", "20:06", "11:10", "11:10", "15:00", "20:00", "15:02"
), trial = c(2, 2, 3, 1, 2, 3, 1, 2, 1, 2, 3, 1, 2, 2, 3,
1, 1, 2, 3, 2), outcome = structure(c(1L, 2L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L
), .Label = c("FALSE", "TRUE"), class = "factor")), .Names = c("id",
"day", "time", "trial", "outcome"), row.names = c(NA, 20L), class = "data.frame")
您可以使用 expand.grid()
的组合来获得所有可能的组合,然后 merge
它们与原始 data.frame
。
dat <- data.frame(id = c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
"14", "15", "16", "17", "18", "19", "20"),
day = c(1, 2, 2, 3, 4, 5, 6, 6, 7, 7, 7, 8, 8, 9, 9, 10, 11, 11, 12, 13),
time = c("15:10", "15:01", "20:10", "11:10", "15:10", "20:13",
"11:10", "15:10", "11:10", "15:09", "20:00", "11:10", "15:01",
"15:00", "20:06", "11:10", "11:10", "15:00", "20:00", "15:02"),
trial = c(2, 2, 3, 1, 2, 3, 1, 2, 1, 2, 3, 1, 2, 2, 3, 1, 1, 2, 3, 2),
outcome = structure(c(1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L),
.Label = c("FALSE", "TRUE"), class = "factor"))
vals <- expand.grid(id = unique(dat$id),
day = unique(dat$day),
trial = unique(dat$trial))
vals$time <- NA
vals$outcome <- NA
dat2 <- merge(vals, dat, all = TRUE)
这是来自 tidyr
、
的 complete
的工作
library(tidyr)
complete(df, trial, nesting(id, day))
我有一个如下所示的数据框:
id day time trial outcome
1 1 1 15:10 2 FALSE
2 1 2 15:01 2 TRUE
3 1 2 20:10 3 FALSE
4 1 3 11:10 1 FALSE
5 1 4 15:10 2 FALSE
6 1 5 20:13 3 FALSE
7 1 6 11:10 1 FALSE
8 1 6 15:10 2 FALSE
9 1 7 11:10 1 FALSE
10 1 7 15:09 2 TRUE
11 1 7 20:00 3 TRUE
12 1 8 11:10 1 FALSE
13 1 8 15:01 2 TRUE
14 1 9 15:00 2 TRUE
15 1 9 20:06 3 TRUE
16 1 10 11:10 1 FALSE
17 1 11 11:10 1 FALSE
18 1 11 15:00 2 TRUE
19 1 12 20:00 3 TRUE
20 1 13 15:02 2 TRUE
参与者每天接受 3 次试验。结果是他们是否看过审判。如果他们不看,则不会记录任何数据行。我想做的是为那些缺失的试验创建行。
因此在上面的示例中,在第 1 天:需要添加试验 1 和 3;其中 id=1,day=1,trial=1 和 3,但 time 和 outcome 有一个 NA。第 2 天,只需添加试验 1;其中 id=1、day=2 和 trial=1,但时间和结果同样为 NA。
我一直在尝试,但一直未能找到可靠的解决方案。
可重现的数据集:
structure(list(id = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
"14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24",
"25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35",
"36", "37", "38", "39", "40", "41", "42", "43", "44", "45", "46",
"47", "48", "49", "50", "51", "52", "53", "54", "55", "56", "57",
"58", "59", "60", "61", "62", "63", "64", "65", "66", "67", "68",
"69", "70", "71", "72", "73", "74", "75", "76", "77", "78", "79",
"80", "81", "82", "83", "84", "85", "86", "87", "88", "89", "90",
"91", "92", "93", "94", "95", "96"), class = "factor"), day = c(1,
2, 2, 3, 4, 5, 6, 6, 7, 7, 7, 8, 8, 9, 9, 10, 11, 11, 12, 13),
time = c("15:10", "15:01", "20:10", "11:10", "15:10", "20:13",
"11:10", "15:10", "11:10", "15:09", "20:00", "11:10", "15:01",
"15:00", "20:06", "11:10", "11:10", "15:00", "20:00", "15:02"
), trial = c(2, 2, 3, 1, 2, 3, 1, 2, 1, 2, 3, 1, 2, 2, 3,
1, 1, 2, 3, 2), outcome = structure(c(1L, 2L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L
), .Label = c("FALSE", "TRUE"), class = "factor")), .Names = c("id",
"day", "time", "trial", "outcome"), row.names = c(NA, 20L), class = "data.frame")
这是一个使用 merge
.
# Skeleton dataframe as template
df2 <- cbind.data.frame(
day = rep(1:max(df$day), each = max(df$trial)),
trial = rep(1:max(df$trial), max(df$day)));
#
df <- merge(df, df2, by = c("day", "trial"), all = TRUE)
df;
# day trial id time outcome
#1 1 1 <NA> <NA> <NA>
#2 1 2 1 15:10 FALSE
#3 1 3 <NA> <NA> <NA>
#4 2 1 <NA> <NA> <NA>
#5 2 2 1 15:01 TRUE
#6 2 3 1 20:10 FALSE
#7 3 1 1 11:10 FALSE
#8 3 2 <NA> <NA> <NA>
#9 3 3 <NA> <NA> <NA>
#10 4 1 <NA> <NA> <NA>
#11 4 2 1 15:10 FALSE
#12 4 3 <NA> <NA> <NA>
#13 5 1 <NA> <NA> <NA>
#14 5 2 <NA> <NA> <NA>
#15 5 3 1 20:13 FALSE
#16 6 1 1 11:10 FALSE
#17 6 2 1 15:10 FALSE
#18 6 3 <NA> <NA> <NA>
#19 7 1 1 11:10 FALSE
#20 7 2 1 15:09 TRUE
#21 7 3 1 20:00 TRUE
#22 8 1 1 11:10 FALSE
#23 8 2 1 15:01 TRUE
#24 8 3 <NA> <NA> <NA>
#25 9 1 <NA> <NA> <NA>
#26 9 2 1 15:00 TRUE
#27 9 3 1 20:06 TRUE
#28 10 1 1 11:10 FALSE
#29 10 2 <NA> <NA> <NA>
#30 10 3 <NA> <NA> <NA>
#31 11 1 1 11:10 FALSE
#32 11 2 1 15:00 TRUE
#33 11 3 <NA> <NA> <NA>
#34 12 1 <NA> <NA> <NA>
#35 12 2 <NA> <NA> <NA>
#36 12 3 1 20:00 TRUE
#37 13 1 <NA> <NA> <NA>
#38 13 2 1 15:02 TRUE
#39 13 3 <NA> <NA> <NA>
说明:根据最大天数和试验次数构建骨架(模板)dataframe
。然后将骨架与原始 dataframe
和 merge(..., by = c("day", "trial"), all = TRUE)
合并,以标记缺少的条目 NA
.
示例数据
df <- structure(list(id = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
"14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24",
"25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35",
"36", "37", "38", "39", "40", "41", "42", "43", "44", "45", "46",
"47", "48", "49", "50", "51", "52", "53", "54", "55", "56", "57",
"58", "59", "60", "61", "62", "63", "64", "65", "66", "67", "68",
"69", "70", "71", "72", "73", "74", "75", "76", "77", "78", "79",
"80", "81", "82", "83", "84", "85", "86", "87", "88", "89", "90",
"91", "92", "93", "94", "95", "96"), class = "factor"), day = c(1,
2, 2, 3, 4, 5, 6, 6, 7, 7, 7, 8, 8, 9, 9, 10, 11, 11, 12, 13),
time = c("15:10", "15:01", "20:10", "11:10", "15:10", "20:13",
"11:10", "15:10", "11:10", "15:09", "20:00", "11:10", "15:01",
"15:00", "20:06", "11:10", "11:10", "15:00", "20:00", "15:02"
), trial = c(2, 2, 3, 1, 2, 3, 1, 2, 1, 2, 3, 1, 2, 2, 3,
1, 1, 2, 3, 2), outcome = structure(c(1L, 2L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L
), .Label = c("FALSE", "TRUE"), class = "factor")), .Names = c("id",
"day", "time", "trial", "outcome"), row.names = c(NA, 20L), class = "data.frame")
您可以使用 expand.grid()
的组合来获得所有可能的组合,然后 merge
它们与原始 data.frame
。
dat <- data.frame(id = c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
"14", "15", "16", "17", "18", "19", "20"),
day = c(1, 2, 2, 3, 4, 5, 6, 6, 7, 7, 7, 8, 8, 9, 9, 10, 11, 11, 12, 13),
time = c("15:10", "15:01", "20:10", "11:10", "15:10", "20:13",
"11:10", "15:10", "11:10", "15:09", "20:00", "11:10", "15:01",
"15:00", "20:06", "11:10", "11:10", "15:00", "20:00", "15:02"),
trial = c(2, 2, 3, 1, 2, 3, 1, 2, 1, 2, 3, 1, 2, 2, 3, 1, 1, 2, 3, 2),
outcome = structure(c(1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 2L),
.Label = c("FALSE", "TRUE"), class = "factor"))
vals <- expand.grid(id = unique(dat$id),
day = unique(dat$day),
trial = unique(dat$trial))
vals$time <- NA
vals$outcome <- NA
dat2 <- merge(vals, dat, all = TRUE)
这是来自 tidyr
、
complete
的工作
library(tidyr)
complete(df, trial, nesting(id, day))