过滤来自另一个数据帧的两部分响应数据并加入两个数据帧
Filter for two-part response data from another dataframe and join two dataframes
我有一个调查问题,格式为:"Do you prefer a rose or a tulip? Imagine that the rose has colors V1 and V2, and the tulip has colors V3 and V4"
实际颜色是从一个数据框中包含的组合中提取的:
数据帧 1 (df1):
structure(list(V1 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("red", "ruby"), class = "factor"),
V2 = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L), .Label = c("blue", "violet"), class = "factor"),
V3 = structure(c(1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L,
2L, 2L, 1L, 1L, 2L, 2L), .Label = c("green", "turqoise"), class = "factor"),
V4 = structure(c(2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L), .Label = c("black", "yellow"), class = "factor")), .Names = c("V1",
"V2", "V3", "V4"), class = "data.frame", row.names = c(NA, -16L
))
在此数据框 (df1) 中,前两列(V1 和 V2)对应于 "rose",最后两列(V3 和 V4)对应于 "tulip"。例如,可以向受访者显示 df1 第一行的组合 1,即 "red blue green yellow"。这意味着受访者可以选择 "rose that is red and blue" 或 "tulip that is green and yellow".
受访者做出的选择包含在单独的数据框 (df2) 中。 df2 每一种颜色组合都有一列。如果向受访者 1 展示了 df1 ("red blue green yellow") 中的第一个组合并选择了郁金香(即绿色和黄色),则该选项在第一行中标记为“2”(代表郁金香,即第二朵花) df2。如果向受访者 2 显示 df1 ("red blue green black") 的第二个组合并选择玫瑰(即红色和蓝色),则该选项在第二行中标记为“1”(代表玫瑰,即第一朵花) df2。换句话说,“2”表示"tulip chosen, rose not chosen",“1”表示"rose chosen, tulip not chosen"。
数据帧 2 (df2):
structure(list(respondentID = 1:16, v1 = c(2L, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), v2 = c(NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), v3 = c(NA,
NA, NA, NA, NA, NA, 1L, NA, NA, NA, NA, NA, NA, NA, NA, NA),
v4 = c(NA, NA, NA, 2L, NA, NA, NA, NA, NA, NA, 1L, 2L, NA,
NA, NA, NA), v5 = c(NA, NA, 1L, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA), v6 = c(NA, 2L, NA, NA, NA, NA, NA,
NA, NA, 1L, NA, NA, NA, NA, NA, NA), v7 = c(NA, NA, NA, NA,
1L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), v8 = c(NA,
NA, NA, NA, NA, 1L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
), v9 = c(NA, NA, NA, NA, NA, NA, NA, 2L, NA, NA, NA, NA,
NA, NA, NA, NA), v10 = c(NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA), v11 = c(NA, NA, NA, NA,
NA, NA, NA, NA, 1L, NA, NA, NA, NA, NA, NA, NA), v12 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1L, NA, NA, NA
), v13 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, 1L, NA, NA), v14 = c(NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA), v15 = c(NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), v16 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 2L
)), .Names = c("respondentID", "v1", "v2", "v3", "v4", "v5",
"v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
"v16"), class = "data.frame", row.names = c(NA, -16L))
如果我只想知道选择了哪朵花和颜色,我可以使用:
df1_with_id <- df1 %>%
setNames(paste0("color", 1:4)) %>%
mutate(combo = paste0("v", row_number()))
result_df <- df2 %>%
gather(key = combo, value = val, -respondentID) %>%
filter(!is.na(val)) %>%
left_join(df1_with_id, by = "combo") %>%
arrange(respondentID)
但这并没有提供我需要的格式。我需要在单独的行中向每个受访者显示两个选项(即 "rose that is V1 and V2" 和 "tulip that is V3 and V4")的信息,以及一个指示两个选项之间的选择的附加变量,如下所示:
Desired result
(图中,choice变量中的“1”表示受访者选择的选项,“0”表示未选择的选项。)
我不太明白如何编写代码以这种方式组织数据。有什么建议吗?
这里的主要问题是 df1
中的每一列都表示两位信息:花的类型和颜色编号。因此,将它们重命名以包含这两个信息位,将它们聚集到一列中,将关键列分隔为 flower
和 color
列,然后展开 color
列。然后你只需要将 val
转换成 1
如果它匹配 flower
列,或者 0
否则。
df2 %>%
gather(key = combo, value = val, -respondentID) %>%
filter(!is.na(val)) %>%
left_join(df1_with_id, by = "combo") %>%
arrange(respondentID) %>%
rename(rose_color1 = color1, rose_color2 = color2,
tulip_color1 = color3, tulip_color2 = color4) %>%
gather(color, value, rose_color1:tulip_color2) %>%
separate(color, into = c('flower', 'color')) %>%
spread(color, value) %>%
mutate(val = if_else(val == 1, 'rose', 'tulip')) %>%
mutate(val = if_else(val == flower, 1, 0)) %>%
select(respondentID, flower, color1, color2, choice = val)
我有一个调查问题,格式为:"Do you prefer a rose or a tulip? Imagine that the rose has colors V1 and V2, and the tulip has colors V3 and V4"
实际颜色是从一个数据框中包含的组合中提取的:
数据帧 1 (df1):
structure(list(V1 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("red", "ruby"), class = "factor"),
V2 = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L), .Label = c("blue", "violet"), class = "factor"),
V3 = structure(c(1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L,
2L, 2L, 1L, 1L, 2L, 2L), .Label = c("green", "turqoise"), class = "factor"),
V4 = structure(c(2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L,
2L, 1L, 2L, 1L, 2L, 1L), .Label = c("black", "yellow"), class = "factor")), .Names = c("V1",
"V2", "V3", "V4"), class = "data.frame", row.names = c(NA, -16L
))
在此数据框 (df1) 中,前两列(V1 和 V2)对应于 "rose",最后两列(V3 和 V4)对应于 "tulip"。例如,可以向受访者显示 df1 第一行的组合 1,即 "red blue green yellow"。这意味着受访者可以选择 "rose that is red and blue" 或 "tulip that is green and yellow".
受访者做出的选择包含在单独的数据框 (df2) 中。 df2 每一种颜色组合都有一列。如果向受访者 1 展示了 df1 ("red blue green yellow") 中的第一个组合并选择了郁金香(即绿色和黄色),则该选项在第一行中标记为“2”(代表郁金香,即第二朵花) df2。如果向受访者 2 显示 df1 ("red blue green black") 的第二个组合并选择玫瑰(即红色和蓝色),则该选项在第二行中标记为“1”(代表玫瑰,即第一朵花) df2。换句话说,“2”表示"tulip chosen, rose not chosen",“1”表示"rose chosen, tulip not chosen"。
数据帧 2 (df2):
structure(list(respondentID = 1:16, v1 = c(2L, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), v2 = c(NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), v3 = c(NA,
NA, NA, NA, NA, NA, 1L, NA, NA, NA, NA, NA, NA, NA, NA, NA),
v4 = c(NA, NA, NA, 2L, NA, NA, NA, NA, NA, NA, 1L, 2L, NA,
NA, NA, NA), v5 = c(NA, NA, 1L, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA), v6 = c(NA, 2L, NA, NA, NA, NA, NA,
NA, NA, 1L, NA, NA, NA, NA, NA, NA), v7 = c(NA, NA, NA, NA,
1L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), v8 = c(NA,
NA, NA, NA, NA, 1L, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA
), v9 = c(NA, NA, NA, NA, NA, NA, NA, 2L, NA, NA, NA, NA,
NA, NA, NA, NA), v10 = c(NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA), v11 = c(NA, NA, NA, NA,
NA, NA, NA, NA, 1L, NA, NA, NA, NA, NA, NA, NA), v12 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1L, NA, NA, NA
), v13 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, 1L, NA, NA), v14 = c(NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA), v15 = c(NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), v16 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 2L
)), .Names = c("respondentID", "v1", "v2", "v3", "v4", "v5",
"v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
"v16"), class = "data.frame", row.names = c(NA, -16L))
如果我只想知道选择了哪朵花和颜色,我可以使用:
df1_with_id <- df1 %>%
setNames(paste0("color", 1:4)) %>%
mutate(combo = paste0("v", row_number()))
result_df <- df2 %>%
gather(key = combo, value = val, -respondentID) %>%
filter(!is.na(val)) %>%
left_join(df1_with_id, by = "combo") %>%
arrange(respondentID)
但这并没有提供我需要的格式。我需要在单独的行中向每个受访者显示两个选项(即 "rose that is V1 and V2" 和 "tulip that is V3 and V4")的信息,以及一个指示两个选项之间的选择的附加变量,如下所示: Desired result
(图中,choice变量中的“1”表示受访者选择的选项,“0”表示未选择的选项。)
我不太明白如何编写代码以这种方式组织数据。有什么建议吗?
这里的主要问题是 df1
中的每一列都表示两位信息:花的类型和颜色编号。因此,将它们重命名以包含这两个信息位,将它们聚集到一列中,将关键列分隔为 flower
和 color
列,然后展开 color
列。然后你只需要将 val
转换成 1
如果它匹配 flower
列,或者 0
否则。
df2 %>%
gather(key = combo, value = val, -respondentID) %>%
filter(!is.na(val)) %>%
left_join(df1_with_id, by = "combo") %>%
arrange(respondentID) %>%
rename(rose_color1 = color1, rose_color2 = color2,
tulip_color1 = color3, tulip_color2 = color4) %>%
gather(color, value, rose_color1:tulip_color2) %>%
separate(color, into = c('flower', 'color')) %>%
spread(color, value) %>%
mutate(val = if_else(val == 1, 'rose', 'tulip')) %>%
mutate(val = if_else(val == flower, 1, 0)) %>%
select(respondentID, flower, color1, color2, choice = val)