R - 使用多个标识符匹配值(当查找 ID 的顺序是随机的时)
R - Match values using multiple identifiers (when the order of lookup IDs are random)
我的问题是此 的后续问题。我在这里提出一个新问题 - 因为这与上一个问题非常不同。
假设我有以下两个数据集:
df1 = data.frame(PersonId1=c(1,2,3,4,5,6,7,8,9,10,1),PersonId2=c(11,12,13,14,15,16,17,18,19,20,11),
Played_together = c(1,0,0,1,1,0,0,0,1,0,1),
Event=c(1,1,1,1,2,2,2,2,2,2,2),
Utility=c(20,-2,-5,10,30,2,1,.5,50,-1,60))
这看起来像:
PersonId1 PersonId2 Played_together Event Utility
1 1 11 1 1 20.0
2 2 12 0 1 -2.0
3 3 13 0 1 -5.0
4 4 14 1 1 10.0
5 5 15 1 2 30.0
6 6 16 0 2 2.0
7 7 17 0 2 1.0
8 8 18 0 2 0.5
9 9 19 1 2 50.0
10 10 20 0 2 -1.0
11 1 11 1 2 60.0
.
df2 = data.frame(PersonId1=c(11,15,9,1),PersonId2=c(1,5,19,11),
Played_together = c(1,1,1,1),
Event=c(1,2,2,2),Utility=c(25,36,51,64))
这看起来像:
PersonId1 PersonId2 Played_together Event Utility
1 11 1 1 1 25
2 15 5 1 2 36
3 9 19 1 2 51
4 1 11 1 2 64
我想执行以下操作:在 df2[ 中查找每一对(每个事件中的 和 played_together == 1) =37=] 并将其与 df1 中的观察结果相匹配。如果匹配,则在 df1 中创建一个名为 'Utility from df2' 的新列。不是,填0。
对我来说挑战来自于人的顺序在 df1 和 df2 中不一致。例如,在 df1 第 1 行中,对于 event== 1 和 played_together == 1,我们看到:personid1 = 1 和 personid2 = 11 而在 df2 中,在第 1 行中我有 personid1=11 和 personid2 =1,对于事件== 1 和 played_together==1。因此两者是相同的。我想从 df2 中获取实用程序的值并将其放入 df1 中的新列中。如果没有匹配,则输入 0.
最终数据框应如下所示:
PersonId1 PersonId2 Played_together Event Utility Utility_from_df2
1 1 11 1 1 20.0 25
2 2 12 0 1 -2.0 0
3 3 13 0 1 -5.0 0
4 4 14 1 1 10.0 0
5 5 15 1 2 30.0 36
6 6 16 0 2 2.0 0
7 7 17 0 2 1.0 0
8 8 18 0 2 0.5 0
9 9 19 1 2 50.0 51
10 10 20 0 2 -1.0 0
11 1 11 1 2 60.0 64
非常感谢。
使用 dplyr
和 data.table
:
df2 = data.frame(PersonId1=c(11,15,9,1),PersonId2=c(1,5,19,11),
Played_together = c(1,1,1,1),
Event=c(1,2,2,2),
Utility=c(25,36,51,64)) # you had missed adding Utility in your ques
library(data.table)
library(dplyr)
df3 <- copy(df2)
colnames(df2) <- c("PersonId2", "PersonId1", "Played_together", "Event", "Utility")
setDT(df2)
df2 <- df2[, c("PersonId2", "PersonId1", "Utility", "Event")]
df3 <- df3[, c("PersonId2", "PersonId1", "Utility", "Event")]
df <- left_join(df1, df2, c("PersonId2", "PersonId1", "Event"))
df <- left_join(df, df3, by = c("PersonId2", "PersonId1", "Event"))
setDT(df)
df[, Utility_from_df2 := ifelse(is.na(Utility), Utility.y, ifelse(is.na(Utility.y), Utility, 0))]
df[is.na(df)] <- 0
df[, c("Utility.y", "Utility") := NULL]
setnames(df, "Utility.x", "Utility")
期望的输出:
PersonId1 PersonId2 Played_together Event Utility Utility_from_df2
1: 1 11 1 1 20.0 25
2: 2 12 0 1 -2.0 0
3: 3 13 0 1 -5.0 0
4: 4 14 1 1 10.0 0
5: 5 15 1 2 30.0 36
6: 6 16 0 2 2.0 0
7: 7 17 0 2 1.0 0
8: 8 18 0 2 0.5 0
9: 9 19 1 2 50.0 51
10: 10 20 0 2 -1.0 0
11: 1 11 1 2 60.0 64
我的问题是此
假设我有以下两个数据集:
df1 = data.frame(PersonId1=c(1,2,3,4,5,6,7,8,9,10,1),PersonId2=c(11,12,13,14,15,16,17,18,19,20,11),
Played_together = c(1,0,0,1,1,0,0,0,1,0,1),
Event=c(1,1,1,1,2,2,2,2,2,2,2),
Utility=c(20,-2,-5,10,30,2,1,.5,50,-1,60))
这看起来像:
PersonId1 PersonId2 Played_together Event Utility
1 1 11 1 1 20.0
2 2 12 0 1 -2.0
3 3 13 0 1 -5.0
4 4 14 1 1 10.0
5 5 15 1 2 30.0
6 6 16 0 2 2.0
7 7 17 0 2 1.0
8 8 18 0 2 0.5
9 9 19 1 2 50.0
10 10 20 0 2 -1.0
11 1 11 1 2 60.0
.
df2 = data.frame(PersonId1=c(11,15,9,1),PersonId2=c(1,5,19,11),
Played_together = c(1,1,1,1),
Event=c(1,2,2,2),Utility=c(25,36,51,64))
这看起来像:
PersonId1 PersonId2 Played_together Event Utility
1 11 1 1 1 25
2 15 5 1 2 36
3 9 19 1 2 51
4 1 11 1 2 64
我想执行以下操作:在 df2[ 中查找每一对(每个事件中的 和 played_together == 1) =37=] 并将其与 df1 中的观察结果相匹配。如果匹配,则在 df1 中创建一个名为 'Utility from df2' 的新列。不是,填0。
对我来说挑战来自于人的顺序在 df1 和 df2 中不一致。例如,在 df1 第 1 行中,对于 event== 1 和 played_together == 1,我们看到:personid1 = 1 和 personid2 = 11 而在 df2 中,在第 1 行中我有 personid1=11 和 personid2 =1,对于事件== 1 和 played_together==1。因此两者是相同的。我想从 df2 中获取实用程序的值并将其放入 df1 中的新列中。如果没有匹配,则输入 0.
最终数据框应如下所示:
PersonId1 PersonId2 Played_together Event Utility Utility_from_df2
1 1 11 1 1 20.0 25
2 2 12 0 1 -2.0 0
3 3 13 0 1 -5.0 0
4 4 14 1 1 10.0 0
5 5 15 1 2 30.0 36
6 6 16 0 2 2.0 0
7 7 17 0 2 1.0 0
8 8 18 0 2 0.5 0
9 9 19 1 2 50.0 51
10 10 20 0 2 -1.0 0
11 1 11 1 2 60.0 64
非常感谢。
使用 dplyr
和 data.table
:
df2 = data.frame(PersonId1=c(11,15,9,1),PersonId2=c(1,5,19,11),
Played_together = c(1,1,1,1),
Event=c(1,2,2,2),
Utility=c(25,36,51,64)) # you had missed adding Utility in your ques
library(data.table)
library(dplyr)
df3 <- copy(df2)
colnames(df2) <- c("PersonId2", "PersonId1", "Played_together", "Event", "Utility")
setDT(df2)
df2 <- df2[, c("PersonId2", "PersonId1", "Utility", "Event")]
df3 <- df3[, c("PersonId2", "PersonId1", "Utility", "Event")]
df <- left_join(df1, df2, c("PersonId2", "PersonId1", "Event"))
df <- left_join(df, df3, by = c("PersonId2", "PersonId1", "Event"))
setDT(df)
df[, Utility_from_df2 := ifelse(is.na(Utility), Utility.y, ifelse(is.na(Utility.y), Utility, 0))]
df[is.na(df)] <- 0
df[, c("Utility.y", "Utility") := NULL]
setnames(df, "Utility.x", "Utility")
期望的输出:
PersonId1 PersonId2 Played_together Event Utility Utility_from_df2
1: 1 11 1 1 20.0 25
2: 2 12 0 1 -2.0 0
3: 3 13 0 1 -5.0 0
4: 4 14 1 1 10.0 0
5: 5 15 1 2 30.0 36
6: 6 16 0 2 2.0 0
7: 7 17 0 2 1.0 0
8: 8 18 0 2 0.5 0
9: 9 19 1 2 50.0 51
10: 10 20 0 2 -1.0 0
11: 1 11 1 2 60.0 64