在 R 中,合并 2 个数据帧,同时保持第一个数据帧的行顺序
In R, merge 2 dataframes while maintaining the row order of the first dataframe
当我了解到在 merge() 中默认为 TRUE 的附加参数 'sort' 时,我真的以为我找到了解决这个问题的方法。但是,将其设置为 false 并没有帮助。下面是我的代码演示,以及我得到的结果和我想要的结果:
df2 = structure(list(player = c("Marvin Williams", "Spencer Hawes",
"Jeremy Lin", "Kemba Walker", "P.J. Hairston", "Rudy Gay", "Rajon Rondo",
"DeMarcus Cousins", "Ben McLemore", "Willie Cauley-Stein"), global.player.id = c(263884L,
329824L, 340730L, 462980L, 609567L, 266358L, 262882L, 509450L,
604898L, 699950L), team.name = c("Hornets", "Hornets", "Hornets",
"Hornets", "Grizzlies", "Kings", "Kings", "Kings", "Kings", "Kings"
)), .Names = c("player", "global.player.id", "team.name"), class = "data.frame", row.names = c(47L,
48L, 52L, 53L, 225L, 389L, 390L, 395L, 398L, 401L))
df1 = structure(list(global.player.id = c(-1L, 262882L, 266358L, 509450L,
604898L, 699950L, 263884L, 329824L, 340730L, 462980L, 609567L,
-1L, 262882L, 266358L, 509450L, 604898L, 699950L, 263884L, 329824L,
340730L, 462980L, 609567L, -1L, 262882L, 266358L), x_loc = c(47.17753,
13.57165, 46.45843, 26.68803, 52.16717, 47.20201, 60.097, 47.20201,
52.16717, 65.1302, 46.45843, 47.19141, 13.61702, 46.5355, 26.71856,
52.25433, 47.27324, 60.08215, 47.27324, 52.25433, 65.11267, 46.5355,
46.82163, 13.66478, 46.57545), y_loc = c(26.44326, 25.18298,
18.46573, 25.48557, 33.09177, 31.09372, 22.79717, 31.09372, 33.09177,
26.39671, 18.46573, 26.5187, 25.17431, 18.42014, 25.53807, 33.11185,
31.01197, 22.76307, 31.01197, 33.11185, 26.40227, 18.42014, 26.72834,
25.17784, 18.35961), order = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25)), .Names = c("global.player.id",
"x_loc", "y_loc", "order"), row.names = c("1", "2", "3", "4",
"5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15",
"16", "17", "18", "19", "20", "21", "22", "23", "24", "25"), class = "data.frame")
以上是我正在使用的数据帧。当我将 df2 合并到它时,我想保留 df1 的顺序。我在这里处理时间序列数据,所以数据帧的顺序很重要。 df1中的order列只是为了测试df1是否被洗牌(我不想在合并后使用额外的代码来排序)。
这是我试过的方法:
merge(df1, df2, by = 'global.player.id', all.x = TRUE)
global.player.id x_loc y_loc order player team.name
1 -1 47.17753 26.44326 1 <NA> <NA>
2 -1 46.82163 26.72834 23 <NA> <NA>
3 -1 47.19141 26.51870 12 <NA> <NA>
4 262882 13.57165 25.18298 2 Rajon Rondo Kings
5 262882 13.61702 25.17431 13 Rajon Rondo Kings
6 262882 13.66478 25.17784 24 Rajon Rondo Kings
7 263884 60.08215 22.76307 18 Marvin Williams Hornets
8 263884 60.09700 22.79717 7 Marvin Williams Hornets
9 266358 46.53550 18.42014 14 Rudy Gay Kings
10 266358 46.45843 18.46573 3 Rudy Gay Kings
11 266358 46.57545 18.35961 25 Rudy Gay Kings
12 329824 47.27324 31.01197 19 Spencer Hawes Hornets
13 329824 47.20201 31.09372 8 Spencer Hawes Hornets
14 340730 52.16717 33.09177 9 Jeremy Lin Hornets
15 340730 52.25433 33.11185 20 Jeremy Lin Hornets
16 462980 65.13020 26.39671 10 Kemba Walker Hornets
17 462980 65.11267 26.40227 21 Kemba Walker Hornets
18 509450 26.71856 25.53807 15 DeMarcus Cousins Kings
19 509450 26.68803 25.48557 4 DeMarcus Cousins Kings
20 604898 52.16717 33.09177 5 Ben McLemore Kings
21 604898 52.25433 33.11185 16 Ben McLemore Kings
22 609567 46.53550 18.42014 22 P.J. Hairston Grizzlies
23 609567 46.45843 18.46573 11 P.J. Hairston Grizzlies
24 699950 47.20201 31.09372 6 Willie Cauley-Stein Kings
25 699950 47.27324 31.01197 17 Willie Cauley-Stein Kings
原来在df1里顺序是1-25,现在全乱了。很明显,df1 以一种我不希望的方式被洗牌。这是我将 sort = FALSE 传递给合并函数时的输出:
merge(df1, df2, by = 'global.player.id', all.x = TRUE, sort = FALSE)
global.player.idx_locy_loc命令玩家team.name
1 262882 13.57165 25.18298 2 Rajon Rondo Kings
2 262882 13.61702 25.17431 13 Rajon Rondo Kings
3 262882 13.66478 25.17784 24 Rajon Rondo Kings
4 266358 46.53550 18.42014 14 Rudy Gay Kings
5 266358 46.45843 18.46573 3 Rudy Gay Kings
6 266358 46.57545 18.35961 25 Rudy Gay Kings
7 509450 26.71856 25.53807 15 DeMarcus Cousins Kings
8 509450 26.68803 25.48557 4 DeMarcus Cousins Kings
9 604898 52.16717 33.09177 5 Ben McLemore Kings
10 604898 52.25433 33.11185 16 Ben McLemore Kings
11 699950 47.20201 31.09372 6 Willie Cauley-Stein Kings
12 699950 47.27324 31.01197 17 Willie Cauley-Stein Kings
13 263884 60.08215 22.76307 18 Marvin Williams Hornets
14 263884 60.09700 22.79717 7 Marvin Williams Hornets
15 329824 47.27324 31.01197 19 Spencer Hawes Hornets
16 329824 47.20201 31.09372 8 Spencer Hawes Hornets
17 340730 52.16717 33.09177 9 Jeremy Lin Hornets
18 340730 52.25433 33.11185 20 Jeremy Lin Hornets
19 462980 65.13020 26.39671 10 Kemba Walker Hornets
20 462980 65.11267 26.40227 21 Kemba Walker Hornets
21 609567 46.53550 18.42014 22 P.J. Hairston Grizzlies
22 609567 46.45843 18.46573 11 P.J. Hairston Grizzlies
23 -1 47.17753 26.44326 1 <NA> <NA>
24 -1 46.82163 26.72834 23 <NA> <NA>
25 -1 47.19141 26.51870 12 <NA> <NA>
也不是我想要的,因为订单又乱了。
是否可以在不完全改组传递的第一个数据帧参数的情况下调用合并函数,或者我完全不走运。如果是这样的话,这似乎是 merge() 函数的一个重大缺陷。谢谢!
您可以使用 plyr
中的 join
library(plyr)
plyr::join(df1,df2, by='global.player.id')
结果未排序。
当我了解到在 merge() 中默认为 TRUE 的附加参数 'sort' 时,我真的以为我找到了解决这个问题的方法。但是,将其设置为 false 并没有帮助。下面是我的代码演示,以及我得到的结果和我想要的结果:
df2 = structure(list(player = c("Marvin Williams", "Spencer Hawes",
"Jeremy Lin", "Kemba Walker", "P.J. Hairston", "Rudy Gay", "Rajon Rondo",
"DeMarcus Cousins", "Ben McLemore", "Willie Cauley-Stein"), global.player.id = c(263884L,
329824L, 340730L, 462980L, 609567L, 266358L, 262882L, 509450L,
604898L, 699950L), team.name = c("Hornets", "Hornets", "Hornets",
"Hornets", "Grizzlies", "Kings", "Kings", "Kings", "Kings", "Kings"
)), .Names = c("player", "global.player.id", "team.name"), class = "data.frame", row.names = c(47L,
48L, 52L, 53L, 225L, 389L, 390L, 395L, 398L, 401L))
df1 = structure(list(global.player.id = c(-1L, 262882L, 266358L, 509450L,
604898L, 699950L, 263884L, 329824L, 340730L, 462980L, 609567L,
-1L, 262882L, 266358L, 509450L, 604898L, 699950L, 263884L, 329824L,
340730L, 462980L, 609567L, -1L, 262882L, 266358L), x_loc = c(47.17753,
13.57165, 46.45843, 26.68803, 52.16717, 47.20201, 60.097, 47.20201,
52.16717, 65.1302, 46.45843, 47.19141, 13.61702, 46.5355, 26.71856,
52.25433, 47.27324, 60.08215, 47.27324, 52.25433, 65.11267, 46.5355,
46.82163, 13.66478, 46.57545), y_loc = c(26.44326, 25.18298,
18.46573, 25.48557, 33.09177, 31.09372, 22.79717, 31.09372, 33.09177,
26.39671, 18.46573, 26.5187, 25.17431, 18.42014, 25.53807, 33.11185,
31.01197, 22.76307, 31.01197, 33.11185, 26.40227, 18.42014, 26.72834,
25.17784, 18.35961), order = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25)), .Names = c("global.player.id",
"x_loc", "y_loc", "order"), row.names = c("1", "2", "3", "4",
"5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15",
"16", "17", "18", "19", "20", "21", "22", "23", "24", "25"), class = "data.frame")
以上是我正在使用的数据帧。当我将 df2 合并到它时,我想保留 df1 的顺序。我在这里处理时间序列数据,所以数据帧的顺序很重要。 df1中的order列只是为了测试df1是否被洗牌(我不想在合并后使用额外的代码来排序)。
这是我试过的方法:
merge(df1, df2, by = 'global.player.id', all.x = TRUE)
global.player.id x_loc y_loc order player team.name
1 -1 47.17753 26.44326 1 <NA> <NA>
2 -1 46.82163 26.72834 23 <NA> <NA>
3 -1 47.19141 26.51870 12 <NA> <NA>
4 262882 13.57165 25.18298 2 Rajon Rondo Kings
5 262882 13.61702 25.17431 13 Rajon Rondo Kings
6 262882 13.66478 25.17784 24 Rajon Rondo Kings
7 263884 60.08215 22.76307 18 Marvin Williams Hornets
8 263884 60.09700 22.79717 7 Marvin Williams Hornets
9 266358 46.53550 18.42014 14 Rudy Gay Kings
10 266358 46.45843 18.46573 3 Rudy Gay Kings
11 266358 46.57545 18.35961 25 Rudy Gay Kings
12 329824 47.27324 31.01197 19 Spencer Hawes Hornets
13 329824 47.20201 31.09372 8 Spencer Hawes Hornets
14 340730 52.16717 33.09177 9 Jeremy Lin Hornets
15 340730 52.25433 33.11185 20 Jeremy Lin Hornets
16 462980 65.13020 26.39671 10 Kemba Walker Hornets
17 462980 65.11267 26.40227 21 Kemba Walker Hornets
18 509450 26.71856 25.53807 15 DeMarcus Cousins Kings
19 509450 26.68803 25.48557 4 DeMarcus Cousins Kings
20 604898 52.16717 33.09177 5 Ben McLemore Kings
21 604898 52.25433 33.11185 16 Ben McLemore Kings
22 609567 46.53550 18.42014 22 P.J. Hairston Grizzlies
23 609567 46.45843 18.46573 11 P.J. Hairston Grizzlies
24 699950 47.20201 31.09372 6 Willie Cauley-Stein Kings
25 699950 47.27324 31.01197 17 Willie Cauley-Stein Kings
原来在df1里顺序是1-25,现在全乱了。很明显,df1 以一种我不希望的方式被洗牌。这是我将 sort = FALSE 传递给合并函数时的输出:
merge(df1, df2, by = 'global.player.id', all.x = TRUE, sort = FALSE)
global.player.idx_locy_loc命令玩家team.name
1 262882 13.57165 25.18298 2 Rajon Rondo Kings
2 262882 13.61702 25.17431 13 Rajon Rondo Kings
3 262882 13.66478 25.17784 24 Rajon Rondo Kings
4 266358 46.53550 18.42014 14 Rudy Gay Kings
5 266358 46.45843 18.46573 3 Rudy Gay Kings
6 266358 46.57545 18.35961 25 Rudy Gay Kings
7 509450 26.71856 25.53807 15 DeMarcus Cousins Kings
8 509450 26.68803 25.48557 4 DeMarcus Cousins Kings
9 604898 52.16717 33.09177 5 Ben McLemore Kings
10 604898 52.25433 33.11185 16 Ben McLemore Kings
11 699950 47.20201 31.09372 6 Willie Cauley-Stein Kings
12 699950 47.27324 31.01197 17 Willie Cauley-Stein Kings
13 263884 60.08215 22.76307 18 Marvin Williams Hornets
14 263884 60.09700 22.79717 7 Marvin Williams Hornets
15 329824 47.27324 31.01197 19 Spencer Hawes Hornets
16 329824 47.20201 31.09372 8 Spencer Hawes Hornets
17 340730 52.16717 33.09177 9 Jeremy Lin Hornets
18 340730 52.25433 33.11185 20 Jeremy Lin Hornets
19 462980 65.13020 26.39671 10 Kemba Walker Hornets
20 462980 65.11267 26.40227 21 Kemba Walker Hornets
21 609567 46.53550 18.42014 22 P.J. Hairston Grizzlies
22 609567 46.45843 18.46573 11 P.J. Hairston Grizzlies
23 -1 47.17753 26.44326 1 <NA> <NA>
24 -1 46.82163 26.72834 23 <NA> <NA>
25 -1 47.19141 26.51870 12 <NA> <NA>
也不是我想要的,因为订单又乱了。
是否可以在不完全改组传递的第一个数据帧参数的情况下调用合并函数,或者我完全不走运。如果是这样的话,这似乎是 merge() 函数的一个重大缺陷。谢谢!
您可以使用 plyr
join
library(plyr)
plyr::join(df1,df2, by='global.player.id')
结果未排序。