通过匹配日期合并 2 个数据框
Merge 2 dataframes by matching dates
我有两个数据框:
id dates
MUM-1 2015-07-10
MUM-1 2015-07-11
MUM-1 2015-07-12
MUM-2 2014-01-14
MUM-2 2014-01-15
MUM-2 2014-01-16
MUM-2 2014-01-17
和:
id dates field1 field2
MUM-1 2015-07-10 1 0
MUM-1 2015-07-12 2 1
MUM-2 2014-01-14 4 3
MUM-2 2014-01-17 0 1
合并数据:
id dates field1 field2
MUM-1 2015-07-10 1 0
MUM-1 2015-07-11 na na
MUM-1 2015-07-12 2 1
MUM-2 2014-01-14 4 3
MUM-2 2014-01-15 na na
MUM-2 2014-01-16 na na
MUM-2 2014-01-17 0 1
代码:merge(x= df1, y= df2, by= 'id', all.x= T)
我正在使用合并,但由于两个数据帧的大小太大,处理时间太长。除了合并功能还有其他选择吗?也许在 dplyr?这样它处理起来比较快。两个数据框都有超过 900K 行。
我认为目前对于这些情况(巨大的数据集)最快的解决方案是 data.table 先设置键后合并。
你也可以使用dplyr
的left_join
和data.frames,如前所述,但比较一下将 data.frames 转换为 data.tables 后执行相同的命令。换句话说,在后台使用 dplyr
和 data.table 结构。
作为示例,我将创建两个数据集,然后将它们保存为 data.frame、带密钥的 data.table 和不带密钥的 data.table。
然后我会执行各种合并并计算时间:
library(data.table)
library(dplyr)
# create and save this dataset as a data.frame and as a data.table
list = seq(1,500000)
random_number = rnorm(500000,10,5)
dataT11 = data.table(list, random_number, key="list") # data.table with a key
dataT12 = data.table(list, random_number) # data.table without key
dataF1 = data.frame(list, random_number)
# create and save this dataset as a data.frame and as a data.table
list = seq(1,500000)
random_number = rnorm(500000,10,5)
dataT21 = data.table(list, random_number, key="list")
dataT22 = data.table(list, random_number)
dataF2 = data.frame(list, random_number)
# check your current data tables (note some have keys)
tables()
# merge the datasets as data.frames and count time
ptm <- proc.time()
dataF3 = merge(dataF1, dataF2, all.x=T)
proc.time() - ptm
# merge the datasets as data.tables by setting the key now and count time
ptm <- proc.time()
dataT3 = merge(dataT12, dataT22, all.x=T, by = "list")
proc.time() - ptm
# merge the datasets as data.tables on the key they have already and count time
ptm <- proc.time()
dataT3 = merge(dataT11, dataT21, all.x=T)
proc.time() - ptm
# merge the datasets as data.tables on the key they have already and count time (alternative)
ptm <- proc.time()
dataT3 = dataT11[dataT21]
proc.time() - ptm
# merge the datasets as data.frames using dplyr and count time
ptm <- proc.time()
dataT3 = dataF1 %>% left_join(dataF2, by="list")
proc.time() - ptm
# merge the datasets as data.tables using dplyr and count time
ptm <- proc.time()
dataT3 = dataT11 %>% left_join(dataT21, by="list")
proc.time() - ptm
可以将两个数据帧都转换为数据tables,然后进行合并:
library(data.table)
setDT(df1); setDT(df2)
merge(df1, df2, by = "id", allow.cartesian = TRUE)
当任何合并元素的键中有重复值时,allow.cartesian
部分允许合并(允许新的 table 长度大于原始元素的最大值,请参阅?data.table
.
除了使用 merge
和 data.table
,您还可以简单地加入如下:
setDT(df1)
setDT(df2)
df2[df1, on = c('id','dates')]
这给出:
> df2[df1]
id dates field1 field2
1: MUM-1 2015-07-10 1 0
2: MUM-1 2015-07-11 NA NA
3: MUM-1 2015-07-12 2 1
4: MUM-2 2014-01-14 4 3
5: MUM-2 2014-01-15 NA NA
6: MUM-2 2014-01-16 NA NA
7: MUM-2 2014-01-17 0 1
用 dplyr
做这个:
library(dplyr)
dplr <- left_join(df1, df2, by=c("id","dates"))
正如@Arun 在评论中提到的那样,基准测试对于七行的小型数据集意义不大。因此,让我们创建一些更大的数据集:
dt1 <- data.table(id=gl(2, 730, labels = c("MUM-1", "MUM-2")),
dates=c(seq(as.Date("2010-01-01"), as.Date("2011-12-31"), by="days"),
seq(as.Date("2013-01-01"), as.Date("2014-12-31"), by="days")))
dt2 <- data.table(id=gl(2, 730, labels = c("MUM-1", "MUM-2")),
dates=c(seq(as.Date("2010-01-01"), as.Date("2011-12-31"), by="days"),
seq(as.Date("2013-01-01"), as.Date("2014-12-31"), by="days")),
field1=sample(c(0,1,2,3,4), size=730, replace = TRUE),
field2=sample(c(0,1,2,3,4), size=730, replace = TRUE))
dt2 <- dt2[sample(nrow(dt2), 800)]
可以看出,@Arun的做法稍微快一点:
library(rbenchmark)
benchmark(replications = 10, order = "elapsed", columns = c("test", "elapsed", "relative"),
jaap = dt2[dt1, on = c('id','dates')],
pavo = merge(dt1,dt2,by="id",allow.cartesian=T),
dplr = left_join(dt1, dt2, by=c("id","dates")),
arun = dt1[dt2, c("fiedl1", "field2") := .(field1, field2), on=c("id", "dates")])
test elapsed relative
4 arun 0.015 1.000
1 jaap 0.016 1.067
3 dplr 0.037 2.467
2 pavo 1.033 68.867
有关大型数据集的比较,请参阅 。
我会通过参考直接更新 df1
如下:
require(data.table) # v1.9.5+
setDT(df1)[df2, c("fiedl1", "field2") :=
.(field1, field2), on=c("id", "dates")]
> df1
# id dates fiedl1 field2
# 1: MUM-1 2015-07-10 1 0
# 2: MUM-1 2015-07-11 NA NA
# 3: MUM-1 2015-07-12 2 1
# 4: MUM-2 2014-01-14 4 3
# 5: MUM-2 2014-01-15 NA NA
# 6: MUM-2 2014-01-16 NA NA
# 7: MUM-2 2014-01-17 0 1
这会非常节省内存(而且速度更快),因为它不会复制整个对象只是为了添加两列,而是就地更新。
更新后的数据集比@Jaap 的更新基准稍大:
set.seed(1L)
dt1 = CJ(id1 = paste("MUM", 1:1e4, sep = "-"), id2 = sample(1e3L))
dt2 = dt1[sample(nrow(dt1), 1e5L)][, c("field1", "field2") := lapply(c(1e3L, 1e4L), sample, 1e5L, TRUE)][]
# @Jaap's answers
system.time(ans1 <- setDT(dt2)[dt1, on = c('id1','id2')])
# user system elapsed
# 0.209 0.067 0.277
system.time(ans2 <- left_join(setDF(dt1), setDF(dt2), by = c("id1", "id2")))
# user system elapsed
# 119.911 0.530 120.749
# this answer
system.time(ans3 <- setDT(dt1)[dt2, c("field1", "field2") := list(field1, field2), on = c("id1", "id2")])
# user system elapsed
# 0.087 0.013 0.100
sessionInfo()
# R version 3.2.1 (2015-06-18)
# Platform: x86_64-apple-darwin13.4.0 (64-bit)
# Running under: OS X 10.10.4 (Yosemite)
# locale:
# [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
# attached base packages:
# [1] stats graphics grDevices utils datasets methods base
# other attached packages:
# [1] data.table_1.9.5 dplyr_0.4.2
# loaded via a namespace (and not attached):
# [1] magrittr_1.5 R6_2.1.0 assertthat_0.1 parallel_3.2.1 DBI_0.3.1
# [6] tools_3.2.1 Rcpp_0.12.0 chron_2.3-45
虽然预计 dplyr
会慢 ~1200 倍。
我有两个数据框:
id dates
MUM-1 2015-07-10
MUM-1 2015-07-11
MUM-1 2015-07-12
MUM-2 2014-01-14
MUM-2 2014-01-15
MUM-2 2014-01-16
MUM-2 2014-01-17
和:
id dates field1 field2
MUM-1 2015-07-10 1 0
MUM-1 2015-07-12 2 1
MUM-2 2014-01-14 4 3
MUM-2 2014-01-17 0 1
合并数据:
id dates field1 field2
MUM-1 2015-07-10 1 0
MUM-1 2015-07-11 na na
MUM-1 2015-07-12 2 1
MUM-2 2014-01-14 4 3
MUM-2 2014-01-15 na na
MUM-2 2014-01-16 na na
MUM-2 2014-01-17 0 1
代码:merge(x= df1, y= df2, by= 'id', all.x= T)
我正在使用合并,但由于两个数据帧的大小太大,处理时间太长。除了合并功能还有其他选择吗?也许在 dplyr?这样它处理起来比较快。两个数据框都有超过 900K 行。
我认为目前对于这些情况(巨大的数据集)最快的解决方案是 data.table 先设置键后合并。
你也可以使用dplyr
的left_join
和data.frames,如前所述,但比较一下将 data.frames 转换为 data.tables 后执行相同的命令。换句话说,在后台使用 dplyr
和 data.table 结构。
作为示例,我将创建两个数据集,然后将它们保存为 data.frame、带密钥的 data.table 和不带密钥的 data.table。 然后我会执行各种合并并计算时间:
library(data.table)
library(dplyr)
# create and save this dataset as a data.frame and as a data.table
list = seq(1,500000)
random_number = rnorm(500000,10,5)
dataT11 = data.table(list, random_number, key="list") # data.table with a key
dataT12 = data.table(list, random_number) # data.table without key
dataF1 = data.frame(list, random_number)
# create and save this dataset as a data.frame and as a data.table
list = seq(1,500000)
random_number = rnorm(500000,10,5)
dataT21 = data.table(list, random_number, key="list")
dataT22 = data.table(list, random_number)
dataF2 = data.frame(list, random_number)
# check your current data tables (note some have keys)
tables()
# merge the datasets as data.frames and count time
ptm <- proc.time()
dataF3 = merge(dataF1, dataF2, all.x=T)
proc.time() - ptm
# merge the datasets as data.tables by setting the key now and count time
ptm <- proc.time()
dataT3 = merge(dataT12, dataT22, all.x=T, by = "list")
proc.time() - ptm
# merge the datasets as data.tables on the key they have already and count time
ptm <- proc.time()
dataT3 = merge(dataT11, dataT21, all.x=T)
proc.time() - ptm
# merge the datasets as data.tables on the key they have already and count time (alternative)
ptm <- proc.time()
dataT3 = dataT11[dataT21]
proc.time() - ptm
# merge the datasets as data.frames using dplyr and count time
ptm <- proc.time()
dataT3 = dataF1 %>% left_join(dataF2, by="list")
proc.time() - ptm
# merge the datasets as data.tables using dplyr and count time
ptm <- proc.time()
dataT3 = dataT11 %>% left_join(dataT21, by="list")
proc.time() - ptm
可以将两个数据帧都转换为数据tables,然后进行合并:
library(data.table)
setDT(df1); setDT(df2)
merge(df1, df2, by = "id", allow.cartesian = TRUE)
当任何合并元素的键中有重复值时,allow.cartesian
部分允许合并(允许新的 table 长度大于原始元素的最大值,请参阅?data.table
.
除了使用 merge
和 data.table
,您还可以简单地加入如下:
setDT(df1)
setDT(df2)
df2[df1, on = c('id','dates')]
这给出:
> df2[df1]
id dates field1 field2
1: MUM-1 2015-07-10 1 0
2: MUM-1 2015-07-11 NA NA
3: MUM-1 2015-07-12 2 1
4: MUM-2 2014-01-14 4 3
5: MUM-2 2014-01-15 NA NA
6: MUM-2 2014-01-16 NA NA
7: MUM-2 2014-01-17 0 1
用 dplyr
做这个:
library(dplyr)
dplr <- left_join(df1, df2, by=c("id","dates"))
正如@Arun 在评论中提到的那样,基准测试对于七行的小型数据集意义不大。因此,让我们创建一些更大的数据集:
dt1 <- data.table(id=gl(2, 730, labels = c("MUM-1", "MUM-2")),
dates=c(seq(as.Date("2010-01-01"), as.Date("2011-12-31"), by="days"),
seq(as.Date("2013-01-01"), as.Date("2014-12-31"), by="days")))
dt2 <- data.table(id=gl(2, 730, labels = c("MUM-1", "MUM-2")),
dates=c(seq(as.Date("2010-01-01"), as.Date("2011-12-31"), by="days"),
seq(as.Date("2013-01-01"), as.Date("2014-12-31"), by="days")),
field1=sample(c(0,1,2,3,4), size=730, replace = TRUE),
field2=sample(c(0,1,2,3,4), size=730, replace = TRUE))
dt2 <- dt2[sample(nrow(dt2), 800)]
可以看出,@Arun的做法稍微快一点:
library(rbenchmark)
benchmark(replications = 10, order = "elapsed", columns = c("test", "elapsed", "relative"),
jaap = dt2[dt1, on = c('id','dates')],
pavo = merge(dt1,dt2,by="id",allow.cartesian=T),
dplr = left_join(dt1, dt2, by=c("id","dates")),
arun = dt1[dt2, c("fiedl1", "field2") := .(field1, field2), on=c("id", "dates")])
test elapsed relative
4 arun 0.015 1.000
1 jaap 0.016 1.067
3 dplr 0.037 2.467
2 pavo 1.033 68.867
有关大型数据集的比较,请参阅
我会通过参考直接更新 df1
如下:
require(data.table) # v1.9.5+
setDT(df1)[df2, c("fiedl1", "field2") :=
.(field1, field2), on=c("id", "dates")]
> df1
# id dates fiedl1 field2
# 1: MUM-1 2015-07-10 1 0
# 2: MUM-1 2015-07-11 NA NA
# 3: MUM-1 2015-07-12 2 1
# 4: MUM-2 2014-01-14 4 3
# 5: MUM-2 2014-01-15 NA NA
# 6: MUM-2 2014-01-16 NA NA
# 7: MUM-2 2014-01-17 0 1
这会非常节省内存(而且速度更快),因为它不会复制整个对象只是为了添加两列,而是就地更新。
更新后的数据集比@Jaap 的更新基准稍大:
set.seed(1L)
dt1 = CJ(id1 = paste("MUM", 1:1e4, sep = "-"), id2 = sample(1e3L))
dt2 = dt1[sample(nrow(dt1), 1e5L)][, c("field1", "field2") := lapply(c(1e3L, 1e4L), sample, 1e5L, TRUE)][]
# @Jaap's answers
system.time(ans1 <- setDT(dt2)[dt1, on = c('id1','id2')])
# user system elapsed
# 0.209 0.067 0.277
system.time(ans2 <- left_join(setDF(dt1), setDF(dt2), by = c("id1", "id2")))
# user system elapsed
# 119.911 0.530 120.749
# this answer
system.time(ans3 <- setDT(dt1)[dt2, c("field1", "field2") := list(field1, field2), on = c("id1", "id2")])
# user system elapsed
# 0.087 0.013 0.100
sessionInfo()
# R version 3.2.1 (2015-06-18)
# Platform: x86_64-apple-darwin13.4.0 (64-bit)
# Running under: OS X 10.10.4 (Yosemite)
# locale:
# [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
# attached base packages:
# [1] stats graphics grDevices utils datasets methods base
# other attached packages:
# [1] data.table_1.9.5 dplyr_0.4.2
# loaded via a namespace (and not attached):
# [1] magrittr_1.5 R6_2.1.0 assertthat_0.1 parallel_3.2.1 DBI_0.3.1
# [6] tools_3.2.1 Rcpp_0.12.0 chron_2.3-45
虽然预计 dplyr
会慢 ~1200 倍。