R:避免循环或行应用功能
R: Avoid loop or row apply function
我关注了两个数据框
df_sales 和 df_supply.
我想将销售合并为供应,以便我的 df_sales table 在以下条件下从 df_supply 获得 DATE_SUPPLY 和 QNT_SUPPLY
*条件:DATE_SUPPLY应该是对应"STORE"对应"ITEM"的最近DATE_SUPPLY,即DATE_SALE <- max(df_supply[df_supply$DATE_SUPPLY <= df_sales$DATE_SALE & df_supply$STORE == df_sales$STORE & df_supply$ITEM == df_sales$ITEM,]$DATE_SUPPLY)*
可以使用行应用函数或简单地通过编写循环来实现。但我有庞大的数据集,所以不想循环。
df_sales <- data.frame("STORE"=c(1001,1001,1001,1001,1001,1002,1002,1002,1002,1002),"ITEM"=c(13048, 13057, 13082, 13048, 13057, 13145, 13166, 13229, 13057, 13048),"DATE_SALE"=as.Date(c("1/1/2014","1/1/2014","1/2/2014","1/2/2014","1/2/2014","1/2/2014","1/3/2014","1/3/2014","1/3/2014","1/4/2014"),"%m/%d/%Y"),"QNT_SALE"=c(1,1,1,1,1,1,1,1,1,1))
df_sales
STORE ITEM DATE_SALE QNT_SALE
1 1001 13048 2014-01-01 1
2 1001 13057 2014-01-01 1
3 1001 13082 2014-01-02 1
4 1001 13048 2014-01-02 1
5 1001 13057 2014-01-02 1
6 1002 13145 2014-01-02 1
7 1002 13166 2014-01-03 1
8 1002 13229 2014-01-03 1
9 1002 13057 2014-01-03 1
10 1002 13048 2014-01-04 1
df_supply <- data.frame("STORE"=c(1001,1002,1001,1001,1002,1002,1002,1002,1002,1002),"ITEM"=c(13048,13229,13057,13082,13145,13166,13229,13057,13048,13048),"DATE_SUPPLY"=as.Date(c("1/31/2013","1/31/2013","1/31/2013","1/1/2014","1/2/2014","1/2/2014","1/2/2014","1/2/2014","1/3/2014","2/1/2014"),"%m/%d/%Y"),"QNT_SUPPLY"=c(2,1,2,1,1,1,2,3,1,2))
df_supply
STORE ITEM DATE_SUPPLY CUM_QNT_SUPPLY
1 1001 13048 2013-01-31 2
2 1002 13229 2013-01-31 1
3 1001 13057 2013-01-31 2
4 1001 13082 2014-01-01 1
5 1002 13145 2014-01-02 1
6 1002 13166 2014-01-02 1
7 1002 13229 2014-01-02 2
8 1002 13057 2014-01-02 3
9 1002 13048 2014-01-03 1
10 1002 13048 2014-02-01 2
Output Required:
Sales Vs Supply
STORE ITEM DATE_SALE QNT_SALE DATE_SUPPLY QNT_SUPPLY
1 1001 13048 2014-01-01 1 2013-01-31 2
2 1001 13057 2014-01-01 1 2013-01-31 2
3 1001 13082 2014-01-02 1 2014-01-01 1
4 1001 13048 2014-01-02 1 2013-01-31 2
5 1001 13057 2014-01-02 1 2013-01-31 2
6 1002 13145 2014-01-03 1 2014-01-02 1
7 1002 13166 2014-01-03 1 2014-01-02 1
8 1002 13229 2014-01-03 1 2014-01-02 2
9 1002 13057 2014-01-03 1 2014-01-02 3
10 1002 13048 2014-01-04 1 2014-01-03 1
您可以尝试使用 merge
和相关顺序 (order
):
# order the data.frames
df_sales <- df_sales[order(-df_sales$STORE, -df_sales$ITEM, df_sales$DATE_SALE, decreasing=T), ]
df_supply <- df_supply[order(-df_supply$STORE, -df_supply$ITEM, df_supply$DATE_SUPPLY, decreasing=T), ]
# merge the data.frames
res <- merge(df_sales, df_supply, by=c("STORE","ITEM"), all=T)
# keep only records with DATE_SUPPLY anterior to DATE_SALE
res <- res[with(res, DATE_SUPPLY <= DATE_SALE), ]
# remove duplicates (based on STORE, ITEM and DATE_SALE)
res <- res[!duplicated(res[, 1:3]), ]
res
# STORE ITEM DATE_SALE QNT_SALE DATE_SUPPLY QNT_SUPPLY
# 1 1001 13048 2014-01-02 1 2013-01-31 2
# 2 1001 13048 2014-01-01 1 2013-01-31 2
# 3 1001 13057 2014-01-02 1 2013-01-31 2
# 4 1001 13057 2014-01-01 1 2013-01-31 2
# 5 1001 13082 2014-01-02 1 2014-01-01 1
# 7 1002 13048 2014-01-04 1 2014-01-03 1
# 8 1002 13057 2014-01-03 1 2014-01-02 3
# 9 1002 13145 2014-01-02 1 2014-01-02 1
# 10 1002 13166 2014-01-03 1 2014-01-02 1
# 11 1002 13229 2014-01-03 1 2014-01-02 2
使用 滚动连接 来自 data.table
:
require(data.table)
setkey(setDT(df_supply), STORE, ITEM, DATE_SUPPLY)
idx = df_supply[df_sales, roll=Inf, which=TRUE]
cbind(df_sales, df_supply[idx, 3:4])
# STORE ITEM DATE_SALE QNT_SALE DATE_SUPPLY QNT_SUPPLY
# 1 1001 13048 2014-01-01 1 2013-01-31 2
# 2 1001 13057 2014-01-01 1 2013-01-31 2
# 3 1001 13082 2014-01-02 1 2014-01-01 1
# 4 1001 13048 2014-01-02 1 2013-01-31 2
# 5 1001 13057 2014-01-02 1 2013-01-31 2
# 6 1002 13145 2014-01-02 1 2014-01-02 1
# 7 1002 13166 2014-01-03 1 2014-01-02 1
# 8 1002 13229 2014-01-03 1 2014-01-02 2
# 9 1002 13057 2014-01-03 1 2014-01-02 3
# 10 1002 13048 2014-01-04 1 2014-01-03 1
cbind
returns 一个全新的对象。相反,如果您想通过引用 添加新列 到 df_sales
,请改用 :=
。在 SO 上有许多使用它的示例,并且还在 new HTML vignettes.
下进行了解释
我关注了两个数据框 df_sales 和 df_supply.
我想将销售合并为供应,以便我的 df_sales table 在以下条件下从 df_supply 获得 DATE_SUPPLY 和 QNT_SUPPLY
*条件:DATE_SUPPLY应该是对应"STORE"对应"ITEM"的最近DATE_SUPPLY,即DATE_SALE <- max(df_supply[df_supply$DATE_SUPPLY <= df_sales$DATE_SALE & df_supply$STORE == df_sales$STORE & df_supply$ITEM == df_sales$ITEM,]$DATE_SUPPLY)*
可以使用行应用函数或简单地通过编写循环来实现。但我有庞大的数据集,所以不想循环。
df_sales <- data.frame("STORE"=c(1001,1001,1001,1001,1001,1002,1002,1002,1002,1002),"ITEM"=c(13048, 13057, 13082, 13048, 13057, 13145, 13166, 13229, 13057, 13048),"DATE_SALE"=as.Date(c("1/1/2014","1/1/2014","1/2/2014","1/2/2014","1/2/2014","1/2/2014","1/3/2014","1/3/2014","1/3/2014","1/4/2014"),"%m/%d/%Y"),"QNT_SALE"=c(1,1,1,1,1,1,1,1,1,1))
df_sales
STORE ITEM DATE_SALE QNT_SALE
1 1001 13048 2014-01-01 1
2 1001 13057 2014-01-01 1
3 1001 13082 2014-01-02 1
4 1001 13048 2014-01-02 1
5 1001 13057 2014-01-02 1
6 1002 13145 2014-01-02 1
7 1002 13166 2014-01-03 1
8 1002 13229 2014-01-03 1
9 1002 13057 2014-01-03 1
10 1002 13048 2014-01-04 1
df_supply <- data.frame("STORE"=c(1001,1002,1001,1001,1002,1002,1002,1002,1002,1002),"ITEM"=c(13048,13229,13057,13082,13145,13166,13229,13057,13048,13048),"DATE_SUPPLY"=as.Date(c("1/31/2013","1/31/2013","1/31/2013","1/1/2014","1/2/2014","1/2/2014","1/2/2014","1/2/2014","1/3/2014","2/1/2014"),"%m/%d/%Y"),"QNT_SUPPLY"=c(2,1,2,1,1,1,2,3,1,2))
df_supply
STORE ITEM DATE_SUPPLY CUM_QNT_SUPPLY
1 1001 13048 2013-01-31 2
2 1002 13229 2013-01-31 1
3 1001 13057 2013-01-31 2
4 1001 13082 2014-01-01 1
5 1002 13145 2014-01-02 1
6 1002 13166 2014-01-02 1
7 1002 13229 2014-01-02 2
8 1002 13057 2014-01-02 3
9 1002 13048 2014-01-03 1
10 1002 13048 2014-02-01 2
Output Required:
Sales Vs Supply
STORE ITEM DATE_SALE QNT_SALE DATE_SUPPLY QNT_SUPPLY
1 1001 13048 2014-01-01 1 2013-01-31 2
2 1001 13057 2014-01-01 1 2013-01-31 2
3 1001 13082 2014-01-02 1 2014-01-01 1
4 1001 13048 2014-01-02 1 2013-01-31 2
5 1001 13057 2014-01-02 1 2013-01-31 2
6 1002 13145 2014-01-03 1 2014-01-02 1
7 1002 13166 2014-01-03 1 2014-01-02 1
8 1002 13229 2014-01-03 1 2014-01-02 2
9 1002 13057 2014-01-03 1 2014-01-02 3
10 1002 13048 2014-01-04 1 2014-01-03 1
您可以尝试使用 merge
和相关顺序 (order
):
# order the data.frames
df_sales <- df_sales[order(-df_sales$STORE, -df_sales$ITEM, df_sales$DATE_SALE, decreasing=T), ]
df_supply <- df_supply[order(-df_supply$STORE, -df_supply$ITEM, df_supply$DATE_SUPPLY, decreasing=T), ]
# merge the data.frames
res <- merge(df_sales, df_supply, by=c("STORE","ITEM"), all=T)
# keep only records with DATE_SUPPLY anterior to DATE_SALE
res <- res[with(res, DATE_SUPPLY <= DATE_SALE), ]
# remove duplicates (based on STORE, ITEM and DATE_SALE)
res <- res[!duplicated(res[, 1:3]), ]
res
# STORE ITEM DATE_SALE QNT_SALE DATE_SUPPLY QNT_SUPPLY
# 1 1001 13048 2014-01-02 1 2013-01-31 2
# 2 1001 13048 2014-01-01 1 2013-01-31 2
# 3 1001 13057 2014-01-02 1 2013-01-31 2
# 4 1001 13057 2014-01-01 1 2013-01-31 2
# 5 1001 13082 2014-01-02 1 2014-01-01 1
# 7 1002 13048 2014-01-04 1 2014-01-03 1
# 8 1002 13057 2014-01-03 1 2014-01-02 3
# 9 1002 13145 2014-01-02 1 2014-01-02 1
# 10 1002 13166 2014-01-03 1 2014-01-02 1
# 11 1002 13229 2014-01-03 1 2014-01-02 2
使用 滚动连接 来自 data.table
:
require(data.table)
setkey(setDT(df_supply), STORE, ITEM, DATE_SUPPLY)
idx = df_supply[df_sales, roll=Inf, which=TRUE]
cbind(df_sales, df_supply[idx, 3:4])
# STORE ITEM DATE_SALE QNT_SALE DATE_SUPPLY QNT_SUPPLY
# 1 1001 13048 2014-01-01 1 2013-01-31 2
# 2 1001 13057 2014-01-01 1 2013-01-31 2
# 3 1001 13082 2014-01-02 1 2014-01-01 1
# 4 1001 13048 2014-01-02 1 2013-01-31 2
# 5 1001 13057 2014-01-02 1 2013-01-31 2
# 6 1002 13145 2014-01-02 1 2014-01-02 1
# 7 1002 13166 2014-01-03 1 2014-01-02 1
# 8 1002 13229 2014-01-03 1 2014-01-02 2
# 9 1002 13057 2014-01-03 1 2014-01-02 3
# 10 1002 13048 2014-01-04 1 2014-01-03 1
cbind
returns 一个全新的对象。相反,如果您想通过引用 添加新列 到 df_sales
,请改用 :=
。在 SO 上有许多使用它的示例,并且还在 new HTML vignettes.