R：循环到子集，追加数据，输出单个数据帧

Question

我有两个单独的数据集，df1 和 df2，（每个都有不同的行数）我需要匹配，然后附加具有来自 df2 的特定值的列.我有一个大数据集，我被困在写一个循环来匹配 df2 到 df1 的每一行，然后附加所需的值...

df1 中的数据根据日期与 df2 匹配。根据 df1 与 df2 匹配的记录数，每次迭代 returns 不同的行数。然后我需要从 df2 中获取具有所需值的相关列，并将它们附加到日期匹配的输出中。这是一个手动示例（它可以工作并提供我需要的东西）：

# Step 1: Get the desired values from df2 that need to be appended to the matching output
  df2_values = df2[, c("trip.start", "trip.end", "b","ln_b,"c","ln_c")] 

# Step 2: Find the df1 records inclusive of the dates for the first record in df2
  df1_rec = subset(df1, Date >= df2$trip.start[1] & Date <= df2$trip.end[1])

# Step 3: Append the date matching output and the df2 values
  output1 = cbind(df1_rec, df2_values[1,])

# Doing this again for df2, second date
  df1_rec = subset(df1, Date >= df2$trip.start[2] & Date <= df2$trip.end[2]) 
  output2 = cbind(df1_rec, df2_values[2,])

我需要对 df2 的 40 行重复此过程，然后将所有输出 rbind 在一起，然后再次对 737 次单独出现的事件进行重复。这是我到目前为止所做的尝试，none 已经奏效：

    # Attempt #1
       library(foreach)
       output = foreach (j=1:nrow(df2), .combine=rbind) %do% {
       cbind(subset(df1, Date >= df2$trip.start[j] & Date <= df2$trip.end[j]), df2_values[j,]) }

    Error in { : 
  task 7 failed - "arguments imply differing number of rows: 0, 1"

# Attempt #2
       for (j in 1:nrow(df2)) {
       df1_rec = subset(df1, Date >= df2$trip.start[j] & Date <= df2$trip.end[j])
       output = cbind(df1_rec, df2_values[j,]) }

    Error in data.frame(..., check.names = FALSE) : 
  arguments imply differing number of rows: 0, 1

# Attempt #3
   library(foreach)
   output = foreach(i=1:nrow(df2), .combine=rbind) %do% {
            for (j in 1:nrow(df2)) {
            df1_rec = subset(df1, Date >= df2$trip.start[j] & Date <= df2$trip.end[j])
            df2_rec = df2_values[j,]
             cbind(df1_rec, df2_rec)} 
             }
Error in { : 
  task 1 failed - "arguments imply differing number of rows: 0, 1"

这是我的可用数据：

> dput(df1)
structure(list(Date = c("01-06-2008", "01-12-2008", "01-12-2008", 
"03-29-2008", "03-29-2008", "03-30-2008", "03-30-2008", "03-30-2008", 
"04-03-2008", "04-03-2008", "04-22-2008", "04-22-2008", "04-25-2008", 
"04-25-2008", "04-26-2008", "04-26-2008", "06-01-2010", "06-02-2010", 
"06-02-2010", "06-02-2010", "06-11-2010", "06-13-2010", "06-19-2010", 
"06-25-2010", "06-26-2010", "09-06-2010", "09-10-2010", "09-11-2010", 
"09-11-2010", "09-11-2010"), LONGITUDE = c(-83.93867, -84.11667, 
-84.158, -83.89933, -83.90333, -84.03533, -84.038, -84.01267, 
-84.14267, -84.18933, -84.01267, -84.02733, -83.49867, -83.98133, 
-84.194, -84.148, -83.99933, -83.75733, -83.76333, -83.802, -83.79333, 
-83.836, -83.78733, -83.92333, -84.23267, -83.79867, -84.15, 
-84.07533, -84.082, -84.10333), LATITUDE = c(29.38467, 29.262, 
29.19333, 28.89467, 28.866, 29.05933, 29.1, 29.074, 29.056, 29.01267, 
29.52467, 29.488, 29.62267, 29.05867, 29.17067, 29.208, 29.29933, 
29.08333, 29.05, 29.05267, 29.114, 29.09933, 29.18533, 29.04333, 
28.83533, 29.16133, 29.256, 29.254, 29.30867, 29.32667)), .Names = c("Date", 
"LONGITUDE", "LATITUDE"), row.names = c(1L, 20L, 21L, 211L, 212L, 
213L, 214L, 215L, 249L, 250L, 339L, 340L, 367L, 368L, 369L, 370L, 
72196L, 72197L, 72198L, 72199L, 72229L, 72237L, 72261L, 72302L, 
72303L, 72456L, 72493L, 72494L, 72495L, 72496L), class = "data.frame")


> dput(df2)
structure(list(trip.start = c("01-06-2008", "01-12-2008", "03-29-2008", 
"04-03-2008", "04-21-2008", "04-25-2008", "05-30-2008", "06-12-2008", 
"06-27-2008", "12-19-2008", "12-20-2008", "12-28-2008", "01-02-2009", 
"05-08-2009", "05-30-2009", "06-05-2009", "06-11-2009", "06-14-2009", 
"06-19-2009", "07-11-2009"), trip.end = c("01-06-2008", "01-12-2008", 
"03-30-2008", "04-04-2008", "04-22-2008", "04-26-2008", "05-31-2008", 
"06-13-2008", "06-28-2008", "12-19-2008", "12-20-2008", "12-28-2008", 
"01-03-2009", "05-09-2009", "05-30-2009", "06-06-2009", "06-12-2009", 
"06-16-2009", "06-20-2009", "07-11-2009"), b = c(29, 80.2, 50.3589743589744, 
34.6666666666667, 46.4117647058824, 44.0555555555556, 28.6511627906977, 
34.7179487179487, 35.4871794871795, 37.3846153846154, 34.5555555555556, 
103.764705882353, 25.6875, 25.9333333333333, 34, 40.3529411764706, 
28.3448275862069, 25.1764705882353, 30.6896551724138, 47.3333333333333
), ln_b = c(3.36729582998647, 4.38452351487247, 3.91917684278476, 
3.54577861047326, 3.83755297678966, 3.78545146373868, 3.3551940283999, 
3.54725680734257, 3.56917149004797, 3.62125926643896, 3.54256833484301, 
4.64212589251052, 3.24600449225645, 3.25552914251624, 3.52636052461616, 
3.69766428366967, 3.34444456506971, 3.22590985152558, 3.42392563273971, 
3.85721476893315), c = c(680.8023262, 2656.955794, 7593.859146, 
4124.485734, 5422.506164, 5251.09887, 3739.569761, 3577.070057, 
3572.918367, 1604.339982, 2090.724546, 5934.445367, 2302.743139, 
2574.448603, 1095.623765, 3377.839478, 2109.856165, 3065.508244, 
2118.826924, 1424.613609), ln_c = c(6.52327199411584, 7.88493630806025, 
8.93509519255547, 8.32469662045131, 8.59831337944543, 8.56619264225512, 
8.22672584662552, 8.18229932447399, 8.18113801057517, 7.38046772481922, 
7.64526595760902, 8.68852885145536, 7.74185636038285, 7.85339065541043, 
6.99907912846437, 8.12499157640442, 7.65437505590011, 8.02796865648309, 
7.65861787678239, 7.26165590435557)), .Names = c("trip.start", 
"trip.end", "b", "ln_b", "c", "ln_c"), row.names = c(16699L, 
16724L, 17055L, 17081L, 17162L, 17187L, 17383L, 17461L, 17558L, 
18176L, 18191L, 18223L, 18255L, 18732L, 18832L, 18888L, 18941L, 
18979L, 19015L, 19124L), class = "data.frame")

Answer 1

您可以使用 apply:

# Go through each row of df2 and do the matching.
# Return the `cbind`-ed df if matches found.
# The '1' in `apply(df2, 1, ...) means 'do-by-row'.
df12 <- apply(df2, 1, function(df2Row){
    matches <- df1$Date >= df2Row["trip.start"] & df1$Date <= df2Row["trip.end"];
    if(sum(matches) > 0){
        # Had to transpose df2Row before doing data.frame(..).
        # or it would make a single column df with 6 rows instead!
        # Very weird...
        cbind(df1[matches, ], data.frame(t(df2Row)));
    } else {
        NULL;
    }
})

# Now bind all the dfs in the list output above into 
# a single data.frame
df12 <- do.call(rbind, df12);

使用 dplyr library 有更好的方法来做到这一点（非常值得学习！），但如果你想坚持使用基础 R，那么以上是你可以做的一种方法它。

R：循环到子集，追加数据，输出单个数据帧

R: loop to subset, append data, and output single data frame

loops

r

append

subset