函数在 ddply 外部运行良好,但在 ddply 内部抛出错误
function works perfectly fine outside ddply but throws an error inside ddply
我正在尝试对数据框进行逐行操作(按组)。当我 运行 函数本身仅用于一组时,它 运行 非常好。但是,当我将 ddply 中的函数放入所有组的 运行 时,它会抛出一个错误 - 参数长度为零。
当运行单独放在一个数据框上时的函数'test':
for (i in 1:(nrow(test) - 5)) {
if (i <= 5) {
test[i, "MPPALERT"] <- 0
}
FIRSTMPP <- test[i, "TAGMPPSEARCHCOUNT"]
LASTMPP <- test[i+5, "TAGMPPSEARCHCOUNT"]
if ((LASTMPP - FIRSTMPP) >= 10) {
test[i+5, "MPPALERT"] <- 1
} else {
test[i+5, "MPPALERT"] <- 0
}
}
ddply 中的上述函数抛出错误:
Error in if (LASTMPP - FIRSTMPP >= 10) { : argument is of length zero
下面是ddply代码:
mpp_fn <- function(x) {
for (i in 1:(nrow(x) - 5)) {
if (i <= 5) {
x[i, "MPPALERT"] <- 0
}
FIRSTMPP <- x[i, "TAGMPPSEARCHCOUNT"]
LASTMPP <- x[i+5, "TAGMPPSEARCHCOUNT"]
if (LASTMPP - FIRSTMPP >= 10) {
x[i+5, "MPPALERT"] <- 1
} else {
x[i+5, "MPPALERT"] <- 0
}
}
}
result <- ddply(data, c("SHELTERID", "INVERTERID"), mpp_fn(x))
在上面的代码中,FIRSTMPP 和 LASTMPP 的值解析为 NULL,因此会出现错误,但为什么会发生这种情况(当它 运行 在 ddply 之外完全正常时)?
更新:这是 dput(data) 的输出:
structure(list(SHELTERID = c("SH02", "SH02", "SH02", "SH02",
"SH02", "SH02", "SH02", "SH02", "SH02", "SH02", "SH02", "SH02",
"SH02", "SH02", "SH02", "SH02", "SH02", "SH02", "SH02", "SH02",
"SH02", "SH02", "SH02", "SH02", "SH02", "SH02", "SH02", "SH02",
"SH02", "SH02", "SH02", "SH02", "SH02", "SH02", "SH02", "SH02",
"SH03", "SH03", "SH03", "SH03", "SH03", "SH03", "SH03", "SH03",
"SH03", "SH03", "SH03", "SH03", "SH03", "SH03", "SH03", "SH03",
"SH03", "SH03", "SH03", "SH03", "SH03", "SH03", "SH03", "SH03",
"SH03", "SH03", "SH03", "SH03", "SH03", "SH03", "SH03", "SH03",
"SH03", "SH03", "SH03", "SH03"), INVERTERID = c("I1", "I1", "I1",
"I1", "I1", "I1", "I1", "I1", "I1", "I1", "I1", "I1", "I1", "I1",
"I1", "I1", "I1", "I1", "I2", "I2", "I2", "I2", "I2", "I2", "I2",
"I2", "I2", "I2", "I2", "I2", "I2", "I2", "I2", "I2", "I2", "I2",
"I1", "I1", "I1", "I1", "I1", "I1", "I1", "I1", "I1", "I1", "I1",
"I1", "I1", "I1", "I1", "I1", "I1", "I1", "I2", "I2", "I2", "I2",
"I2", "I2", "I2", "I2", "I2", "I2", "I2", "I2", "I2", "I2", "I2",
"I2", "I2", "I2"), TAGMPPSEARCHCOUNT = c(0, 0, 0, 0, 0, 0, 0,
2, 0, 0, 3, 0, 0, 3, 0, 0, 3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 3, 0, 3, 0, 0, 4, 0, 0, 4, 0, 4, 0, 0, 5, 0, 0,
5, 0)), .Names = c("SHELTERID", "INVERTERID", "TAGMPPSEARCHCOUNT"
), row.names = c(350L, 351L, 352L, 353L, 354L, 355L, 356L, 357L,
358L, 359L, 360L, 361L, 362L, 363L, 364L, 365L, 366L, 367L, 494L,
495L, 496L, 497L, 498L, 499L, 500L, 501L, 502L, 503L, 504L, 505L,
506L, 507L, 508L, 509L, 510L, 511L, 638L, 639L, 640L, 641L, 642L,
643L, 644L, 645L, 646L, 647L, 648L, 649L, 650L, 651L, 652L, 653L,
654L, 655L, 782L, 783L, 784L, 785L, 786L, 787L, 788L, 789L, 790L,
791L, 792L, 793L, 794L, 795L, 796L, 797L, 798L, 799L), class = "data.frame")
这是一个dplyr
解决方案。它不需要显式循环
library(dplyr)
data %>%
group_by(SHELTERID, INVERTERID) %>%
mutate(
First = lag(TAGMPPSEARCHCOUNT, 5),
MPPALERT = ifelse(
is.na(First),
0,
ifelse(
TAGMPPSEARCHCOUNT - First > 10,
1,
0
)
)
)
我正在尝试对数据框进行逐行操作(按组)。当我 运行 函数本身仅用于一组时,它 运行 非常好。但是,当我将 ddply 中的函数放入所有组的 运行 时,它会抛出一个错误 - 参数长度为零。
当运行单独放在一个数据框上时的函数'test':
for (i in 1:(nrow(test) - 5)) {
if (i <= 5) {
test[i, "MPPALERT"] <- 0
}
FIRSTMPP <- test[i, "TAGMPPSEARCHCOUNT"]
LASTMPP <- test[i+5, "TAGMPPSEARCHCOUNT"]
if ((LASTMPP - FIRSTMPP) >= 10) {
test[i+5, "MPPALERT"] <- 1
} else {
test[i+5, "MPPALERT"] <- 0
}
}
ddply 中的上述函数抛出错误:
Error in if (LASTMPP - FIRSTMPP >= 10) { : argument is of length zero
下面是ddply代码:
mpp_fn <- function(x) {
for (i in 1:(nrow(x) - 5)) {
if (i <= 5) {
x[i, "MPPALERT"] <- 0
}
FIRSTMPP <- x[i, "TAGMPPSEARCHCOUNT"]
LASTMPP <- x[i+5, "TAGMPPSEARCHCOUNT"]
if (LASTMPP - FIRSTMPP >= 10) {
x[i+5, "MPPALERT"] <- 1
} else {
x[i+5, "MPPALERT"] <- 0
}
}
}
result <- ddply(data, c("SHELTERID", "INVERTERID"), mpp_fn(x))
在上面的代码中,FIRSTMPP 和 LASTMPP 的值解析为 NULL,因此会出现错误,但为什么会发生这种情况(当它 运行 在 ddply 之外完全正常时)?
更新:这是 dput(data) 的输出:
structure(list(SHELTERID = c("SH02", "SH02", "SH02", "SH02",
"SH02", "SH02", "SH02", "SH02", "SH02", "SH02", "SH02", "SH02",
"SH02", "SH02", "SH02", "SH02", "SH02", "SH02", "SH02", "SH02",
"SH02", "SH02", "SH02", "SH02", "SH02", "SH02", "SH02", "SH02",
"SH02", "SH02", "SH02", "SH02", "SH02", "SH02", "SH02", "SH02",
"SH03", "SH03", "SH03", "SH03", "SH03", "SH03", "SH03", "SH03",
"SH03", "SH03", "SH03", "SH03", "SH03", "SH03", "SH03", "SH03",
"SH03", "SH03", "SH03", "SH03", "SH03", "SH03", "SH03", "SH03",
"SH03", "SH03", "SH03", "SH03", "SH03", "SH03", "SH03", "SH03",
"SH03", "SH03", "SH03", "SH03"), INVERTERID = c("I1", "I1", "I1",
"I1", "I1", "I1", "I1", "I1", "I1", "I1", "I1", "I1", "I1", "I1",
"I1", "I1", "I1", "I1", "I2", "I2", "I2", "I2", "I2", "I2", "I2",
"I2", "I2", "I2", "I2", "I2", "I2", "I2", "I2", "I2", "I2", "I2",
"I1", "I1", "I1", "I1", "I1", "I1", "I1", "I1", "I1", "I1", "I1",
"I1", "I1", "I1", "I1", "I1", "I1", "I1", "I2", "I2", "I2", "I2",
"I2", "I2", "I2", "I2", "I2", "I2", "I2", "I2", "I2", "I2", "I2",
"I2", "I2", "I2"), TAGMPPSEARCHCOUNT = c(0, 0, 0, 0, 0, 0, 0,
2, 0, 0, 3, 0, 0, 3, 0, 0, 3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 3, 0, 3, 0, 0, 4, 0, 0, 4, 0, 4, 0, 0, 5, 0, 0,
5, 0)), .Names = c("SHELTERID", "INVERTERID", "TAGMPPSEARCHCOUNT"
), row.names = c(350L, 351L, 352L, 353L, 354L, 355L, 356L, 357L,
358L, 359L, 360L, 361L, 362L, 363L, 364L, 365L, 366L, 367L, 494L,
495L, 496L, 497L, 498L, 499L, 500L, 501L, 502L, 503L, 504L, 505L,
506L, 507L, 508L, 509L, 510L, 511L, 638L, 639L, 640L, 641L, 642L,
643L, 644L, 645L, 646L, 647L, 648L, 649L, 650L, 651L, 652L, 653L,
654L, 655L, 782L, 783L, 784L, 785L, 786L, 787L, 788L, 789L, 790L,
791L, 792L, 793L, 794L, 795L, 796L, 797L, 798L, 799L), class = "data.frame")
这是一个dplyr
解决方案。它不需要显式循环
library(dplyr)
data %>%
group_by(SHELTERID, INVERTERID) %>%
mutate(
First = lag(TAGMPPSEARCHCOUNT, 5),
MPPALERT = ifelse(
is.na(First),
0,
ifelse(
TAGMPPSEARCHCOUNT - First > 10,
1,
0
)
)
)