用 R 中的 apply 函数替换 for 循环
Replacing for loop with apply function in R
我正在尝试计算主队和客队的平均进球数,但是 "todays" 比赛。
数据可以在这里找到:http://www.football-data.co.uk/mmz4281/1415/E0.csv
我的代码
pl <- pl[,2:6]
pl$Date <- as.Date(pl$Date, "%d/%m/%y")
pl$HomeTeam <- as.character(pl$HomeTeam)
pl$AwayTeam <- as.character(pl$AwayTeam)
pl.func <- function(tf){
tf$avg.ht <- rep(NA,nrow(tf))
tf$avg.at <- rep(NA,nrow(tf))
for(i in 1:nrow(tf)){
tf$avg.ht[i] <- (sum(tf$FTHG[tf$HomeTeam == tf$HomeTeam[i] & tf$Date < tf$Date[i]]) + sum(tf$FTAG[tf$AwayTeam == tf$HomeTeam[i] & tf$Date <tf$Date[i]])) / sum(tf$HomeTeam == tf$HomeTeam[i] & tf$Date < tf$Date[i] | tf$AwayTeam == tf$HomeTeam[i] & tf$Date < tf$Date[i])
tf$avg.at[i] <- (sum(tf$FTHG[tf$HomeTeam == tf$AwayTeam[i] & tf$Date < tf$Date[i]]) + sum(tf$FTAG[tf$AwayTeam == tf$AwayTeam[i] & tf$Date <tf$Date[i]])) / sum(tf$HomeTeam == tf$AwayTeam[i] & tf$Date < tf$Date[i] | tf$AwayTeam == tf$AwayTeam[i] & tf$Date < tf$Date[i])
}
return(tf)
}
pl <- pl.func(pl)
我需要 "match" 加入团队,而且日期要早一些。上面的代码有效,但速度很慢,因为我要计算数百次计算。任何人都可以提示或展示我如何使用某种应用功能来做到这一点吗?我无法成功,因为我不知道以正确的方式替换循环中的 [i] 参数。
您真正需要的是 运行 条件平均值。最近,我回答了一个类似的问题 ,其中 OP 需要 运行 每组每 15 分钟的平均值,而你需要 运行 团队过去每场比赛的平均值。
因此请考虑以下 sapply()
方法,该方法使用示例数据和 运行 您的代码,returns 等效输出。性能可能会更符合您的需求:
pl$runavgHT <- sapply(1:nrow(pl),
function(i) {
(sum(((pl[1:i, c("Date")] < (pl$Date[i]))
& (pl[1:i, c("HomeTeam")] == pl$HomeTeam[i]))
* pl[1:i,]$FTHG) +
sum(((pl[1:i, c("Date")] < (pl$Date[i]))
& (pl[1:i, c("AwayTeam")] == pl$HomeTeam[i]))
* pl[1:i,]$FTAG)) /
sum(((pl[1:i, c("Date")] < (pl$Date[i])) &
(pl[1:i, c("HomeTeam")] == pl$HomeTeam[i]))
|((pl[1:i, c("Date")] < (pl$Date[i])) &
(pl[1:i, c("AwayTeam")] == pl$HomeTeam[i])))
}
)
pl$runavgAT <- sapply(1:nrow(pl),
function(i) {
(sum(((pl[1:i, c("Date")] < (pl$Date[i]))
& (pl[1:i, c("HomeTeam")] == pl$AwayTeam[i]))
* pl[1:i,]$FTHG) +
sum(((pl[1:i, c("Date")] < (pl$Date[i]))
& (pl[1:i, c("AwayTeam")] == pl$AwayTeam[i]))
* pl[1:i,]$FTAG)) /
sum(((pl[1:i, c("Date")] < (pl$Date[i])) &
(pl[1:i, c("HomeTeam")] == pl$AwayTeam[i]))
|((pl[1:i, c("Date")] < (pl$Date[i])) &
(pl[1:i, c("AwayTeam")] == pl$AwayTeam[i])))
}
)
以下是一些可能的改进(和最终基准):
1) 这是您的函数的修改版本,仅对循环进行了一些改进:
pl.func2 <- function(DF){
DF$avg.ht <- rep(NA,nrow(DF))
DF$avg.at <- rep(NA,nrow(DF))
for(i in 1:nrow(DF)){
currDate <- DF$Date[i]
currHT <- DF$HomeTeam[i]
currAT <- DF$AwayTeam[i]
prevHT.eq.HT <- which(DF$HomeTeam == currHT & DF$Date < currDate)
prevHT.eq.AT <- which(DF$HomeTeam == currAT & DF$Date < currDate)
prevAT.eq.HT <- which(DF$AwayTeam == currHT & DF$Date < currDate)
prevAT.eq.AT <- which(DF$AwayTeam == currAT & DF$Date < currDate)
DF$avg.ht[i] <- (sum(DF$FTHG[prevHT.eq.HT]) + sum(tf$FTAG[prevAT.eq.HT])) / (length(prevHT.eq.HT) + length(prevAT.eq.HT))
DF$avg.at[i] <- (sum(DF$FTHG[prevHT.eq.AT]) + sum(tf$FTAG[prevAT.eq.AT])) / (length(prevHT.eq.AT) + length(prevAT.eq.AT))
}
return(DF)
}
2) 这是您的函数的另一个修改版本,它使用累积信息来避免对所有前几天进行子集化和求和(N.B。这需要 data.frame 按日期排序):
pl.func3 <- function(DF){
DF$avg.ht <- rep(NA,nrow(DF))
DF$avg.at <- rep(NA,nrow(DF))
teams <- unique(c(DF$HomeTeam,DF$AwayTeam))
cumul.info <- t(sapply(teams,FUN=function(team) c(cumulFTG=0,cumulMatches=0)))
# store column indexes to reuse them
cumulFTG <- 1
cumulMatches <- 2
for(i in 1:nrow(DF)){
currHT <- DF$HomeTeam[i]
currAT <- DF$AwayTeam[i]
DF$avg.ht[i] <- cumul.info[currHT,cumulFTG] / cumul.info[currHT,cumulMatches]
DF$avg.at[i] <- cumul.info[currAT,cumulFTG] / cumul.info[currAT,cumulMatches]
cumul.info[currHT,cumulFTG] = cumul.info[currHT,cumulFTG] + DF$FTHG[i]
cumul.info[currHT,cumulMatches] = cumul.info[currHT,cumulMatches] + 1
cumul.info[currAT,cumulFTG] = cumul.info[currAT,cumulFTG] + DF$FTAG[i]
cumul.info[currAT,cumulMatches] = cumul.info[currAT,cumulMatches] + 1
}
return(DF)
}
检查和基准测试:
# this is necessary for pl.func3
pl <- pl[order(pl$Date),]
# are the results identical ? -> TRUE
identical(pl.func(pl),pl.func2(pl)) && identical(pl.func(pl),pl.func3(pl))
# benchmark
library(microbenchmark)
microbenchmark(pl.func(pl),pl.func2(pl),pl.func3(pl))
Unit: milliseconds
expr min lq mean median uq max neval cld
pl.func(pl) 184.36644 186.10643 188.38130 187.16322 188.80065 255.2101 100 c
pl.func2(pl) 84.95047 85.80966 89.27945 87.41589 88.33845 159.6284 100 b
pl.func3(pl) 30.72683 31.05515 32.02944 31.41211 33.22858 35.8644 100 a
我正在尝试计算主队和客队的平均进球数,但是 "todays" 比赛。
数据可以在这里找到:http://www.football-data.co.uk/mmz4281/1415/E0.csv
我的代码
pl <- pl[,2:6]
pl$Date <- as.Date(pl$Date, "%d/%m/%y")
pl$HomeTeam <- as.character(pl$HomeTeam)
pl$AwayTeam <- as.character(pl$AwayTeam)
pl.func <- function(tf){
tf$avg.ht <- rep(NA,nrow(tf))
tf$avg.at <- rep(NA,nrow(tf))
for(i in 1:nrow(tf)){
tf$avg.ht[i] <- (sum(tf$FTHG[tf$HomeTeam == tf$HomeTeam[i] & tf$Date < tf$Date[i]]) + sum(tf$FTAG[tf$AwayTeam == tf$HomeTeam[i] & tf$Date <tf$Date[i]])) / sum(tf$HomeTeam == tf$HomeTeam[i] & tf$Date < tf$Date[i] | tf$AwayTeam == tf$HomeTeam[i] & tf$Date < tf$Date[i])
tf$avg.at[i] <- (sum(tf$FTHG[tf$HomeTeam == tf$AwayTeam[i] & tf$Date < tf$Date[i]]) + sum(tf$FTAG[tf$AwayTeam == tf$AwayTeam[i] & tf$Date <tf$Date[i]])) / sum(tf$HomeTeam == tf$AwayTeam[i] & tf$Date < tf$Date[i] | tf$AwayTeam == tf$AwayTeam[i] & tf$Date < tf$Date[i])
}
return(tf)
}
pl <- pl.func(pl)
我需要 "match" 加入团队,而且日期要早一些。上面的代码有效,但速度很慢,因为我要计算数百次计算。任何人都可以提示或展示我如何使用某种应用功能来做到这一点吗?我无法成功,因为我不知道以正确的方式替换循环中的 [i] 参数。
您真正需要的是 运行 条件平均值。最近,我回答了一个类似的问题
因此请考虑以下 sapply()
方法,该方法使用示例数据和 运行 您的代码,returns 等效输出。性能可能会更符合您的需求:
pl$runavgHT <- sapply(1:nrow(pl),
function(i) {
(sum(((pl[1:i, c("Date")] < (pl$Date[i]))
& (pl[1:i, c("HomeTeam")] == pl$HomeTeam[i]))
* pl[1:i,]$FTHG) +
sum(((pl[1:i, c("Date")] < (pl$Date[i]))
& (pl[1:i, c("AwayTeam")] == pl$HomeTeam[i]))
* pl[1:i,]$FTAG)) /
sum(((pl[1:i, c("Date")] < (pl$Date[i])) &
(pl[1:i, c("HomeTeam")] == pl$HomeTeam[i]))
|((pl[1:i, c("Date")] < (pl$Date[i])) &
(pl[1:i, c("AwayTeam")] == pl$HomeTeam[i])))
}
)
pl$runavgAT <- sapply(1:nrow(pl),
function(i) {
(sum(((pl[1:i, c("Date")] < (pl$Date[i]))
& (pl[1:i, c("HomeTeam")] == pl$AwayTeam[i]))
* pl[1:i,]$FTHG) +
sum(((pl[1:i, c("Date")] < (pl$Date[i]))
& (pl[1:i, c("AwayTeam")] == pl$AwayTeam[i]))
* pl[1:i,]$FTAG)) /
sum(((pl[1:i, c("Date")] < (pl$Date[i])) &
(pl[1:i, c("HomeTeam")] == pl$AwayTeam[i]))
|((pl[1:i, c("Date")] < (pl$Date[i])) &
(pl[1:i, c("AwayTeam")] == pl$AwayTeam[i])))
}
)
以下是一些可能的改进(和最终基准):
1) 这是您的函数的修改版本,仅对循环进行了一些改进:
pl.func2 <- function(DF){
DF$avg.ht <- rep(NA,nrow(DF))
DF$avg.at <- rep(NA,nrow(DF))
for(i in 1:nrow(DF)){
currDate <- DF$Date[i]
currHT <- DF$HomeTeam[i]
currAT <- DF$AwayTeam[i]
prevHT.eq.HT <- which(DF$HomeTeam == currHT & DF$Date < currDate)
prevHT.eq.AT <- which(DF$HomeTeam == currAT & DF$Date < currDate)
prevAT.eq.HT <- which(DF$AwayTeam == currHT & DF$Date < currDate)
prevAT.eq.AT <- which(DF$AwayTeam == currAT & DF$Date < currDate)
DF$avg.ht[i] <- (sum(DF$FTHG[prevHT.eq.HT]) + sum(tf$FTAG[prevAT.eq.HT])) / (length(prevHT.eq.HT) + length(prevAT.eq.HT))
DF$avg.at[i] <- (sum(DF$FTHG[prevHT.eq.AT]) + sum(tf$FTAG[prevAT.eq.AT])) / (length(prevHT.eq.AT) + length(prevAT.eq.AT))
}
return(DF)
}
2) 这是您的函数的另一个修改版本,它使用累积信息来避免对所有前几天进行子集化和求和(N.B。这需要 data.frame 按日期排序):
pl.func3 <- function(DF){
DF$avg.ht <- rep(NA,nrow(DF))
DF$avg.at <- rep(NA,nrow(DF))
teams <- unique(c(DF$HomeTeam,DF$AwayTeam))
cumul.info <- t(sapply(teams,FUN=function(team) c(cumulFTG=0,cumulMatches=0)))
# store column indexes to reuse them
cumulFTG <- 1
cumulMatches <- 2
for(i in 1:nrow(DF)){
currHT <- DF$HomeTeam[i]
currAT <- DF$AwayTeam[i]
DF$avg.ht[i] <- cumul.info[currHT,cumulFTG] / cumul.info[currHT,cumulMatches]
DF$avg.at[i] <- cumul.info[currAT,cumulFTG] / cumul.info[currAT,cumulMatches]
cumul.info[currHT,cumulFTG] = cumul.info[currHT,cumulFTG] + DF$FTHG[i]
cumul.info[currHT,cumulMatches] = cumul.info[currHT,cumulMatches] + 1
cumul.info[currAT,cumulFTG] = cumul.info[currAT,cumulFTG] + DF$FTAG[i]
cumul.info[currAT,cumulMatches] = cumul.info[currAT,cumulMatches] + 1
}
return(DF)
}
检查和基准测试:
# this is necessary for pl.func3
pl <- pl[order(pl$Date),]
# are the results identical ? -> TRUE
identical(pl.func(pl),pl.func2(pl)) && identical(pl.func(pl),pl.func3(pl))
# benchmark
library(microbenchmark)
microbenchmark(pl.func(pl),pl.func2(pl),pl.func3(pl))
Unit: milliseconds
expr min lq mean median uq max neval cld
pl.func(pl) 184.36644 186.10643 188.38130 187.16322 188.80065 255.2101 100 c
pl.func2(pl) 84.95047 85.80966 89.27945 87.41589 88.33845 159.6284 100 b
pl.func3(pl) 30.72683 31.05515 32.02944 31.41211 33.22858 35.8644 100 a