加速 R 中的嵌套循环
Speeding up a nested loop in R
从我的用户名可以看出,我是一个完全的初学者。我已经搜索了这个问题的答案,但是我正在努力寻找一个可以执行与我需要的类似的工作示例。此代码目前有效,但看起来 运行.
可能需要超过 64 小时
此代码用于帮助我确定在 5 年期间的任何一天有多少活跃合同。我现在的流程是在2015-01-01到2020-10-05之间每天循环,然后循环遍历data.frame的每一行,看看合同的起止日期是否在日期内正在循环。
big_data_new <- read.table(header=TRUE, text="
end_date start_date start_date_N end_date_N
1: 2017-03-16 2016-03-16 16876 17241
2: 2015-07-31 2015-07-08 16624 16647
3: 2016-08-02 2016-07-06 16988 17015
4: 2017-07-18 2017-05-31 17317 17365
5: 2016-10-28 2016-01-07 16807 17102
6: 2017-08-01 2017-06-29 17346 17379")
library(data.table)
library(lubridate)
library(dplyr)
library(tidyverse)
big_data_new <- big_data_new2
big_data_new$start_date <- lubridate:: parse_date_time(big_data_new$start_date, orders = "d-b-Y")
big_data_new$end_date <- lubridate:: parse_date_time(big_data_new$end_date, orders = "d-b-Y")
big_data_new = select(big_data_new, end_date, start_date)
big_data_new$start_date <- as.Date(big_data_new$start_date, "%d-%M-%Y")
big_data_new$end_date <- as.Date(big_data_new$end_date, "%d-%M-%Y")
head(big_data_new)
StartDate<-"2015-01-01"
EndDate<- "2020-10-05"
dates<-seq(as.POSIXct(StartDate, format="%Y-%m-%d"), as.POSIXct(EndDate, format="%Y-%m-%d"), by='days')
df = NULL
for (f in as.list(dates)){
Between22 <- as.Date(f, format="%Y-%m-%d")
class(Between22)
count=0
for(i in 1:nrow(big_data)){
date_start <- as.Date(big_data$start_date_N[i], origin="1970-01-01")
date_end <- as.Date(big_data$end_date_N[i], origin="1970-01-01")
if (between(Between22, date_start, date_end) == TRUE){
count=count +1
}
}
df = rbind(df, data.frame(Between22, count))
}
library(tibble)
library(dplyr)
library(lubridate)
big_data_new <- tibble::tribble(
~end_date, ~start_date, ~start_date_N, ~end_date_N,
"2017-03-16", "2016-03-16", 16876, 17241,
"2015-07-31", "2015-07-08", 16624, 16647,
"2016-08-02", "2016-07-06", 16988, 17015,
"2017-07-18", "2017-05-31", 17317, 17365,
"2016-10-28", "2016-01-07", 16807, 17102,
"2017-08-01", "2017-06-29", 17346, 17379)
## what are min starting and max ending date
date_range_min <- min(big_data_new$start_date_N)
date_range_max <- max(big_data_new$end_date_N)
## this definese the range of dates we are going through
date_vector <- date_range_min:date_range_max
## create storage object which is large enough to contain all results !!
result_vector <- vector(mode = "integer", length = length(date_vector))
for (d in seq_along(date_vector)) {
## logical tests: is date between any starting or ending date?
## if so, each row for which this is true will return TRUE
## sum(TRUE) equals 1, 0 otherwise
result_vector[[d]] <-
sum(big_data_new$start_date_N <= date_vector[[d]] & big_data_new$end_date_N >= date_vector[[d]])
}
result_data_frame <- tibble::tibble(date = date_vector, number_of_contracts = result_vector)
result_data_frame %>% dplyr::mutate(date = lubridate::as_date(date))
一个问题是您循环遍历数据框的行,这在这种情况下是不必要的。另一个方面是您创建了一个足够大以包含所有结果的结果对象。如果你有一个(非常)大的数据集并且你继续使用 'growing' 你的结果对象,这将导致速度降低。我强烈推荐 Hands-on Programming in R
两点都说明了。顺便说一句:如果不更改变量名,我将无法正确执行您的方法。此外,df 数据框在我尝试时只有 count = 0 作为结果。
编辑:当我设法执行你的代码时,它花了
Time difference of 17.04095 secs
执行(不加载包)。
当我执行我的解决方案时,它花费了:
Time difference of 0.916553 secs
不加载包(与您的时间范围相同)。
希望这会提供您要求的速度提升。如果是这样,我很乐意提供帮助!
从我的用户名可以看出,我是一个完全的初学者。我已经搜索了这个问题的答案,但是我正在努力寻找一个可以执行与我需要的类似的工作示例。此代码目前有效,但看起来 运行.
可能需要超过 64 小时此代码用于帮助我确定在 5 年期间的任何一天有多少活跃合同。我现在的流程是在2015-01-01到2020-10-05之间每天循环,然后循环遍历data.frame的每一行,看看合同的起止日期是否在日期内正在循环。
big_data_new <- read.table(header=TRUE, text="
end_date start_date start_date_N end_date_N
1: 2017-03-16 2016-03-16 16876 17241
2: 2015-07-31 2015-07-08 16624 16647
3: 2016-08-02 2016-07-06 16988 17015
4: 2017-07-18 2017-05-31 17317 17365
5: 2016-10-28 2016-01-07 16807 17102
6: 2017-08-01 2017-06-29 17346 17379")
library(data.table)
library(lubridate)
library(dplyr)
library(tidyverse)
big_data_new <- big_data_new2
big_data_new$start_date <- lubridate:: parse_date_time(big_data_new$start_date, orders = "d-b-Y")
big_data_new$end_date <- lubridate:: parse_date_time(big_data_new$end_date, orders = "d-b-Y")
big_data_new = select(big_data_new, end_date, start_date)
big_data_new$start_date <- as.Date(big_data_new$start_date, "%d-%M-%Y")
big_data_new$end_date <- as.Date(big_data_new$end_date, "%d-%M-%Y")
head(big_data_new)
StartDate<-"2015-01-01"
EndDate<- "2020-10-05"
dates<-seq(as.POSIXct(StartDate, format="%Y-%m-%d"), as.POSIXct(EndDate, format="%Y-%m-%d"), by='days')
df = NULL
for (f in as.list(dates)){
Between22 <- as.Date(f, format="%Y-%m-%d")
class(Between22)
count=0
for(i in 1:nrow(big_data)){
date_start <- as.Date(big_data$start_date_N[i], origin="1970-01-01")
date_end <- as.Date(big_data$end_date_N[i], origin="1970-01-01")
if (between(Between22, date_start, date_end) == TRUE){
count=count +1
}
}
df = rbind(df, data.frame(Between22, count))
}
library(tibble)
library(dplyr)
library(lubridate)
big_data_new <- tibble::tribble(
~end_date, ~start_date, ~start_date_N, ~end_date_N,
"2017-03-16", "2016-03-16", 16876, 17241,
"2015-07-31", "2015-07-08", 16624, 16647,
"2016-08-02", "2016-07-06", 16988, 17015,
"2017-07-18", "2017-05-31", 17317, 17365,
"2016-10-28", "2016-01-07", 16807, 17102,
"2017-08-01", "2017-06-29", 17346, 17379)
## what are min starting and max ending date
date_range_min <- min(big_data_new$start_date_N)
date_range_max <- max(big_data_new$end_date_N)
## this definese the range of dates we are going through
date_vector <- date_range_min:date_range_max
## create storage object which is large enough to contain all results !!
result_vector <- vector(mode = "integer", length = length(date_vector))
for (d in seq_along(date_vector)) {
## logical tests: is date between any starting or ending date?
## if so, each row for which this is true will return TRUE
## sum(TRUE) equals 1, 0 otherwise
result_vector[[d]] <-
sum(big_data_new$start_date_N <= date_vector[[d]] & big_data_new$end_date_N >= date_vector[[d]])
}
result_data_frame <- tibble::tibble(date = date_vector, number_of_contracts = result_vector)
result_data_frame %>% dplyr::mutate(date = lubridate::as_date(date))
一个问题是您循环遍历数据框的行,这在这种情况下是不必要的。另一个方面是您创建了一个足够大以包含所有结果的结果对象。如果你有一个(非常)大的数据集并且你继续使用 'growing' 你的结果对象,这将导致速度降低。我强烈推荐 Hands-on Programming in R 两点都说明了。顺便说一句:如果不更改变量名,我将无法正确执行您的方法。此外,df 数据框在我尝试时只有 count = 0 作为结果。
编辑:当我设法执行你的代码时,它花了
Time difference of 17.04095 secs
执行(不加载包)。
当我执行我的解决方案时,它花费了:
Time difference of 0.916553 secs
不加载包(与您的时间范围相同)。
希望这会提供您要求的速度提升。如果是这样,我很乐意提供帮助!