加速 R 中的嵌套循环

Speeding up a nested loop in R

从我的用户名可以看出,我是一个完全的初学者。我已经搜索了这个问题的答案,但是我正在努力寻找一个可以执行与我需要的类似的工作示例。此代码目前有效,但看起来 运行.

可能需要超过 64 小时

此代码用于帮助我确定在 5 年期间的任何一天有多少活跃合同。我现在的流程是在2015-01-01到2020-10-05之间每天循环,然后循环遍历data.frame的每一行,看看合同的起止日期是否在日期内正在循环。

big_data_new <- read.table(header=TRUE, text="
     end_date start_date start_date_N end_date_N
1: 2017-03-16 2016-03-16        16876      17241
2: 2015-07-31 2015-07-08        16624      16647
3: 2016-08-02 2016-07-06        16988      17015
4: 2017-07-18 2017-05-31        17317      17365
5: 2016-10-28 2016-01-07        16807      17102
6: 2017-08-01 2017-06-29        17346      17379")

library(data.table)
library(lubridate)
library(dplyr)
library(tidyverse)

big_data_new <- big_data_new2

big_data_new$start_date <- lubridate:: parse_date_time(big_data_new$start_date, orders =  "d-b-Y")
big_data_new$end_date <- lubridate:: parse_date_time(big_data_new$end_date, orders =  "d-b-Y")



big_data_new = select(big_data_new, end_date, start_date)
big_data_new$start_date <- as.Date(big_data_new$start_date, "%d-%M-%Y")
big_data_new$end_date <- as.Date(big_data_new$end_date, "%d-%M-%Y")
head(big_data_new)


StartDate<-"2015-01-01"
EndDate<- "2020-10-05"
dates<-seq(as.POSIXct(StartDate, format="%Y-%m-%d"), as.POSIXct(EndDate, format="%Y-%m-%d"), by='days')
df = NULL



for (f in as.list(dates)){
 Between22 <- as.Date(f, format="%Y-%m-%d")
 class(Between22)
 
 count=0
 for(i in 1:nrow(big_data)){
   date_start <- as.Date(big_data$start_date_N[i], origin="1970-01-01")
   date_end <- as.Date(big_data$end_date_N[i], origin="1970-01-01")
   
   if (between(Between22, date_start, date_end) == TRUE){ 
     count=count +1
   }
   
 }
 
 df = rbind(df, data.frame(Between22, count))
}
library(tibble)
library(dplyr)
library(lubridate)


big_data_new <- tibble::tribble(
     ~end_date, ~start_date, ~start_date_N, ~end_date_N,
"2017-03-16", "2016-03-16",        16876, 17241,
"2015-07-31", "2015-07-08",       16624, 16647,
"2016-08-02", "2016-07-06",        16988, 17015,
"2017-07-18", "2017-05-31",        17317, 17365,
"2016-10-28", "2016-01-07",       16807, 17102,
"2017-08-01", "2017-06-29",       17346, 17379)

## what are min starting and max ending date
date_range_min <- min(big_data_new$start_date_N)
date_range_max <- max(big_data_new$end_date_N)


## this definese the range of dates we are going through
date_vector <-   date_range_min:date_range_max
  
## create storage object which is large enough to contain all results !!
result_vector <- vector(mode = "integer", length = length(date_vector))


for (d  in seq_along(date_vector)) {
  
  ## logical tests: is date between any starting or ending date? 
  ## if so, each row  for which this is true will return TRUE 
  ## sum(TRUE) equals 1, 0 otherwise
  
  result_vector[[d]]   <- 
    sum(big_data_new$start_date_N <= date_vector[[d]] & big_data_new$end_date_N >= date_vector[[d]])
  
}

result_data_frame <- tibble::tibble(date = date_vector, number_of_contracts = result_vector)
result_data_frame %>% dplyr::mutate(date = lubridate::as_date(date))

一个问题是您循环遍历数据框的行,这在这种情况下是不必要的。另一个方面是您创建了一个足够大以包含所有结果的结果对象。如果你有一个(非常)大的数据集并且你继续使用 'growing' 你的结果对象,这将导致速度降低。我强烈推荐 Hands-on Programming in R 两点都说明了。顺便说一句:如果不更改变量名,我将无法正确执行您的方法。此外,df 数据框在我尝试时只有 count = 0 作为结果。

编辑:当我设法执行你的代码时,它花了

Time difference of 17.04095 secs

执行(不加载包)。

当我执行我的解决方案时,它花费了:

Time difference of 0.916553 secs

不加载包(与您的时间范围相同)。

希望这会提供您要求的速度提升。如果是这样,我很乐意提供帮助!