简化for循环:读取多个文件并移除特定数据
Simplify for loop: read multiple files and remove specific data
我在一个包含多年降雨数据的文件夹中有 38 个 csv 文件,如下所示:
Precp_1980.csv
Precp_1981.csv
Precp_1982.csv
Precp_1983.csv
.
.
.
Precp_2017.csv
闰年文件如下所示:
Precp_1980 <- data.frame(matrix(runif(366*11299,min = 0, max = 4), ncol = 366, nrow = 11299))
names(Precp_1980) <- c(rep(paste0("d_",1:366)))
Precp_1980$ID1 <- seq(1:11299)
Precp_1980$ID2 <- seq(0.0,1.1, length.out = 11299)
Precp_1980$ID3 <- seq(1,10, length.out = 11299)
Precp_1980$ID4 <- seq(100,200, length.out = 11299)
Precp_1980$year<- 1980
一个csv文件有11299个位置(11299行)。前四列有一些特定于位置的信息 (ID1,ID2,ID3,ID4
),第五列有
year
第六列到第 371 列有每日降雨数据(即 1 到 366 天)。
非闰年文件完全相同,只是因为只有 365 天而少了一列:
Precp_1981 <- data.frame(matrix(runif(365*11299,min = 0, max = 4), ncol = 365, nrow = 11299))
names(Precp_1981) <- c(rep(paste0("d_",1:365)))
Precp_1981$ID1 <- seq(1:11299)
Precp_1981$ID2 <- seq(0.0,1.1, length.out = 11299)
Precp_1981$ID3 <- seq(1,10, length.out = 11299)
Precp_1981$ID4 <- seq(100,200, length.out = 11299)
Precp_1981$year<- 1981
我的objective:
1) 从所有闰年文件中,删除一年中的 doy 60,以便所有文件都有一年中的 365 天。
2) 将单个文件从宽格式转换为长格式
3) 将所有文件合并为一个文件
我所做的是:
library(data.table)
library(dplyr)
library(reshape2)
year.list <- list() # create a list to save the outputs
yr.list <- c(1980:2017)
leap.yr <- c(1980,1984,1988,1992,1996,2000,2004,2008,2012,2016) # vector of leap years
for(y in seq_along(yr.list)){
yr <- yr.list[y]
if(yr %in% leap.yr){ # if a year is a leap year
dat <- fread("Precp_",yr,".csv"))
dat.up <- dat %>% dplyr::select(-d_60) # this removes the day 60 from the leap year
dat.up.m <- melt(dat.up, id.vars = c("ID1","ID2","ID3","ID4","year"), value.name = "rain", variable.name = "day") # converts the data into long format
dat.up.m <- dat.up.m %>% mutate(day = gsub("d_", "", day)) %>% # converts the "day_1" to numeric day of year
mutate(day = as.numeric(day)) %>%
mutate(day = ifelse(day >= 61, day - 1, day)) # this converts all days which were greater than 60 to previous day so that I have 365 days of year
year.list[[y]] <- dat.up.m
} else { # if a year is not a leap year
dat <- fread("Precp_",yr,".csv"))
dat.up.m <- melt(dat.up, id.vars = c("ID1","ID2","ID3","ID4","year"), value.name = "rain", variable.name = "day") # converts the data into long format
dat.up.m <- dat.up.m %>% mutate(day = gsub("d_", "", day)) %>% # converts the "day_1" to numeric day of year
mutate(day = as.numeric(day)) %>%
year.list[[y]] <- dat.up.m
}
stack.rain <- rbindlist(year.list)
我正在寻找更短(也许更快?)的解决方案。脚本太长
下面的代码应该可以工作。我循环浏览文件,阅读每个人,但在旅途中更改 leap.years
。可能有更简洁的方法,但这个比你的更简单。
year.list <- list() # create a list to save the outputs
yr.list <- c(1980:2017)
leap.yr <- c(1980,1984,1988,1992,1996,2000,2004,2008,2012,2016) # vector of leap years
# function to read and clean data sets
data.prep = function(x){
yy = fread("Precp_", x, ".csv")
if(x %in% leap.yr){
yy[, d_60 := NULL]
cols = as.numeric(gsub("d_", "", names(yy)))
cols = cols[!is.na(cols)]
cols[cols > 60] = cols[cols > 60] - 1
names(yy)[grep("d_", names(yy))] = cols
} else {
names(yy)[grep("d_", names(yy))] = paste(1:365)
}
return(yy)
}
xx = lapply(yr.list, data.prep))
names(xx) = paste(yr.list)
xx = rbindlist(xx, idcol = "year")
stack_rain <- melt(xx, id.vars = c("ID1","ID2","ID3","ID4","year"),
value.name = "rain", variable.name = "day") # converts the data into long format
不要在循环内执行任何复杂的数据操作。
在循环中:加载并 melt
所有数据(您不会丢失太多内存,因为您只删除了 ~1/365 的数据)。
然后在循环外:使用 data.table
对象过滤器(删除第 60 天)并修改您的数据("day" 列)。
# Arguments
yearAll <- 1980:2017
yearLp <- seq(1980, 2016, 4)
# Libraries
library(data.table)
library(foreach)
# Load data
# It's possible to parallelize loop using %dopar%
result <- foreach(i = yearAll, .combine = rbind) %do% {
melt(fread(paste0("Precp_", i, ".csv")),
c("ID1", "ID2", "ID3", "ID4", "year"))
}
# Modify data
result <- result[!(year %in% yearLp & variable == "d_60")]
result[, day := as.numeric(sub("d_", "", variable))]
result[year %in% yearLp & day >= 61, day := day - 1]
我在一个包含多年降雨数据的文件夹中有 38 个 csv 文件,如下所示:
Precp_1980.csv
Precp_1981.csv
Precp_1982.csv
Precp_1983.csv . . .
Precp_2017.csv
闰年文件如下所示:
Precp_1980 <- data.frame(matrix(runif(366*11299,min = 0, max = 4), ncol = 366, nrow = 11299))
names(Precp_1980) <- c(rep(paste0("d_",1:366)))
Precp_1980$ID1 <- seq(1:11299)
Precp_1980$ID2 <- seq(0.0,1.1, length.out = 11299)
Precp_1980$ID3 <- seq(1,10, length.out = 11299)
Precp_1980$ID4 <- seq(100,200, length.out = 11299)
Precp_1980$year<- 1980
一个csv文件有11299个位置(11299行)。前四列有一些特定于位置的信息 (ID1,ID2,ID3,ID4
),第五列有
year
第六列到第 371 列有每日降雨数据(即 1 到 366 天)。
非闰年文件完全相同,只是因为只有 365 天而少了一列:
Precp_1981 <- data.frame(matrix(runif(365*11299,min = 0, max = 4), ncol = 365, nrow = 11299))
names(Precp_1981) <- c(rep(paste0("d_",1:365)))
Precp_1981$ID1 <- seq(1:11299)
Precp_1981$ID2 <- seq(0.0,1.1, length.out = 11299)
Precp_1981$ID3 <- seq(1,10, length.out = 11299)
Precp_1981$ID4 <- seq(100,200, length.out = 11299)
Precp_1981$year<- 1981
我的objective:
1) 从所有闰年文件中,删除一年中的 doy 60,以便所有文件都有一年中的 365 天。
2) 将单个文件从宽格式转换为长格式
3) 将所有文件合并为一个文件
我所做的是:
library(data.table)
library(dplyr)
library(reshape2)
year.list <- list() # create a list to save the outputs
yr.list <- c(1980:2017)
leap.yr <- c(1980,1984,1988,1992,1996,2000,2004,2008,2012,2016) # vector of leap years
for(y in seq_along(yr.list)){
yr <- yr.list[y]
if(yr %in% leap.yr){ # if a year is a leap year
dat <- fread("Precp_",yr,".csv"))
dat.up <- dat %>% dplyr::select(-d_60) # this removes the day 60 from the leap year
dat.up.m <- melt(dat.up, id.vars = c("ID1","ID2","ID3","ID4","year"), value.name = "rain", variable.name = "day") # converts the data into long format
dat.up.m <- dat.up.m %>% mutate(day = gsub("d_", "", day)) %>% # converts the "day_1" to numeric day of year
mutate(day = as.numeric(day)) %>%
mutate(day = ifelse(day >= 61, day - 1, day)) # this converts all days which were greater than 60 to previous day so that I have 365 days of year
year.list[[y]] <- dat.up.m
} else { # if a year is not a leap year
dat <- fread("Precp_",yr,".csv"))
dat.up.m <- melt(dat.up, id.vars = c("ID1","ID2","ID3","ID4","year"), value.name = "rain", variable.name = "day") # converts the data into long format
dat.up.m <- dat.up.m %>% mutate(day = gsub("d_", "", day)) %>% # converts the "day_1" to numeric day of year
mutate(day = as.numeric(day)) %>%
year.list[[y]] <- dat.up.m
}
stack.rain <- rbindlist(year.list)
我正在寻找更短(也许更快?)的解决方案。脚本太长
下面的代码应该可以工作。我循环浏览文件,阅读每个人,但在旅途中更改 leap.years
。可能有更简洁的方法,但这个比你的更简单。
year.list <- list() # create a list to save the outputs
yr.list <- c(1980:2017)
leap.yr <- c(1980,1984,1988,1992,1996,2000,2004,2008,2012,2016) # vector of leap years
# function to read and clean data sets
data.prep = function(x){
yy = fread("Precp_", x, ".csv")
if(x %in% leap.yr){
yy[, d_60 := NULL]
cols = as.numeric(gsub("d_", "", names(yy)))
cols = cols[!is.na(cols)]
cols[cols > 60] = cols[cols > 60] - 1
names(yy)[grep("d_", names(yy))] = cols
} else {
names(yy)[grep("d_", names(yy))] = paste(1:365)
}
return(yy)
}
xx = lapply(yr.list, data.prep))
names(xx) = paste(yr.list)
xx = rbindlist(xx, idcol = "year")
stack_rain <- melt(xx, id.vars = c("ID1","ID2","ID3","ID4","year"),
value.name = "rain", variable.name = "day") # converts the data into long format
不要在循环内执行任何复杂的数据操作。
在循环中:加载并 melt
所有数据(您不会丢失太多内存,因为您只删除了 ~1/365 的数据)。
然后在循环外:使用 data.table
对象过滤器(删除第 60 天)并修改您的数据("day" 列)。
# Arguments
yearAll <- 1980:2017
yearLp <- seq(1980, 2016, 4)
# Libraries
library(data.table)
library(foreach)
# Load data
# It's possible to parallelize loop using %dopar%
result <- foreach(i = yearAll, .combine = rbind) %do% {
melt(fread(paste0("Precp_", i, ".csv")),
c("ID1", "ID2", "ID3", "ID4", "year"))
}
# Modify data
result <- result[!(year %in% yearLp & variable == "d_60")]
result[, day := as.numeric(sub("d_", "", variable))]
result[year %in% yearLp & day >= 61, day := day - 1]