R - 转置和计算未知数量的日期列之间的差异。一种更具可读性的方式可能吗?
R - transpose and calculate differences between an unknown amount of date columns. A more readable way possible?
我在一个主要的周期序列中拟合了未知数量的多个日期周期。有没有比我的方式更具可读性或更紧凑的方式?我在这里混合了 sqldf 和 data.table,因为我在项目中使用了这两个库,但如果需要,可以在纯 data.table 中进行。或相反亦然。其他开发好的库也可以。
没有硬编码,我 运行 这个有几个选项,但不知道句点内的最大句点数。周期内的数量没有理论上的上限(可能是 365),但基本的健全性检查应该将最大值设置在 6-15 个周期左右,具体取决于规范。
这是我使用 atm 的工作代码的表示(在原始脚本中写为函数)。
我相信应该有一个 shorter/more 可读的函数。我们可以在计算差异并应用求和函数的 dcast 部分中做些什么吗?我试过了,但 fun.aggregate 似乎仅限于更简单的操作。
library("data.table")
library("sqldf")
Data <- data.table(
Fnr = c(22516, 22516, 22516, 45459, 45459),
Vernr = c(1,1,1,1,2),
Startdat = c("2010-01-01", "2010-01-01", "2012-01-01",
"2013-04-01", "2013-04-01"),
Endat = c("2010-12-31", "2010-12-31", "2012-05-19",
"2014-03-31", "2014-03-31"),
Fromdat = c("2010-02-21", "2010-08-16", NA, "2013-08-31", "2014-01-02"),
Tomdat = c("2010-05-16", "2010-09-11", NA, "2013-10-27", "2014-02-13")
)
tmp.eval <- "list(Fnr, Vernr, Startdat, Endat)"
dt_tmp <- Data[, nobs_id := order(Fromdat),
by = eval(parse(text = tmp.eval))]
dt_tmp <- dt_tmp[, c("Fromdat", "Tomdat") := list(as.Date(Fromdat),
as.Date(Tomdat))]
dt_tmp <- dcast(dt_tmp, Fnr + Vernr + ... ~ nobs_id,
value.var = c("Fromdat", "Tomdat"))
dt.colnames <- data.table(colnames(dt_tmp))
dt.col1 <- dt.colnames[substr(V1,1,3) == "Fro"][, c("nobs_id", "Fromdat") :=
list(order(V1),V1)]
dt.col2 <- dt.colnames[substr(V1,1,3) == "Tom"][, c("nobs_id", "Tomdat") :=
list(order(V1),V1)]
dt.set <- merge(dt.col1[,V1 := NULL], dt.col2[,V1 := NULL],
by = "nobs_id")
dt.set <- dt.set[, diff_col := paste(Tomdat,Fromdat, sep = "-")]
dt.set <- dt.set[, diff_col := paste(diff_col, " diff_",nobs_id, sep = "")]
dt.set <- dt.set[, diffvar_col := paste("as.numeric(diff_",nobs_id,")",
sep = "")]
str.diff <- paste(dt.set$diff_col, collapse = ",")
str.diffvar <- paste(dt.set$diffvar_col, collapse = ",")
str.diffvar <- paste("sum(", str.diffvar, ", na.rm = TRUE)")
dt_tmp <- sqldf(sprintf("SELECT *, %s FROM dt_tmp",
str.diff)
)
dt_tmp <- setDT(dt_tmp)[Startdat <= Endat,
corr_days := eval(parse (text = str.diffvar)),
by = list(Fnr, Vernr, Startdat, Endat)]
我不是 100% 确定您要做什么,但这里有一种方法可以获得与 dt_tmp
相同的最终输出。假设我正在解释您要正确执行的操作,那么诀窍就是先进行计算,然后将其融化,然后再进行投射。
Data[, nobs_id := order(Fromdat),
by = list(Fnr, Vernr, Startdat, Endat)]
Data[,Tomdat:=as.IDate(Tomdat)]
Data[,Fromdat:=as.IDate(Fromdat)]
Data[,diff:=as.integer(Tomdat-Fromdat)]
Data[,corr_days:=sum(diff),by= list(Fnr, Vernr, Startdat, Endat)]
mytemp<-dcast(melt(Data,id.vars=c('Fnr','Vernr','Startdat','Endat','nobs_id','corr_days')),Fnr+Vernr+Startdat+Endat+corr_days~variable+nobs_id,value.var='value')
#if you want them in the same order as your dt_tmp
setcolorder(mytemp, c(names(mytemp)[!names(mytemp)%in%'corr_days'],'corr_days'))
#if you want the date columns to be type character again
datecols<-names(mytemp)[grepl('dat',names(mytemp))]
mytemp[,(datecols):=lapply(.SD,as.character),.SDcols=datecols]
我在一个主要的周期序列中拟合了未知数量的多个日期周期。有没有比我的方式更具可读性或更紧凑的方式?我在这里混合了 sqldf 和 data.table,因为我在项目中使用了这两个库,但如果需要,可以在纯 data.table 中进行。或相反亦然。其他开发好的库也可以。
没有硬编码,我 运行 这个有几个选项,但不知道句点内的最大句点数。周期内的数量没有理论上的上限(可能是 365),但基本的健全性检查应该将最大值设置在 6-15 个周期左右,具体取决于规范。
这是我使用 atm 的工作代码的表示(在原始脚本中写为函数)。
我相信应该有一个 shorter/more 可读的函数。我们可以在计算差异并应用求和函数的 dcast 部分中做些什么吗?我试过了,但 fun.aggregate 似乎仅限于更简单的操作。
library("data.table")
library("sqldf")
Data <- data.table(
Fnr = c(22516, 22516, 22516, 45459, 45459),
Vernr = c(1,1,1,1,2),
Startdat = c("2010-01-01", "2010-01-01", "2012-01-01",
"2013-04-01", "2013-04-01"),
Endat = c("2010-12-31", "2010-12-31", "2012-05-19",
"2014-03-31", "2014-03-31"),
Fromdat = c("2010-02-21", "2010-08-16", NA, "2013-08-31", "2014-01-02"),
Tomdat = c("2010-05-16", "2010-09-11", NA, "2013-10-27", "2014-02-13")
)
tmp.eval <- "list(Fnr, Vernr, Startdat, Endat)"
dt_tmp <- Data[, nobs_id := order(Fromdat),
by = eval(parse(text = tmp.eval))]
dt_tmp <- dt_tmp[, c("Fromdat", "Tomdat") := list(as.Date(Fromdat),
as.Date(Tomdat))]
dt_tmp <- dcast(dt_tmp, Fnr + Vernr + ... ~ nobs_id,
value.var = c("Fromdat", "Tomdat"))
dt.colnames <- data.table(colnames(dt_tmp))
dt.col1 <- dt.colnames[substr(V1,1,3) == "Fro"][, c("nobs_id", "Fromdat") :=
list(order(V1),V1)]
dt.col2 <- dt.colnames[substr(V1,1,3) == "Tom"][, c("nobs_id", "Tomdat") :=
list(order(V1),V1)]
dt.set <- merge(dt.col1[,V1 := NULL], dt.col2[,V1 := NULL],
by = "nobs_id")
dt.set <- dt.set[, diff_col := paste(Tomdat,Fromdat, sep = "-")]
dt.set <- dt.set[, diff_col := paste(diff_col, " diff_",nobs_id, sep = "")]
dt.set <- dt.set[, diffvar_col := paste("as.numeric(diff_",nobs_id,")",
sep = "")]
str.diff <- paste(dt.set$diff_col, collapse = ",")
str.diffvar <- paste(dt.set$diffvar_col, collapse = ",")
str.diffvar <- paste("sum(", str.diffvar, ", na.rm = TRUE)")
dt_tmp <- sqldf(sprintf("SELECT *, %s FROM dt_tmp",
str.diff)
)
dt_tmp <- setDT(dt_tmp)[Startdat <= Endat,
corr_days := eval(parse (text = str.diffvar)),
by = list(Fnr, Vernr, Startdat, Endat)]
我不是 100% 确定您要做什么,但这里有一种方法可以获得与 dt_tmp
相同的最终输出。假设我正在解释您要正确执行的操作,那么诀窍就是先进行计算,然后将其融化,然后再进行投射。
Data[, nobs_id := order(Fromdat),
by = list(Fnr, Vernr, Startdat, Endat)]
Data[,Tomdat:=as.IDate(Tomdat)]
Data[,Fromdat:=as.IDate(Fromdat)]
Data[,diff:=as.integer(Tomdat-Fromdat)]
Data[,corr_days:=sum(diff),by= list(Fnr, Vernr, Startdat, Endat)]
mytemp<-dcast(melt(Data,id.vars=c('Fnr','Vernr','Startdat','Endat','nobs_id','corr_days')),Fnr+Vernr+Startdat+Endat+corr_days~variable+nobs_id,value.var='value')
#if you want them in the same order as your dt_tmp
setcolorder(mytemp, c(names(mytemp)[!names(mytemp)%in%'corr_days'],'corr_days'))
#if you want the date columns to be type character again
datecols<-names(mytemp)[grepl('dat',names(mytemp))]
mytemp[,(datecols):=lapply(.SD,as.character),.SDcols=datecols]