使用 R 从另一个 data.table 条件查找向 data.table 添加新列
Add new column to data.table using conditional lookup from another data.table using R
我有 data.table dt
如下 -
dt = structure(list(date = structure(c(18904L, 18904L, 18904L), class = c("IDate",
"Date")), exp_date = structure(c(18915L, 19013L, 19377L), class = c("IDate",
"Date")), days_remaining = c(11, 109, 473), year_remaining = c(0.0301369863013699,
0.298630136986301, 1.2958904109589)), row.names = c(NA, -3L), class = c("data.table",
"data.frame"))
date exp_date days_remaining year_remaining
1: 2021-10-04 2021-10-15 11 0.03013699
2: 2021-10-04 2022-01-21 109 0.29863014
3: 2021-10-04 2023-01-20 473 1.29589041
至 table dt
,我想从名为 rates_dt
的利率 table 添加一个利率列
这里是利率data.tablerates_dt
rates_dt = structure(list(index = structure(c(18900, 18901, 18904, 18905,
18906), tzone = "UTC", tclass = "Date", class = "Date"), `1_MO` = c(0.07,
0.08, 0.09, 0.1, 0.06), `2_MO` = c(0.05, 0.04, 0.04, 0.04, 0.04
), `3_MO` = c(0.04, 0.04, 0.04, 0.04, 0.04), `6_MO` = c(0.05,
0.05, 0.06, 0.06, 0.06), `1_YR` = c(0.09, 0.09, 0.09, 0.09, 0.1
), `2_YR` = c(0.28, 0.27, 0.27, 0.28, 0.3), `3_YR` = c(0.53,
0.49, 0.52, 0.54, 0.55)), class = c("data.table", "data.frame"
), row.names = c(NA, -5L))
index 1_MO 2_MO 3_MO 6_MO 1_YR 2_YR 3_YR
1: 2021-09-30 0.07 0.05 0.04 0.05 0.09 0.28 0.53
2: 2021-10-01 0.08 0.04 0.04 0.05 0.09 0.27 0.49
3: 2021-10-04 0.09 0.04 0.04 0.06 0.09 0.27 0.52
4: 2021-10-05 0.10 0.04 0.04 0.06 0.09 0.28 0.54
5: 2021-10-06 0.06 0.04 0.04 0.06 0.10 0.30 0.55
我想做的是,如果 days_remaining
在 dt
table 中少于 30 天,则添加一个 rates
列查找列 1_MO
在 rates_dt
table 中。同样,如果 days_remaining 为 473(超过 365 天),则从 rates_dt
table 中的列 1_YR
中获取比率。
输出应该看起来像这样 -
date exp_date days_remaining year_remaining rates
1: 2021-10-04 2021-10-15 11 0.03013699 0.09
2: 2021-10-04 2022-01-21 109 0.29863014 0.04
3: 2021-10-04 2023-01-20 473 1.29589041 0.09
在实际数据中,date
列将包含过去一年的所有日期。所以我正在寻找的解决方案应该使用 group by date
.
我试过融化rates_dt
然后做如下的事情,但无法达到想要的解决方案-
dt[melted_rates_dt, on = c("date" = "index"), rates := value]
如果有人可以提供有关如何解决此问题的任何指示,我将不胜感激。
谢谢!
这是我的建议:
REPREX
library(data.table)
# Join between dt and rates_dt
z <- rates_dt[dt, on = .(index = date)][, c(1,9,10,11,2:8)]
# fill 'rates' column with multiple nested 'fifelse' (i.e. dedicated ifelse for data.table) and delete unnecessary columns
z[, `:=` (rates = fifelse (days_remaining <= 30, `1_MO`,
fifelse(days_remaining > 30 & days_remaining <= 60, `2_MO`,
fifelse(days_remaining > 60 & days_remaining <= 365, `3_MO`,
fifelse(days_remaining > 365 & days_remaining <= 730, `1_YR`, `3_YR`)))),
`1_MO` = NULL, `2_MO` = NULL, `3_MO` = NULL, `6_MO` = NULL, `1_YR` = NULL, `2_YR` = NULL, `3_YR` = NULL)]
由 reprex package (v2.0.1)
创建于 2021-10-07
输出:
z
#> index exp_date days_remaining year_remaining rates
#> 1: 2021-10-04 2021-10-15 11 0.03013699 0.09
#> 2: 2021-10-04 2022-01-21 109 0.29863014 0.04
#> 3: 2021-10-04 2023-01-20 473 1.29589041 0.09
由 reprex package (v2.0.1)
创建于 2021-10-07
我是这样解决的-
melted_rates_dt = melt(rates_dt, id.vars = "index")
dt[, duration := fcase(days_remaining < 30, "1_MO",
days_remaining < 61, "2_MO",
days_remaining < 91, "3_MO",
days_remaining < 181, "6_MO",
days_remaining < 365, "1_YR",
days_remaining < 730, "2_YR",
days_remaining < 1095, "3_YR")]
dt[melted_rates_dt, on = c("duration" = "variable",
"date" = "index"), rates := value]
我认为这是 findInterval
和加入融化率的好案例。
library(lubridate)
periodtxt <- c("1_MO", "2_MO", "3_MO", "6_MO", "1_YR", "2_YR", "3_YR")
periodn <- c(1, 2, 3, 6, 12, 24, 36)
dt[, period := mapply(function(dt, exp) periodtxt[findInterval(exp, dt %m+% months(c(0, periodn)))],
date, exp_date) ]
# date exp_date days_remaining year_remaining period
# <IDat> <IDat> <num> <num> <char>
# 1: 2021-10-04 2021-10-15 11 0.03013699 1_MO
# 2: 2021-10-04 2022-01-21 109 0.29863014 6_MO
# 3: 2021-10-04 2023-01-20 473 1.29589041 2_YR
从这里开始,
melted_rates_dt <- melt(rates_dt, id.vars = "index", variable.name = "period", value.name = "rate")
dt[melted_rates_dt, rate := rate, on = .(date == index, period)]
# date exp_date days_remaining year_remaining period rate
# <IDat> <IDat> <num> <num> <char> <num>
# 1: 2021-10-04 2021-10-15 11 0.03013699 1_MO 0.09
# 2: 2021-10-04 2022-01-21 109 0.29863014 6_MO 0.06
# 3: 2021-10-04 2023-01-20 473 1.29589041 2_YR 0.27
(我仍然不确定你是如何从 2_YR
数据中得到 0.09
的,所以我按原样提供它,直到我更好地理解你的第三行预期输出。)
我有 data.table dt
如下 -
dt = structure(list(date = structure(c(18904L, 18904L, 18904L), class = c("IDate",
"Date")), exp_date = structure(c(18915L, 19013L, 19377L), class = c("IDate",
"Date")), days_remaining = c(11, 109, 473), year_remaining = c(0.0301369863013699,
0.298630136986301, 1.2958904109589)), row.names = c(NA, -3L), class = c("data.table",
"data.frame"))
date exp_date days_remaining year_remaining
1: 2021-10-04 2021-10-15 11 0.03013699
2: 2021-10-04 2022-01-21 109 0.29863014
3: 2021-10-04 2023-01-20 473 1.29589041
至 table dt
,我想从名为 rates_dt
的利率 table 添加一个利率列
这里是利率data.tablerates_dt
rates_dt = structure(list(index = structure(c(18900, 18901, 18904, 18905,
18906), tzone = "UTC", tclass = "Date", class = "Date"), `1_MO` = c(0.07,
0.08, 0.09, 0.1, 0.06), `2_MO` = c(0.05, 0.04, 0.04, 0.04, 0.04
), `3_MO` = c(0.04, 0.04, 0.04, 0.04, 0.04), `6_MO` = c(0.05,
0.05, 0.06, 0.06, 0.06), `1_YR` = c(0.09, 0.09, 0.09, 0.09, 0.1
), `2_YR` = c(0.28, 0.27, 0.27, 0.28, 0.3), `3_YR` = c(0.53,
0.49, 0.52, 0.54, 0.55)), class = c("data.table", "data.frame"
), row.names = c(NA, -5L))
index 1_MO 2_MO 3_MO 6_MO 1_YR 2_YR 3_YR
1: 2021-09-30 0.07 0.05 0.04 0.05 0.09 0.28 0.53
2: 2021-10-01 0.08 0.04 0.04 0.05 0.09 0.27 0.49
3: 2021-10-04 0.09 0.04 0.04 0.06 0.09 0.27 0.52
4: 2021-10-05 0.10 0.04 0.04 0.06 0.09 0.28 0.54
5: 2021-10-06 0.06 0.04 0.04 0.06 0.10 0.30 0.55
我想做的是,如果 days_remaining
在 dt
table 中少于 30 天,则添加一个 rates
列查找列 1_MO
在 rates_dt
table 中。同样,如果 days_remaining 为 473(超过 365 天),则从 rates_dt
table 中的列 1_YR
中获取比率。
输出应该看起来像这样 -
date exp_date days_remaining year_remaining rates
1: 2021-10-04 2021-10-15 11 0.03013699 0.09
2: 2021-10-04 2022-01-21 109 0.29863014 0.04
3: 2021-10-04 2023-01-20 473 1.29589041 0.09
在实际数据中,date
列将包含过去一年的所有日期。所以我正在寻找的解决方案应该使用 group by date
.
我试过融化rates_dt
然后做如下的事情,但无法达到想要的解决方案-
dt[melted_rates_dt, on = c("date" = "index"), rates := value]
如果有人可以提供有关如何解决此问题的任何指示,我将不胜感激。
谢谢!
这是我的建议:
REPREX
library(data.table)
# Join between dt and rates_dt
z <- rates_dt[dt, on = .(index = date)][, c(1,9,10,11,2:8)]
# fill 'rates' column with multiple nested 'fifelse' (i.e. dedicated ifelse for data.table) and delete unnecessary columns
z[, `:=` (rates = fifelse (days_remaining <= 30, `1_MO`,
fifelse(days_remaining > 30 & days_remaining <= 60, `2_MO`,
fifelse(days_remaining > 60 & days_remaining <= 365, `3_MO`,
fifelse(days_remaining > 365 & days_remaining <= 730, `1_YR`, `3_YR`)))),
`1_MO` = NULL, `2_MO` = NULL, `3_MO` = NULL, `6_MO` = NULL, `1_YR` = NULL, `2_YR` = NULL, `3_YR` = NULL)]
由 reprex package (v2.0.1)
创建于 2021-10-07输出:
z
#> index exp_date days_remaining year_remaining rates
#> 1: 2021-10-04 2021-10-15 11 0.03013699 0.09
#> 2: 2021-10-04 2022-01-21 109 0.29863014 0.04
#> 3: 2021-10-04 2023-01-20 473 1.29589041 0.09
由 reprex package (v2.0.1)
创建于 2021-10-07我是这样解决的-
melted_rates_dt = melt(rates_dt, id.vars = "index")
dt[, duration := fcase(days_remaining < 30, "1_MO",
days_remaining < 61, "2_MO",
days_remaining < 91, "3_MO",
days_remaining < 181, "6_MO",
days_remaining < 365, "1_YR",
days_remaining < 730, "2_YR",
days_remaining < 1095, "3_YR")]
dt[melted_rates_dt, on = c("duration" = "variable",
"date" = "index"), rates := value]
我认为这是 findInterval
和加入融化率的好案例。
library(lubridate)
periodtxt <- c("1_MO", "2_MO", "3_MO", "6_MO", "1_YR", "2_YR", "3_YR")
periodn <- c(1, 2, 3, 6, 12, 24, 36)
dt[, period := mapply(function(dt, exp) periodtxt[findInterval(exp, dt %m+% months(c(0, periodn)))],
date, exp_date) ]
# date exp_date days_remaining year_remaining period
# <IDat> <IDat> <num> <num> <char>
# 1: 2021-10-04 2021-10-15 11 0.03013699 1_MO
# 2: 2021-10-04 2022-01-21 109 0.29863014 6_MO
# 3: 2021-10-04 2023-01-20 473 1.29589041 2_YR
从这里开始,
melted_rates_dt <- melt(rates_dt, id.vars = "index", variable.name = "period", value.name = "rate")
dt[melted_rates_dt, rate := rate, on = .(date == index, period)]
# date exp_date days_remaining year_remaining period rate
# <IDat> <IDat> <num> <num> <char> <num>
# 1: 2021-10-04 2021-10-15 11 0.03013699 1_MO 0.09
# 2: 2021-10-04 2022-01-21 109 0.29863014 6_MO 0.06
# 3: 2021-10-04 2023-01-20 473 1.29589041 2_YR 0.27
(我仍然不确定你是如何从 2_YR
数据中得到 0.09
的,所以我按原样提供它,直到我更好地理解你的第三行预期输出。)