分组数据 table 并按引用对列应用剪切函数
Group data table and apply cut function to columns by reference
使用下面提供的数据,我想按日期对数据 table 进行分组,并按列引用 (colstoCut
) 在我的代码中应用剪切函数。我可以使用 dplyr
实现这一点,但我的实际数据非常大而且速度太慢。我已经根据我在 SO 和其他地方找到的方法进行了几次尝试,但到目前为止我发现的方法似乎都没有用。
library(tidyverse)
library(data.table)
cutme <- structure(list(Date = structure(c(18993, 18993, 18993, 18993,
18993, 18994, 18994, 18994, 18994, 18994, 18995, 18995, 18995,
18995, 18995, 18996, 18996, 18996, 18996, 18996, 18997, 18997,
18997, 18997, 18997), class = "Date"), val1 = c(2, 1, 1, 1, 2,
0, 0, 1, 0, 0, 0, 1, 1, 2, 1, 1, 2, 1, 0, 0, 1, 1, 0, 0, 1),
val2 = c(306, 291, 306, 300, 306, 295, 299, 291, 302, 298,
301, 300, 291, 301, 297, 290, 294, 298, 293, 294, 310, 305,
293, 322, 299), val3 = c(278.115915402059, 275.206632766366,
277.843871977486, 274.375934310537, 271.976342200702, 314.694861131995,
322.55015422103, 312.56565930567, 321.31779178896, 310.742656596237,
294.839125866978, 305.946938215211, 317.090018318496, 319.386088532157,
312.323793703966, 309.29514039576, 313.96520162878, 317.360306029457,
310.212544203034, 320.263145398593, 310.432980834677, 296.638028917156,
294.622602772748, 305.922855022984, 308.30568677617)), row.names = c(NA,
-25L), groups = structure(list(`Date,` = structure(c(18993, 18994,
18995, 18996, 18997), class = "Date"), .rows = structure(list(
1:5, 6:10, 11:15, 16:20, 21:25), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -5L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("data.table",
"data.frame"))
# cut these columns [transforms them to bins (factors)]
colstoCut <- colnames(cutme)[-1]
# approach using dplyr (works but too slow on real data)
cutme <- cutme %>%
dplyr::group_by(Date) %>%
dplyr::mutate_at(all_of(colstoCut), ~cut(x = ., breaks = unique(quantile(x = ., probs = seq(0,1,0.025))), include.lowest = TRUE)) %>%
dplyr::ungroup(.)
## several attempts using data.table ##
# no error thrown but columns are not actually cut
cutme[, (colstoCut) := Map(function(x) cut(x = ., breaks = seq(0,1,0.0025), include.lowest = TRUE)), by = Date]
# Error in cut.default(x = ., breaks = seq(0, 1, 0.0025), include.lowest = TRUE) : 'x' must be numeric
cutme[, (colstoCut) := lapply(.SD, cut(x = ., breaks = seq(0,1,0.0025), include.lowest = TRUE)), by = Date, .SDcols = colstoCut]
# Error in cut.default(x = ., breaks = seq(0, 1, 0.0025), include.lowest = TRUE) : 'x' must be numeric
in_cols <- colstoCut
out_cols <- paste0(in_cols,"fact")
cutme[, (out_cols) := lapply(.SD, cut(x = ., breaks = seq(0,1,0.0025), include.lowest = TRUE)), by = Date, .SDcols = in_cols]
# Error in cut.default(x = ., breaks = seq(0, 1, 0.0025), include.lowest = TRUE) : 'x' must be numeric
cutme[, (colstoCut) := lapply(colstoCut, function(x) cut(x = ., breaks = seq(0,1,0.0025), include.lowest = TRUE)), by = Date]
# Error in .subset2(x, i, exact = exact) : recursive indexing failed at level 2
cutme[, (colstoCut) := lapply(.SD, cut(x = cutme[[colstoCut]], breaks = seq(0,1,0.0025), include.lowest = TRUE)), by = Date]
根据 akrun 的评论,以下对我有用:
in_cols <- colstoCut
out_cols <- paste0(in_cols,"_fact")
cutme[, (out_cols) := lapply(.SD, function(.) cut(x = ., breaks = unique(quantile(x = ., probs = seq(0,1,0.025))), include.lowest = TRUE)), by = Date, .SDcols = (in_cols) ]
cutme <- cutme %>%
dplyr::select(Date, all_of(out_cols)
colnames(cutme) <- gsub("_fact","",colnames(cutme))
因为我无法像使用 dplyr::mutate_at(vars(colstoCut), ~cut(...))
那样直接将数字列转换为因子列,所以我从数据中删除了原始(未切割)列 table 并选择了新的切割列 / 使用 gsub
.
重命名
我确信有一种更简洁的方法可以做到这一点,而无需选择/重命名列,但它对我有用。我很乐意接受一次更好的答案,如果它已发布。
对于具有 1 个分组列和 3 个要转换的值列的给定示例数据集,data.table equivalent of OP's dplyr 代码只是
library(data.table)
mycut <- \(x) cut(x, unique(quantile(x, probs = seq(0, 1, 0.025))), include.lowest = TRUE)
cutme <- setDT(cutme)[, lapply(.SD, mycut), .SDcols = colstoCut, by = Date]
cutme
Date val1 val2 val3
<Date> <fctr> <fctr> <fctr>
1: 2022-01-01 (1.9,2] (305.4,306] (278.09,278.12]
2: 2022-01-01 [1,1.1] [291,291.9] (275.12,275.21]
3: 2022-01-01 [1,1.1] (305.4,306] (277.58,277.84]
4: 2022-01-01 [1,1.1] (299.1,300] (274.14,274.38]
5: 2022-01-01 (1.9,2] (305.4,306] [271.98,272.22]
6: 2022-01-02 [0,0.1] (294.6,295] (314.5,314.7]
7: 2022-01-02 [0,0.1] (298.9,299] (322.4,322.6]
8: 2022-01-02 (0.9,1] [291,291.4] (312.4,312.6]
9: 2022-01-02 [0,0.1] (301.7,302] (320.7,321.3]
10: 2022-01-02 [0,0.1] (297.7,298] [310.7,310.9]
11: 2022-01-03 [0,0.1] (300.9,301] [294.8,295.9]
12: 2022-01-03 (0.9,1] (299.7,300] (304.8,305.9]
13: 2022-01-03 (0.9,1] [291,291.6] (316.6,317.1]
14: 2022-01-03 (1.9,2] (300.9,301] (319.2,319.4]
15: 2022-01-03 (0.9,1] (296.4,297] (311.7,312.3]
16: 2022-01-04 (0.9,1] [290,290.3] [309.3,309.39]
17: 2022-01-04 (1.9,2] (293.9,294] (313.59,313.97]
18: 2022-01-04 (0.9,1] (297.6,298] (317.02,317.36]
19: 2022-01-04 [0,0.1] (292.7,293] (310.12,310.21]
20: 2022-01-04 [0,0.1] (293.9,294] (319.97,320.26]
21: 2022-01-05 (0.9,1] (309.5,310] (310.2,310.4]
22: 2022-01-05 (0.9,1] (304.4,305] (296.4,296.6]
23: 2022-01-05 [0,0.1] [293,293.6] [294.6,294.8]
24: 2022-01-05 [0,0.1] (320.8,322] (305,305.9]
25: 2022-01-05 (0.9,1] (298.4,299] (308.1,308.3]
Date val1 val2 val3
使用下面提供的数据,我想按日期对数据 table 进行分组,并按列引用 (colstoCut
) 在我的代码中应用剪切函数。我可以使用 dplyr
实现这一点,但我的实际数据非常大而且速度太慢。我已经根据我在 SO 和其他地方找到的方法进行了几次尝试,但到目前为止我发现的方法似乎都没有用。
library(tidyverse)
library(data.table)
cutme <- structure(list(Date = structure(c(18993, 18993, 18993, 18993,
18993, 18994, 18994, 18994, 18994, 18994, 18995, 18995, 18995,
18995, 18995, 18996, 18996, 18996, 18996, 18996, 18997, 18997,
18997, 18997, 18997), class = "Date"), val1 = c(2, 1, 1, 1, 2,
0, 0, 1, 0, 0, 0, 1, 1, 2, 1, 1, 2, 1, 0, 0, 1, 1, 0, 0, 1),
val2 = c(306, 291, 306, 300, 306, 295, 299, 291, 302, 298,
301, 300, 291, 301, 297, 290, 294, 298, 293, 294, 310, 305,
293, 322, 299), val3 = c(278.115915402059, 275.206632766366,
277.843871977486, 274.375934310537, 271.976342200702, 314.694861131995,
322.55015422103, 312.56565930567, 321.31779178896, 310.742656596237,
294.839125866978, 305.946938215211, 317.090018318496, 319.386088532157,
312.323793703966, 309.29514039576, 313.96520162878, 317.360306029457,
310.212544203034, 320.263145398593, 310.432980834677, 296.638028917156,
294.622602772748, 305.922855022984, 308.30568677617)), row.names = c(NA,
-25L), groups = structure(list(`Date,` = structure(c(18993, 18994,
18995, 18996, 18997), class = "Date"), .rows = structure(list(
1:5, 6:10, 11:15, 16:20, 21:25), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -5L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("data.table",
"data.frame"))
# cut these columns [transforms them to bins (factors)]
colstoCut <- colnames(cutme)[-1]
# approach using dplyr (works but too slow on real data)
cutme <- cutme %>%
dplyr::group_by(Date) %>%
dplyr::mutate_at(all_of(colstoCut), ~cut(x = ., breaks = unique(quantile(x = ., probs = seq(0,1,0.025))), include.lowest = TRUE)) %>%
dplyr::ungroup(.)
## several attempts using data.table ##
# no error thrown but columns are not actually cut
cutme[, (colstoCut) := Map(function(x) cut(x = ., breaks = seq(0,1,0.0025), include.lowest = TRUE)), by = Date]
# Error in cut.default(x = ., breaks = seq(0, 1, 0.0025), include.lowest = TRUE) : 'x' must be numeric
cutme[, (colstoCut) := lapply(.SD, cut(x = ., breaks = seq(0,1,0.0025), include.lowest = TRUE)), by = Date, .SDcols = colstoCut]
# Error in cut.default(x = ., breaks = seq(0, 1, 0.0025), include.lowest = TRUE) : 'x' must be numeric
in_cols <- colstoCut
out_cols <- paste0(in_cols,"fact")
cutme[, (out_cols) := lapply(.SD, cut(x = ., breaks = seq(0,1,0.0025), include.lowest = TRUE)), by = Date, .SDcols = in_cols]
# Error in cut.default(x = ., breaks = seq(0, 1, 0.0025), include.lowest = TRUE) : 'x' must be numeric
cutme[, (colstoCut) := lapply(colstoCut, function(x) cut(x = ., breaks = seq(0,1,0.0025), include.lowest = TRUE)), by = Date]
# Error in .subset2(x, i, exact = exact) : recursive indexing failed at level 2
cutme[, (colstoCut) := lapply(.SD, cut(x = cutme[[colstoCut]], breaks = seq(0,1,0.0025), include.lowest = TRUE)), by = Date]
根据 akrun 的评论,以下对我有用:
in_cols <- colstoCut
out_cols <- paste0(in_cols,"_fact")
cutme[, (out_cols) := lapply(.SD, function(.) cut(x = ., breaks = unique(quantile(x = ., probs = seq(0,1,0.025))), include.lowest = TRUE)), by = Date, .SDcols = (in_cols) ]
cutme <- cutme %>%
dplyr::select(Date, all_of(out_cols)
colnames(cutme) <- gsub("_fact","",colnames(cutme))
因为我无法像使用 dplyr::mutate_at(vars(colstoCut), ~cut(...))
那样直接将数字列转换为因子列,所以我从数据中删除了原始(未切割)列 table 并选择了新的切割列 / 使用 gsub
.
我确信有一种更简洁的方法可以做到这一点,而无需选择/重命名列,但它对我有用。我很乐意接受一次更好的答案,如果它已发布。
对于具有 1 个分组列和 3 个要转换的值列的给定示例数据集,data.table equivalent of OP's dplyr 代码只是
library(data.table)
mycut <- \(x) cut(x, unique(quantile(x, probs = seq(0, 1, 0.025))), include.lowest = TRUE)
cutme <- setDT(cutme)[, lapply(.SD, mycut), .SDcols = colstoCut, by = Date]
cutme
Date val1 val2 val3 <Date> <fctr> <fctr> <fctr> 1: 2022-01-01 (1.9,2] (305.4,306] (278.09,278.12] 2: 2022-01-01 [1,1.1] [291,291.9] (275.12,275.21] 3: 2022-01-01 [1,1.1] (305.4,306] (277.58,277.84] 4: 2022-01-01 [1,1.1] (299.1,300] (274.14,274.38] 5: 2022-01-01 (1.9,2] (305.4,306] [271.98,272.22] 6: 2022-01-02 [0,0.1] (294.6,295] (314.5,314.7] 7: 2022-01-02 [0,0.1] (298.9,299] (322.4,322.6] 8: 2022-01-02 (0.9,1] [291,291.4] (312.4,312.6] 9: 2022-01-02 [0,0.1] (301.7,302] (320.7,321.3] 10: 2022-01-02 [0,0.1] (297.7,298] [310.7,310.9] 11: 2022-01-03 [0,0.1] (300.9,301] [294.8,295.9] 12: 2022-01-03 (0.9,1] (299.7,300] (304.8,305.9] 13: 2022-01-03 (0.9,1] [291,291.6] (316.6,317.1] 14: 2022-01-03 (1.9,2] (300.9,301] (319.2,319.4] 15: 2022-01-03 (0.9,1] (296.4,297] (311.7,312.3] 16: 2022-01-04 (0.9,1] [290,290.3] [309.3,309.39] 17: 2022-01-04 (1.9,2] (293.9,294] (313.59,313.97] 18: 2022-01-04 (0.9,1] (297.6,298] (317.02,317.36] 19: 2022-01-04 [0,0.1] (292.7,293] (310.12,310.21] 20: 2022-01-04 [0,0.1] (293.9,294] (319.97,320.26] 21: 2022-01-05 (0.9,1] (309.5,310] (310.2,310.4] 22: 2022-01-05 (0.9,1] (304.4,305] (296.4,296.6] 23: 2022-01-05 [0,0.1] [293,293.6] [294.6,294.8] 24: 2022-01-05 [0,0.1] (320.8,322] (305,305.9] 25: 2022-01-05 (0.9,1] (298.4,299] (308.1,308.3] Date val1 val2 val3