分组数据 table 并按引用对列应用剪切函数

Question

使用下面提供的数据，我想按日期对数据 table 进行分组，并按列引用 (colstoCut) 在我的代码中应用剪切函数。我可以使用 dplyr 实现这一点，但我的实际数据非常大而且速度太慢。我已经根据我在 SO 和其他地方找到的方法进行了几次尝试，但到目前为止我发现的方法似乎都没有用。

library(tidyverse)
library(data.table)

cutme <- structure(list(Date = structure(c(18993, 18993, 18993, 18993, 
18993, 18994, 18994, 18994, 18994, 18994, 18995, 18995, 18995, 
18995, 18995, 18996, 18996, 18996, 18996, 18996, 18997, 18997, 
18997, 18997, 18997), class = "Date"), val1 = c(2, 1, 1, 1, 2, 
0, 0, 1, 0, 0, 0, 1, 1, 2, 1, 1, 2, 1, 0, 0, 1, 1, 0, 0, 1), 
    val2 = c(306, 291, 306, 300, 306, 295, 299, 291, 302, 298, 
    301, 300, 291, 301, 297, 290, 294, 298, 293, 294, 310, 305, 
    293, 322, 299), val3 = c(278.115915402059, 275.206632766366, 
    277.843871977486, 274.375934310537, 271.976342200702, 314.694861131995, 
    322.55015422103, 312.56565930567, 321.31779178896, 310.742656596237, 
    294.839125866978, 305.946938215211, 317.090018318496, 319.386088532157, 
    312.323793703966, 309.29514039576, 313.96520162878, 317.360306029457, 
    310.212544203034, 320.263145398593, 310.432980834677, 296.638028917156, 
    294.622602772748, 305.922855022984, 308.30568677617)), row.names = c(NA, 
-25L), groups = structure(list(`Date,` = structure(c(18993, 18994, 
18995, 18996, 18997), class = "Date"), .rows = structure(list(
    1:5, 6:10, 11:15, 16:20, 21:25), ptype = integer(0), class = c("vctrs_list_of", 
"vctrs_vctr", "list"))), row.names = c(NA, -5L), class = c("tbl_df", 
"tbl", "data.frame"), .drop = TRUE), class = c("data.table", 
"data.frame"))

# cut these columns [transforms them to bins (factors)]
colstoCut <- colnames(cutme)[-1]

# approach using dplyr (works but too slow on real data)
cutme <- cutme %>%
  dplyr::group_by(Date) %>%
  dplyr::mutate_at(all_of(colstoCut), ~cut(x = ., breaks = unique(quantile(x = ., probs = seq(0,1,0.025))), include.lowest = TRUE)) %>%
  dplyr::ungroup(.)

## several attempts using data.table ##

# no error thrown but columns are not actually cut
cutme[, (colstoCut) := Map(function(x) cut(x = ., breaks = seq(0,1,0.0025), include.lowest = TRUE)), by = Date]

# Error in cut.default(x = ., breaks = seq(0, 1, 0.0025), include.lowest = TRUE) : 'x' must be numeric
cutme[, (colstoCut) := lapply(.SD, cut(x = ., breaks = seq(0,1,0.0025), include.lowest = TRUE)), by = Date, .SDcols = colstoCut]

# Error in cut.default(x = ., breaks = seq(0, 1, 0.0025), include.lowest = TRUE) : 'x' must be numeric
in_cols <- colstoCut
out_cols <- paste0(in_cols,"fact")
cutme[, (out_cols) := lapply(.SD, cut(x = ., breaks = seq(0,1,0.0025), include.lowest = TRUE)), by = Date, .SDcols = in_cols]

# Error in cut.default(x = ., breaks = seq(0, 1, 0.0025), include.lowest = TRUE) : 'x' must be numeric
cutme[, (colstoCut) := lapply(colstoCut, function(x) cut(x = ., breaks = seq(0,1,0.0025), include.lowest = TRUE)), by = Date]

# Error in .subset2(x, i, exact = exact) : recursive indexing failed at level 2
cutme[, (colstoCut) := lapply(.SD, cut(x = cutme[[colstoCut]], breaks = seq(0,1,0.0025), include.lowest = TRUE)), by = Date]

Answer 1

根据 akrun 的评论，以下对我有用：

in_cols <- colstoCut
out_cols <- paste0(in_cols,"_fact")
cutme[, (out_cols) := lapply(.SD, function(.) cut(x = ., breaks = unique(quantile(x = ., probs = seq(0,1,0.025))), include.lowest = TRUE)), by = Date, .SDcols = (in_cols) ]
cutme <- cutme %>%
  dplyr::select(Date, all_of(out_cols)
colnames(cutme) <- gsub("_fact","",colnames(cutme))

因为我无法像使用 dplyr::mutate_at(vars(colstoCut), ~cut(...)) 那样直接将数字列转换为因子列，所以我从数据中删除了原始（未切割）列 table 并选择了新的切割列 / 使用 gsub.

重命名

我确信有一种更简洁的方法可以做到这一点，而无需选择/重命名列，但它对我有用。我很乐意接受一次更好的答案，如果它已发布。

Answer 2

对于具有 1 个分组列和 3 个要转换的值列的给定示例数据集，data.table equivalent of OP's dplyr 代码只是

library(data.table)
mycut <- \(x) cut(x, unique(quantile(x, probs = seq(0, 1, 0.025))), include.lowest = TRUE)
cutme <- setDT(cutme)[, lapply(.SD, mycut), .SDcols = colstoCut, by = Date]
cutme

          Date    val1        val2            val3
        <Date>  <fctr>      <fctr>          <fctr>
 1: 2022-01-01 (1.9,2] (305.4,306] (278.09,278.12]
 2: 2022-01-01 [1,1.1] [291,291.9] (275.12,275.21]
 3: 2022-01-01 [1,1.1] (305.4,306] (277.58,277.84]
 4: 2022-01-01 [1,1.1] (299.1,300] (274.14,274.38]
 5: 2022-01-01 (1.9,2] (305.4,306] [271.98,272.22]
 6: 2022-01-02 [0,0.1] (294.6,295]   (314.5,314.7]
 7: 2022-01-02 [0,0.1] (298.9,299]   (322.4,322.6]
 8: 2022-01-02 (0.9,1] [291,291.4]   (312.4,312.6]
 9: 2022-01-02 [0,0.1] (301.7,302]   (320.7,321.3]
10: 2022-01-02 [0,0.1] (297.7,298]   [310.7,310.9]
11: 2022-01-03 [0,0.1] (300.9,301]   [294.8,295.9]
12: 2022-01-03 (0.9,1] (299.7,300]   (304.8,305.9]
13: 2022-01-03 (0.9,1] [291,291.6]   (316.6,317.1]
14: 2022-01-03 (1.9,2] (300.9,301]   (319.2,319.4]
15: 2022-01-03 (0.9,1] (296.4,297]   (311.7,312.3]
16: 2022-01-04 (0.9,1] [290,290.3]  [309.3,309.39]
17: 2022-01-04 (1.9,2] (293.9,294] (313.59,313.97]
18: 2022-01-04 (0.9,1] (297.6,298] (317.02,317.36]
19: 2022-01-04 [0,0.1] (292.7,293] (310.12,310.21]
20: 2022-01-04 [0,0.1] (293.9,294] (319.97,320.26]
21: 2022-01-05 (0.9,1] (309.5,310]   (310.2,310.4]
22: 2022-01-05 (0.9,1] (304.4,305]   (296.4,296.6]
23: 2022-01-05 [0,0.1] [293,293.6]   [294.6,294.8]
24: 2022-01-05 [0,0.1] (320.8,322]     (305,305.9]
25: 2022-01-05 (0.9,1] (298.4,299]   (308.1,308.3]
          Date    val1        val2            val3

分组数据 table 并按引用对列应用剪切函数

Group data table and apply cut function to columns by reference

r

data.table