此数据表重组的解决方案?

Solution for this datatable reorganization?

我有以下 data.table:

Date Segment Variable value
31-12-2021 XXX aa_a_1 10
31-12-2021 XXX bbb_1 11
31-12-2021 XXX ccc_1 12
31-01-2022 XXX aa_a_2 13
31-01-2022 XXX bbb_2 14
31-01-2022 XXX ccc_2 15
28-02-2022 XXX aa_a_3 16
28-02-2022 XXX bbb_3 17
28-02-2022 XXX ccc_3 18
31-03-2022 XXX aa_a_4 19
31-03-2022 XXX bbb_4 20
31-03-2022 XXX ccc_4 21
30-04-2022 XXX aa_a_5 22
30-04-2022 XXX bbb_5 23
30-04-2022 XXX ccc_5 24
31-05-2022 XXX aa_a_6 25
31-05-2022 XXX bbb_6 26
31-05-2022 XXX ccc_6 27
30-06-2022 XXX aa_a_7 28
30-06-2022 XXX bbb_7 29
30-06-2022 XXX ccc_7 30
31-07-2022 XXX aa_a_8 31
31-07-2022 XXX bbb_8 32
31-07-2022 XXX ccc_8 33
31-08-2022 XXX aa_a_9 34
31-08-2022 XXX bbb_9 35
31-08-2022 XXX ccc_9 36
30-09-2022 XXX aa_a_10 37
30-09-2022 XXX bbb_10 38
30-09-2022 XXX ccc_10 39
31-10-2022 XXX aa_a_11 40
31-10-2022 XXX bbb_11 41
31-10-2022 XXX ccc_11 42
30-11-2022 XXX aa_a_12 43
30-11-2022 XXX bbb_12 44
30-11-2022 XXX ccc_12 45
31-12-2021 YYY aa_a_1 46
31-12-2021 YYY bbb_1 47
31-12-2021 YYY ccc_1 48
31-01-2022 YYY aa_a_2 49
31-01-2022 YYY bbb_2 50
31-01-2022 YYY ccc_2 51
28-02-2022 YYY aa_a_3 52
28-02-2022 YYY bbb_3 53
28-02-2022 YYY ccc_3 54
31-03-2022 YYY aa_a_4 55
31-03-2022 YYY bbb_4 56
31-03-2022 YYY ccc_4 57
30-04-2022 YYY aa_a_5 58
30-04-2022 YYY bbb_5 59
30-04-2022 YYY ccc_5 60
31-05-2022 YYY aa_a_6 61
31-05-2022 YYY bbb_6 62
31-05-2022 YYY ccc_6 63
30-06-2022 YYY aa_a_7 64
30-06-2022 YYY bbb_7 65
30-06-2022 YYY ccc_7 66
31-07-2022 YYY aa_a_8 67
31-07-2022 YYY bbb_8 68
31-07-2022 YYY ccc_8 69
31-08-2022 YYY aa_a_9 70
31-08-2022 YYY bbb_9 71
31-08-2022 YYY ccc_9 72
30-09-2022 YYY aa_a_10 73
30-09-2022 YYY bbb_10 74
30-09-2022 YYY ccc_10 75
31-10-2022 YYY aa_a_11 76
31-10-2022 YYY bbb_11 77
31-10-2022 YYY ccc_11 78
30-11-2022 YYY aa_a_12 79
30-11-2022 YYY bbb_12 80
30-11-2022 YYY ccc_12 81

以此类推,总共有 24 个不同的日期和 'aaa'、'bbb' 和 'ccc' 从 1 到 24。

所需的输出将是:

Date Segment aa_a bbb ccc
31-12-2021 XXX 10 11 12
31-01-2022 XXX 13 14 15
31-12-2021 YYY 16 17 18
31-01-2022 YYY 19 20 21

等等...

有什么想法吗?我现在被屏蔽了。

我们可以 separate 将 'Variable' 列转换为 'two',然后使用 pivot_wider 重塑为 'wide' 格式

library(dplyr)
library(tidyr)
df1 %>%
  separate(Variable, into = c('Var1', 'Var2')) %>% 
  pivot_wider(names_from = Var1, values_from = value) %>% 
  select(-Var2)

-输出

# A tibble: 4 × 5
  Date       Segment   aaa   bbb   ccc
  <chr>      <chr>   <int> <int> <int>
1 31-12-2021 XXX        10    11    12
2 31-01-2022 XXX        13    14    15
3 31-12-2021 YYY        16    17    18
4 31-01-2022 YYY        19    20    21

或使用dcast

library(data.table)
dcast(setDT(df1), Date + Segment ~ trimws(Variable, whitespace = "_.*"))
         Date Segment aaa bbb ccc
1: 31-01-2022     XXX  13  14  15
2: 31-01-2022     YYY  19  20  21
3: 31-12-2021     XXX  10  11  12
4: 31-12-2021     YYY  16  17  18

数据

df1 <- structure(list(Date = c("31-12-2021", "31-12-2021", "31-12-2021", 
"31-01-2022", "31-01-2022", "31-01-2022", "31-12-2021", "31-12-2021", 
"31-12-2021", "31-01-2022", "31-01-2022", "31-01-2022"), Segment = c("XXX", 
"XXX", "XXX", "XXX", "XXX", "XXX", "YYY", "YYY", "YYY", "YYY", 
"YYY", "YYY"), Variable = c("aaa_1", "bbb_1", "ccc_1", "aaa_2", 
"bbb_2", "ccc_2", "aaa_1", "bbb_1", "ccc_1", "aaa_2", "bbb_2", 
"ccc_2"), value = 10:21), class = "data.frame", row.names = c(NA, 
-12L))

这只是整形。既然你提到了 :

library(data.table)
dcast(Date +  Segment ~ Variable, value.var = "value",
      data = DT[, Variable := gsub("_[0-9]+$", "", Variable)])
#           Date Segment aa_a bbb ccc
#  1: 28-02-2022     XXX   16  17  18
#  2: 28-02-2022     YYY   52  53  54
#  3: 30-04-2022     XXX   22  23  24
#  4: 30-04-2022     YYY   58  59  60
#  5: 30-06-2022     XXX   28  29  30
#  6: 30-06-2022     YYY   64  65  66
#  7: 30-09-2022     XXX   37  38  39
#  8: 30-09-2022     YYY   73  74  75
#  9: 30-11-2022     XXX   43  44  45
# 10: 30-11-2022     YYY   79  80  81
# 11: 31-01-2022     XXX   13  14  15
# 12: 31-01-2022     YYY   49  50  51
# 13: 31-03-2022     XXX   19  20  21
# 14: 31-03-2022     YYY   55  56  57
# 15: 31-05-2022     XXX   25  26  27
# 16: 31-05-2022     YYY   61  62  63
# 17: 31-07-2022     XXX   31  32  33
# 18: 31-07-2022     YYY   67  68  69
# 19: 31-08-2022     XXX   34  35  36
# 20: 31-08-2022     YYY   70  71  72
# 21: 31-10-2022     XXX   40  41  42
# 22: 31-10-2022     YYY   76  77  78
# 23: 31-12-2021     XXX   10  11  12
# 24: 31-12-2021     YYY   46  47  48
#           Date Segment aa_a bbb ccc

仅供参考,正则表达式(以及关于正则表达式的extensive answer):

  • _ 是文字下划线字符。
  • [0-9] 表示(单个)“字符 class” 包含“0”和“9”之间的字符,而不是 numbers 介于 0 和 9 之间。例如,19 是两个字符,每个字符介于 0 和 9 之间。
  • +表示“一个或多个”.
  • $ 表示 “字符串结尾”.

所以"aaa_24""aa_a_9999999"会被更新; "aaa_""aaa_z""aaa_24b" 不会。


数据

DT <- setDT(structure(list(Date = c("31-12-2021", "31-12-2021", "31-12-2021", "31-01-2022", "31-01-2022", "31-01-2022", "28-02-2022", "28-02-2022", "28-02-2022", "31-03-2022", "31-03-2022", "31-03-2022", "30-04-2022", "30-04-2022", "30-04-2022", "31-05-2022", "31-05-2022", "31-05-2022", "30-06-2022", "30-06-2022", "30-06-2022", "31-07-2022", "31-07-2022", "31-07-2022", "31-08-2022", "31-08-2022", "31-08-2022", "30-09-2022", "30-09-2022", "30-09-2022", "31-10-2022", "31-10-2022", "31-10-2022", "30-11-2022", "30-11-2022",  "30-11-2022", "31-12-2021", "31-12-2021", "31-12-2021", "31-01-2022", "31-01-2022", "31-01-2022", "28-02-2022", "28-02-2022", "28-02-2022", "31-03-2022", "31-03-2022", "31-03-2022", "30-04-2022", "30-04-2022", "30-04-2022", "31-05-2022", "31-05-2022", "31-05-2022", "30-06-2022", "30-06-2022", "30-06-2022", "31-07-2022", "31-07-2022", "31-07-2022", "31-08-2022", "31-08-2022", "31-08-2022", "30-09-2022", "30-09-2022", "30-09-2022", "31-10-2022", "31-10-2022", "31-10-2022", "30-11-2022", "30-11-2022",  "30-11-2022"), Segment = c("XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY",  "YYY", "YYY", "YYY", "YYY"), Variable = c("aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc",  "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc"), value = 10:81), row.names = c(NA, -72L), class = c("data.table", "data.frame")))