此数据表重组的解决方案?
Solution for this datatable reorganization?
我有以下 data.table:
Date
Segment
Variable
value
31-12-2021
XXX
aa_a_1
10
31-12-2021
XXX
bbb_1
11
31-12-2021
XXX
ccc_1
12
31-01-2022
XXX
aa_a_2
13
31-01-2022
XXX
bbb_2
14
31-01-2022
XXX
ccc_2
15
28-02-2022
XXX
aa_a_3
16
28-02-2022
XXX
bbb_3
17
28-02-2022
XXX
ccc_3
18
31-03-2022
XXX
aa_a_4
19
31-03-2022
XXX
bbb_4
20
31-03-2022
XXX
ccc_4
21
30-04-2022
XXX
aa_a_5
22
30-04-2022
XXX
bbb_5
23
30-04-2022
XXX
ccc_5
24
31-05-2022
XXX
aa_a_6
25
31-05-2022
XXX
bbb_6
26
31-05-2022
XXX
ccc_6
27
30-06-2022
XXX
aa_a_7
28
30-06-2022
XXX
bbb_7
29
30-06-2022
XXX
ccc_7
30
31-07-2022
XXX
aa_a_8
31
31-07-2022
XXX
bbb_8
32
31-07-2022
XXX
ccc_8
33
31-08-2022
XXX
aa_a_9
34
31-08-2022
XXX
bbb_9
35
31-08-2022
XXX
ccc_9
36
30-09-2022
XXX
aa_a_10
37
30-09-2022
XXX
bbb_10
38
30-09-2022
XXX
ccc_10
39
31-10-2022
XXX
aa_a_11
40
31-10-2022
XXX
bbb_11
41
31-10-2022
XXX
ccc_11
42
30-11-2022
XXX
aa_a_12
43
30-11-2022
XXX
bbb_12
44
30-11-2022
XXX
ccc_12
45
31-12-2021
YYY
aa_a_1
46
31-12-2021
YYY
bbb_1
47
31-12-2021
YYY
ccc_1
48
31-01-2022
YYY
aa_a_2
49
31-01-2022
YYY
bbb_2
50
31-01-2022
YYY
ccc_2
51
28-02-2022
YYY
aa_a_3
52
28-02-2022
YYY
bbb_3
53
28-02-2022
YYY
ccc_3
54
31-03-2022
YYY
aa_a_4
55
31-03-2022
YYY
bbb_4
56
31-03-2022
YYY
ccc_4
57
30-04-2022
YYY
aa_a_5
58
30-04-2022
YYY
bbb_5
59
30-04-2022
YYY
ccc_5
60
31-05-2022
YYY
aa_a_6
61
31-05-2022
YYY
bbb_6
62
31-05-2022
YYY
ccc_6
63
30-06-2022
YYY
aa_a_7
64
30-06-2022
YYY
bbb_7
65
30-06-2022
YYY
ccc_7
66
31-07-2022
YYY
aa_a_8
67
31-07-2022
YYY
bbb_8
68
31-07-2022
YYY
ccc_8
69
31-08-2022
YYY
aa_a_9
70
31-08-2022
YYY
bbb_9
71
31-08-2022
YYY
ccc_9
72
30-09-2022
YYY
aa_a_10
73
30-09-2022
YYY
bbb_10
74
30-09-2022
YYY
ccc_10
75
31-10-2022
YYY
aa_a_11
76
31-10-2022
YYY
bbb_11
77
31-10-2022
YYY
ccc_11
78
30-11-2022
YYY
aa_a_12
79
30-11-2022
YYY
bbb_12
80
30-11-2022
YYY
ccc_12
81
以此类推,总共有 24 个不同的日期和 'aaa'、'bbb' 和 'ccc' 从 1 到 24。
所需的输出将是:
Date
Segment
aa_a
bbb
ccc
31-12-2021
XXX
10
11
12
31-01-2022
XXX
13
14
15
31-12-2021
YYY
16
17
18
31-01-2022
YYY
19
20
21
等等...
有什么想法吗?我现在被屏蔽了。
我们可以 separate
将 'Variable' 列转换为 'two',然后使用 pivot_wider
重塑为 'wide' 格式
library(dplyr)
library(tidyr)
df1 %>%
separate(Variable, into = c('Var1', 'Var2')) %>%
pivot_wider(names_from = Var1, values_from = value) %>%
select(-Var2)
-输出
# A tibble: 4 × 5
Date Segment aaa bbb ccc
<chr> <chr> <int> <int> <int>
1 31-12-2021 XXX 10 11 12
2 31-01-2022 XXX 13 14 15
3 31-12-2021 YYY 16 17 18
4 31-01-2022 YYY 19 20 21
或使用dcast
library(data.table)
dcast(setDT(df1), Date + Segment ~ trimws(Variable, whitespace = "_.*"))
Date Segment aaa bbb ccc
1: 31-01-2022 XXX 13 14 15
2: 31-01-2022 YYY 19 20 21
3: 31-12-2021 XXX 10 11 12
4: 31-12-2021 YYY 16 17 18
数据
df1 <- structure(list(Date = c("31-12-2021", "31-12-2021", "31-12-2021",
"31-01-2022", "31-01-2022", "31-01-2022", "31-12-2021", "31-12-2021",
"31-12-2021", "31-01-2022", "31-01-2022", "31-01-2022"), Segment = c("XXX",
"XXX", "XXX", "XXX", "XXX", "XXX", "YYY", "YYY", "YYY", "YYY",
"YYY", "YYY"), Variable = c("aaa_1", "bbb_1", "ccc_1", "aaa_2",
"bbb_2", "ccc_2", "aaa_1", "bbb_1", "ccc_1", "aaa_2", "bbb_2",
"ccc_2"), value = 10:21), class = "data.frame", row.names = c(NA,
-12L))
这只是整形。既然你提到了 data.table:
library(data.table)
dcast(Date + Segment ~ Variable, value.var = "value",
data = DT[, Variable := gsub("_[0-9]+$", "", Variable)])
# Date Segment aa_a bbb ccc
# 1: 28-02-2022 XXX 16 17 18
# 2: 28-02-2022 YYY 52 53 54
# 3: 30-04-2022 XXX 22 23 24
# 4: 30-04-2022 YYY 58 59 60
# 5: 30-06-2022 XXX 28 29 30
# 6: 30-06-2022 YYY 64 65 66
# 7: 30-09-2022 XXX 37 38 39
# 8: 30-09-2022 YYY 73 74 75
# 9: 30-11-2022 XXX 43 44 45
# 10: 30-11-2022 YYY 79 80 81
# 11: 31-01-2022 XXX 13 14 15
# 12: 31-01-2022 YYY 49 50 51
# 13: 31-03-2022 XXX 19 20 21
# 14: 31-03-2022 YYY 55 56 57
# 15: 31-05-2022 XXX 25 26 27
# 16: 31-05-2022 YYY 61 62 63
# 17: 31-07-2022 XXX 31 32 33
# 18: 31-07-2022 YYY 67 68 69
# 19: 31-08-2022 XXX 34 35 36
# 20: 31-08-2022 YYY 70 71 72
# 21: 31-10-2022 XXX 40 41 42
# 22: 31-10-2022 YYY 76 77 78
# 23: 31-12-2021 XXX 10 11 12
# 24: 31-12-2021 YYY 46 47 48
# Date Segment aa_a bbb ccc
仅供参考,正则表达式(以及关于正则表达式的extensive answer):
_
是文字下划线字符。
[0-9]
表示(单个)“字符 class” 包含“0”和“9”之间的字符,而不是 numbers 介于 0 和 9 之间。例如,19
是两个字符,每个字符介于 0 和 9 之间。
+
表示“一个或多个”.
$
表示 “字符串结尾”.
所以"aaa_24"
和"aa_a_9999999"
会被更新; "aaa_"
和 "aaa_z"
和 "aaa_24b"
不会。
数据
DT <- setDT(structure(list(Date = c("31-12-2021", "31-12-2021", "31-12-2021", "31-01-2022", "31-01-2022", "31-01-2022", "28-02-2022", "28-02-2022", "28-02-2022", "31-03-2022", "31-03-2022", "31-03-2022", "30-04-2022", "30-04-2022", "30-04-2022", "31-05-2022", "31-05-2022", "31-05-2022", "30-06-2022", "30-06-2022", "30-06-2022", "31-07-2022", "31-07-2022", "31-07-2022", "31-08-2022", "31-08-2022", "31-08-2022", "30-09-2022", "30-09-2022", "30-09-2022", "31-10-2022", "31-10-2022", "31-10-2022", "30-11-2022", "30-11-2022", "30-11-2022", "31-12-2021", "31-12-2021", "31-12-2021", "31-01-2022", "31-01-2022", "31-01-2022", "28-02-2022", "28-02-2022", "28-02-2022", "31-03-2022", "31-03-2022", "31-03-2022", "30-04-2022", "30-04-2022", "30-04-2022", "31-05-2022", "31-05-2022", "31-05-2022", "30-06-2022", "30-06-2022", "30-06-2022", "31-07-2022", "31-07-2022", "31-07-2022", "31-08-2022", "31-08-2022", "31-08-2022", "30-09-2022", "30-09-2022", "30-09-2022", "31-10-2022", "31-10-2022", "31-10-2022", "30-11-2022", "30-11-2022", "30-11-2022"), Segment = c("XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY"), Variable = c("aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc"), value = 10:81), row.names = c(NA, -72L), class = c("data.table", "data.frame")))
我有以下 data.table:
Date | Segment | Variable | value |
---|---|---|---|
31-12-2021 | XXX | aa_a_1 | 10 |
31-12-2021 | XXX | bbb_1 | 11 |
31-12-2021 | XXX | ccc_1 | 12 |
31-01-2022 | XXX | aa_a_2 | 13 |
31-01-2022 | XXX | bbb_2 | 14 |
31-01-2022 | XXX | ccc_2 | 15 |
28-02-2022 | XXX | aa_a_3 | 16 |
28-02-2022 | XXX | bbb_3 | 17 |
28-02-2022 | XXX | ccc_3 | 18 |
31-03-2022 | XXX | aa_a_4 | 19 |
31-03-2022 | XXX | bbb_4 | 20 |
31-03-2022 | XXX | ccc_4 | 21 |
30-04-2022 | XXX | aa_a_5 | 22 |
30-04-2022 | XXX | bbb_5 | 23 |
30-04-2022 | XXX | ccc_5 | 24 |
31-05-2022 | XXX | aa_a_6 | 25 |
31-05-2022 | XXX | bbb_6 | 26 |
31-05-2022 | XXX | ccc_6 | 27 |
30-06-2022 | XXX | aa_a_7 | 28 |
30-06-2022 | XXX | bbb_7 | 29 |
30-06-2022 | XXX | ccc_7 | 30 |
31-07-2022 | XXX | aa_a_8 | 31 |
31-07-2022 | XXX | bbb_8 | 32 |
31-07-2022 | XXX | ccc_8 | 33 |
31-08-2022 | XXX | aa_a_9 | 34 |
31-08-2022 | XXX | bbb_9 | 35 |
31-08-2022 | XXX | ccc_9 | 36 |
30-09-2022 | XXX | aa_a_10 | 37 |
30-09-2022 | XXX | bbb_10 | 38 |
30-09-2022 | XXX | ccc_10 | 39 |
31-10-2022 | XXX | aa_a_11 | 40 |
31-10-2022 | XXX | bbb_11 | 41 |
31-10-2022 | XXX | ccc_11 | 42 |
30-11-2022 | XXX | aa_a_12 | 43 |
30-11-2022 | XXX | bbb_12 | 44 |
30-11-2022 | XXX | ccc_12 | 45 |
31-12-2021 | YYY | aa_a_1 | 46 |
31-12-2021 | YYY | bbb_1 | 47 |
31-12-2021 | YYY | ccc_1 | 48 |
31-01-2022 | YYY | aa_a_2 | 49 |
31-01-2022 | YYY | bbb_2 | 50 |
31-01-2022 | YYY | ccc_2 | 51 |
28-02-2022 | YYY | aa_a_3 | 52 |
28-02-2022 | YYY | bbb_3 | 53 |
28-02-2022 | YYY | ccc_3 | 54 |
31-03-2022 | YYY | aa_a_4 | 55 |
31-03-2022 | YYY | bbb_4 | 56 |
31-03-2022 | YYY | ccc_4 | 57 |
30-04-2022 | YYY | aa_a_5 | 58 |
30-04-2022 | YYY | bbb_5 | 59 |
30-04-2022 | YYY | ccc_5 | 60 |
31-05-2022 | YYY | aa_a_6 | 61 |
31-05-2022 | YYY | bbb_6 | 62 |
31-05-2022 | YYY | ccc_6 | 63 |
30-06-2022 | YYY | aa_a_7 | 64 |
30-06-2022 | YYY | bbb_7 | 65 |
30-06-2022 | YYY | ccc_7 | 66 |
31-07-2022 | YYY | aa_a_8 | 67 |
31-07-2022 | YYY | bbb_8 | 68 |
31-07-2022 | YYY | ccc_8 | 69 |
31-08-2022 | YYY | aa_a_9 | 70 |
31-08-2022 | YYY | bbb_9 | 71 |
31-08-2022 | YYY | ccc_9 | 72 |
30-09-2022 | YYY | aa_a_10 | 73 |
30-09-2022 | YYY | bbb_10 | 74 |
30-09-2022 | YYY | ccc_10 | 75 |
31-10-2022 | YYY | aa_a_11 | 76 |
31-10-2022 | YYY | bbb_11 | 77 |
31-10-2022 | YYY | ccc_11 | 78 |
30-11-2022 | YYY | aa_a_12 | 79 |
30-11-2022 | YYY | bbb_12 | 80 |
30-11-2022 | YYY | ccc_12 | 81 |
以此类推,总共有 24 个不同的日期和 'aaa'、'bbb' 和 'ccc' 从 1 到 24。
所需的输出将是:
Date | Segment | aa_a | bbb | ccc |
---|---|---|---|---|
31-12-2021 | XXX | 10 | 11 | 12 |
31-01-2022 | XXX | 13 | 14 | 15 |
31-12-2021 | YYY | 16 | 17 | 18 |
31-01-2022 | YYY | 19 | 20 | 21 |
等等...
有什么想法吗?我现在被屏蔽了。
我们可以 separate
将 'Variable' 列转换为 'two',然后使用 pivot_wider
重塑为 'wide' 格式
library(dplyr)
library(tidyr)
df1 %>%
separate(Variable, into = c('Var1', 'Var2')) %>%
pivot_wider(names_from = Var1, values_from = value) %>%
select(-Var2)
-输出
# A tibble: 4 × 5
Date Segment aaa bbb ccc
<chr> <chr> <int> <int> <int>
1 31-12-2021 XXX 10 11 12
2 31-01-2022 XXX 13 14 15
3 31-12-2021 YYY 16 17 18
4 31-01-2022 YYY 19 20 21
或使用dcast
library(data.table)
dcast(setDT(df1), Date + Segment ~ trimws(Variable, whitespace = "_.*"))
Date Segment aaa bbb ccc
1: 31-01-2022 XXX 13 14 15
2: 31-01-2022 YYY 19 20 21
3: 31-12-2021 XXX 10 11 12
4: 31-12-2021 YYY 16 17 18
数据
df1 <- structure(list(Date = c("31-12-2021", "31-12-2021", "31-12-2021",
"31-01-2022", "31-01-2022", "31-01-2022", "31-12-2021", "31-12-2021",
"31-12-2021", "31-01-2022", "31-01-2022", "31-01-2022"), Segment = c("XXX",
"XXX", "XXX", "XXX", "XXX", "XXX", "YYY", "YYY", "YYY", "YYY",
"YYY", "YYY"), Variable = c("aaa_1", "bbb_1", "ccc_1", "aaa_2",
"bbb_2", "ccc_2", "aaa_1", "bbb_1", "ccc_1", "aaa_2", "bbb_2",
"ccc_2"), value = 10:21), class = "data.frame", row.names = c(NA,
-12L))
这只是整形。既然你提到了 data.table:
library(data.table)
dcast(Date + Segment ~ Variable, value.var = "value",
data = DT[, Variable := gsub("_[0-9]+$", "", Variable)])
# Date Segment aa_a bbb ccc
# 1: 28-02-2022 XXX 16 17 18
# 2: 28-02-2022 YYY 52 53 54
# 3: 30-04-2022 XXX 22 23 24
# 4: 30-04-2022 YYY 58 59 60
# 5: 30-06-2022 XXX 28 29 30
# 6: 30-06-2022 YYY 64 65 66
# 7: 30-09-2022 XXX 37 38 39
# 8: 30-09-2022 YYY 73 74 75
# 9: 30-11-2022 XXX 43 44 45
# 10: 30-11-2022 YYY 79 80 81
# 11: 31-01-2022 XXX 13 14 15
# 12: 31-01-2022 YYY 49 50 51
# 13: 31-03-2022 XXX 19 20 21
# 14: 31-03-2022 YYY 55 56 57
# 15: 31-05-2022 XXX 25 26 27
# 16: 31-05-2022 YYY 61 62 63
# 17: 31-07-2022 XXX 31 32 33
# 18: 31-07-2022 YYY 67 68 69
# 19: 31-08-2022 XXX 34 35 36
# 20: 31-08-2022 YYY 70 71 72
# 21: 31-10-2022 XXX 40 41 42
# 22: 31-10-2022 YYY 76 77 78
# 23: 31-12-2021 XXX 10 11 12
# 24: 31-12-2021 YYY 46 47 48
# Date Segment aa_a bbb ccc
仅供参考,正则表达式(以及关于正则表达式的extensive answer):
_
是文字下划线字符。[0-9]
表示(单个)“字符 class” 包含“0”和“9”之间的字符,而不是 numbers 介于 0 和 9 之间。例如,19
是两个字符,每个字符介于 0 和 9 之间。+
表示“一个或多个”.$
表示 “字符串结尾”.
所以"aaa_24"
和"aa_a_9999999"
会被更新; "aaa_"
和 "aaa_z"
和 "aaa_24b"
不会。
数据
DT <- setDT(structure(list(Date = c("31-12-2021", "31-12-2021", "31-12-2021", "31-01-2022", "31-01-2022", "31-01-2022", "28-02-2022", "28-02-2022", "28-02-2022", "31-03-2022", "31-03-2022", "31-03-2022", "30-04-2022", "30-04-2022", "30-04-2022", "31-05-2022", "31-05-2022", "31-05-2022", "30-06-2022", "30-06-2022", "30-06-2022", "31-07-2022", "31-07-2022", "31-07-2022", "31-08-2022", "31-08-2022", "31-08-2022", "30-09-2022", "30-09-2022", "30-09-2022", "31-10-2022", "31-10-2022", "31-10-2022", "30-11-2022", "30-11-2022", "30-11-2022", "31-12-2021", "31-12-2021", "31-12-2021", "31-01-2022", "31-01-2022", "31-01-2022", "28-02-2022", "28-02-2022", "28-02-2022", "31-03-2022", "31-03-2022", "31-03-2022", "30-04-2022", "30-04-2022", "30-04-2022", "31-05-2022", "31-05-2022", "31-05-2022", "30-06-2022", "30-06-2022", "30-06-2022", "31-07-2022", "31-07-2022", "31-07-2022", "31-08-2022", "31-08-2022", "31-08-2022", "30-09-2022", "30-09-2022", "30-09-2022", "31-10-2022", "31-10-2022", "31-10-2022", "30-11-2022", "30-11-2022", "30-11-2022"), Segment = c("XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "XXX", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY", "YYY"), Variable = c("aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc", "aa_a", "bbb", "ccc"), value = 10:81), row.names = c(NA, -72L), class = c("data.table", "data.frame")))