如何在所有组合中使用 SD 取平均值
how to make an average with SD across all combination
我有这样的数据
df<- structure(list(data = c("A", "AA", "AAA", "AB", "ABN", "AHG",
"ANBV"), G1_1 = c(1677.1, 956, 57.7, 44.5, 1578.2, 1184.8, 1958.5
), G1_2 = c(1296.9, 1230.2, 68.4, 57.5, 925.1, 1275.7, 2143.6
), G2_1 = c(7012.5, 2087.9, 146.7, 80, 6278.5, 3076.9, 808),
G2_2 = c(13470.8, 3926.6, 226.5, 212.6, 12569.4, 6569, 2479.8
), Vok3_1 = c(911, 832.8, 50.8, 34.8, 916.6, 1358.5, 1540.4
), Vok3_2 = c(5121.9, 1029.6, 108.9, 135, 3624.5, 1971.3,
1851.4), Vok4_1 = c(5136.7, 892.1, 78, 60.4, 3106.6, 1578.6,
1132.1), Vok4_2 = c(6313.4, 1180.8, 112, 163.4, 4639.8, 2506.4,
879.1)), class = "data.frame", row.names = c(NA, -7L))
我想做的是求sd和median,没成功
这是我尝试的和我想要得到的
df_med <- as.data.frame(df %>% group_by(data = substr(data, 1, 2,3,4)) %>% summarise_all(funs(median)))
这里是我想要的方法是获取每一行的每两列之间的中位数或 sd,例如前两列 G1_1 和 G1_2 以及 G2_1 之间和 G2_2 等
我找的输出是这样的
data G1 G2 Vok3 Vok4
A 1487 10241.65 3016.45 5725.05
AA 1093.1 3007.25 931.2 1036.45
AAA 63.05 186.6 79.85 95
AB 51 146.3 84.9 111.9
ABN 1251.65 9423.95 2270.55 3873.2
AHG 1230.25 4822.95. 1664.9 2042.5
ANBV 2051.05 1643.9 1695.9 1005.6
在按 summarise
分组之前,我们可能需要重新整形为 'long' (pivot_longer
)
- 将第一个 'data' 列以外的列重塑为 'long',捕获
names_pattern
中 _
之前的列名称中的子字符串
- 按 'data'、
summarise
across
所有其他列分组以获得 median
library(dplyr)
library(tidyr)
df_median <- df %>%
pivot_longer(cols = -data, names_to = ".value",
names_pattern = "(.*)_\d+") %>%
group_by(data) %>%
dplyr::summarise(across(everything(), median, na.rm = TRUE))
-输出
df_median
# A tibble: 7 x 5
data G1 G2 Vok3 Vok4
<chr> <dbl> <dbl> <dbl> <dbl>
1 A 1487 10242. 3016. 5725.
2 AA 1093. 3007. 931. 1036.
3 AAA 63.0 187. 79.8 95
4 AB 51 146. 84.9 112.
5 ABN 1252. 9424. 2271. 3873.
6 AHG 1230. 4823. 1665. 2042.
7 ANBV 2051. 1644. 1696. 1006.
从宽格式,我们可以再次转换为 'long' as
df_median %>%
pivot_longer(cols = -data, names_to = 'key') %>%
arrange(data)
# A tibble: 28 x 3
data key value
<chr> <chr> <dbl>
1 A G1 1487
2 A G2 10242.
3 A Vok3 3016.
4 A Vok4 5725.
5 AA G1 1093.
6 AA G2 3007.
7 AA Vok3 931.
8 AA Vok4 1036.
9 AAA G1 63.0
10 AAA G2 187.
# … with 18 more rows
或者不是分两步进行,我们对所有列进行整形,然后获取 'key' 列的子字符串并用于分组以获得 [=59] 的 median
=]
library(stringr)
df %>%
pivot_longer(cols = -data, names_to = "key") %>%
group_by(data, key = str_remove(key, "_\d+")) %>%
summarise(value = median(value), .groups = 'drop')
# A tibble: 28 x 3
data key value
<chr> <chr> <dbl>
1 A G1 1487
2 A G2 10242.
3 A Vok3 3016.
4 A Vok4 5725.
5 AA G1 1093.
6 AA G2 3007.
7 AA Vok3 931.
8 AA Vok4 1036.
9 AAA G1 63.0
10 AAA G2 187.
# … with 18 more rows
如果dplyr
版本旧,则使用summarise_all
df %>%
pivot_longer(cols = -data, names_to = ".value",
names_pattern = "(.*)_\d+") %>%
group_by(data) %>%
dplyr::summarise_all(median, na.rm = TRUE)
# A tibble: 7 x 5
data G1 G2 Vok3 Vok4
<chr> <dbl> <dbl> <dbl> <dbl>
1 A 1487 10242. 3016. 5725.
2 AA 1093. 3007. 931. 1036.
3 AAA 63.0 187. 79.8 95
4 AB 51 146. 84.9 112.
5 ABN 1252. 9424. 2271. 3873.
6 AHG 1230. 4823. 1665. 2042.
7 ANBV 2051. 1644. 1696. 1006.
或者用split.default
从base R
变成data.frames的list
,从matrixStats
转成[=31后用rowMedians
=]
library(matrixStats)
cbind(df[1], sapply(split.default(df[-1], sub("_\d+$", "", names(df)[-1])),
function(x) rowMedians(as.matrix(x))))
data G1 G2 Vok3 Vok4
1 A 1487.00 10241.65 3016.45 5725.05
2 AA 1093.10 3007.25 931.20 1036.45
3 AAA 63.05 186.60 79.85 95.00
4 AB 51.00 146.30 84.90 111.90
5 ABN 1251.65 9423.95 2270.55 3873.20
6 AHG 1230.25 4822.95 1664.90 2042.50
7 ANBV 2051.05 1643.90 1695.90 1005.60
我有这样的数据
df<- structure(list(data = c("A", "AA", "AAA", "AB", "ABN", "AHG",
"ANBV"), G1_1 = c(1677.1, 956, 57.7, 44.5, 1578.2, 1184.8, 1958.5
), G1_2 = c(1296.9, 1230.2, 68.4, 57.5, 925.1, 1275.7, 2143.6
), G2_1 = c(7012.5, 2087.9, 146.7, 80, 6278.5, 3076.9, 808),
G2_2 = c(13470.8, 3926.6, 226.5, 212.6, 12569.4, 6569, 2479.8
), Vok3_1 = c(911, 832.8, 50.8, 34.8, 916.6, 1358.5, 1540.4
), Vok3_2 = c(5121.9, 1029.6, 108.9, 135, 3624.5, 1971.3,
1851.4), Vok4_1 = c(5136.7, 892.1, 78, 60.4, 3106.6, 1578.6,
1132.1), Vok4_2 = c(6313.4, 1180.8, 112, 163.4, 4639.8, 2506.4,
879.1)), class = "data.frame", row.names = c(NA, -7L))
我想做的是求sd和median,没成功
这是我尝试的和我想要得到的
df_med <- as.data.frame(df %>% group_by(data = substr(data, 1, 2,3,4)) %>% summarise_all(funs(median)))
这里是我想要的方法是获取每一行的每两列之间的中位数或 sd,例如前两列 G1_1 和 G1_2 以及 G2_1 之间和 G2_2 等
我找的输出是这样的
data G1 G2 Vok3 Vok4
A 1487 10241.65 3016.45 5725.05
AA 1093.1 3007.25 931.2 1036.45
AAA 63.05 186.6 79.85 95
AB 51 146.3 84.9 111.9
ABN 1251.65 9423.95 2270.55 3873.2
AHG 1230.25 4822.95. 1664.9 2042.5
ANBV 2051.05 1643.9 1695.9 1005.6
在按 summarise
pivot_longer
)
- 将第一个 'data' 列以外的列重塑为 'long',捕获
names_pattern
中 - 按 'data'、
summarise
across
所有其他列分组以获得median
_
之前的列名称中的子字符串
library(dplyr)
library(tidyr)
df_median <- df %>%
pivot_longer(cols = -data, names_to = ".value",
names_pattern = "(.*)_\d+") %>%
group_by(data) %>%
dplyr::summarise(across(everything(), median, na.rm = TRUE))
-输出
df_median
# A tibble: 7 x 5
data G1 G2 Vok3 Vok4
<chr> <dbl> <dbl> <dbl> <dbl>
1 A 1487 10242. 3016. 5725.
2 AA 1093. 3007. 931. 1036.
3 AAA 63.0 187. 79.8 95
4 AB 51 146. 84.9 112.
5 ABN 1252. 9424. 2271. 3873.
6 AHG 1230. 4823. 1665. 2042.
7 ANBV 2051. 1644. 1696. 1006.
从宽格式,我们可以再次转换为 'long' as
df_median %>%
pivot_longer(cols = -data, names_to = 'key') %>%
arrange(data)
# A tibble: 28 x 3
data key value
<chr> <chr> <dbl>
1 A G1 1487
2 A G2 10242.
3 A Vok3 3016.
4 A Vok4 5725.
5 AA G1 1093.
6 AA G2 3007.
7 AA Vok3 931.
8 AA Vok4 1036.
9 AAA G1 63.0
10 AAA G2 187.
# … with 18 more rows
或者不是分两步进行,我们对所有列进行整形,然后获取 'key' 列的子字符串并用于分组以获得 [=59] 的 median
=]
library(stringr)
df %>%
pivot_longer(cols = -data, names_to = "key") %>%
group_by(data, key = str_remove(key, "_\d+")) %>%
summarise(value = median(value), .groups = 'drop')
# A tibble: 28 x 3
data key value
<chr> <chr> <dbl>
1 A G1 1487
2 A G2 10242.
3 A Vok3 3016.
4 A Vok4 5725.
5 AA G1 1093.
6 AA G2 3007.
7 AA Vok3 931.
8 AA Vok4 1036.
9 AAA G1 63.0
10 AAA G2 187.
# … with 18 more rows
如果dplyr
版本旧,则使用summarise_all
df %>%
pivot_longer(cols = -data, names_to = ".value",
names_pattern = "(.*)_\d+") %>%
group_by(data) %>%
dplyr::summarise_all(median, na.rm = TRUE)
# A tibble: 7 x 5
data G1 G2 Vok3 Vok4
<chr> <dbl> <dbl> <dbl> <dbl>
1 A 1487 10242. 3016. 5725.
2 AA 1093. 3007. 931. 1036.
3 AAA 63.0 187. 79.8 95
4 AB 51 146. 84.9 112.
5 ABN 1252. 9424. 2271. 3873.
6 AHG 1230. 4823. 1665. 2042.
7 ANBV 2051. 1644. 1696. 1006.
或者用split.default
从base R
变成data.frames的list
,从matrixStats
转成[=31后用rowMedians
=]
library(matrixStats)
cbind(df[1], sapply(split.default(df[-1], sub("_\d+$", "", names(df)[-1])),
function(x) rowMedians(as.matrix(x))))
data G1 G2 Vok3 Vok4
1 A 1487.00 10241.65 3016.45 5725.05
2 AA 1093.10 3007.25 931.20 1036.45
3 AAA 63.05 186.60 79.85 95.00
4 AB 51.00 146.30 84.90 111.90
5 ABN 1251.65 9423.95 2270.55 3873.20
6 AHG 1230.25 4822.95 1664.90 2042.50
7 ANBV 2051.05 1643.90 1695.90 1005.60