在没有 pivot_longer 的情况下通过 dplyr 中的 R 中的列分组计算 rowMeans
calculate rowMeans by column groupings in R in dplyr without pivot_longer
我有一个如下所示的数据框:
> df[1:5,1:10]
X F1_01 F1_03 F1_04 F1_06 F1_09 F1_14 F1_15 F1_16 F1_17
1 gene0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
2 gene1 3.420577 2.919879 2.287364 5.554634 2.233958 3.155860 2.946792 2.628113 2.702805
3 gene10 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
4 gene100 7.623784 7.035468 6.917434 6.276214 7.615697 5.822012 5.437085 4.691465 4.876582
5 gene1000 5.277115 6.184268 5.122632 5.827487 4.848992 3.419213 4.594827 4.123349 4.810539
每列分组如下:
groups <- data.frame(ID = c("F1_01", "F1_03", "F1_04", "F1_06", "F1_09", "F1_14", "F1_15", "F1_16", "F1_17"),
group = c("A", "B", "C", "A", "B", "C", "A", "B", "C"))
我希望每组(A、B 和 C)rowMeans
。
我将如何在 dplyr 中指定它?我可以使用 pivot_longer:
tmp %>%
pivot_longer(-10,
names_to = "ID") %>%
left_join(groups) %>%
group_by(x,group) %>%
summarise(mean = mean(value)) %>%
spread(group, mean)
但是,我不想使用 pivot_longer,因为原始数据框大约有 15k 行和 48 列。当我尝试执行此操作时,我的计算机崩溃了。是否可以使用 rowMeans?我有点卡住了,如有任何帮助,我们将不胜感激
数据
> dput(tmp)
structure(list(F1_01 = c(0, 3.420577, 0, 7.623784, 5.277115),
F1_03 = c(0, 2.919879, 0, 7.035468, 6.184268), F1_04 = c(0,
2.287364, 0, 6.917434, 5.122632), F1_06 = c(0, 5.554634,
0, 6.276214, 5.827487), F1_09 = c(0, 2.233958, 0, 7.615697,
4.848992), F1_14 = c(0, 3.15586, 0, 5.822012, 3.419213),
F1_15 = c(0, 2.946792, 0, 5.437085, 4.594827), F1_16 = c(0,
2.628113, 0, 4.691465, 4.123349), F1_17 = c(0, 2.702805,
0, 4.876582, 4.810539), x = c("id01", "id02", " id03", "id04",
"id05")), row.names = c(NA, 5L), class = "data.frame")
这是一个基本的 R 选项:
tmp1 <- tmp[-ncol(tmp)]
cbind(tmp[ncol(tmp)], sapply(split.default(tmp1, groups$group), rowMeans))
# x A B C
#1 id01 0.000000 0.000000 0.000000
#2 id02 3.974001 2.593983 2.715343
#3 id03 0.000000 0.000000 0.000000
#4 id04 6.445694 6.447543 5.872009
#5 id05 5.233143 5.052203 4.450795
如果组 ID
和 tmp
的列名未按相同顺序排列,请在应用上述答案之前执行以下操作。
tmp1 <- tmp1[groups$ID]
使用 dplyr
和 purrr
的一个选项可能是:
df %>%
select(x) %>%
bind_cols(imap_dfc(.x = split.default(select(df, -x), groups$group),
~ .x %>%
transmute(!!.y := rowMeans(select(.x, everything())))))
x A B C
1 id01 0.000000 0.000000 0.000000
2 id02 3.974001 2.593983 2.715343
3 id03 0.000000 0.000000 0.000000
4 id04 6.445694 6.447543 5.872009
5 id05 5.233143 5.052203 4.450795
我有一个如下所示的数据框:
> df[1:5,1:10]
X F1_01 F1_03 F1_04 F1_06 F1_09 F1_14 F1_15 F1_16 F1_17
1 gene0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
2 gene1 3.420577 2.919879 2.287364 5.554634 2.233958 3.155860 2.946792 2.628113 2.702805
3 gene10 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
4 gene100 7.623784 7.035468 6.917434 6.276214 7.615697 5.822012 5.437085 4.691465 4.876582
5 gene1000 5.277115 6.184268 5.122632 5.827487 4.848992 3.419213 4.594827 4.123349 4.810539
每列分组如下:
groups <- data.frame(ID = c("F1_01", "F1_03", "F1_04", "F1_06", "F1_09", "F1_14", "F1_15", "F1_16", "F1_17"),
group = c("A", "B", "C", "A", "B", "C", "A", "B", "C"))
我希望每组(A、B 和 C)rowMeans
。
我将如何在 dplyr 中指定它?我可以使用 pivot_longer:
tmp %>%
pivot_longer(-10,
names_to = "ID") %>%
left_join(groups) %>%
group_by(x,group) %>%
summarise(mean = mean(value)) %>%
spread(group, mean)
但是,我不想使用 pivot_longer,因为原始数据框大约有 15k 行和 48 列。当我尝试执行此操作时,我的计算机崩溃了。是否可以使用 rowMeans?我有点卡住了,如有任何帮助,我们将不胜感激
数据> dput(tmp)
structure(list(F1_01 = c(0, 3.420577, 0, 7.623784, 5.277115),
F1_03 = c(0, 2.919879, 0, 7.035468, 6.184268), F1_04 = c(0,
2.287364, 0, 6.917434, 5.122632), F1_06 = c(0, 5.554634,
0, 6.276214, 5.827487), F1_09 = c(0, 2.233958, 0, 7.615697,
4.848992), F1_14 = c(0, 3.15586, 0, 5.822012, 3.419213),
F1_15 = c(0, 2.946792, 0, 5.437085, 4.594827), F1_16 = c(0,
2.628113, 0, 4.691465, 4.123349), F1_17 = c(0, 2.702805,
0, 4.876582, 4.810539), x = c("id01", "id02", " id03", "id04",
"id05")), row.names = c(NA, 5L), class = "data.frame")
这是一个基本的 R 选项:
tmp1 <- tmp[-ncol(tmp)]
cbind(tmp[ncol(tmp)], sapply(split.default(tmp1, groups$group), rowMeans))
# x A B C
#1 id01 0.000000 0.000000 0.000000
#2 id02 3.974001 2.593983 2.715343
#3 id03 0.000000 0.000000 0.000000
#4 id04 6.445694 6.447543 5.872009
#5 id05 5.233143 5.052203 4.450795
如果组 ID
和 tmp
的列名未按相同顺序排列,请在应用上述答案之前执行以下操作。
tmp1 <- tmp1[groups$ID]
使用 dplyr
和 purrr
的一个选项可能是:
df %>%
select(x) %>%
bind_cols(imap_dfc(.x = split.default(select(df, -x), groups$group),
~ .x %>%
transmute(!!.y := rowMeans(select(.x, everything())))))
x A B C
1 id01 0.000000 0.000000 0.000000
2 id02 3.974001 2.593983 2.715343
3 id03 0.000000 0.000000 0.000000
4 id04 6.445694 6.447543 5.872009
5 id05 5.233143 5.052203 4.450795