如何对多个数据子集的多个列使用 apply 函数?
How do I use the apply function over multiple columns for multiple subsets of data?
我有一个包含 943 列和 500 行的数据框(下面的示例)。
df <-data.frame(Rep=c(1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3), Depth=c("D", "D", "D", "M", "M", "M", "D", "D", "D", "M", "M", "D", "D"), T0= c(-165,-163,-160,-161,-270,165,-163,-160,-161,-270,-181,-231, -230), T0.01= c(458,459,457,342,158,458,459,457,342,158,324,333,320), T0.02=c(-151,-153,-131,-125,-130,-151,-153,-131,-125,-130,-120, -130,-120))
我需要获取数据集中 7:943 列的列中位数(所有带有数字数据的列...它们也都以标题 "T" 开头,如 T0、T0。 01 等)。但是,我只需要特定行子集的列中位数。该子集将基于 "Rep" 和 "Depth." 例如,我需要 "Rep 1 at Depth D" 的列中位数向量,然后是 "Rep 1 at Depth M" 的列中位数向量。我总共有 24 个 Reps 和 3 个深度,所有组合都需要一个中位数向量,总共产生 3x24=72 个向量。这将产生一个 table 结构如下(转置版本也可以):
df <-data.frame(Rep=c(1, 1, 1, 2, 2, 2), Depth=c("D", "M", "S", "D", "M", "S"), T0= c(-163,-160,-161,-270,165, 165), T0.01= c(458,459,457,342,158,458), T0.02=c(-151,-153,-131,-125,-130,-151))
Rep Depth T0 T0.01 T0.02
1 D -163 458 -151
1 M -160 459 -153
1 S -161 457 -131
2 D -270 342 -125
2 M 165 158 -130
2 S 165 458 -151
此外,我需要为这些相同的数据子集计算 7:943 列("T" 列)中所有单元格的方差。这将为每个子集生成一个数字(而不是向量)。
我已经为所有这些尝试了子集、tapply、grepl 函数,但似乎无法让它们执行我想要的操作。谢谢
根据您提供的数据:
library(dplyr)
df %>%
group_by(Rep, Depth) %>%
summarise_each(funs(median, var))
Rep Depth T0_median T0.01_median T0.02_median T0_var T0.01_var T0.02_var
(dbl) (fctr) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl)
1 1 D -163.0 458.0 -151.0 6.333333 1.000 148.0000
2 1 M -215.5 250.0 -127.5 5940.500000 16928.000 12.5000
3 2 D -161.0 457.0 -131.0 2.333333 4486.333 217.3333
4 2 M 165.0 458.0 -151.0 NA NA NA
5 3 D -230.5 326.5 -125.0 0.500000 84.500 50.0000
6 3 M -225.5 241.0 -125.0 3960.500000 13778.000 50.0000
或者,如果您想让分组更具描述性:
df %>%
mutate(group=paste("Rep",Rep,"at Depth", Depth)) %>%
group_by(group) %>%
summarise_each(funs(median, var), matches("^T"))
group T0_median T0.01_median T0.02_median T0_var T0.01_var T0.02_var
(chr) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl)
1 Rep 1 at Depth D -163.0 458.0 -151.0 6.333333 1.000 148.0000
2 Rep 1 at Depth M -215.5 250.0 -127.5 5940.500000 16928.000 12.5000
3 Rep 2 at Depth D -161.0 457.0 -131.0 2.333333 4486.333 217.3333
4 Rep 2 at Depth M 165.0 458.0 -151.0 NA NA NA
5 Rep 3 at Depth D -230.5 326.5 -125.0 0.500000 84.500 50.0000
6 Rep 3 at Depth M -225.5 241.0 -125.0 3960.500000 13778.000 50.0000
更新: 所以对于所有数据列的分组方差,这就是你的意思吗(do
语句可能比它需要的更复杂是):
df %>%
mutate(group=paste("Rep",Rep,"at Depth", Depth)) %>%
select(-Rep, -Depth) %>%
group_by(group) %>%
do(data.frame(variance=var(unlist(.[,sapply(., is.numeric)]))))
group variance
(chr) (dbl)
1 Rep 1 at Depth D 93682.36
2 Rep 1 at Depth M 53501.60
3 Rep 2 at Depth D 81997.03
4 Rep 2 at Depth M 92764.33
5 Rep 3 at Depth D 70057.87
6 Rep 3 at Depth M 51781.50
我有一个包含 943 列和 500 行的数据框(下面的示例)。
df <-data.frame(Rep=c(1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3), Depth=c("D", "D", "D", "M", "M", "M", "D", "D", "D", "M", "M", "D", "D"), T0= c(-165,-163,-160,-161,-270,165,-163,-160,-161,-270,-181,-231, -230), T0.01= c(458,459,457,342,158,458,459,457,342,158,324,333,320), T0.02=c(-151,-153,-131,-125,-130,-151,-153,-131,-125,-130,-120, -130,-120))
我需要获取数据集中 7:943 列的列中位数(所有带有数字数据的列...它们也都以标题 "T" 开头,如 T0、T0。 01 等)。但是,我只需要特定行子集的列中位数。该子集将基于 "Rep" 和 "Depth." 例如,我需要 "Rep 1 at Depth D" 的列中位数向量,然后是 "Rep 1 at Depth M" 的列中位数向量。我总共有 24 个 Reps 和 3 个深度,所有组合都需要一个中位数向量,总共产生 3x24=72 个向量。这将产生一个 table 结构如下(转置版本也可以):
df <-data.frame(Rep=c(1, 1, 1, 2, 2, 2), Depth=c("D", "M", "S", "D", "M", "S"), T0= c(-163,-160,-161,-270,165, 165), T0.01= c(458,459,457,342,158,458), T0.02=c(-151,-153,-131,-125,-130,-151))
Rep Depth T0 T0.01 T0.02
1 D -163 458 -151
1 M -160 459 -153
1 S -161 457 -131
2 D -270 342 -125
2 M 165 158 -130
2 S 165 458 -151
此外,我需要为这些相同的数据子集计算 7:943 列("T" 列)中所有单元格的方差。这将为每个子集生成一个数字(而不是向量)。
我已经为所有这些尝试了子集、tapply、grepl 函数,但似乎无法让它们执行我想要的操作。谢谢
根据您提供的数据:
library(dplyr)
df %>%
group_by(Rep, Depth) %>%
summarise_each(funs(median, var))
Rep Depth T0_median T0.01_median T0.02_median T0_var T0.01_var T0.02_var
(dbl) (fctr) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl)
1 1 D -163.0 458.0 -151.0 6.333333 1.000 148.0000
2 1 M -215.5 250.0 -127.5 5940.500000 16928.000 12.5000
3 2 D -161.0 457.0 -131.0 2.333333 4486.333 217.3333
4 2 M 165.0 458.0 -151.0 NA NA NA
5 3 D -230.5 326.5 -125.0 0.500000 84.500 50.0000
6 3 M -225.5 241.0 -125.0 3960.500000 13778.000 50.0000
或者,如果您想让分组更具描述性:
df %>%
mutate(group=paste("Rep",Rep,"at Depth", Depth)) %>%
group_by(group) %>%
summarise_each(funs(median, var), matches("^T"))
group T0_median T0.01_median T0.02_median T0_var T0.01_var T0.02_var
(chr) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl)
1 Rep 1 at Depth D -163.0 458.0 -151.0 6.333333 1.000 148.0000
2 Rep 1 at Depth M -215.5 250.0 -127.5 5940.500000 16928.000 12.5000
3 Rep 2 at Depth D -161.0 457.0 -131.0 2.333333 4486.333 217.3333
4 Rep 2 at Depth M 165.0 458.0 -151.0 NA NA NA
5 Rep 3 at Depth D -230.5 326.5 -125.0 0.500000 84.500 50.0000
6 Rep 3 at Depth M -225.5 241.0 -125.0 3960.500000 13778.000 50.0000
更新: 所以对于所有数据列的分组方差,这就是你的意思吗(do
语句可能比它需要的更复杂是):
df %>%
mutate(group=paste("Rep",Rep,"at Depth", Depth)) %>%
select(-Rep, -Depth) %>%
group_by(group) %>%
do(data.frame(variance=var(unlist(.[,sapply(., is.numeric)]))))
group variance
(chr) (dbl)
1 Rep 1 at Depth D 93682.36
2 Rep 1 at Depth M 53501.60
3 Rep 2 at Depth D 81997.03
4 Rep 2 at Depth M 92764.33
5 Rep 3 at Depth D 70057.87
6 Rep 3 at Depth M 51781.50