dplyr:使用行而不是列进行变异
dplyr: mutate using rows rather than columns
我的数据集是这样的
year Spp CO2 plantN_mean N15_mean plantN_sd N15_sd plantN_se N15_se
1 2004 A amb 17.136667 10.723333 1.2615202 0.7507552 0.7283391 0.4334487
2 2004 A elev 23.310000 13.043333 2.7160081 2.6595175 1.5680880 1.5354731
3 2004 AB amb 14.410000 10.156667 1.1363538 1.7773670 0.6560742 1.0261633
4 2004 AB elev 19.470000 14.786667 2.9173790 3.7358979 1.6843495 2.1569217
5 2004 AM amb 9.603333 13.510000 0.5515735 1.7176437 0.3184511 0.9916821
6 2004 AM elev 16.333333 9.743333 2.3622306 1.8825869 1.3638345 1.0869120
对于plantN_mean
和N15_mean
,我需要计算疗效比,即elev/amb
。我可以使用 plyr 为其中一个变量做这样的事情:
effect <- ddply(data, .(year,Spp), function (x){
plantN_ratio <- x$plantN_mean[x$CO2 == "elev"]/x$plantN_mean[x$CO2 == "amb"]
data.frame(plantN_ratio)
})
dplyr 版本是什么,plantN_mean
和 N15_mean
是什么?
我想答案可以这样开头:
effect <- summary %>% group_by(year,Spp) %>% mutate(
plantN_ratio=plantN_mean[CO2 == "elev"]/plantN_mean[CO2 == "amb"],
N15_ratio= N15_mean[CO2 == "elev"]/N15_mean[CO2 == "amb"])
tidyr 在这里很有用。
install_packages("tidyr")
library(tidyr)
library(dplyr) # for %>% from [magrittr][http://cran.r-project.org/web/packages/magrittr/vignettes/magrittr.html]
可重现的数据:
df <- structure(list(year = c(2004L, 2004L, 2004L, 2004L, 2004L, 2004L
), Spp = c("A", "A", "AB", "AB", "AM", "AM"), CO2 = c("amb",
"elev", "amb", "elev", "amb", "elev"), plantN_mean = c(17.136667,
23.31, 14.41, 19.47, 9.603333, 16.333333), N15_mean = c(10.723333,
13.043333, 10.156667, 14.786667, 13.51, 9.743333)), .Names = c("year",
"Spp", "CO2", "plantN_mean", "N15_mean"), row.names = c(NA, -6L
), class = c("tbl_df", "tbl", "data.frame"), .internal.selfref = <pointer: 0x0>)
df
看起来像:
year Spp CO2 plantN_mean N15_mean
1 2004 A amb 17.136667 10.723333
2 2004 A elev 23.310000 13.043333
3 2004 AB amb 14.410000 10.156667
4 2004 AB elev 19.470000 14.786667
5 2004 AM amb 9.603333 13.510000
6 2004 AM elev 16.333333 9.743333
1。让我们收集所有的平均值变量:
gdf <- df %>% group_by(year,Spp) %>% gather(mean_id,mean_val,plantN_mean:N15_mean)
gdf
看起来像:
year Spp CO2 mean_id mean_val
1 2004 A amb plantN_mean 17.136667
2 2004 A elev plantN_mean 23.310000
3 2004 AB amb plantN_mean 14.410000
4 2004 AB elev plantN_mean 19.470000
5 2004 AM amb plantN_mean 9.603333
6 2004 AM elev plantN_mean 16.333333
7 2004 A amb N15_mean 10.723333
8 2004 A elev N15_mean 13.043333
9 2004 AB amb N15_mean 10.156667
10 2004 AB elev N15_mean 14.786667
11 2004 AM amb N15_mean 13.510000
12 2004 AM elev N15_mean 9.743333
2。让我们根据 CO2
变量展开平均值:
sdf <- gdf %>% spread(CO2,mean_val)
sdf
看起来像:
year Spp mean_id amb elev
1 2004 A plantN_mean 17.136667 23.310000
2 2004 A N15_mean 10.723333 13.043333
3 2004 AB plantN_mean 14.410000 19.470000
4 2004 AB N15_mean 10.156667 14.786667
5 2004 AM plantN_mean 9.603333 16.333333
6 2004 AM N15_mean 13.510000 9.743333
3。现在计算 elev
/ amb
:
的比率
sdf %>% mutate(elev_o_amb = elev / amb)
获得:
year Spp mean_id amb elev elev_o_amb
1 2004 A plantN_mean 17.136667 23.310000 1.3602412
2 2004 A N15_mean 10.723333 13.043333 1.2163506
3 2004 AB plantN_mean 14.410000 19.470000 1.3511450
4 2004 AB N15_mean 10.156667 14.786667 1.4558582
5 2004 AM plantN_mean 9.603333 16.333333 1.7007984
6 2004 AM N15_mean 13.510000 9.743333 0.7211942
下面是我将如何使用 data.table
来解决这个问题
library(data.table)
cols <- c("plantN_mean", "N15_mean") # select which columns to modify
Myfunc <- function(x) x[2L]/x[1L] # define the function
setDT(df)[order(CO2), lapply(.SD, Myfunc), .SDcols = cols, by = .(year, Spp)]
# year Spp plantN_mean N15_mean
# 1: 2004 A 1.360241 1.2163506
# 2: 2004 AB 1.351145 1.4558582
# 3: 2004 AM 1.700798 0.7211942
或者,您可以使用 .SD[2L]/.SD[1L]
(请参阅@Aruns 评论)提高效率,无论您要修改多少列,它都只会执行一次 /
,因此会跳过 _each
部分(或 lapply
部分)
setDT(df)[order(CO2), .SD[2L]/.SD[1L], by=.(year, Spp), .SDcols = cols]
为了解释它是如何工作的,我们基本上是按 CO2
对数据进行排序,所以 amb
将 总是 在 elev
之前,然后我们只需将第二个实例除以第一个实例。 .SD
表示 Sub Data
这意味着我们的数据集具有 .SDcols
中指定的列
我的数据集是这样的
year Spp CO2 plantN_mean N15_mean plantN_sd N15_sd plantN_se N15_se
1 2004 A amb 17.136667 10.723333 1.2615202 0.7507552 0.7283391 0.4334487
2 2004 A elev 23.310000 13.043333 2.7160081 2.6595175 1.5680880 1.5354731
3 2004 AB amb 14.410000 10.156667 1.1363538 1.7773670 0.6560742 1.0261633
4 2004 AB elev 19.470000 14.786667 2.9173790 3.7358979 1.6843495 2.1569217
5 2004 AM amb 9.603333 13.510000 0.5515735 1.7176437 0.3184511 0.9916821
6 2004 AM elev 16.333333 9.743333 2.3622306 1.8825869 1.3638345 1.0869120
对于plantN_mean
和N15_mean
,我需要计算疗效比,即elev/amb
。我可以使用 plyr 为其中一个变量做这样的事情:
effect <- ddply(data, .(year,Spp), function (x){
plantN_ratio <- x$plantN_mean[x$CO2 == "elev"]/x$plantN_mean[x$CO2 == "amb"]
data.frame(plantN_ratio)
})
dplyr 版本是什么,plantN_mean
和 N15_mean
是什么?
我想答案可以这样开头:
effect <- summary %>% group_by(year,Spp) %>% mutate(
plantN_ratio=plantN_mean[CO2 == "elev"]/plantN_mean[CO2 == "amb"],
N15_ratio= N15_mean[CO2 == "elev"]/N15_mean[CO2 == "amb"])
tidyr 在这里很有用。
install_packages("tidyr")
library(tidyr)
library(dplyr) # for %>% from [magrittr][http://cran.r-project.org/web/packages/magrittr/vignettes/magrittr.html]
可重现的数据:
df <- structure(list(year = c(2004L, 2004L, 2004L, 2004L, 2004L, 2004L
), Spp = c("A", "A", "AB", "AB", "AM", "AM"), CO2 = c("amb",
"elev", "amb", "elev", "amb", "elev"), plantN_mean = c(17.136667,
23.31, 14.41, 19.47, 9.603333, 16.333333), N15_mean = c(10.723333,
13.043333, 10.156667, 14.786667, 13.51, 9.743333)), .Names = c("year",
"Spp", "CO2", "plantN_mean", "N15_mean"), row.names = c(NA, -6L
), class = c("tbl_df", "tbl", "data.frame"), .internal.selfref = <pointer: 0x0>)
df
看起来像:
year Spp CO2 plantN_mean N15_mean
1 2004 A amb 17.136667 10.723333
2 2004 A elev 23.310000 13.043333
3 2004 AB amb 14.410000 10.156667
4 2004 AB elev 19.470000 14.786667
5 2004 AM amb 9.603333 13.510000
6 2004 AM elev 16.333333 9.743333
1。让我们收集所有的平均值变量:
gdf <- df %>% group_by(year,Spp) %>% gather(mean_id,mean_val,plantN_mean:N15_mean)
gdf
看起来像:
year Spp CO2 mean_id mean_val
1 2004 A amb plantN_mean 17.136667
2 2004 A elev plantN_mean 23.310000
3 2004 AB amb plantN_mean 14.410000
4 2004 AB elev plantN_mean 19.470000
5 2004 AM amb plantN_mean 9.603333
6 2004 AM elev plantN_mean 16.333333
7 2004 A amb N15_mean 10.723333
8 2004 A elev N15_mean 13.043333
9 2004 AB amb N15_mean 10.156667
10 2004 AB elev N15_mean 14.786667
11 2004 AM amb N15_mean 13.510000
12 2004 AM elev N15_mean 9.743333
2。让我们根据 CO2
变量展开平均值:
sdf <- gdf %>% spread(CO2,mean_val)
sdf
看起来像:
year Spp mean_id amb elev
1 2004 A plantN_mean 17.136667 23.310000
2 2004 A N15_mean 10.723333 13.043333
3 2004 AB plantN_mean 14.410000 19.470000
4 2004 AB N15_mean 10.156667 14.786667
5 2004 AM plantN_mean 9.603333 16.333333
6 2004 AM N15_mean 13.510000 9.743333
3。现在计算 elev
/ amb
:
的比率
sdf %>% mutate(elev_o_amb = elev / amb)
获得:
year Spp mean_id amb elev elev_o_amb
1 2004 A plantN_mean 17.136667 23.310000 1.3602412
2 2004 A N15_mean 10.723333 13.043333 1.2163506
3 2004 AB plantN_mean 14.410000 19.470000 1.3511450
4 2004 AB N15_mean 10.156667 14.786667 1.4558582
5 2004 AM plantN_mean 9.603333 16.333333 1.7007984
6 2004 AM N15_mean 13.510000 9.743333 0.7211942
下面是我将如何使用 data.table
library(data.table)
cols <- c("plantN_mean", "N15_mean") # select which columns to modify
Myfunc <- function(x) x[2L]/x[1L] # define the function
setDT(df)[order(CO2), lapply(.SD, Myfunc), .SDcols = cols, by = .(year, Spp)]
# year Spp plantN_mean N15_mean
# 1: 2004 A 1.360241 1.2163506
# 2: 2004 AB 1.351145 1.4558582
# 3: 2004 AM 1.700798 0.7211942
或者,您可以使用 .SD[2L]/.SD[1L]
(请参阅@Aruns 评论)提高效率,无论您要修改多少列,它都只会执行一次 /
,因此会跳过 _each
部分(或 lapply
部分)
setDT(df)[order(CO2), .SD[2L]/.SD[1L], by=.(year, Spp), .SDcols = cols]
为了解释它是如何工作的,我们基本上是按 CO2
对数据进行排序,所以 amb
将 总是 在 elev
之前,然后我们只需将第二个实例除以第一个实例。 .SD
表示 Sub Data
这意味着我们的数据集具有 .SDcols