如何在回收值时将数据帧的每一行除以数据帧中相应列的行
How to divide each row of a dataframe by rows of corresponding columns in a dataframe while recycling values
我的数据框如下所示:
Gene.names=c("ESR", "ESR.1", "ESR.2", "ESR.3", "PKB", "PKB.1", "PKB.2", "PKB.3")
mean_0.x = c(3,2,5,9,2,4,6,7)
mean_1.x = c(6,2,5,1,9,1,1,9)
mean_2.x = c(3,2,9,9,6,7,3,3)
mean_0.y = c(1,NA,NA,NA,6,NA,NA,NA)
mean_1.y = c(1,NA,NA,NA,3,NA,NA,NA)
mean_2.y = c(6,NA,NA,NA,4,NA,NA,NA)
df = cbind.data.frame(Gene.names, mean_0.x, mean_1.x, mean_2.x, mean_0.y, mean_1.y, mean_2.y)
我想要的输出:
Gene.names = c("ESR", "ESR.1", "ESR.2", "ESR.3", "PKB", "PKB.1", "PKB.2", "PKB.3")
mean_0_diff = c(3,2,5,9,0.33,0.66,1,1.16)
mean_1_diff = c(6,2,5,1,3,0.33,.0.33,3)
mean_2_diff = c(0.5,0.33,1.5,1.5,1.5,1.75,0.75,0.75)
df_out = cbind.data.frame(Gene.names, mean_0_diff, mean_1_diff, mean_2_diff)
- 我的数据框包含数千行和 >50 列
- 我想划分相应的列,例如
mean_0.x/mean_0.y; mean_1.x/mean_1.y; mean_2.x/mean_2.y;
等等
- 我想回收 *.y 中的行值,这样,在此示例中,mean_0.y 中的值在 mean_0.x 中使用了 4 次。但是,在我的真实数据集中,这个 "recycling" 必须发生任何未知次数。
使用tidyverse
:
library(tidyverse)
res <- cbind(df[1],
`/`(df %>% select(ends_with('x')),
df %>% select(ends_with('y')) %>%
fill(everything())))
# Gene.names mean_0.x mean_1.x mean_2.x
# 1 ESR 3.0000000 6.0000000 0.5000000
# 2 ESR.1 2.0000000 2.0000000 0.3333333
# 3 ESR.2 5.0000000 5.0000000 1.5000000
# 4 ESR.3 9.0000000 1.0000000 1.5000000
# 5 PKB 0.3333333 3.0000000 1.5000000
# 6 PKB.1 0.6666667 0.3333333 1.7500000
# 7 PKB.2 1.0000000 0.3333333 0.7500000
# 8 PKB.3 1.1666667 3.0000000 0.7500000
这将是惯用的方式:
df %>%
fill(ends_with('y')) %>%
gather(,,-1) %>%
separate(key,c("key","xy"),sep="\.") %>%
spread(xy,value) %>%
transmute(Gene.names,key, value=x /y) %>%
spread(key,value)
# Gene.names mean_0 mean_1 mean_2
# 1 ESR 3.0000000 6.0000000 0.5000000
# 2 ESR.1 2.0000000 2.0000000 0.3333333
# 3 ESR.2 5.0000000 5.0000000 1.5000000
# 4 ESR.3 9.0000000 1.0000000 1.5000000
# 5 PKB 0.3333333 3.0000000 1.5000000
# 6 PKB.1 0.6666667 0.3333333 1.7500000
# 7 PKB.2 1.0000000 0.3333333 0.7500000
# 8 PKB.3 1.1666667 3.0000000 0.7500000
以下需要包 zoo
.
中的函数 na.locf
inx.x <- grep("x$", names(df))
inx.y <- grep("y$", names(df))
df[inx.y] <- lapply(df[inx.y], zoo::na.locf)
df_out2 <- df[1]
df_out2 <- cbind(df_out2, df[inx.x]/df[inx.y])
nms <- sub("\.x$", "", names(df[inx.x]))
names(df_out2)[-1] <- paste(nms, "diff", sep = "_")
df_out2
# Gene.names mean_0_diff mean_1_diff mean_2_diff
#1 ESR 3.0000000 6.0000000 0.5000000
#2 ESR.1 2.0000000 2.0000000 0.3333333
#3 ESR.2 5.0000000 5.0000000 1.5000000
#4 ESR.3 9.0000000 1.0000000 1.5000000
#5 PKB 0.3333333 3.0000000 1.5000000
#6 PKB.1 0.6666667 0.3333333 1.7500000
#7 PKB.2 1.0000000 0.3333333 0.7500000
#8 PKB.3 1.1666667 3.0000000 0.7500000
请注意,结果不相等,因为您的结果是四舍五入的值:
all.equal(df_out, df_out2)
#[1] "Component “mean_0_diff”: Mean relative difference: 0.007751938"
#[2] "Component “mean_1_diff”: Mean relative difference: 0.01010101"
#[3] "Component “mean_2_diff”: Mean relative difference: 0.01010101"
另一种选择是处理 wide-format
本身的数据。使用 mutate_at
的基于 dplyr
的解决方案可以写成:
library(dplyr)
# Group data on base name of 'Gene.names` first.
df %>% group_by(Gene = gsub("(^\w+)\..*","\1", Gene.names)) %>%
# For each column ending with .x divide corresponding column ending with .y
mutate_at(vars(ends_with(".x")),
funs(diff = ./get(sub("\.x",".y",quo_name(quo(.))))[1] )) %>%
ungroup() %>%
select( Gene.names, ends_with("diff"))
# # A tibble: 8 x 4
# Gene.names mean_0.x_diff mean_1.x_diff mean_2.x_diff
# <fctr> <dbl> <dbl> <dbl>
# 1 ESR 3.00 6.00 0.500
# 2 ESR.1 2.00 2.00 0.333
# 3 ESR.2 5.00 5.00 1.50
# 4 ESR.3 9.00 1.00 1.50
# 5 PKB 0.333 3.00 1.50
# 6 PKB.1 0.667 0.333 1.75
# 7 PKB.2 1.00 0.333 0.750
# 8 PKB.3 1.17 3.00 0.750
我的数据框如下所示:
Gene.names=c("ESR", "ESR.1", "ESR.2", "ESR.3", "PKB", "PKB.1", "PKB.2", "PKB.3")
mean_0.x = c(3,2,5,9,2,4,6,7)
mean_1.x = c(6,2,5,1,9,1,1,9)
mean_2.x = c(3,2,9,9,6,7,3,3)
mean_0.y = c(1,NA,NA,NA,6,NA,NA,NA)
mean_1.y = c(1,NA,NA,NA,3,NA,NA,NA)
mean_2.y = c(6,NA,NA,NA,4,NA,NA,NA)
df = cbind.data.frame(Gene.names, mean_0.x, mean_1.x, mean_2.x, mean_0.y, mean_1.y, mean_2.y)
我想要的输出:
Gene.names = c("ESR", "ESR.1", "ESR.2", "ESR.3", "PKB", "PKB.1", "PKB.2", "PKB.3")
mean_0_diff = c(3,2,5,9,0.33,0.66,1,1.16)
mean_1_diff = c(6,2,5,1,3,0.33,.0.33,3)
mean_2_diff = c(0.5,0.33,1.5,1.5,1.5,1.75,0.75,0.75)
df_out = cbind.data.frame(Gene.names, mean_0_diff, mean_1_diff, mean_2_diff)
- 我的数据框包含数千行和 >50 列
- 我想划分相应的列,例如
mean_0.x/mean_0.y; mean_1.x/mean_1.y; mean_2.x/mean_2.y;
等等 - 我想回收 *.y 中的行值,这样,在此示例中,mean_0.y 中的值在 mean_0.x 中使用了 4 次。但是,在我的真实数据集中,这个 "recycling" 必须发生任何未知次数。
使用tidyverse
:
library(tidyverse)
res <- cbind(df[1],
`/`(df %>% select(ends_with('x')),
df %>% select(ends_with('y')) %>%
fill(everything())))
# Gene.names mean_0.x mean_1.x mean_2.x
# 1 ESR 3.0000000 6.0000000 0.5000000
# 2 ESR.1 2.0000000 2.0000000 0.3333333
# 3 ESR.2 5.0000000 5.0000000 1.5000000
# 4 ESR.3 9.0000000 1.0000000 1.5000000
# 5 PKB 0.3333333 3.0000000 1.5000000
# 6 PKB.1 0.6666667 0.3333333 1.7500000
# 7 PKB.2 1.0000000 0.3333333 0.7500000
# 8 PKB.3 1.1666667 3.0000000 0.7500000
这将是惯用的方式:
df %>%
fill(ends_with('y')) %>%
gather(,,-1) %>%
separate(key,c("key","xy"),sep="\.") %>%
spread(xy,value) %>%
transmute(Gene.names,key, value=x /y) %>%
spread(key,value)
# Gene.names mean_0 mean_1 mean_2
# 1 ESR 3.0000000 6.0000000 0.5000000
# 2 ESR.1 2.0000000 2.0000000 0.3333333
# 3 ESR.2 5.0000000 5.0000000 1.5000000
# 4 ESR.3 9.0000000 1.0000000 1.5000000
# 5 PKB 0.3333333 3.0000000 1.5000000
# 6 PKB.1 0.6666667 0.3333333 1.7500000
# 7 PKB.2 1.0000000 0.3333333 0.7500000
# 8 PKB.3 1.1666667 3.0000000 0.7500000
以下需要包 zoo
.
na.locf
inx.x <- grep("x$", names(df))
inx.y <- grep("y$", names(df))
df[inx.y] <- lapply(df[inx.y], zoo::na.locf)
df_out2 <- df[1]
df_out2 <- cbind(df_out2, df[inx.x]/df[inx.y])
nms <- sub("\.x$", "", names(df[inx.x]))
names(df_out2)[-1] <- paste(nms, "diff", sep = "_")
df_out2
# Gene.names mean_0_diff mean_1_diff mean_2_diff
#1 ESR 3.0000000 6.0000000 0.5000000
#2 ESR.1 2.0000000 2.0000000 0.3333333
#3 ESR.2 5.0000000 5.0000000 1.5000000
#4 ESR.3 9.0000000 1.0000000 1.5000000
#5 PKB 0.3333333 3.0000000 1.5000000
#6 PKB.1 0.6666667 0.3333333 1.7500000
#7 PKB.2 1.0000000 0.3333333 0.7500000
#8 PKB.3 1.1666667 3.0000000 0.7500000
请注意,结果不相等,因为您的结果是四舍五入的值:
all.equal(df_out, df_out2)
#[1] "Component “mean_0_diff”: Mean relative difference: 0.007751938"
#[2] "Component “mean_1_diff”: Mean relative difference: 0.01010101"
#[3] "Component “mean_2_diff”: Mean relative difference: 0.01010101"
另一种选择是处理 wide-format
本身的数据。使用 mutate_at
的基于 dplyr
的解决方案可以写成:
library(dplyr)
# Group data on base name of 'Gene.names` first.
df %>% group_by(Gene = gsub("(^\w+)\..*","\1", Gene.names)) %>%
# For each column ending with .x divide corresponding column ending with .y
mutate_at(vars(ends_with(".x")),
funs(diff = ./get(sub("\.x",".y",quo_name(quo(.))))[1] )) %>%
ungroup() %>%
select( Gene.names, ends_with("diff"))
# # A tibble: 8 x 4
# Gene.names mean_0.x_diff mean_1.x_diff mean_2.x_diff
# <fctr> <dbl> <dbl> <dbl>
# 1 ESR 3.00 6.00 0.500
# 2 ESR.1 2.00 2.00 0.333
# 3 ESR.2 5.00 5.00 1.50
# 4 ESR.3 9.00 1.00 1.50
# 5 PKB 0.333 3.00 1.50
# 6 PKB.1 0.667 0.333 1.75
# 7 PKB.2 1.00 0.333 0.750
# 8 PKB.3 1.17 3.00 0.750