R 中 purr::map 的引导功能
boot function with purr::map in R
我正在研究 bootstrap 使用启动包的两个样本 t 检验。在基因表达矩阵中,我想比较条件之间的基因,我的目标是找到表达的基因。
我有一个矩阵 5*12(5 个控制、7 个处理和 5 个基因),首先我将这个数据矩阵转换为 tibble 格式作为两个长向量,以便理解 tibble 结构并使我更容易。:
library(tidyverse)
#> Warning: package 'tidyr' was built under R version 4.0.4
#> Warning: package 'dplyr' was built under R version 4.0.4
library(magrittr)
library(boot)
##Gene Expression matrix
Exp.mat<- read.table( header = TRUE, text = " C1 C2 C3 C4 C5 T1 T2 T3 T4 T5 T6 T7
Gene1 1.683340 0.3223701 1.72485303 1.9079350 1.2709514 1.682034 2.2272389 1.8203397 1.3749755 1.3140870 1.282419 0.8609480
Gene2 0.377944 0.2189322 0.08454824 0.5209215 0.6368712 1.045141 1.3999023 0.4671403 0.2733392 1.3171397 1.419082 0.5013091
Gene3 3.074032 1.9200940 2.11537958 2.6196671 1.2480232 2.677003 2.2899405 2.1760864 3.3651843 2.2385994 2.275105 3.0107882
Gene4 2.594239 1.3695119 1.89617796 2.2024559 1.1321975 2.178326 1.8842747 2.0992865 0.0000000 1.3404468 1.198157 1.4775754
Gene5 1.182900 3.4522132 1.58912876 1.0666626 0.2400953 1.159052 0.8113895 0.9986083 0.0000000 0.7091586 1.288114 2.0487426
" )
##Vectorized format from matrix
Vec_Ex.Mat <- as_tibble(t(Exp.mat))
Vec_Ex.Mat$Cond <- as.factor(c(rep("1", 5), rep("2", 7)))
Vec_Ex.Mat <- Vec_Ex.Mat %>% gather(Var, Val, -Cond)
Vec_Ex.Mat<- Vec_Ex.Mat[, c(2, 3, 1)]
colnames(Vec_Ex.Mat) <- c("Gene", "Exp", "Cond")
head(Vec_Ex.Mat)
#> # A tibble: 6 x 3
#> Gene Exp Cond
#> <chr> <dbl> <fct>
#> 1 Gene1 1.68 1
#> 2 Gene1 0.322 1
#> 3 Gene1 1.72 1
#> 4 Gene1 1.91 1
#> 5 Gene1 1.27 1
#> 6 Gene1 1.68 2
##Created nested tibble
Nested_Ex.Mat <- Vec_Ex.Mat %>%
dplyr::group_by(Gene) %>%
tidyr::nest()
head(Nested_Ex.Mat)
#> # A tibble: 5 x 2
#> # Groups: Gene [5]
#> Gene data
#> <chr> <list>
#> 1 Gene1 <tibble[,2] [12 x 2]>
#> 2 Gene2 <tibble[,2] [12 x 2]>
#> 3 Gene3 <tibble[,2] [12 x 2]>
#> 4 Gene4 <tibble[,2] [12 x 2]>
#> 5 Gene5 <tibble[,2] [12 x 2]>
## Function for bootstrap
bootFun <- function(df, f) {
n <- nrow(df)
idx <- which(df[, 2] == 2)
idy <- which(df[, 2] == 1)
nx <- length(idx)
ny <- length(idy)
new.df <- df
new.df[idx, 1] <- df[idx, 1] - mean(df[idx, 1]) + mean(df[, 1])
new.df[idy, 1] <- df[idy, 1] - mean(df[idy, 1]) + mean(df[, 1])
df <- new.df
MX <- sum(df[idx, 1] * f[idx])/sum(f[idx])
SX <- sum(df[idx, 1]^2 * f[idx])/sum(f[idx]) - MX^2
SX <- nx * SX/(nx - 1)
MY <- sum(df[idy, 1] * f[idy])/sum(f[idy])
SY <- sum(df[idy, 1]^2 * f[idy])/sum(f[idy]) - MY^2
SY <- ny * SY/(ny - 1)
SXY <- sqrt((SX/nx) + (SY/ny))
(MX -MY)/SXY
}
##Bootstrap analysis with boot package using purrr::map
Nested_Ex.Mat %<>%
dplyr::mutate(booted = purrr::map(.x=data, ~ boot::boot(data= .x, sim = "ordinary", statistic = bootFun,R = 5,stype = "f", strata=.x[, 2])))
#> Error: Problem with `mutate()` input `booted`.
#> x 'list' object cannot be coerced to type 'double'
#> i Input `booted` is `purrr::map(...)`.
#> i The error occurred in group 1: Gene = "Gene1".
由 reprex package (v1.0.0)
于 2021-04-06 创建
我不明白的是我是否正确使用了索引,我不知道如何使用 tibble 矢量化格式的引导包引入这些数据。在示例 here 中,分析了单个列,它正在运行。但是我想通过引导包为每个基因使用 strata 选项,在 tibble 数据中有两列。是否有可能摆脱这种代码负载,或者我们是否可以通过更短的代码和正确的索引使函数更高效?你能分享你的知识和建议吗?谢谢。
我不确定你为什么要 bootstrap t 检验。 运行 t.test
函数似乎更容易。这是我的代码:
加载包
library(dplyr)
library(tidyr)
library(tibble)
library(stringr)
library(purrr)
获取数据
Exp.mat<- read.table(header = TRUE, text = " C1 C2 C3 C4 C5 T1 T2 T3 T4 T5 T6 T7
Gene1 1.683340 0.3223701 1.72485303 1.9079350 1.2709514 1.682034 2.2272389 1.8203397 1.3749755 1.3140870 1.282419 0.8609480
Gene2 0.377944 0.2189322 0.08454824 0.5209215 0.6368712 1.045141 1.3999023 0.4671403 0.2733392 1.3171397 1.419082 0.5013091
Gene3 3.074032 1.9200940 2.11537958 2.6196671 1.2480232 2.677003 2.2899405 2.1760864 3.3651843 2.2385994 2.275105 3.0107882
Gene4 2.594239 1.3695119 1.89617796 2.2024559 1.1321975 2.178326 1.8842747 2.0992865 0.0000000 1.3404468 1.198157 1.4775754
Gene5 1.182900 3.4522132 1.58912876 1.0666626 0.2400953 1.159052 0.8113895 0.9986083 0.0000000 0.7091586 1.288114 2.0487426
" )
查看数据
head(Exp.mat)
#> C1 C2 C3 C4 C5 T1 T2
#> Gene1 1.683340 0.3223701 1.72485303 1.9079350 1.2709514 1.682034 2.2272389
#> Gene2 0.377944 0.2189322 0.08454824 0.5209215 0.6368712 1.045141 1.3999023
#> Gene3 3.074032 1.9200940 2.11537958 2.6196671 1.2480232 2.677003 2.2899405
#> Gene4 2.594239 1.3695119 1.89617796 2.2024559 1.1321975 2.178326 1.8842747
#> Gene5 1.182900 3.4522132 1.58912876 1.0666626 0.2400953 1.159052 0.8113895
#> T3 T4 T5 T6 T7
#> Gene1 1.8203397 1.3749755 1.3140870 1.282419 0.8609480
#> Gene2 0.4671403 0.2733392 1.3171397 1.419082 0.5013091
#> Gene3 2.1760864 3.3651843 2.2385994 2.275105 3.0107882
#> Gene4 2.0992865 0.0000000 1.3404468 1.198157 1.4775754
#> Gene5 0.9986083 0.0000000 0.7091586 1.288114 2.0487426
格式化数据
Exp.mat.new <- Exp.mat %>%
# Convert row names to a column
rownames_to_column('gene') %>%
# "Gather" the data using the pivot_longer function (preferred to the old "gather" function)
pivot_longer(cols = -gene,
names_to = 'cond',
values_to = 'exp') %>%
# Fix condition
mutate(cond = case_when(
str_detect(cond, pattern = 'C') ~ 'C',
TRUE ~ 'T')) %>%
# Nest
group_by(gene) %>%
nest()
执行分析
Exp.mat.new <- Exp.mat.new %>%
# Perform t-test
mutate(t_test = map(.x = data,
~ t.test(.x$exp ~ .x$cond))) %>%
# Extract vital statistics
mutate(C_minus_T_95CI = map(.x = t_test, # 95% CI of the mean difference between groups C and T
~ paste0(round(.x$conf.int[[1]], 3), ' to ',
round(.x$conf.int[[2]], 3))),
p_value = map(.x = t_test, # p-value
~ round(.x$p.value, 5))) %>%
# Unnest the data
unnest(cols = c(C_minus_T_95CI, p_value)) %>%
# Arrange by p-value
arrange(p_value)
打印数据帧
Exp.mat.new
#> # A tibble: 5 x 5
#> # Groups: gene [5]
#> gene data t_test C_minus_T_95CI p_value
#> <chr> <list> <list> <chr> <dbl>
#> 1 Gene2 <tibble [12 × 2]> <htest> -1.028 to -0.071 0.0288
#> 2 Gene3 <tibble [12 × 2]> <htest> -1.237 to 0.475 0.323
#> 3 Gene4 <tibble [12 × 2]> <htest> -0.482 to 1.251 0.345
#> 4 Gene5 <tibble [12 × 2]> <htest> -0.95 to 1.958 0.423
#> 5 Gene1 <tibble [12 × 2]> <htest> -0.914 to 0.66 0.712
如果需要,打印每个单独的 t 检验的完整结果
walk(.x = Exp.mat.new$t_test,
~ print(.x))
#>
#> Welch Two Sample t-test
#>
#> data: .x$exp by .x$cond
#> t = -2.6068, df = 8.8453, p-value = 0.02881
#> alternative hypothesis: true difference in means is not equal to 0
#> 95 percent confidence interval:
#> -1.02805952 -0.07141179
#> sample estimates:
#> mean in group C mean in group T
#> 0.3678434 0.9175791
#>
#>
#> Welch Two Sample t-test
#>
#> data: .x$exp by .x$cond
#> t = -1.0691, df = 6.4703, p-value = 0.3233
#> alternative hypothesis: true difference in means is not equal to 0
#> 95 percent confidence interval:
#> -1.2367921 0.4754686
#> sample estimates:
#> mean in group C mean in group T
#> 2.195439 2.576101
#>
#>
#> Welch Two Sample t-test
#>
#> data: .x$exp by .x$cond
#> t = 0.99278, df = 9.7754, p-value = 0.3448
#> alternative hypothesis: true difference in means is not equal to 0
#> 95 percent confidence interval:
#> -0.4816528 1.2514667
#> sample estimates:
#> mean in group C mean in group T
#> 1.838916 1.454009
#>
#>
#> Welch Two Sample t-test
#>
#> data: .x$exp by .x$cond
#> t = 0.86423, df = 5.5685, p-value = 0.4231
#> alternative hypothesis: true difference in means is not equal to 0
#> 95 percent confidence interval:
#> -0.9503223 1.9584180
#> sample estimates:
#> mean in group C mean in group T
#> 1.506200 1.002152
#>
#>
#> Welch Two Sample t-test
#>
#> data: .x$exp by .x$cond
#> t = -0.38483, df = 6.6963, p-value = 0.7123
#> alternative hypothesis: true difference in means is not equal to 0
#> 95 percent confidence interval:
#> -0.9143974 0.6604509
#> sample estimates:
#> mean in group C mean in group T
#> 1.381890 1.508863
由 reprex package (v1.0.0)
于 2021-04-06 创建
首先我遇到了上述错误。之后我在引导功能中更改了层。 (“strata = .x$cond”而不是 strata=.x[ 2])。然后它在第一个 boot.fun 函数中返回 NA 和 mean 函数。我在 boot.fun 中将 mean 更改为 colMeans。它现在正在工作。我将 运行 与 matrxix 一起使用,以测试准确性。
library(tidyverse)
library(dplyr)
library(tidyr)
library(tibble)
library(stringr)
library(purrr)
library(boot)
Exp.mat<- read.table(header = TRUE, text = " C1 C2 C3 C4 C5 T1 T2 T3 T4 T5 T6 T7
Gene1 1.683340 0.3223701 1.72485303 1.9079350 1.2709514 1.682034 2.2272389 1.8203397 1.3749755 1.3140870 1.282419 0.8609480
Gene2 0.377944 0.2189322 0.08454824 0.5209215 0.6368712 1.045141 1.3999023 0.4671403 0.2733392 1.3171397 1.419082 0.5013091
Gene3 3.074032 1.9200940 2.11537958 2.6196671 1.2480232 2.677003 2.2899405 2.1760864 3.3651843 2.2385994 2.275105 3.0107882
Gene4 2.594239 1.3695119 1.89617796 2.2024559 1.1321975 2.178326 1.8842747 2.0992865 0.0000000 1.3404468 1.198157 1.4775754
Gene5 1.182900 3.4522132 1.58912876 1.0666626 0.2400953 1.159052 0.8113895 0.9986083 0.0000000 0.7091586 1.288114 2.0487426
" )
Exp.mat.new <- Exp.mat %>%
# Convert row names to a column
rownames_to_column('gene') %>%
# "Gather" the data using the pivot_longer function (preferred to the old "gather" function)
pivot_longer(cols = -gene,
names_to = 'cond',
values_to = 'exp') %>%
# Fix condition
mutate(cond = case_when(
str_detect(cond, pattern = 'C') ~ '1',
TRUE ~ '2')) %>%
# Nest
group_by(gene) %>%
nest()
bootFun.tbl <- function(df,f) {
n <- nrow(df)
idx <- which(df[, 1] == 2) #select cond = 2 (treatment)
idy <- which(df[, 1] == 1) #select cond = 1 (control)
nx <- length(idx)
ny <- length(idy)
new.df <- df
new.df[idx, 2] <- df[idx, 2] - colMeans(df[idx, 2]) + colMeans(df[, 2])
new.df[idy, 2] <- df[idy, 2] - colMeans(df[idy, 2]) + colMeans(df[, 2])
df <- new.df
MX <- sum(df[idx, 2] * f[idx])/sum(f[idx])
SX <- sum(df[idx, 2]^2 * f[idx])/sum(f[idx]) - MX^2
SX <- nx * SX/(nx - 1)
MY <- sum(df[idy, 2] * f[idy])/sum(f[idy])
SY <- sum(df[idy, 2]^2 * f[idy])/sum(f[idy]) - MY^2
SY <- ny * SY/(ny - 1)
SXY <- sqrt((SX/nx) + (SY/ny))
(MX -MY)/SXY
}
Exp.mat.new <- Exp.mat.new %>%
# Perform bootstrap test
mutate(boot.fun = map(.x = data,
~boot::boot(data=.x, sim = "ordinary",
statistic = bootFun.tbl, R = 5,
stype = "f", strata = .x$cond)))
head(Exp.mat.new)
#> # A tibble: 5 x 3
#> # Groups: gene [5]
#> gene data boot.fun
#> <chr> <list> <list>
#> 1 Gene1 <tibble[,2] [12 × 2]> <boot>
#> 2 Gene2 <tibble[,2] [12 × 2]> <boot>
#> 3 Gene3 <tibble[,2] [12 × 2]> <boot>
#> 4 Gene4 <tibble[,2] [12 × 2]> <boot>
#> 5 Gene5 <tibble[,2] [12 × 2]> <boot>
head(Exp.mat.new$boot.fun)
#> [[1]]
#>
#> STRATIFIED BOOTSTRAP
#>
#>
#> Call:
#> boot::boot(data = .x, statistic = bootFun.tbl, R = 5, sim = "ordinary",
#> stype = "f", strata = .x$cond)
#>
#>
#> Bootstrap Statistics :
#> original bias std. error
#> t1* -6.729777e-16 -1.237088 2.189771
#>
#> [[2]]
#>
#> STRATIFIED BOOTSTRAP
#>
#>
#> Call:
#> boot::boot(data = .x, statistic = bootFun.tbl, R = 5, sim = "ordinary",
#> stype = "f", strata = .x$cond)
#>
#>
#> Bootstrap Statistics :
#> original bias std. error
#> t1* 5.264653e-16 0.4360011 0.4969442
#>
由 reprex package (v2.0.0)
于 2021-04-08 创建
我正在研究 bootstrap 使用启动包的两个样本 t 检验。在基因表达矩阵中,我想比较条件之间的基因,我的目标是找到表达的基因。 我有一个矩阵 5*12(5 个控制、7 个处理和 5 个基因),首先我将这个数据矩阵转换为 tibble 格式作为两个长向量,以便理解 tibble 结构并使我更容易。:
library(tidyverse)
#> Warning: package 'tidyr' was built under R version 4.0.4
#> Warning: package 'dplyr' was built under R version 4.0.4
library(magrittr)
library(boot)
##Gene Expression matrix
Exp.mat<- read.table( header = TRUE, text = " C1 C2 C3 C4 C5 T1 T2 T3 T4 T5 T6 T7
Gene1 1.683340 0.3223701 1.72485303 1.9079350 1.2709514 1.682034 2.2272389 1.8203397 1.3749755 1.3140870 1.282419 0.8609480
Gene2 0.377944 0.2189322 0.08454824 0.5209215 0.6368712 1.045141 1.3999023 0.4671403 0.2733392 1.3171397 1.419082 0.5013091
Gene3 3.074032 1.9200940 2.11537958 2.6196671 1.2480232 2.677003 2.2899405 2.1760864 3.3651843 2.2385994 2.275105 3.0107882
Gene4 2.594239 1.3695119 1.89617796 2.2024559 1.1321975 2.178326 1.8842747 2.0992865 0.0000000 1.3404468 1.198157 1.4775754
Gene5 1.182900 3.4522132 1.58912876 1.0666626 0.2400953 1.159052 0.8113895 0.9986083 0.0000000 0.7091586 1.288114 2.0487426
" )
##Vectorized format from matrix
Vec_Ex.Mat <- as_tibble(t(Exp.mat))
Vec_Ex.Mat$Cond <- as.factor(c(rep("1", 5), rep("2", 7)))
Vec_Ex.Mat <- Vec_Ex.Mat %>% gather(Var, Val, -Cond)
Vec_Ex.Mat<- Vec_Ex.Mat[, c(2, 3, 1)]
colnames(Vec_Ex.Mat) <- c("Gene", "Exp", "Cond")
head(Vec_Ex.Mat)
#> # A tibble: 6 x 3
#> Gene Exp Cond
#> <chr> <dbl> <fct>
#> 1 Gene1 1.68 1
#> 2 Gene1 0.322 1
#> 3 Gene1 1.72 1
#> 4 Gene1 1.91 1
#> 5 Gene1 1.27 1
#> 6 Gene1 1.68 2
##Created nested tibble
Nested_Ex.Mat <- Vec_Ex.Mat %>%
dplyr::group_by(Gene) %>%
tidyr::nest()
head(Nested_Ex.Mat)
#> # A tibble: 5 x 2
#> # Groups: Gene [5]
#> Gene data
#> <chr> <list>
#> 1 Gene1 <tibble[,2] [12 x 2]>
#> 2 Gene2 <tibble[,2] [12 x 2]>
#> 3 Gene3 <tibble[,2] [12 x 2]>
#> 4 Gene4 <tibble[,2] [12 x 2]>
#> 5 Gene5 <tibble[,2] [12 x 2]>
## Function for bootstrap
bootFun <- function(df, f) {
n <- nrow(df)
idx <- which(df[, 2] == 2)
idy <- which(df[, 2] == 1)
nx <- length(idx)
ny <- length(idy)
new.df <- df
new.df[idx, 1] <- df[idx, 1] - mean(df[idx, 1]) + mean(df[, 1])
new.df[idy, 1] <- df[idy, 1] - mean(df[idy, 1]) + mean(df[, 1])
df <- new.df
MX <- sum(df[idx, 1] * f[idx])/sum(f[idx])
SX <- sum(df[idx, 1]^2 * f[idx])/sum(f[idx]) - MX^2
SX <- nx * SX/(nx - 1)
MY <- sum(df[idy, 1] * f[idy])/sum(f[idy])
SY <- sum(df[idy, 1]^2 * f[idy])/sum(f[idy]) - MY^2
SY <- ny * SY/(ny - 1)
SXY <- sqrt((SX/nx) + (SY/ny))
(MX -MY)/SXY
}
##Bootstrap analysis with boot package using purrr::map
Nested_Ex.Mat %<>%
dplyr::mutate(booted = purrr::map(.x=data, ~ boot::boot(data= .x, sim = "ordinary", statistic = bootFun,R = 5,stype = "f", strata=.x[, 2])))
#> Error: Problem with `mutate()` input `booted`.
#> x 'list' object cannot be coerced to type 'double'
#> i Input `booted` is `purrr::map(...)`.
#> i The error occurred in group 1: Gene = "Gene1".
由 reprex package (v1.0.0)
于 2021-04-06 创建我不明白的是我是否正确使用了索引,我不知道如何使用 tibble 矢量化格式的引导包引入这些数据。在示例 here 中,分析了单个列,它正在运行。但是我想通过引导包为每个基因使用 strata 选项,在 tibble 数据中有两列。是否有可能摆脱这种代码负载,或者我们是否可以通过更短的代码和正确的索引使函数更高效?你能分享你的知识和建议吗?谢谢。
我不确定你为什么要 bootstrap t 检验。 运行 t.test
函数似乎更容易。这是我的代码:
加载包
library(dplyr)
library(tidyr)
library(tibble)
library(stringr)
library(purrr)
获取数据
Exp.mat<- read.table(header = TRUE, text = " C1 C2 C3 C4 C5 T1 T2 T3 T4 T5 T6 T7
Gene1 1.683340 0.3223701 1.72485303 1.9079350 1.2709514 1.682034 2.2272389 1.8203397 1.3749755 1.3140870 1.282419 0.8609480
Gene2 0.377944 0.2189322 0.08454824 0.5209215 0.6368712 1.045141 1.3999023 0.4671403 0.2733392 1.3171397 1.419082 0.5013091
Gene3 3.074032 1.9200940 2.11537958 2.6196671 1.2480232 2.677003 2.2899405 2.1760864 3.3651843 2.2385994 2.275105 3.0107882
Gene4 2.594239 1.3695119 1.89617796 2.2024559 1.1321975 2.178326 1.8842747 2.0992865 0.0000000 1.3404468 1.198157 1.4775754
Gene5 1.182900 3.4522132 1.58912876 1.0666626 0.2400953 1.159052 0.8113895 0.9986083 0.0000000 0.7091586 1.288114 2.0487426
" )
查看数据
head(Exp.mat)
#> C1 C2 C3 C4 C5 T1 T2
#> Gene1 1.683340 0.3223701 1.72485303 1.9079350 1.2709514 1.682034 2.2272389
#> Gene2 0.377944 0.2189322 0.08454824 0.5209215 0.6368712 1.045141 1.3999023
#> Gene3 3.074032 1.9200940 2.11537958 2.6196671 1.2480232 2.677003 2.2899405
#> Gene4 2.594239 1.3695119 1.89617796 2.2024559 1.1321975 2.178326 1.8842747
#> Gene5 1.182900 3.4522132 1.58912876 1.0666626 0.2400953 1.159052 0.8113895
#> T3 T4 T5 T6 T7
#> Gene1 1.8203397 1.3749755 1.3140870 1.282419 0.8609480
#> Gene2 0.4671403 0.2733392 1.3171397 1.419082 0.5013091
#> Gene3 2.1760864 3.3651843 2.2385994 2.275105 3.0107882
#> Gene4 2.0992865 0.0000000 1.3404468 1.198157 1.4775754
#> Gene5 0.9986083 0.0000000 0.7091586 1.288114 2.0487426
格式化数据
Exp.mat.new <- Exp.mat %>%
# Convert row names to a column
rownames_to_column('gene') %>%
# "Gather" the data using the pivot_longer function (preferred to the old "gather" function)
pivot_longer(cols = -gene,
names_to = 'cond',
values_to = 'exp') %>%
# Fix condition
mutate(cond = case_when(
str_detect(cond, pattern = 'C') ~ 'C',
TRUE ~ 'T')) %>%
# Nest
group_by(gene) %>%
nest()
执行分析
Exp.mat.new <- Exp.mat.new %>%
# Perform t-test
mutate(t_test = map(.x = data,
~ t.test(.x$exp ~ .x$cond))) %>%
# Extract vital statistics
mutate(C_minus_T_95CI = map(.x = t_test, # 95% CI of the mean difference between groups C and T
~ paste0(round(.x$conf.int[[1]], 3), ' to ',
round(.x$conf.int[[2]], 3))),
p_value = map(.x = t_test, # p-value
~ round(.x$p.value, 5))) %>%
# Unnest the data
unnest(cols = c(C_minus_T_95CI, p_value)) %>%
# Arrange by p-value
arrange(p_value)
打印数据帧
Exp.mat.new
#> # A tibble: 5 x 5
#> # Groups: gene [5]
#> gene data t_test C_minus_T_95CI p_value
#> <chr> <list> <list> <chr> <dbl>
#> 1 Gene2 <tibble [12 × 2]> <htest> -1.028 to -0.071 0.0288
#> 2 Gene3 <tibble [12 × 2]> <htest> -1.237 to 0.475 0.323
#> 3 Gene4 <tibble [12 × 2]> <htest> -0.482 to 1.251 0.345
#> 4 Gene5 <tibble [12 × 2]> <htest> -0.95 to 1.958 0.423
#> 5 Gene1 <tibble [12 × 2]> <htest> -0.914 to 0.66 0.712
如果需要,打印每个单独的 t 检验的完整结果
walk(.x = Exp.mat.new$t_test,
~ print(.x))
#>
#> Welch Two Sample t-test
#>
#> data: .x$exp by .x$cond
#> t = -2.6068, df = 8.8453, p-value = 0.02881
#> alternative hypothesis: true difference in means is not equal to 0
#> 95 percent confidence interval:
#> -1.02805952 -0.07141179
#> sample estimates:
#> mean in group C mean in group T
#> 0.3678434 0.9175791
#>
#>
#> Welch Two Sample t-test
#>
#> data: .x$exp by .x$cond
#> t = -1.0691, df = 6.4703, p-value = 0.3233
#> alternative hypothesis: true difference in means is not equal to 0
#> 95 percent confidence interval:
#> -1.2367921 0.4754686
#> sample estimates:
#> mean in group C mean in group T
#> 2.195439 2.576101
#>
#>
#> Welch Two Sample t-test
#>
#> data: .x$exp by .x$cond
#> t = 0.99278, df = 9.7754, p-value = 0.3448
#> alternative hypothesis: true difference in means is not equal to 0
#> 95 percent confidence interval:
#> -0.4816528 1.2514667
#> sample estimates:
#> mean in group C mean in group T
#> 1.838916 1.454009
#>
#>
#> Welch Two Sample t-test
#>
#> data: .x$exp by .x$cond
#> t = 0.86423, df = 5.5685, p-value = 0.4231
#> alternative hypothesis: true difference in means is not equal to 0
#> 95 percent confidence interval:
#> -0.9503223 1.9584180
#> sample estimates:
#> mean in group C mean in group T
#> 1.506200 1.002152
#>
#>
#> Welch Two Sample t-test
#>
#> data: .x$exp by .x$cond
#> t = -0.38483, df = 6.6963, p-value = 0.7123
#> alternative hypothesis: true difference in means is not equal to 0
#> 95 percent confidence interval:
#> -0.9143974 0.6604509
#> sample estimates:
#> mean in group C mean in group T
#> 1.381890 1.508863
由 reprex package (v1.0.0)
于 2021-04-06 创建首先我遇到了上述错误。之后我在引导功能中更改了层。 (“strata = .x$cond”而不是 strata=.x[ 2])。然后它在第一个 boot.fun 函数中返回 NA 和 mean 函数。我在 boot.fun 中将 mean 更改为 colMeans。它现在正在工作。我将 运行 与 matrxix 一起使用,以测试准确性。
library(tidyverse)
library(dplyr)
library(tidyr)
library(tibble)
library(stringr)
library(purrr)
library(boot)
Exp.mat<- read.table(header = TRUE, text = " C1 C2 C3 C4 C5 T1 T2 T3 T4 T5 T6 T7
Gene1 1.683340 0.3223701 1.72485303 1.9079350 1.2709514 1.682034 2.2272389 1.8203397 1.3749755 1.3140870 1.282419 0.8609480
Gene2 0.377944 0.2189322 0.08454824 0.5209215 0.6368712 1.045141 1.3999023 0.4671403 0.2733392 1.3171397 1.419082 0.5013091
Gene3 3.074032 1.9200940 2.11537958 2.6196671 1.2480232 2.677003 2.2899405 2.1760864 3.3651843 2.2385994 2.275105 3.0107882
Gene4 2.594239 1.3695119 1.89617796 2.2024559 1.1321975 2.178326 1.8842747 2.0992865 0.0000000 1.3404468 1.198157 1.4775754
Gene5 1.182900 3.4522132 1.58912876 1.0666626 0.2400953 1.159052 0.8113895 0.9986083 0.0000000 0.7091586 1.288114 2.0487426
" )
Exp.mat.new <- Exp.mat %>%
# Convert row names to a column
rownames_to_column('gene') %>%
# "Gather" the data using the pivot_longer function (preferred to the old "gather" function)
pivot_longer(cols = -gene,
names_to = 'cond',
values_to = 'exp') %>%
# Fix condition
mutate(cond = case_when(
str_detect(cond, pattern = 'C') ~ '1',
TRUE ~ '2')) %>%
# Nest
group_by(gene) %>%
nest()
bootFun.tbl <- function(df,f) {
n <- nrow(df)
idx <- which(df[, 1] == 2) #select cond = 2 (treatment)
idy <- which(df[, 1] == 1) #select cond = 1 (control)
nx <- length(idx)
ny <- length(idy)
new.df <- df
new.df[idx, 2] <- df[idx, 2] - colMeans(df[idx, 2]) + colMeans(df[, 2])
new.df[idy, 2] <- df[idy, 2] - colMeans(df[idy, 2]) + colMeans(df[, 2])
df <- new.df
MX <- sum(df[idx, 2] * f[idx])/sum(f[idx])
SX <- sum(df[idx, 2]^2 * f[idx])/sum(f[idx]) - MX^2
SX <- nx * SX/(nx - 1)
MY <- sum(df[idy, 2] * f[idy])/sum(f[idy])
SY <- sum(df[idy, 2]^2 * f[idy])/sum(f[idy]) - MY^2
SY <- ny * SY/(ny - 1)
SXY <- sqrt((SX/nx) + (SY/ny))
(MX -MY)/SXY
}
Exp.mat.new <- Exp.mat.new %>%
# Perform bootstrap test
mutate(boot.fun = map(.x = data,
~boot::boot(data=.x, sim = "ordinary",
statistic = bootFun.tbl, R = 5,
stype = "f", strata = .x$cond)))
head(Exp.mat.new)
#> # A tibble: 5 x 3
#> # Groups: gene [5]
#> gene data boot.fun
#> <chr> <list> <list>
#> 1 Gene1 <tibble[,2] [12 × 2]> <boot>
#> 2 Gene2 <tibble[,2] [12 × 2]> <boot>
#> 3 Gene3 <tibble[,2] [12 × 2]> <boot>
#> 4 Gene4 <tibble[,2] [12 × 2]> <boot>
#> 5 Gene5 <tibble[,2] [12 × 2]> <boot>
head(Exp.mat.new$boot.fun)
#> [[1]]
#>
#> STRATIFIED BOOTSTRAP
#>
#>
#> Call:
#> boot::boot(data = .x, statistic = bootFun.tbl, R = 5, sim = "ordinary",
#> stype = "f", strata = .x$cond)
#>
#>
#> Bootstrap Statistics :
#> original bias std. error
#> t1* -6.729777e-16 -1.237088 2.189771
#>
#> [[2]]
#>
#> STRATIFIED BOOTSTRAP
#>
#>
#> Call:
#> boot::boot(data = .x, statistic = bootFun.tbl, R = 5, sim = "ordinary",
#> stype = "f", strata = .x$cond)
#>
#>
#> Bootstrap Statistics :
#> original bias std. error
#> t1* 5.264653e-16 0.4360011 0.4969442
#>
由 reprex package (v2.0.0)
于 2021-04-08 创建