R:自动生存分析
R: Automated Survival Analysis
下面是示例数据,其中在 genomicmatrix
中,每一行对应一个基因 ("sample"
),每个单元格对应患者的该基因的一个值,该列以该值命名(格式为 "TCGA-__-____-__"
)。 (问题在下面继续)
genomicmatrix <- data.frame("sample" = c("BIX","HEF","TUR","ZOP","VAG"),
"TCGA-K4-6303-01" = runif(5, -1, 1),
"TCGA-DM-A28E-01" = runif(5, -1, 1),
"TCGA-AY-6197-01" = runif(5, -1, 1),
"TCGA-F4-6703-01" = runif(5, -1, 1),
"TCGA-HB-KH8H-01" = runif(5, -1, 1),
"TCGA-Y7-PIK2-01" = runif(5, -1, 1),
"TCGA-A6-5657-01" = runif(5, -1, 1))
colnames(genomicmatrix) <- gsub("[.]", "_",colnames(genomicmatrix))
sample = NULL
sample <- genomicmatrix$sample
genomicmatrix$sample = NULL
means = NULL
for(z in 1:nrow(genomicmatrix)) {
means[z] <- rowMeans(genomicmatrix[z,])
}
genemeans <- data.frame(sample, means)
因此,在如上所述找到每一行(基因)的平均值之后,我提取了该基因的值大于该基因的平均值的患者姓名。每个基因的 "greater than" 患者进入该基因的列表中称为 high
的元素(例如,对于第四个基因,"greater than" 患者出现在 high[[4]]
中)。 "lesser than" 患者也是如此,它们转到名为 low
.
的元素
high = NULL
low = NULL
high <- list(list())
low <- list(list())
uplist = NULL
downlist = NULL
for (i in 1:nrow(genomicmatrix)) {
uplist = NULL
downlist = NULL
for (y in seq_along(genomicmatrix)) {
uplist[y] <- ifelse(genomicmatrix[i,y] > genemeans$means[i], names(genomicmatrix[y]), "")
downlist[y] <- ifelse(genomicmatrix[i,y] < genemeans$means[i], names(genomicmatrix[y]), "")
high[[i]] <- uplist
low[[i]] <- downlist
}
}
因此,对于每个基因,我将患者分为 "high expression" 和 "low expression" 类别。例如,出现在 low[[3]]
中的患者是那些第三个基因 ("TUR"
) 的表达低于该基因的平均值的患者。下面,我有一个 patientID 的转换 table - 以天为单位的生存时间。
survival = NULL
survival$sampleID <- c("TCGA-K4-6303-01", "TCGA-DM-A28E-01", "TCGA-AY-6197-01", "TCGA-F4-6703-01", "TCGA-HB-KH8H-01", "TCGA-Y7-PIK2-01", "TCGA-A6-5657-01")
survival$X_OS <- c(256, 26, 88, 491, 553, 177, 732)
survival$sampleID <- chartr("-", "_", survival$sampleID)
我想从该设置中提取每个基因的对数秩检验 p 值。也就是说,例如,对于基因 1 ("BIX"
),给定高表达与低表达的 Kaplan-Meier 生存曲线(即 high[[1]]
与 low[[1]]
),我希望提取相应的pvalue 来自这两个向量的对数秩检验(回答问题:对于该基因,高表达和低表达患者的生存结果是否存在显着差异?)。一旦推导出 pvalue,它当然应该转移到下一个基因。
(如果您只是寻求执行操作的工具或统计建议,那么 Whosebug 可能不适合回答此问题。)
尽管如此,我建议对您的数据格式和代码进行一些改进,这应该有助于实现您在 R 中的目标。如果您没有明显的内存限制,您可以转换您的 "genomicmatrix"转换为长格式 "data.frame":
longDF = reshape(genomicmatrix, direction = "long", idvar = "sample",
varying = list(2:8), times = colnames(genomicmatrix[-1]),
timevar = "ID", v.names = "value")
row.names(longDF) = NULL
head(longDF)
# sample ID value
#1 BIX TCGA_K4_6303_01 -0.4811441
#2 HEF TCGA_K4_6303_01 -0.2665017
#3 TUR TCGA_K4_6303_01 0.8367469
#4 ZOP TCGA_K4_6303_01 -0.5868480
#5 VAG TCGA_K4_6303_01 -0.0319600
#6 BIX TCGA_DM_A28E_01 0.3435170
然后你可以找出哪些患者的表达高于和低于平均水平并创建 "data.frame":
exprs = do.call(rbind,
lapply(split(longDF, longDF$sample),
function(x) {
x$expr = ifelse(findInterval(x$value, mean(x$value)) == 1,
"high",
"low")
x
}))
row.names(exprs) = NULL
head(exprs)
# sample ID value expr
#1 BIX TCGA_K4_6303_01 -0.4811441 low
#2 BIX TCGA_DM_A28E_01 0.3435170 high
#3 BIX TCGA_AY_6197_01 0.2269158 high
#4 BIX TCGA_F4_6703_01 -0.8283441 low
#5 BIX TCGA_HB_KH8H_01 0.4024671 high
#6 BIX TCGA_Y7_PIK2_01 -0.2979979 low
然后添加"survival$X_OS":
exprs$X_OS = survival$X_OS[match(exprs$ID, survival$sampleID)]
head(exprs)
# sample ID value expr X_OS
#1 BIX TCGA_K4_6303_01 -0.4811441 low 256
#2 BIX TCGA_DM_A28E_01 0.3435170 high 26
#3 BIX TCGA_AY_6197_01 0.2269158 high 88
#4 BIX TCGA_F4_6703_01 -0.8283441 low 491
#5 BIX TCGA_HB_KH8H_01 0.4024671 high 553
#6 BIX TCGA_Y7_PIK2_01 -0.2979979 low 177
然后,假设你有一个函数 log_rank_test
接受两个向量并输出一个 "p.value" 你可以使用类似的东西:
#lapply(split(exprs[c("expr", "X_OS")], exprs$sample),
# function(x) log_rank_test(x$X_OS[x$expr == "high"], x$X_OS[x$expr == "low"]))
我正在尝试一种 "data.table" 方法,尽管它可能不符合习惯或可以改进,因为我不熟悉它:
library(data.table)
library(reshape2)
DT = as.data.table(genomicmatrix)
longDT = melt(DT, "sample", variable.name = "ID")
longDT[, expr := ifelse(findInterval(value, mean(value)) == 1, "high", "low"), by = sample]
longDT[, X_OS := survival$X_OS[match(ID, survival$sampleID)]]
head(longDT)
# sample ID value expr X_OS
#1: BIX TCGA_K4_6303_01 -0.4811441 low 256
#2: HEF TCGA_K4_6303_01 -0.2665017 low 256
#3: TUR TCGA_K4_6303_01 0.8367469 high 256
#4: ZOP TCGA_K4_6303_01 -0.5868480 low 256
#5: VAG TCGA_K4_6303_01 -0.0319600 low 256
#6: BIX TCGA_DM_A28E_01 0.3435170 high 26
然后,运行 你的 log_rank_test
功能如下:
#longDT[, log_rank_test(X_OS[expr == "high"], X_OS[expr == "low"]), by = sample]
下面是示例数据,其中在 genomicmatrix
中,每一行对应一个基因 ("sample"
),每个单元格对应患者的该基因的一个值,该列以该值命名(格式为 "TCGA-__-____-__"
)。 (问题在下面继续)
genomicmatrix <- data.frame("sample" = c("BIX","HEF","TUR","ZOP","VAG"),
"TCGA-K4-6303-01" = runif(5, -1, 1),
"TCGA-DM-A28E-01" = runif(5, -1, 1),
"TCGA-AY-6197-01" = runif(5, -1, 1),
"TCGA-F4-6703-01" = runif(5, -1, 1),
"TCGA-HB-KH8H-01" = runif(5, -1, 1),
"TCGA-Y7-PIK2-01" = runif(5, -1, 1),
"TCGA-A6-5657-01" = runif(5, -1, 1))
colnames(genomicmatrix) <- gsub("[.]", "_",colnames(genomicmatrix))
sample = NULL
sample <- genomicmatrix$sample
genomicmatrix$sample = NULL
means = NULL
for(z in 1:nrow(genomicmatrix)) {
means[z] <- rowMeans(genomicmatrix[z,])
}
genemeans <- data.frame(sample, means)
因此,在如上所述找到每一行(基因)的平均值之后,我提取了该基因的值大于该基因的平均值的患者姓名。每个基因的 "greater than" 患者进入该基因的列表中称为 high
的元素(例如,对于第四个基因,"greater than" 患者出现在 high[[4]]
中)。 "lesser than" 患者也是如此,它们转到名为 low
.
high = NULL
low = NULL
high <- list(list())
low <- list(list())
uplist = NULL
downlist = NULL
for (i in 1:nrow(genomicmatrix)) {
uplist = NULL
downlist = NULL
for (y in seq_along(genomicmatrix)) {
uplist[y] <- ifelse(genomicmatrix[i,y] > genemeans$means[i], names(genomicmatrix[y]), "")
downlist[y] <- ifelse(genomicmatrix[i,y] < genemeans$means[i], names(genomicmatrix[y]), "")
high[[i]] <- uplist
low[[i]] <- downlist
}
}
因此,对于每个基因,我将患者分为 "high expression" 和 "low expression" 类别。例如,出现在 low[[3]]
中的患者是那些第三个基因 ("TUR"
) 的表达低于该基因的平均值的患者。下面,我有一个 patientID 的转换 table - 以天为单位的生存时间。
survival = NULL
survival$sampleID <- c("TCGA-K4-6303-01", "TCGA-DM-A28E-01", "TCGA-AY-6197-01", "TCGA-F4-6703-01", "TCGA-HB-KH8H-01", "TCGA-Y7-PIK2-01", "TCGA-A6-5657-01")
survival$X_OS <- c(256, 26, 88, 491, 553, 177, 732)
survival$sampleID <- chartr("-", "_", survival$sampleID)
我想从该设置中提取每个基因的对数秩检验 p 值。也就是说,例如,对于基因 1 ("BIX"
),给定高表达与低表达的 Kaplan-Meier 生存曲线(即 high[[1]]
与 low[[1]]
),我希望提取相应的pvalue 来自这两个向量的对数秩检验(回答问题:对于该基因,高表达和低表达患者的生存结果是否存在显着差异?)。一旦推导出 pvalue,它当然应该转移到下一个基因。
(如果您只是寻求执行操作的工具或统计建议,那么 Whosebug 可能不适合回答此问题。)
尽管如此,我建议对您的数据格式和代码进行一些改进,这应该有助于实现您在 R 中的目标。如果您没有明显的内存限制,您可以转换您的 "genomicmatrix"转换为长格式 "data.frame":
longDF = reshape(genomicmatrix, direction = "long", idvar = "sample",
varying = list(2:8), times = colnames(genomicmatrix[-1]),
timevar = "ID", v.names = "value")
row.names(longDF) = NULL
head(longDF)
# sample ID value
#1 BIX TCGA_K4_6303_01 -0.4811441
#2 HEF TCGA_K4_6303_01 -0.2665017
#3 TUR TCGA_K4_6303_01 0.8367469
#4 ZOP TCGA_K4_6303_01 -0.5868480
#5 VAG TCGA_K4_6303_01 -0.0319600
#6 BIX TCGA_DM_A28E_01 0.3435170
然后你可以找出哪些患者的表达高于和低于平均水平并创建 "data.frame":
exprs = do.call(rbind,
lapply(split(longDF, longDF$sample),
function(x) {
x$expr = ifelse(findInterval(x$value, mean(x$value)) == 1,
"high",
"low")
x
}))
row.names(exprs) = NULL
head(exprs)
# sample ID value expr
#1 BIX TCGA_K4_6303_01 -0.4811441 low
#2 BIX TCGA_DM_A28E_01 0.3435170 high
#3 BIX TCGA_AY_6197_01 0.2269158 high
#4 BIX TCGA_F4_6703_01 -0.8283441 low
#5 BIX TCGA_HB_KH8H_01 0.4024671 high
#6 BIX TCGA_Y7_PIK2_01 -0.2979979 low
然后添加"survival$X_OS":
exprs$X_OS = survival$X_OS[match(exprs$ID, survival$sampleID)]
head(exprs)
# sample ID value expr X_OS
#1 BIX TCGA_K4_6303_01 -0.4811441 low 256
#2 BIX TCGA_DM_A28E_01 0.3435170 high 26
#3 BIX TCGA_AY_6197_01 0.2269158 high 88
#4 BIX TCGA_F4_6703_01 -0.8283441 low 491
#5 BIX TCGA_HB_KH8H_01 0.4024671 high 553
#6 BIX TCGA_Y7_PIK2_01 -0.2979979 low 177
然后,假设你有一个函数 log_rank_test
接受两个向量并输出一个 "p.value" 你可以使用类似的东西:
#lapply(split(exprs[c("expr", "X_OS")], exprs$sample),
# function(x) log_rank_test(x$X_OS[x$expr == "high"], x$X_OS[x$expr == "low"]))
我正在尝试一种 "data.table" 方法,尽管它可能不符合习惯或可以改进,因为我不熟悉它:
library(data.table)
library(reshape2)
DT = as.data.table(genomicmatrix)
longDT = melt(DT, "sample", variable.name = "ID")
longDT[, expr := ifelse(findInterval(value, mean(value)) == 1, "high", "low"), by = sample]
longDT[, X_OS := survival$X_OS[match(ID, survival$sampleID)]]
head(longDT)
# sample ID value expr X_OS
#1: BIX TCGA_K4_6303_01 -0.4811441 low 256
#2: HEF TCGA_K4_6303_01 -0.2665017 low 256
#3: TUR TCGA_K4_6303_01 0.8367469 high 256
#4: ZOP TCGA_K4_6303_01 -0.5868480 low 256
#5: VAG TCGA_K4_6303_01 -0.0319600 low 256
#6: BIX TCGA_DM_A28E_01 0.3435170 high 26
然后,运行 你的 log_rank_test
功能如下:
#longDT[, log_rank_test(X_OS[expr == "high"], X_OS[expr == "low"]), by = sample]