做,生存分析和 dplyr

do, survivalanalysis and dplyr

我正在尝试计算按医院、疾病等级和年份分层的不同患者的中位生存期。在 plyr 中,这相当简单,但我试图使用 dplyr 来加快速度(因为它的速度)。然而,我对生存结果被放置在数据框内的列表中的输出感到困惑。如何将生成的 m6 数据帧转换为 "useful" 数据帧以供后续分析? dplyr中的do函数有教程吗?

编辑 1: m6$ty2<-unlist(m6$te) 似乎成功了

但如果我改为

m6<-ml %>% group_by(year,Grade,hospital) %>% do(te=survfit(Surv(time=surv, event=status=="Dead")~-1,data=.))

如何访问 survfit 对象的各个元素?

这是我写的另一个代码。

m6<- ml %>% group_by(year,Grade,hospital) %>% 
     do(te=summary(survfit(Surv(time=surv, event=status=="Dead")~-1,data=.))$table['median'])

数据

    > dput(ml)
structure(list(year = structure(c(12L, 12L, 2L, 7L, 5L, 1L, 1L, 
9L, 3L, 5L, 3L, 7L, 11L, 3L, 13L, 5L, 6L, 6L, 9L, 13L, 7L, 5L, 
11L, 8L, 1L, 2L, 11L, 4L, 6L, 13L, 7L, 4L, 8L, 13L, 7L, 8L, 1L, 
9L, 7L, 5L, 6L, 5L, 10L, 13L, 7L, 10L, 10L, 8L, 13L, 11L, 11L, 
12L, 10L, 6L, 7L, 11L, 2L, 11L, 6L, 11L, 2L, 1L, 12L, 11L, 11L, 
12L, 8L, 4L, 4L, 11L, 13L, 8L, 6L, 5L, 10L, 3L, 12L, 3L, 11L, 
1L, 5L, 11L, 4L, 5L, 5L, 7L, 9L, 7L, 11L, 13L, 7L, 12L, 6L, 6L, 
11L, 2L, 7L, 10L, 7L, 7L, 8L, 8L, 4L, 12L, 8L, 3L, 12L, 3L, 11L, 
11L, 13L, 3L, 5L, 6L, 4L, 10L, 12L, 9L, 10L, 5L, 12L, 10L, 7L, 
2L, 7L, 6L, 12L, 10L, 3L, 2L, 7L, 5L, 2L, 12L, 10L, 3L, 6L, 9L, 
2L, 7L, 4L, 6L, 3L, 8L, 5L, 10L, 8L, 10L, 11L, 10L, 13L, 10L, 
5L, 3L, 11L, 5L, 1L, 7L, 13L, 13L, 9L, 13L, 12L, 10L, 4L, 13L, 
7L, 4L, 12L, 12L, 11L, 5L, 11L, 3L, 6L, 10L, 3L, 3L, 13L, 1L, 
13L, 6L, 6L, 12L, 1L, 6L, 5L, 10L, 2L, 5L, 11L, 9L, 9L, 9L, 2L, 
7L, 5L, 12L, 8L, 2L), .Label = c("2003", "2004", "2005", "2006", 
"2007", "2008", "2009", "2010", "2011", "2012", "2013", "2014", 
"2015"), class = "factor"), Grade = structure(c(3L, 2L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 3L, 3L, 3L, 2L, 
3L, 3L, 1L, 3L, 2L, 1L, 2L, 3L, 3L, 3L, 2L, 3L, 3L, 2L, 3L, 3L, 
3L, 1L, 3L, 2L, 3L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 3L, 3L, 
2L, 3L, 3L, 3L, 1L, 1L, 3L, 2L, 1L, 3L, 3L, 3L, 3L, 2L, 2L, 3L, 
2L, 3L, 3L, 3L, 3L, 2L, 3L, 3L, 2L, 1L, 3L, 3L, 3L, 1L, 2L, 2L, 
1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 3L, 2L, 3L, 3L, 3L, 3L, 3L, 
2L, 3L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 2L, 2L, 3L, 3L, 2L, 1L, 1L, 
3L, 2L, 3L, 2L, 3L, 3L, 3L, 2L, 3L, 3L, 3L, 3L, 3L, 1L, 3L, 2L, 
1L, 3L, 3L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 1L, 1L, 
2L, 3L, 1L, 3L, 1L, 3L, 1L, 3L, 3L, 2L, 3L, 1L, 1L, 3L, 1L, 3L, 
3L, 1L, 1L, 3L, 1L, 3L, 1L, 3L, 3L, 3L, 3L, 2L, 3L, 3L, 3L, 3L, 
1L, 1L, 3L, 3L, 2L, 3L, 3L, 2L, 3L, 3L, 1L, 2L, 3L, 3L, 3L, 3L, 
3L, 2L, 3L, 1L), .Label = c("II", "III", "IV"), class = "factor"), 
    hospital = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 
    1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 
    2L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
    2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 
    1L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 
    1L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
    1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 
    1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 
    2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 
    2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 
    1L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 1L, 2L, 1L), .Label = c("A", 
    "B"), class = "factor"), surv = c(0.298425735797399, 0.235455167693361, 
    0.243668720054757, 1.98220396988364, 0.955509924709103, 0.386036960985626, 
    0.503764544832307, 1.19644079397673, 0.139630390143737, 7.82477754962355, 
    1.3990417522245, 2.48596851471595, 2.19849418206708, 1.0321697467488, 
    0.369609856262834, 5.88364134154689, 4.54757015742642, 0.334017796030116, 
    0.498288843258042, 0.621492128678987, 0.563997262149213, 
    0.128678986995209, 2.40109514031485, 1.61533196440794, 0.142368240930869, 
    5.08966461327858, 1.89733059548255, 7.31553730321698, 1.27310061601643, 
    0.561259411362081, 6.07529089664613, 0.369609856262834, 0.287474332648871, 
    0.251882272416153, 6.21492128678987, 0.670773442847365, 0.54757015742642, 
    4.10130047912389, 0.865160848733744, 0.468172484599589, 0.271047227926078, 
    0.843258042436687, 1.3990417522245, 0.0739219712525667, 0.470910335386721, 
    0.183436002737851, 0.832306639288159, 0.427104722792608, 
    0.457221081451061, 2.02874743326489, 0.616016427104723, 1.00479123887748, 
    2.86105407255305, 0.449007529089665, 0.0711841204654346, 
    2.19849418206708, 2.00958247775496, 1.82067077344285, 0.334017796030116, 
    1.95756331279945, 5.9958932238193, 0.903490759753593, 0.824093086926763, 
    0.684462696783025, 0.629705681040383, 1.62628336755647, 1.94934976043806, 
    0.490075290896646, 9.07323750855578, 0.991101984941821, 0.0520191649555099, 
    0.900752908966461, 0.476386036960986, 0.298425735797399, 
    0.213552361396304, 1.25941136208077, 0.158795345653662, 9.89459274469541, 
    0.295687885010267, 11.8576317590691, 0.889801505817933, 2.05338809034908, 
    2.39014373716632, 0.509240246406571, 8.16427104722793, 3.68240930869268, 
    0.720054757015743, 2.79808350444901, 0.974674880219028, 0.678986995208761, 
    0.525667351129363, 1.4757015742642, 7.2662559890486, 0.125941136208077, 
    0.884325804243669, 0.281998631074606, 0.435318275154004, 
    1.43463381245722, 2.19028062970568, 0.0657084188911704, 5.50581793292266, 
    0.70362765229295, 8.95824777549624, 0.37782340862423, 5.47022587268994, 
    0.555783709787817, 0.731006160164271, 0.287474332648871, 
    1.23477070499658, 2.40383299110198, 0.462696783025325, 0.65160848733744, 
    0.194387405886379, 0.48186173853525, 7.68514715947981, 3.07460643394935, 
    0.0164271047227926, 3.93429158110883, 0.928131416837782, 
    1.91649555099247, 0.331279945242984, 0.528405201916496, 0.87337440109514, 
    2.71321013004791, 1.57152635181383, 2.87200547570157, 0.188911704312115, 
    0.810403832991102, 1.16906228610541, 11.2553045859001, 0.147843942505133, 
    6.18480492813142, 10.9103353867214, 0.714579055441478, 0.380561259411362, 
    1.88090349075975, 0.958247775496235, 0.0328542094455852, 
    0.191649555099247, 1.05133470225873, 1.01848049281314, 0.720054757015743, 
    1.91649555099247, 0.670773442847365, 1.40177960301164, 3.50992470910335, 
    5.42642026009583, 3.62765229295003, 0.884325804243669, 1.06776180698152, 
    0.123203285420945, 0.826830937713895, 5.637234770705, 1.72210814510609, 
    2.43668720054757, 0.292950034223135, 0.375085557837098, 5.90828199863107, 
    0.54757015742642, -0.0164271047227926, 4.7337440109514, 0.136892539356605, 
    1.08145106091718, 0.353182751540041, 1.23203285420945, 0.229979466119097, 
    2.42847364818617, 0.0383299110198494, 1.57426420260096, 0.49555099247091, 
    1.90006844626968, 3.0444900752909, 0.342231348391513, 0.364134154688569, 
    0.0109514031485284, 3.53456536618754, 1.3990417522245, 1.41820670773443, 
    0.0355920602327173, 0.607802874743327, 0.0191649555099247, 
    6.80903490759754, 0.243668720054757, 1.15263518138261, 0.900752908966461, 
    3.13210130047912, 0.659822039698836, 0.514715947980835, 0.706365503080082, 
    0.194387405886379, 2.58726899383984, 0.484599589322382, 1.01026694045175, 
    0.145106091718001, 1.85352498288843, 0.87337440109514, 0.383299110198494, 
    1.3305954825462, 0.0465434633812457, 6.65297741273101), status = c("Dead", 
    "Dead", "Dead", "Dead", "Dead", "Dead", "Dead", "Dead", "Dead", 
    "Alive", "Dead", "Dead", "Dead", "Dead", "Alive", "Dead", 
    "Dead", "Dead", "Dead", "Alive", "Dead", "Dead", "Alive", 
    "Dead", "Dead", "Dead", "Alive", "Dead", "Dead", "Alive", 
    "Alive", "Dead", "Dead", "Alive", "Alive", "Dead", "Dead", 
    "Alive", "Dead", "Dead", "Dead", "Dead", "Dead", "Dead", 
    "Dead", "Dead", "Dead", "Dead", "Alive", "Alive", "Dead", 
    "Dead", "Dead", "Dead", "Dead", "Alive", "Dead", "Alive", 
    "Dead", "Alive", "Dead", "Dead", "Dead", "Dead", "Dead", 
    "Alive", "Dead", "Dead", "Alive", "Dead", "Dead", "Dead", 
    "Dead", "Dead", "Dead", "Dead", "Dead", "Alive", "Dead", 
    "Alive", "Dead", "Alive", "Dead", "Dead", "Dead", "Dead", 
    "Dead", "Dead", "Dead", "Alive", "Dead", "Dead", "Alive", 
    "Dead", "Dead", "Dead", "Dead", "Dead", "Dead", "Dead", "Alive", 
    "Dead", "Alive", "Dead", "Alive", "Dead", "Dead", "Dead", 
    "Dead", "Alive", "Alive", "Dead", "Dead", "Dead", "Dead", 
    "Alive", "Dead", "Dead", "Dead", "Dead", "Dead", "Dead", 
    "Dead", "Dead", "Dead", "Dead", "Dead", "Dead", "Dead", "Alive", 
    "Dead", "Dead", "Alive", "Dead", "Dead", "Dead", "Dead", 
    "Dead", "Dead", "Dead", "Dead", "Dead", "Dead", "Dead", "Dead", 
    "Alive", "Alive", "Alive", "Dead", "Dead", "Alive", "Dead", 
    "Dead", "Dead", "Alive", "Dead", "Dead", "Dead", "Dead", 
    "Alive", "Alive", "Alive", "Alive", "Dead", "Dead", "Alive", 
    "Dead", "Dead", "Alive", "Dead", "Alive", "Dead", "Dead", 
    "Dead", "Dead", "Alive", "Dead", "Dead", "Alive", "Dead", 
    "Alive", "Alive", "Dead", "Alive", "Dead", "Dead", "Dead", 
    "Dead", "Dead", "Dead", "Alive", "Dead", "Dead", "Dead", 
    "Dead", "Dead", "Dead", "Alive", "Dead", "Dead")), .Names = c("year", 
"Grade", "hospital", "surv", "status"), class = c("tbl_dt", "tbl", 
"data.table", "data.frame"), row.names = c(NA, -200L))

这是我现在能做的。但是,我认为这已经接近你想要的了。我认为挑战在于从列表中提取必要的组件并创建数据框。我一直在玩 purrr 包,并认为它会对我们有所帮助。我在代码中调用了 ml mydf

library(dplyr)
library(purrr)

# Run the model
group_by(mydf, year, Grade, hospital) %>%
do(te = survfit(Surv(time = surv, event = status == "Dead") ~ -1, data = .)) -> foo

选中 foo,您会看到一个新列 te,其中包含列表。

#     year  Grade hospital           te
#   (fctr) (fctr)   (fctr)        (chr)
#1    2014     IV        A <S3:survfit>
#2    2014    III        A <S3:survfit>
#3    2004     IV        A <S3:survfit>
#4    2009     IV        A <S3:survfit>
#5    2007     IV        A <S3:survfit>

我使用 purrr 包中的 zip_n() 来排列列表。该函数的作用是为每个列表组件创建一个包含元素的列表。例如,您在 te 中有 surv 的值。这些值存储在所有列表中。该函数收集所有值并将它们放入列表中。我认为 运行 以下代码和查看数据结构是了解正在发生的事情的最佳方式。我还更改了名为 lower 的列表的名称以供以后操作。

foo$te %>% zip_n(.simplify = TRUE) -> whatever

names(whatever$lower) <- paste("model", seq(1:length(whatever$lower)), ".", sep = "")

新列表,whatever看起来像这样(显示其中的一部分)

# str(whatever)
#List of 13
#$ n        : int [1:68] 7 4 3 12 6 7 2 8 5 5 ...
#$ time     : num [1:194] 0.189 0.298 0.331 0.715 0.731 ...
#$ n.risk   : num [1:194] 7 6 5 4 3 2 1 4 3 2 ...
#$ n.event  : num [1:194] 1 1 1 1 1 1 1 1 1 0 ...
#$ n.censor : num [1:194] 0 0 0 0 0 0 0 0 0 1 ...
#$ surv     : num [1:194] 0.857 0.714 0.571 0.429 0.286 ...
#$ type     : chr [1:68] "right" "right" "right" "right" ...
#$ std.err  : num [1:194] 0.154 0.239 0.327 0.436 0.598 ...
#$ upper    : num [1:194] 1 1 1 1 0.922 ...
#$ lower    :List of 68
#..$ model1. : num [1:7] 0.6334 0.4471 0.3008 0.1822 0.0886 ...
#..$ model2. : num [1:4] 0.426 0.188 0.188 0.188

我无法确定哪些值是中位数。至少,我认为 upperlower 是您想要的两个东西。所以让我和他们一起安排一个数据框。

列表,lower in whatever 包含数字和逻辑。当您使用 as_vector() 时,似乎您必须只有一种类型。鉴于此,我将所有内容都转换为数字,然后应用该函数以创建一个向量。

lapply(whatever$lower, as.numeric) %>%
as_vector(.type = "numeric") -> lower

这是 lower 的一部分。

 model1.1   model1.2   model1.3   model1.4   model1.5   model1.6   model1.7   model2.1   model2.2   model2.3 
0.63344653 0.44709095 0.30084365 0.18219162 0.08856084 0.02327247         NA 0.42593227 0.18765893 0.18765893 

最后,您创建了一个新的数据框。

newdf <- data.frame(upper = unlist(whatever$upper),
                    lower = lower)

#             upper      lower
#model1.1 1.0000000 0.63344653
#model1.2 1.0000000 0.44709095
#model1.3 1.0000000 0.30084365
#model1.4 1.0000000 0.18219162
#model1.5 0.9217692 0.08856084
#model1.6 0.8769230 0.02327247

感谢您输入爵士乐,但我找到了 Broom - 和 halleluja...感谢 hadley 和 dgrtwo 的所有努力。

require (broom)
m6<-ml %>% group_by(year,Grade,hospital) %>% 
do(glance(survfit(Surv(time=surv, event=status=="Dead")~-1,data=.))