在 R 中构建(当前状态,下一状态)数据框的有效方法

Efficient way to build data frame of (current state, next state) in R

我正在处理一段时间内患者健康状况的数据集。 我想计算转换的数据帧 从当前健康状态到下一个健康状态。

这是测量健康状态的示例 仅通过 AFP 级别和重量。 健康状态测量值可能如下所示:

x <- data.frame(id = c(1, 1, 1, 2, 2, 2),
                day = c(1, 2, 3, 1, 2, 3),
                event = c('status', 'status', 'death', 'status', 'status', 'status'),
                afp = c(10, 50, NA, 20, 30, 40),
                weight = c(100, 105, NA, 200, 200, 200))

所需的输出如下所示:

y <- data.frame(id = c(1, 1, 2, 2),
                current_afp = c(10, 50, 20, 30),
                current_weight = c(100, 105, 200, 200),
                next_event = c('status', 'death', 'status', 'status'),
                next_afp = c(50, NA, 30, 40),
                next_weight = c(105, NA, 200, 200))

获取输出的一种低效方法是:

有没有更高效的获取输出的方法?

注:真实测量数据框可以有10列以上, 所以从代码行的角度来看效率不是很高 明确地写

current_afp = x$afp[1:(n-1)],
next_afp = x$afp[2:n]
...

等等。

你可以试试:

library(dplyr)

x %>%
  mutate_each(funs(lead(.)), -id, -day) %>%
  full_join(x, ., by = c("id", "day")) %>%
  select(-event.x) %>%
  setNames(c(names(.)[1:2], 
             paste0("current_", sub("\..*","", names(.)[3:4])), 
             paste0("next_", sub("\..*","", names(.)[5:7])))) %>%
  group_by(id) %>%
  filter(day != last(day))

给出:

#  id day current_afp current_weight next_event next_afp next_weight
#1  1   1          10            100     status       50         105
#2  1   2          50            105      death       NA          NA
#3  2   1          20            200     status       30         200
#4  2   2          30            200     status       40         200

将基础 R 与拆分-应用-组合方法结合使用

res <- lapply(split(x[-2], x$id), function(y) {
  xx <- cbind(y[1:(nrow(y)-1), ], y[2:nrow(y), -1])
  colnames(xx) <- c("id", paste("current", colnames(y)[-1], sep="_"), 
      paste("next", colnames(y)[-1], sep="_"))
  xx[, which(colnames(xx) != "current_event")]
})
do.call(rbind, res)

  id current_afp current_weight next_event next_afp next_weight
1  1          10            100     status       50         105
2  1          50            105      death       NA          NA
3  2          20            200     status       30         200
4  2          30            200     status       40         200

或者,并非所有日期都按顺序排列的示例

x <- data.frame(id = c(1, 1, 1, 2, 2, 2),
            day = c(1, 2, 3, 1, 2, 4),
            event = c('status', 'status', 'death', 'status', 'status', 'status'),
            afp = c(10, 50, NA, 20, 30, 40),
            weight = c(100, 105, NA, 200, 200, 200))
x
  id day  event afp weight
1  1   1 status  10    100
2  1   2 status  50    105
3  1   3  death  NA     NA
4  2   1 status  20    200
5  2   2 status  30    200
6  2   4 status  40    200

一些转换是 NA,如果需要可以将其删除。

res <- lapply(split(x, x$id), function(y) {
  y <- merge(data.frame(id=unique(y$id), day = 1:max(y$day)), y, 
    by = c("id",   "day"), all.x=TRUE)[, -2]
  xx <- cbind(y[1:(nrow(y)-1), ], y[2:nrow(y), -1])
  colnames(xx) <- c("id", paste("current", colnames(y)[-1], sep="_"), 
      paste("next", colnames(y)[-1], sep="_"))
  xx[, which(colnames(xx) != "current_event")]
})
do.call(rbind, res)
    id current_afp current_weight next_event next_afp next_weight
1.1  1          10            100     status       50         105
1.2  1          50            105      death       NA          NA
2.1  2          20            200     status       30         200
2.2  2          30            200       <NA>       NA          NA
2.3  2          NA             NA     status       40         200