R:事件的高效提取(连续增加一个变量)

R: Efficient extraction of events (continuous increase in a variable)

任务是有效地从此数据中提取事件:

data <- structure(
            list(i = c(1, 1, 1, 2, 2, 2), t = c(1, 2, 3, 1, 3, 4), x = c(1, 1, 2, 1, 2, 3)),
            .Names = c("i", "t", "x"), row.names = c(NA, -6L), class = "data.frame"
        )

> data
  i t x
1 1 1 1
2 1 2 1
3 1 3 2
4 2 1 1
5 2 3 2
6 2 4 3

我们称i个事实,t是时间,xit处的选择数。

一个事件是对一个事实的不间断选择序列。在整个 t=1 到 t=3 期间,事实 1 被选中,总计有 4 次选择。但事实 2 分为两个事件,第一个是从 t=1 到 t=1(sum=1),第二个是从 t=3 到 t=4(sum=5)。因此,事件数据框应该如下所示:

> event
  i from to sum
1 1    1  3   4
2 2    1  1   1
3 2    3  4   5

此代码执行所需的操作:

event <- structure(
             list(i = logical(0), from = logical(0), to = logical(0), sum = logical(0)),
             .Names = c("i", "from", "to", "sum"), row.names = integer(0),
             class = "data.frame"
         )
l <- nrow(data) # get rows of data frame
c <- 1 # set counter
d <- 1 # set initial row of data to start with
e <- 1 # set initial row of event to fill
repeat{
    event[e,1] <- data[d,1] # store "i" in event data frame
    event[e,2] <- data[d,2] # store "from" in event data frame
    while((data[d+1,1] == data[d,1]) & (data[d+1,2] == data[d,2]+1)){
        c <- c+1
        d <- d+1
        if(d >= l) break
    }
    event[e,3] <- data[d,2] # store "to" in event data frame
    event[e,4] <- sum(data[(d-c+1):d,3]) # store "sum" in event data frame
    c <- 1
    d <- d+1
    e <- e+1
}

问题是此代码需要 3 天时间从具有 100 万行的数据框中提取事件,而我的数据框有 500 万行。

如何提高效率?

P.S.: 我的代码中还有一个与终止相关的小错误。

P.P.S.:数据先按i排序,再按t排序。

假设数据框是按照data$t排序的,你可以试试这样

event <- NULL
for (i in unique(data$i)) {
    x <- data[data$i == i, ]
    ev <- cumsum(c(1, diff(x$t)) > 1)
    smry <- lapply(split(x, ev), function(z) c(i, range(z$t), sum(z$x)))
    event <- c(event, smry)
}
event <- do.call(rbind, event)
rownames(event) <- NULL
colnames(event) <- c('i', 'from', 'to', 'sum')

结果是矩阵,不是数据框。

如果这个 dplyr 实现更快,你能试试吗?

library(dplyr)

data <- structure(
    list(fact = c(1, 1, 1, 2, 2, 2), timing = c(1, 2, 3, 1, 3, 4), x = c(1, 1, 2, 1, 2, 3)),
    .Names = c("fact", "timing", "x"), row.names = c(NA, -6L), class = "data.frame"
)

group_by(data, fact) %>%
    mutate(fromto=cumsum(c(0, diff(timing) > 1))) %>%
    group_by(fact, fromto) %>%
    summarize(from=min(timing), to=max(timing), sumx=sum(x)) %>%
    select(-fromto) %>%
    ungroup()

这个data.table实施怎么样?

library(data.table)
data <- structure(
    list(fact = c(1, 1, 1, 2, 2, 2), timing = c(1, 2, 3, 1, 3, 4), x = c(1, 1, 2, 1, 2, 3)),
    .Names = c("fact", "timing", "x"), row.names = c(NA, -6L), class = "data.frame"
)
setDT(data)[, fromto:=cumsum(c(0, diff(timing) > 1)), by=fact]
event <- data[, .(from=min(timing), to=max(timing), sumx=sum(x)), by=c("fact", "fromto")][,fromto:=NULL]

##results when i enter event in the R console and my data.table package version is data.table_1.9.6
> event
   fact from to sumx
1:    1    1  3    4
2:    2    1  1    1
3:    2    3  4    5
> str(event)
Classes ‘data.table’ and 'data.frame':  3 obs. of  4 variables:
 $ fact: num  1 2 2
 $ from: num  1 1 3
 $ to  : num  3 1 4
 $ sumx: num  4 1 5
 - attr(*, ".internal.selfref")=<externalptr> 
> dput(event)
structure(list(fact = c(1, 2, 2), from = c(1, 1, 3), to = c(3, 
1, 4), sumx = c(4, 1, 5)), row.names = c(NA, -3L), class = c("data.table", 
"data.frame"), .Names = c("fact", "from", "to", "sumx"), .internal.selfref = <pointer: 0x0000000000120788>)

参考 detect intervals of the consequent integer sequences