R:Group-by 滞后变量生成不同的组内滞后值
R: Group-by lag variable generating different within-group lag values
我有数据按 id
变量分组,每个季度有多个独特的观察值,每个 id
具有不同的组大小:
library(dplyr)
library(data.table)
library(lubridate)
v2 <- sample(1:100, 15)
df <- data.frame(qy = c(rep('2016-01-01', 5), rep('2016-04-01', 5), rep('2016-10-01', 5)),
id = c(rep(c('a','a','b','b','c'), 3)),
value_t = c(0,0,1,1,0,1,1,0,0,0,0,0,1,1,1),
value2_t = c(v2))
df$qy <- ymd(df$qy)
df <- df %>% arrange(id, qy)
> df
qy id value_t value2_t
1 2016-01-01 a 0 49
2 2016-01-01 a 0 4
3 2016-01-01 b 1 5
4 2016-01-01 b 1 48
5 2016-01-01 c 0 32
6 2016-04-01 a 1 81
7 2016-04-01 a 1 6
8 2016-04-01 b 0 71
9 2016-04-01 b 0 47
10 2016-04-01 c 0 78
11 2016-10-01 a 0 31
12 2016-10-01 a 0 10
13 2016-10-01 b 1 37
14 2016-10-01 b 1 63
15 2016-10-01 c 1 36
我尝试创建两个由 id
分组的滞后变量,滞后时间分别为 t-1 和 t-2:
setDT(df)[order(qy), paste0('value_t', 1:2) := shift(value_t, 1:2) , by = id]
虽然我按 id
分组,但滞后不遵循分组分配 - 滞后变量只是组内的滚动滞后:
> df
qy id value_t value2_t value_t1 value_t2
1: 2016-01-01 a 0 49 NA NA
2: 2016-01-01 a 0 4 0 NA
3: 2016-04-01 a 1 81 0 0
4: 2016-04-01 a 1 6 1 0
5: 2016-10-01 a 0 31 1 1
6: 2016-10-01 a 0 10 0 1
7: 2016-01-01 b 1 5 NA NA
8: 2016-01-01 b 1 48 1 NA
9: 2016-04-01 b 0 71 1 1
10: 2016-04-01 b 0 47 0 1
11: 2016-10-01 b 1 37 0 0
12: 2016-10-01 b 1 63 1 0
13: 2016-01-01 c 0 32 NA NA
14: 2016-04-01 c 0 78 0 NA
15: 2016-10-01 c 1 36 0 0
我希望滞后变量尊重分组,尽管每个季度有多个观察结果如下:
> df
qy id value_t value2_t value_t1 value_t2
1 2016-01-01 a 0 49 NA NA
2 2016-01-01 a 0 4 NA NA
3 2016-04-01 a 1 81 0 NA
4 2016-04-01 a 1 6 0 NA
5 2016-10-01 a 0 31 1 0
6 2016-10-01 a 0 10 1 0
7 2016-01-01 b 1 5 NA NA
8 2016-01-01 b 1 48 NA NA
9 2016-04-01 b 0 71 1 NA
10 2016-04-01 b 0 47 1 NA
11 2016-10-01 b 1 37 0 1
12 2016-10-01 b 1 63 0 1
13 2016-01-01 c 0 32 NA NA
14 2016-04-01 c 0 78 0 NA
15 2016-10-01 c 1 36 0 0
特别是 data.table
或 dplyr
中的任何建议,我们将不胜感激!
更新:感谢大家的评论。我相信 David A. 是正确的,因为主要问题是不同的 id
组大小,我已经更新了问题以突出显示这一点。
我们可以基于唯一的 qy
和 id
创建数据框的子集,创建滞后列 value_t1
和 value_t2
,然后合并回原始数据框。
library(dplyr)
library(data.table)
library(lubridate)
# Create example data frame
set.seed(123)
v2 <- sample(1:100, 15)
df <- data.frame(qy = c(rep('2016-01-01', 5), rep('2016-04-01', 5), rep('2016-10-01', 5)),
id = c(rep(c('a','a','b','b','c'), 3)),
value_t = c(0,0,1,1,0,1,1,0,0,0,0,0,1,1,1),
value2_t = c(v2))
df$qy <- ymd(df$qy)
df <- df %>% arrange(id, qy)
# Process the data
df2 <- df %>%
distinct(id, qy, .keep_all = TRUE) %>%
group_by(id) %>%
mutate(value_t1 = lag(value_t, n = 1L),
value_t2 = lag(value_t, n = 2L)) %>%
select(-value_t, -value2_t) %>%
ungroup() %>%
left_join(df, ., by = c("qy", "id"))
df2
# qy id value_t value2_t value_t1 value_t2
# 1 2016-01-01 a 0 29 NA NA
# 2 2016-01-01 a 0 79 NA NA
# 3 2016-04-01 a 1 5 0 NA
# 4 2016-04-01 a 1 50 0 NA
# 5 2016-10-01 a 0 87 1 0
# 6 2016-10-01 a 0 98 1 0
# 7 2016-01-01 b 1 41 NA NA
# 8 2016-01-01 b 1 86 NA NA
# 9 2016-04-01 b 0 83 1 NA
# 10 2016-04-01 b 0 51 1 NA
# 11 2016-10-01 b 1 60 0 1
# 12 2016-10-01 b 1 94 0 1
# 13 2016-01-01 c 0 91 NA NA
# 14 2016-04-01 c 0 42 0 NA
# 15 2016-10-01 c 1 9 0 0
您可以使用 rle
(运行 长度编码)编写自己的 time_lag
函数并将其应用于列:
library(dplyr)
time_lag = function(x, time_var, k = 1){
shift_N = sum(rle(as.character(time_var))$lengths[0:k])
return(c(rep(NA, shift_N), x[0:(length(x)-shift_N)]))
}
df %>%
group_by(id) %>%
mutate(value_t1 = time_lag(value_t, qy),
value_t2 = time_lag(value_t, qy, 2),
value_t3 = time_lag(value_t, qy, 3))
结果:
# A tibble: 15 x 7
# Groups: id [3]
qy id value_t value2_t value_t1 value_t2 value_t3
<date> <fctr> <dbl> <int> <dbl> <dbl> <dbl>
1 2016-01-01 a 0 7 NA NA NA
2 2016-01-01 a 0 25 NA NA NA
3 2016-04-01 a 1 100 0 NA NA
4 2016-04-01 a 1 20 0 NA NA
5 2016-10-01 a 0 1 1 0 NA
6 2016-10-01 a 0 59 1 0 NA
7 2016-01-01 b 1 76 NA NA NA
8 2016-01-01 b 1 73 NA NA NA
9 2016-04-01 b 0 69 1 NA NA
10 2016-04-01 b 0 86 1 NA NA
11 2016-10-01 b 1 85 0 1 NA
12 2016-10-01 b 1 40 0 1 NA
13 2016-01-01 c 0 49 NA NA NA
14 2016-04-01 c 0 82 0 NA NA
15 2016-10-01 c 1 43 0 0 NA
备注:
time_lag
假定 time_var
已排序并且 k >= 0
time_lag
首先计算 time_var
的 rle,并将前 k
个唯一时间值的长度相加。我们称这个和为 shift_N
- 然后在向量
x
的开头附加 shift_N
NA
并删除 shift_N
个元素
rle
需要一个原子向量作为输入,因此 as.character
- 当应用于
dplyr::group_by
时,自定义函数尊重分组,因此那里不需要额外的工作
我有数据按 id
变量分组,每个季度有多个独特的观察值,每个 id
具有不同的组大小:
library(dplyr)
library(data.table)
library(lubridate)
v2 <- sample(1:100, 15)
df <- data.frame(qy = c(rep('2016-01-01', 5), rep('2016-04-01', 5), rep('2016-10-01', 5)),
id = c(rep(c('a','a','b','b','c'), 3)),
value_t = c(0,0,1,1,0,1,1,0,0,0,0,0,1,1,1),
value2_t = c(v2))
df$qy <- ymd(df$qy)
df <- df %>% arrange(id, qy)
> df
qy id value_t value2_t
1 2016-01-01 a 0 49
2 2016-01-01 a 0 4
3 2016-01-01 b 1 5
4 2016-01-01 b 1 48
5 2016-01-01 c 0 32
6 2016-04-01 a 1 81
7 2016-04-01 a 1 6
8 2016-04-01 b 0 71
9 2016-04-01 b 0 47
10 2016-04-01 c 0 78
11 2016-10-01 a 0 31
12 2016-10-01 a 0 10
13 2016-10-01 b 1 37
14 2016-10-01 b 1 63
15 2016-10-01 c 1 36
我尝试创建两个由 id
分组的滞后变量,滞后时间分别为 t-1 和 t-2:
setDT(df)[order(qy), paste0('value_t', 1:2) := shift(value_t, 1:2) , by = id]
虽然我按 id
分组,但滞后不遵循分组分配 - 滞后变量只是组内的滚动滞后:
> df
qy id value_t value2_t value_t1 value_t2
1: 2016-01-01 a 0 49 NA NA
2: 2016-01-01 a 0 4 0 NA
3: 2016-04-01 a 1 81 0 0
4: 2016-04-01 a 1 6 1 0
5: 2016-10-01 a 0 31 1 1
6: 2016-10-01 a 0 10 0 1
7: 2016-01-01 b 1 5 NA NA
8: 2016-01-01 b 1 48 1 NA
9: 2016-04-01 b 0 71 1 1
10: 2016-04-01 b 0 47 0 1
11: 2016-10-01 b 1 37 0 0
12: 2016-10-01 b 1 63 1 0
13: 2016-01-01 c 0 32 NA NA
14: 2016-04-01 c 0 78 0 NA
15: 2016-10-01 c 1 36 0 0
我希望滞后变量尊重分组,尽管每个季度有多个观察结果如下:
> df
qy id value_t value2_t value_t1 value_t2
1 2016-01-01 a 0 49 NA NA
2 2016-01-01 a 0 4 NA NA
3 2016-04-01 a 1 81 0 NA
4 2016-04-01 a 1 6 0 NA
5 2016-10-01 a 0 31 1 0
6 2016-10-01 a 0 10 1 0
7 2016-01-01 b 1 5 NA NA
8 2016-01-01 b 1 48 NA NA
9 2016-04-01 b 0 71 1 NA
10 2016-04-01 b 0 47 1 NA
11 2016-10-01 b 1 37 0 1
12 2016-10-01 b 1 63 0 1
13 2016-01-01 c 0 32 NA NA
14 2016-04-01 c 0 78 0 NA
15 2016-10-01 c 1 36 0 0
特别是 data.table
或 dplyr
中的任何建议,我们将不胜感激!
更新:感谢大家的评论。我相信 David A. 是正确的,因为主要问题是不同的 id
组大小,我已经更新了问题以突出显示这一点。
我们可以基于唯一的 qy
和 id
创建数据框的子集,创建滞后列 value_t1
和 value_t2
,然后合并回原始数据框。
library(dplyr)
library(data.table)
library(lubridate)
# Create example data frame
set.seed(123)
v2 <- sample(1:100, 15)
df <- data.frame(qy = c(rep('2016-01-01', 5), rep('2016-04-01', 5), rep('2016-10-01', 5)),
id = c(rep(c('a','a','b','b','c'), 3)),
value_t = c(0,0,1,1,0,1,1,0,0,0,0,0,1,1,1),
value2_t = c(v2))
df$qy <- ymd(df$qy)
df <- df %>% arrange(id, qy)
# Process the data
df2 <- df %>%
distinct(id, qy, .keep_all = TRUE) %>%
group_by(id) %>%
mutate(value_t1 = lag(value_t, n = 1L),
value_t2 = lag(value_t, n = 2L)) %>%
select(-value_t, -value2_t) %>%
ungroup() %>%
left_join(df, ., by = c("qy", "id"))
df2
# qy id value_t value2_t value_t1 value_t2
# 1 2016-01-01 a 0 29 NA NA
# 2 2016-01-01 a 0 79 NA NA
# 3 2016-04-01 a 1 5 0 NA
# 4 2016-04-01 a 1 50 0 NA
# 5 2016-10-01 a 0 87 1 0
# 6 2016-10-01 a 0 98 1 0
# 7 2016-01-01 b 1 41 NA NA
# 8 2016-01-01 b 1 86 NA NA
# 9 2016-04-01 b 0 83 1 NA
# 10 2016-04-01 b 0 51 1 NA
# 11 2016-10-01 b 1 60 0 1
# 12 2016-10-01 b 1 94 0 1
# 13 2016-01-01 c 0 91 NA NA
# 14 2016-04-01 c 0 42 0 NA
# 15 2016-10-01 c 1 9 0 0
您可以使用 rle
(运行 长度编码)编写自己的 time_lag
函数并将其应用于列:
library(dplyr)
time_lag = function(x, time_var, k = 1){
shift_N = sum(rle(as.character(time_var))$lengths[0:k])
return(c(rep(NA, shift_N), x[0:(length(x)-shift_N)]))
}
df %>%
group_by(id) %>%
mutate(value_t1 = time_lag(value_t, qy),
value_t2 = time_lag(value_t, qy, 2),
value_t3 = time_lag(value_t, qy, 3))
结果:
# A tibble: 15 x 7
# Groups: id [3]
qy id value_t value2_t value_t1 value_t2 value_t3
<date> <fctr> <dbl> <int> <dbl> <dbl> <dbl>
1 2016-01-01 a 0 7 NA NA NA
2 2016-01-01 a 0 25 NA NA NA
3 2016-04-01 a 1 100 0 NA NA
4 2016-04-01 a 1 20 0 NA NA
5 2016-10-01 a 0 1 1 0 NA
6 2016-10-01 a 0 59 1 0 NA
7 2016-01-01 b 1 76 NA NA NA
8 2016-01-01 b 1 73 NA NA NA
9 2016-04-01 b 0 69 1 NA NA
10 2016-04-01 b 0 86 1 NA NA
11 2016-10-01 b 1 85 0 1 NA
12 2016-10-01 b 1 40 0 1 NA
13 2016-01-01 c 0 49 NA NA NA
14 2016-04-01 c 0 82 0 NA NA
15 2016-10-01 c 1 43 0 0 NA
备注:
time_lag
假定time_var
已排序并且k >= 0
time_lag
首先计算time_var
的 rle,并将前k
个唯一时间值的长度相加。我们称这个和为shift_N
- 然后在向量
x
的开头附加 rle
需要一个原子向量作为输入,因此as.character
- 当应用于
dplyr::group_by
时,自定义函数尊重分组,因此那里不需要额外的工作
shift_N
NA
并删除 shift_N
个元素