在 R 中使用具有年份差距的滞后函数计算变化
Calculate Change Using Lag Function with Year Gaps in R
我看到的所有滞后示例都使用连续时间序列。我正在尝试计算每年的百分比变化,但是,计算两者之间是否存在差距对我来说没有意义。
即我不希望从 2001 年到 2004 年发生百分比变化。只对两年之间感兴趣。数据输入示例:
structure(list(ID = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L), .Label = c("A", "B"), class = "factor"),
Year = c(2000L, 2001L, 2004L, 2005L, 2006L, 2007L, 1990L,
2000L, 2001L, 2005L, 2006L, 2007L, 2009L), Value = c(4L,
10L, 7L, 4L, 7L, 5L, 2L, 7L, 10L, 6L, 9L, 2L, 9L)), .Names = c("ID",
"Year", "Value"), class = "data.frame", row.names = c(NA, -13L
))
df <- df %>% group_by(ID) %>%
mutate(delta = (Value-lag(Value))/lag(Value))
上面的行没有return我想要的输出,忽略跳转的地方。
期望的输出:
structure(list(ID = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L), .Label = c("A", "B"), class = "factor"),
Year = c(2000L, 2001L, 2004L, 2005L, 2006L, 2007L, 1990L,
2000L, 2001L, 2005L, 2006L, 2007L, 2009L), Value = c(4L,
10L, 7L, 4L, 7L, 5L, 2L, 7L, 10L, 6L, 9L, 2L, 9L), Change = c(NA,
1.5, NA, -0.428571429, 0.75, -0.285714286, NA, 2.5, 0.428571429,
NA, 0.5, -0.777777778, NA)), .Names = c("ID", "Year", "Value",
"Change"), class = "data.frame", row.names = c(NA, -13L))
这是一个data.table
解决方案:
# load library and convert to data.table
library(data.table)
setDT(df)
df[, "Change" := ifelse(Year-shift(Year)==1,
(Value-shift(Value))/shift(Value), NA), by="ID"]
哪个returns
df
ID Year Value Change
1: A 2000 4 NA
2: A 2001 10 1.5000000
3: A 2004 7 NA
4: A 2005 4 -0.4285714
5: A 2006 7 0.7500000
6: A 2007 5 -0.2857143
7: B 1990 2 NA
8: B 2000 7 NA
9: B 2001 10 0.4285714
10: B 2005 6 NA
11: B 2006 9 0.5000000
12: B 2007 2 -0.7777778
13: B 2009 9 NA
这使用 ifelse
,这对于庞大的数据集来说可能会很慢,但如果数据集有数千个观察值,则不会引起注意。
使用 dplyr
:
df %>% group_by(ID) %>%
mutate(delta = ifelse((Year - lag(Year)) > 1, NA, (Value-lag(Value))/lag(Value)))
这里是使用 diff
函数的可能解决方案。
library(dplyr)
df <- df %>% group_by(ID) %>%
mutate(delta = (Value-lag(Value))/lag(Value))
#find the difference between each row
yeardiff<-c(0,diff(df$Year) )
#for any row with a difference not equal to one set to NA
df$delta[yeardiff !=1]<-NA
我们可以只使用base R
函数来得到输出
lv <- with(df, ave(Value, ID, FUN = function(x) c(NA, x[-length(x)])))
ly <- with(df, ave(Year, ID, FUN = function(x) c(NA, x[-length(x)])))
df$Change <- with(df, ifelse((Year -ly) >1, NA, (Value - lv)/lv))
df$Change
#[1] NA 1.5000000 NA -0.4285714 0.7500000
#[6] -0.2857143 NA NA 0.4285714 NA
#[11] 0.5000000 -0.7777778 NA
我看到的所有滞后示例都使用连续时间序列。我正在尝试计算每年的百分比变化,但是,计算两者之间是否存在差距对我来说没有意义。 即我不希望从 2001 年到 2004 年发生百分比变化。只对两年之间感兴趣。数据输入示例:
structure(list(ID = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L), .Label = c("A", "B"), class = "factor"),
Year = c(2000L, 2001L, 2004L, 2005L, 2006L, 2007L, 1990L,
2000L, 2001L, 2005L, 2006L, 2007L, 2009L), Value = c(4L,
10L, 7L, 4L, 7L, 5L, 2L, 7L, 10L, 6L, 9L, 2L, 9L)), .Names = c("ID",
"Year", "Value"), class = "data.frame", row.names = c(NA, -13L
))
df <- df %>% group_by(ID) %>%
mutate(delta = (Value-lag(Value))/lag(Value))
上面的行没有return我想要的输出,忽略跳转的地方。 期望的输出:
structure(list(ID = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L), .Label = c("A", "B"), class = "factor"),
Year = c(2000L, 2001L, 2004L, 2005L, 2006L, 2007L, 1990L,
2000L, 2001L, 2005L, 2006L, 2007L, 2009L), Value = c(4L,
10L, 7L, 4L, 7L, 5L, 2L, 7L, 10L, 6L, 9L, 2L, 9L), Change = c(NA,
1.5, NA, -0.428571429, 0.75, -0.285714286, NA, 2.5, 0.428571429,
NA, 0.5, -0.777777778, NA)), .Names = c("ID", "Year", "Value",
"Change"), class = "data.frame", row.names = c(NA, -13L))
这是一个data.table
解决方案:
# load library and convert to data.table
library(data.table)
setDT(df)
df[, "Change" := ifelse(Year-shift(Year)==1,
(Value-shift(Value))/shift(Value), NA), by="ID"]
哪个returns
df
ID Year Value Change
1: A 2000 4 NA
2: A 2001 10 1.5000000
3: A 2004 7 NA
4: A 2005 4 -0.4285714
5: A 2006 7 0.7500000
6: A 2007 5 -0.2857143
7: B 1990 2 NA
8: B 2000 7 NA
9: B 2001 10 0.4285714
10: B 2005 6 NA
11: B 2006 9 0.5000000
12: B 2007 2 -0.7777778
13: B 2009 9 NA
这使用 ifelse
,这对于庞大的数据集来说可能会很慢,但如果数据集有数千个观察值,则不会引起注意。
使用 dplyr
:
df %>% group_by(ID) %>%
mutate(delta = ifelse((Year - lag(Year)) > 1, NA, (Value-lag(Value))/lag(Value)))
这里是使用 diff
函数的可能解决方案。
library(dplyr)
df <- df %>% group_by(ID) %>%
mutate(delta = (Value-lag(Value))/lag(Value))
#find the difference between each row
yeardiff<-c(0,diff(df$Year) )
#for any row with a difference not equal to one set to NA
df$delta[yeardiff !=1]<-NA
我们可以只使用base R
函数来得到输出
lv <- with(df, ave(Value, ID, FUN = function(x) c(NA, x[-length(x)])))
ly <- with(df, ave(Year, ID, FUN = function(x) c(NA, x[-length(x)])))
df$Change <- with(df, ifelse((Year -ly) >1, NA, (Value - lv)/lv))
df$Change
#[1] NA 1.5000000 NA -0.4285714 0.7500000
#[6] -0.2857143 NA NA 0.4285714 NA
#[11] 0.5000000 -0.7777778 NA