创建一个指示变量,用于确定将来 x 是否发生在 y 上
Create an indicator variable for if x happens to y in the future
我正在尝试根据 A 行 1 年内的行是否出现 x 来在 A 行中创建一个虚拟变量。
我相信这可能是一个常见问题,并且已经发布了类似的问题(this is the most similar I found). Unfortunately the zoo package doesn't fit well since it doesn't deal well with irregular spaced dates (I don't want to aggregate rows and my data is too large to deal with this well) and I have been trying unsuccessfully 找出数据表方法来做到这一点,但根据我的经验我更喜欢 tidyverse。
dates <- rep(as.Date(c('2015-01-01', '2015-02-02', '2015-03-03', '2016-02-02'), '%Y-%m-%d'), 3)
names <- c(rep('John', 4), rep('Phil', 4), rep('Ty', 4))
df <- data.frame(Name = names, Date = dates,
did_y = c(0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0),
did_x = c(1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1))
Name Date did_y did_x
John 2015-01-01 0 1
John 2015-02-02 1 0
John 2015-03-03 1 0
John 2016-02-02 0 0
Phil 2015-01-01 1 0
Phil 2015-02-02 1 1
Phil 2015-03-03 0 1
Phil 2016-02-02 0 0
Ty 2015-01-01 0 0
Ty 2015-02-02 0 0
Ty 2015-03-03 0 0
Ty 2016-02-02 0 1
我想要的是
dffinal <- data.frame(Name = names, Date = dates,
did_y = c(0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0),
did_x = c(1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1),
did_x_within_year = c(1, 1, 1, NA, 1, 1, 1, 1, 0, 1, 1, 1),
did_x_next_year = c(0, 0, 0, NA, 1, 1, 0, NA, 0, 1, 1, NA))
Name Date did_y did_x did_x_within_year did_x_next_year
John 2015-01-01 0 1 1 0
John 2015-02-02 1 0 1 0
John 2015-03-03 1 0 1 0
John 2016-02-02 0 0 NA NA
Phil 2015-01-01 1 0 1 1
Phil 2015-02-02 1 1 1 1
Phil 2015-03-03 0 1 1 0
Phil 2016-02-02 0 0 1 NA
Ty 2015-01-01 0 0 0 0
Ty 2015-02-02 0 0 1 1
Ty 2015-03-03 0 0 1 1
Ty 2016-02-02 0 1 1 NA
所以我想要两列,如果 x 发生在 A 行的 1 年内(无论之前还是之后),另一列如果它发生在未来 1 年内。
我试验过 RcppRoll,但它似乎只在日期上向后看,也就是说,如果某件事发生在它的前一年,它将是虚拟的,但如果它会在未来一年发生,则不会。
df$did_x_next_year <- roll_max(df$did_x, 365, fill = NA)
编辑:基于其他问题的尝试解决方案
我已经尝试实现 this solution (1b),不幸的是我的 dataframe/datatable 中没有任何实际改变。即使我在将函数应用到我的数据时保持示例中的原样,它也不会更新。
library(zoo)
library(data.table)
df$Year <- lubridate::year(df$Date)
df$Month <- lubridate::month(df$Date)
df$did_x_next_year <- df$did_x
DT <- as.data.table(df)
k <- 12 # prior 12 months
# inputs zoo object x, subsets it to specified window and sums
Max2 <- function(x) {
w <- window(x, start = end(x) - k/12, end = end(x) - 1/12)
if (length(w) == 0 || all(is.na(w))) NA_real_ else max(w, na.rm = TRUE)
}
nms <- names(DT)[7]
setkey(DT, Name, Year, Month) # sort
# create zoo object from arguments and run rollapplyr using Sum2
roll2 <- function(x, year, month) {
z <- zoo(x, as.yearmon(year + (month - 1)/12))
coredata(rollapplyr(z, k+1, Max2, coredata = FALSE, partial = TRUE))
}
DT <- DT[, nms := lapply(.SD, roll2, Year, Month), .SDcols = nms, by = "Name"]
经过朋友的建议,我想出了以下办法:
# Filtering to the obs I care about
dfadd <- df %>% filter(did_x == 1) %>% select(Name, Date) %>% rename(x_date = Date)
# Converting to character since in dcast it screws up the dates
dfadd$x_date <- as.character(dfadd$x_date)
# Merging data
df <- plyr::join(df, dfadd, by = 'Name')
# Creating new column used for dcasting
df <- df %>% group_by(Name, Date) %>% mutate(x_date_index = seq(from = 1, to = n()))
df$x_date_index <- paste0('x_date_',df$x_date_index)
#casting the data wide
df <- reshape2::dcast(df,
Name + Date + did_y + did_x ~ x_date_index,
value.var = "x_date",
fill = NA)
# Converting to back to date
df$x_date_1 <- as.Date(df$x_date_1)
df$x_date_2 <- as.Date(df$x_date_2)
# Creating dummy variables
df$did_x_within_year <- 0
df$did_x_within_year <- ifelse((df$x_date_1 - df$Date) <= 366, 1,
df$did_x_within_year)
df$did_x_next_year <- 0
df$did_x_next_year <- ifelse(((df$x_date_1 > df$Date) & (df$x_date_1 - df$Date<= 365)),
1, df$did_x_next_year)
# Can extend to account for x_date_2, x_date_3, etc
# Changing the last entry to NA as desired
df <- df %>% group_by(Name) %>% mutate(did_x_next_year = c(did_x_next_year[-n()], NA))
我正在尝试根据 A 行 1 年内的行是否出现 x 来在 A 行中创建一个虚拟变量。 我相信这可能是一个常见问题,并且已经发布了类似的问题(this is the most similar I found). Unfortunately the zoo package doesn't fit well since it doesn't deal well with irregular spaced dates (I don't want to aggregate rows and my data is too large to deal with this well) and I have been trying unsuccessfully 找出数据表方法来做到这一点,但根据我的经验我更喜欢 tidyverse。
dates <- rep(as.Date(c('2015-01-01', '2015-02-02', '2015-03-03', '2016-02-02'), '%Y-%m-%d'), 3)
names <- c(rep('John', 4), rep('Phil', 4), rep('Ty', 4))
df <- data.frame(Name = names, Date = dates,
did_y = c(0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0),
did_x = c(1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1))
Name Date did_y did_x
John 2015-01-01 0 1
John 2015-02-02 1 0
John 2015-03-03 1 0
John 2016-02-02 0 0
Phil 2015-01-01 1 0
Phil 2015-02-02 1 1
Phil 2015-03-03 0 1
Phil 2016-02-02 0 0
Ty 2015-01-01 0 0
Ty 2015-02-02 0 0
Ty 2015-03-03 0 0
Ty 2016-02-02 0 1
我想要的是
dffinal <- data.frame(Name = names, Date = dates,
did_y = c(0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0),
did_x = c(1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1),
did_x_within_year = c(1, 1, 1, NA, 1, 1, 1, 1, 0, 1, 1, 1),
did_x_next_year = c(0, 0, 0, NA, 1, 1, 0, NA, 0, 1, 1, NA))
Name Date did_y did_x did_x_within_year did_x_next_year
John 2015-01-01 0 1 1 0
John 2015-02-02 1 0 1 0
John 2015-03-03 1 0 1 0
John 2016-02-02 0 0 NA NA
Phil 2015-01-01 1 0 1 1
Phil 2015-02-02 1 1 1 1
Phil 2015-03-03 0 1 1 0
Phil 2016-02-02 0 0 1 NA
Ty 2015-01-01 0 0 0 0
Ty 2015-02-02 0 0 1 1
Ty 2015-03-03 0 0 1 1
Ty 2016-02-02 0 1 1 NA
所以我想要两列,如果 x 发生在 A 行的 1 年内(无论之前还是之后),另一列如果它发生在未来 1 年内。
我试验过 RcppRoll,但它似乎只在日期上向后看,也就是说,如果某件事发生在它的前一年,它将是虚拟的,但如果它会在未来一年发生,则不会。
df$did_x_next_year <- roll_max(df$did_x, 365, fill = NA)
编辑:基于其他问题的尝试解决方案
我已经尝试实现 this solution (1b),不幸的是我的 dataframe/datatable 中没有任何实际改变。即使我在将函数应用到我的数据时保持示例中的原样,它也不会更新。
library(zoo)
library(data.table)
df$Year <- lubridate::year(df$Date)
df$Month <- lubridate::month(df$Date)
df$did_x_next_year <- df$did_x
DT <- as.data.table(df)
k <- 12 # prior 12 months
# inputs zoo object x, subsets it to specified window and sums
Max2 <- function(x) {
w <- window(x, start = end(x) - k/12, end = end(x) - 1/12)
if (length(w) == 0 || all(is.na(w))) NA_real_ else max(w, na.rm = TRUE)
}
nms <- names(DT)[7]
setkey(DT, Name, Year, Month) # sort
# create zoo object from arguments and run rollapplyr using Sum2
roll2 <- function(x, year, month) {
z <- zoo(x, as.yearmon(year + (month - 1)/12))
coredata(rollapplyr(z, k+1, Max2, coredata = FALSE, partial = TRUE))
}
DT <- DT[, nms := lapply(.SD, roll2, Year, Month), .SDcols = nms, by = "Name"]
经过朋友的建议,我想出了以下办法:
# Filtering to the obs I care about
dfadd <- df %>% filter(did_x == 1) %>% select(Name, Date) %>% rename(x_date = Date)
# Converting to character since in dcast it screws up the dates
dfadd$x_date <- as.character(dfadd$x_date)
# Merging data
df <- plyr::join(df, dfadd, by = 'Name')
# Creating new column used for dcasting
df <- df %>% group_by(Name, Date) %>% mutate(x_date_index = seq(from = 1, to = n()))
df$x_date_index <- paste0('x_date_',df$x_date_index)
#casting the data wide
df <- reshape2::dcast(df,
Name + Date + did_y + did_x ~ x_date_index,
value.var = "x_date",
fill = NA)
# Converting to back to date
df$x_date_1 <- as.Date(df$x_date_1)
df$x_date_2 <- as.Date(df$x_date_2)
# Creating dummy variables
df$did_x_within_year <- 0
df$did_x_within_year <- ifelse((df$x_date_1 - df$Date) <= 366, 1,
df$did_x_within_year)
df$did_x_next_year <- 0
df$did_x_next_year <- ifelse(((df$x_date_1 > df$Date) & (df$x_date_1 - df$Date<= 365)),
1, df$did_x_next_year)
# Can extend to account for x_date_2, x_date_3, etc
# Changing the last entry to NA as desired
df <- df %>% group_by(Name) %>% mutate(did_x_next_year = c(did_x_next_year[-n()], NA))