根据出现时间估算分类变量的 NA
Impute NA of a Categorical Variable based on the Timing of Appearance
我有一个如下所示的数据集,其中包含独特的公司年度观察结果。但变量 IndustryCode 由于合并较早,有一些 NA。
stkcd date industrycode
10 2002 .
10 2003 .
10 2004 E22
10 2005 E22
10 2006 E22
10 2007 E22
10 2008 G45
10 2009 G45
10 2010 .
10 2011 .
11 2001 .
11 2002 .
11 2003 D23
11 2004 D23
....
我想用同一公司最近年份的值来估算 NA。例如公司10(stkcd=10),2004年之前的IndustryCode改为E22,2004年的值,2009年之后的NAs改为G45,2009年的值。
我如何在 R 中实现这一点?
如何应用两次 zoo
中的 na.locf
函数?
基本上它将 NA
替换为最后一个值,您也需要倒退。
一些数据:
dat <- data.frame(
stkcd = rep(10, 10),
year = 2002:2011,
type = c(NA,NA, "E22", "E22","E22", "E22", "G45", "G45", NA, NA)
)
library(zoo)
dat$type <- na.locf(dat$type, na.rm = F) # computes NA with the last value found
dat$type <- na.locf(dat$type, na.rm = F, fromLast = T) # this because you start with NAs, so you need to go backwards too
# output:
# stkcd year type
# 1 10 2002 E22
# 2 10 2003 E22
# 3 10 2004 E22
# 4 10 2005 E22
# 5 10 2006 E22
# 6 10 2007 E22
# 7 10 2008 G45
# 8 10 2009 G45
# 9 10 2010 G45
# 10 10 2011 G45
如果你有多个公司,你需要先group_by
,从dplyr
:
library(dplyr)
library(zoo)
dat %>%
group_by(stkcd) %>% # the variable used for the company name
mutate(type = na.locf(type, na.rm = F),
type = na.locf(type, na.rm = F, fromLast = T))
例如有 2 家公司:
dat <- data.frame(
stkcd = c(rep(10, 10), rep(20,10)),
year = rep(2002:2011, 2),
type = c(NA,NA, "E22", "E22","E22", "E22", "G45", "G45", NA, NA,
NA,NA, "E22", "E22","E22", "E22", "G45", "G45", NA, NA)
)
dat %>%
group_by(stkcd) %>% # the variable used for the company name
mutate(type = na.locf(type, na.rm = F),
type = na.locf(type, na.rm = F, fromLast = T))
# A tibble: 20 x 3
# Groups: stkcd [2]
# stkcd year type
# <dbl> <int> <fct>
# 1 10 2002 E22
# 2 10 2003 E22
# 3 10 2004 E22
# 4 10 2005 E22
# 5 10 2006 E22
# 6 10 2007 E22
# 7 10 2008 G45
# 8 10 2009 G45
# 9 10 2010 G45
# 10 10 2011 G45
# 11 20 2002 E22
# 12 20 2003 E22
# 13 20 2004 E22
# 14 20 2005 E22
# 15 20 2006 E22
# 16 20 2007 E22
# 17 20 2008 G45
# 18 20 2009 G45
# 19 20 2010 G45
# 20 20 2011 G45
当然需要group_by陪伴!
对于给定的公司,这有效:
inds = is.na(df1$industrycode)
resInds = sapply(df1$date[inds], function(x) { which.min( abs( x - df1$date[!inds] ) ) })
df1$industrycode[inds] <- df1$industrycode[!inds][resInds]
# stkcd date industrycode
#1 10 2002 E22
#2 10 2003 E22
#3 10 2004 E22
#4 10 2005 E22
#5 10 2006 E22
#6 10 2007 E22
#7 10 2008 G45
#8 10 2009 G45
#9 10 2010 G45
#10 10 2011 G45
使用data.table
df1<-
fread("stkcd date industrycode
10 2002 NA
10 2003 NA
10 2004 E22
10 2005 E22
10 2006 E22
10 2007 E22
10 2008 G45
10 2009 G45
10 2010 NA
10 2011 NA
11 2002 NA
11 2003 NA
11 2004 sE22
11 2005 sE22
11 2006 NA
11 2007 sE22
11 2008 sG45
11 2009 sG45
11 2010 NA
11 2011 NA") %>% setDF
na_fill <- function(data,variable) {
inds = is.na(data[[variable]])
resInds = sapply(data$date[inds], function(x) { which.min( abs( x - data$date[!inds] ) ) })
data[[variable]][inds] <- data[[variable]][!inds][resInds]
return(data[[variable]])
}
setDT(df1)[,na_fill(.SD,"industrycode"),by="stkcd"][]
# stkcd V1
# 1: 10 E22
# 2: 10 E22
# 3: 10 E22
# 4: 10 E22
# 5: 10 E22
# 6: 10 E22
# 7: 10 G45
# 8: 10 G45
# 9: 10 G45
#10: 10 G45
#11: 11 sE22
#12: 11 sE22
#13: 11 sE22
#14: 11 sE22
#15: 11 sE22
#16: 11 sE22
#17: 11 sG45
#18: 11 sG45
#19: 11 sG45
#20: 11 sG45
我有一个如下所示的数据集,其中包含独特的公司年度观察结果。但变量 IndustryCode 由于合并较早,有一些 NA。
stkcd date industrycode
10 2002 .
10 2003 .
10 2004 E22
10 2005 E22
10 2006 E22
10 2007 E22
10 2008 G45
10 2009 G45
10 2010 .
10 2011 .
11 2001 .
11 2002 .
11 2003 D23
11 2004 D23
....
我想用同一公司最近年份的值来估算 NA。例如公司10(stkcd=10),2004年之前的IndustryCode改为E22,2004年的值,2009年之后的NAs改为G45,2009年的值。
我如何在 R 中实现这一点?
如何应用两次 zoo
中的 na.locf
函数?
基本上它将 NA
替换为最后一个值,您也需要倒退。
一些数据:
dat <- data.frame(
stkcd = rep(10, 10),
year = 2002:2011,
type = c(NA,NA, "E22", "E22","E22", "E22", "G45", "G45", NA, NA)
)
library(zoo)
dat$type <- na.locf(dat$type, na.rm = F) # computes NA with the last value found
dat$type <- na.locf(dat$type, na.rm = F, fromLast = T) # this because you start with NAs, so you need to go backwards too
# output:
# stkcd year type
# 1 10 2002 E22
# 2 10 2003 E22
# 3 10 2004 E22
# 4 10 2005 E22
# 5 10 2006 E22
# 6 10 2007 E22
# 7 10 2008 G45
# 8 10 2009 G45
# 9 10 2010 G45
# 10 10 2011 G45
如果你有多个公司,你需要先group_by
,从dplyr
:
library(dplyr)
library(zoo)
dat %>%
group_by(stkcd) %>% # the variable used for the company name
mutate(type = na.locf(type, na.rm = F),
type = na.locf(type, na.rm = F, fromLast = T))
例如有 2 家公司:
dat <- data.frame(
stkcd = c(rep(10, 10), rep(20,10)),
year = rep(2002:2011, 2),
type = c(NA,NA, "E22", "E22","E22", "E22", "G45", "G45", NA, NA,
NA,NA, "E22", "E22","E22", "E22", "G45", "G45", NA, NA)
)
dat %>%
group_by(stkcd) %>% # the variable used for the company name
mutate(type = na.locf(type, na.rm = F),
type = na.locf(type, na.rm = F, fromLast = T))
# A tibble: 20 x 3
# Groups: stkcd [2]
# stkcd year type
# <dbl> <int> <fct>
# 1 10 2002 E22
# 2 10 2003 E22
# 3 10 2004 E22
# 4 10 2005 E22
# 5 10 2006 E22
# 6 10 2007 E22
# 7 10 2008 G45
# 8 10 2009 G45
# 9 10 2010 G45
# 10 10 2011 G45
# 11 20 2002 E22
# 12 20 2003 E22
# 13 20 2004 E22
# 14 20 2005 E22
# 15 20 2006 E22
# 16 20 2007 E22
# 17 20 2008 G45
# 18 20 2009 G45
# 19 20 2010 G45
# 20 20 2011 G45
当然需要group_by陪伴!
对于给定的公司,这有效:
inds = is.na(df1$industrycode)
resInds = sapply(df1$date[inds], function(x) { which.min( abs( x - df1$date[!inds] ) ) })
df1$industrycode[inds] <- df1$industrycode[!inds][resInds]
# stkcd date industrycode
#1 10 2002 E22
#2 10 2003 E22
#3 10 2004 E22
#4 10 2005 E22
#5 10 2006 E22
#6 10 2007 E22
#7 10 2008 G45
#8 10 2009 G45
#9 10 2010 G45
#10 10 2011 G45
使用data.table
df1<-
fread("stkcd date industrycode
10 2002 NA
10 2003 NA
10 2004 E22
10 2005 E22
10 2006 E22
10 2007 E22
10 2008 G45
10 2009 G45
10 2010 NA
10 2011 NA
11 2002 NA
11 2003 NA
11 2004 sE22
11 2005 sE22
11 2006 NA
11 2007 sE22
11 2008 sG45
11 2009 sG45
11 2010 NA
11 2011 NA") %>% setDF
na_fill <- function(data,variable) {
inds = is.na(data[[variable]])
resInds = sapply(data$date[inds], function(x) { which.min( abs( x - data$date[!inds] ) ) })
data[[variable]][inds] <- data[[variable]][!inds][resInds]
return(data[[variable]])
}
setDT(df1)[,na_fill(.SD,"industrycode"),by="stkcd"][]
# stkcd V1
# 1: 10 E22
# 2: 10 E22
# 3: 10 E22
# 4: 10 E22
# 5: 10 E22
# 6: 10 E22
# 7: 10 G45
# 8: 10 G45
# 9: 10 G45
#10: 10 G45
#11: 11 sE22
#12: 11 sE22
#13: 11 sE22
#14: 11 sE22
#15: 11 sE22
#16: 11 sE22
#17: 11 sG45
#18: 11 sG45
#19: 11 sG45
#20: 11 sG45