根据出现时间估算分类变量的 NA

Question

我有一个如下所示的数据集，其中包含独特的公司年度观察结果。但变量 IndustryCode 由于合并较早，有一些 NA。

stkcd date industrycode
10    2002   .
10    2003   .
10    2004   E22
10    2005   E22
10    2006   E22
10    2007   E22
10    2008   G45
10    2009   G45
10    2010   .
10    2011   .
11    2001   .
11    2002   .
11    2003   D23
11    2004   D23
....

我想用同一公司最近年份的值来估算 NA。例如公司10(stkcd=10)，2004年之前的IndustryCode改为E22，2004年的值，2009年之后的NAs改为G45，2009年的值。

我如何在 R 中实现这一点？

Answer 1

如何应用两次 zoo 中的 na.locf 函数？

基本上它将 NA 替换为最后一个值，您也需要倒退。

一些数据：

dat <- data.frame(
  stkcd = rep(10, 10),
  year = 2002:2011,
  type = c(NA,NA, "E22", "E22","E22", "E22", "G45", "G45", NA, NA)
)


library(zoo)
dat$type <- na.locf(dat$type, na.rm = F) # computes NA with the last value found
dat$type <- na.locf(dat$type, na.rm = F, fromLast = T) # this because you start with NAs, so you need to go backwards too

# output:

# stkcd year type
# 1     10 2002  E22
# 2     10 2003  E22
# 3     10 2004  E22
# 4     10 2005  E22
# 5     10 2006  E22
# 6     10 2007  E22
# 7     10 2008  G45
# 8     10 2009  G45
# 9     10 2010  G45
# 10    10 2011  G45

如果你有多个公司，你需要先group_by，从dplyr:

library(dplyr)
library(zoo)

dat %>%
    group_by(stkcd) %>% # the variable used for the company name
    mutate(type = na.locf(type, na.rm = F),
           type = na.locf(type, na.rm = F, fromLast = T))

例如有 2 家公司：

dat <- data.frame(
  stkcd = c(rep(10, 10), rep(20,10)),
  year = rep(2002:2011, 2),
  type = c(NA,NA, "E22", "E22","E22", "E22", "G45", "G45", NA, NA,
           NA,NA, "E22", "E22","E22", "E22", "G45", "G45", NA, NA)
)

dat %>%
  group_by(stkcd) %>% # the variable used for the company name
  mutate(type = na.locf(type, na.rm = F),
         type = na.locf(type, na.rm = F, fromLast = T)) 

# A tibble: 20 x 3
# Groups:   stkcd [2]
# stkcd  year type 
# <dbl> <int> <fct>
#   1    10  2002 E22  
# 2    10  2003 E22  
# 3    10  2004 E22  
# 4    10  2005 E22  
# 5    10  2006 E22  
# 6    10  2007 E22  
# 7    10  2008 G45  
# 8    10  2009 G45  
# 9    10  2010 G45  
# 10    10  2011 G45  
# 11    20  2002 E22  
# 12    20  2003 E22  
# 13    20  2004 E22  
# 14    20  2005 E22  
# 15    20  2006 E22  
# 16    20  2007 E22  
# 17    20  2008 G45  
# 18    20  2009 G45  
# 19    20  2010 G45  
# 20    20  2011 G45

Answer 2

当然需要group_by陪伴！

对于给定的公司，这有效：

inds = is.na(df1$industrycode)
resInds = sapply(df1$date[inds], function(x) { which.min( abs( x - df1$date[!inds] ) ) })

df1$industrycode[inds] <- df1$industrycode[!inds][resInds]

#   stkcd date industrycode
#1     10 2002          E22
#2     10 2003          E22
#3     10 2004          E22
#4     10 2005          E22
#5     10 2006          E22
#6     10 2007          E22
#7     10 2008          G45
#8     10 2009          G45
#9     10 2010          G45
#10    10 2011          G45

使用data.table

df1<-
    fread("stkcd date industrycode
10    2002   NA
10    2003   NA
10    2004   E22
10    2005   E22
10    2006   E22
10    2007   E22
10    2008   G45
10    2009   G45
10    2010   NA
10    2011   NA
11    2002   NA
11    2003   NA
11    2004   sE22
11    2005   sE22
11    2006   NA
11    2007   sE22
11    2008   sG45
11    2009   sG45
11    2010   NA
11    2011   NA") %>% setDF

na_fill <- function(data,variable) {
    inds = is.na(data[[variable]])
    resInds = sapply(data$date[inds], function(x) { which.min( abs( x - data$date[!inds] ) ) })
    data[[variable]][inds] <- data[[variable]][!inds][resInds]
    return(data[[variable]])
}

setDT(df1)[,na_fill(.SD,"industrycode"),by="stkcd"][]

#    stkcd   V1
# 1:    10  E22
# 2:    10  E22
# 3:    10  E22
# 4:    10  E22
# 5:    10  E22
# 6:    10  E22
# 7:    10  G45
# 8:    10  G45
# 9:    10  G45
#10:    10  G45
#11:    11 sE22
#12:    11 sE22
#13:    11 sE22
#14:    11 sE22
#15:    11 sE22
#16:    11 sE22
#17:    11 sG45
#18:    11 sG45
#19:    11 sG45
#20:    11 sG45

根据出现时间估算分类变量的 NA

Impute NA of a Categorical Variable based on the Timing of Appearance

r

time-series

na

imputation