根据条件将内插行添加到 data.table

Question

我有一个数据集如下：

DT <- structure(list(year = structure(c(1993, 1993, 1993, 1997, 1997, 
1997, 1999, 1999, 1999, 2003, 2003, 2005, 2005, 2005, 2009, 2009, 
2009, 2011, 2011, 2011, 2015, 2015, 2017, 2017, 2017), comment = "year"), 
    State = structure(c("Kansas", "Kansas", "Kansas", "Kansas", 
    "Kansas", "Kansas", "Kansas", "Kansas", "Kansas", "Kansas", 
    "Kansas", "Kansas", "Kansas", "Kansas", "Kansas", "Kansas", 
    "Kansas", "Kansas", "Kansas", "Kansas", "Kansas", "Kansas", 
    "Kansas", "Kansas", "Kansas"), class = "AsIs", comment = "state"), 
    State_Abbr = structure(c("KS", "KS", "KS", "KS", "KS", "KS", 
    "KS", "KS", "KS", "KS", "KS", "KS", "KS", "KS", "KS", "KS", 
    "KS", "KS", "KS", "KS", "KS", "KS", "KS", "KS", "KS"), class = "AsIs", comment = "state_po"), 
    Party = structure(c("republican", "democrat", "Other", "republican", 
    "democrat", "Other", "republican", "democrat", "Other", "republican", 
    "Other", "democrat", "republican", "Other", "republican", 
    "democrat", "Other", "democrat", "republican", "Other", "republican", 
    "Other", "democrat", "republican", "Other"), class = "AsIs", comment = "party"), 
    Percentage = structure(c(0.626967802302283, 0.310289787269175, 
    0.0627424104285421, 0.620238525135418, 0.344369476385061, 
    0.035391998479521, 0.652661584410013, 0.315878201849193, 
    0.0314602137407939, 0.825223659651155, 0.174776340348845, 
    0.274872411697912, 0.691627798218281, 0.033499790083807, 
    0.600583964516102, 0.364584658335329, 0.0348313771485682, 
    0.263785496339944, 0.70094378363408, 0.0352707200259761, 
    0.531464769317622, 0.468535230682378, 0.322381278217064, 
    0.621752543886607, 0.0558661778963293), comment = "totalvotes"), 
    cyear = structure(c(1992L, 1992L, 1992L, 1996L, 1996L, 1996L, 
    1998L, 1998L, 1998L, 2002L, 2002L, 2004L, 2004L, 2004L, 2008L, 
    2008L, 2008L, 2010L, 2010L, 2010L, 2014L, 2014L, 2016L, 2016L, 
    2016L), comment = "year")), row.names = c(NA, -25L), class = c("data.table", 
"data.frame"))

部分数据如下所示：

虽然参议院选举通常每两年举行一次，但堪萨斯州在 1994 年没有举行选举 (cyear)。尽管如此，我还是想知道 1994 年政党分裂的估计。因此我想添加一行。添加我解决的行如下：

编辑：

# Gets the rows and row numbers where a row should be added

Numbers <- setDT(DT)[, newcolumn := as.integer(year - shift(year) > 2), .(State, Party)]
Numbers[, uniqueID := .I]  
Numbers[newcolumn==1, newcolumn := uniqueID] 
Numbers[, uniqueID := NULL]
sub <- setDT(Numbers)
sub <- setDT(sub)[!is.na(newcolumn)]
sub <- setDT(sub)[newcolumn!=0]

# X is a dataframe of the rows that should be added
X <- sub
# X is a vector of where the rows should be added
sub <- sub[,newcolumn]

# The function that insert the row - r = rownumber
insertRow2 <- function(existingDF, newrow, r) {
  existingDF <- rbind(existingDF,newrow, use.names=FALSE)
  existingDF <- existingDF[order(c(1:(nrow(existingDF)-1),r-0.5)),]
  row.names(existingDF) <- 1:nrow(existingDF)
  return(existingDF)  
}

# for-loop that gets the row number from the vector, and the corresponding row from the data-frame and adds to the new data-set.
for (i in sub){
  newrow <- X[i,]
  Numbers <- insertRow2(Numbers, newrow, sub[i] )
}

我想我最后一个 for 循环还有问题。谁能帮帮我？

期望的结果：

Answer 1

如果我没理解错的话，你想要 (a) fill-in 缺失的年份，以及 (b) carry-forward 上一年的信息fill-in那些失踪年份的信息。这是一种插值形式，但还有许多其他形式。

为此，您可以：

使用 State-Party-Year 的所有可能组合创建一个“矩形”数据集。
将您的数据集合并到该矩形数据中
排序方式State-Party-Year
使用 zoo 包中的 na.locf 函数将 carry-forward 信息添加到 fill-in 新行。

library(data.table)
library(zoo)

# rectangular data
dat = CJ(year = min(DT$year):max(DT$year),
         State = unique(DT$State),
         Party = unique(DT$Party))

# merge
dat = merge(dat, DT, all.x=TRUE)

# sort
dat = setkey(dat, State, Party, year)

# carry-forward
dat = dat[, lapply(.SD, function(x) na.locf(x, na.rm=FALSE)), 
          by=.(State, Party)]

# print
head(dat)
#>    year  State Party State_Abbr Percentage cyear
#> 1: 1993 Kansas Other         KS 0.06274241  1992
#> 2: 1994 Kansas Other         KS 0.06274241  1992
#> 3: 1995 Kansas Other         KS 0.06274241  1992
#> 4: 1996 Kansas Other         KS 0.06274241  1992
#> 5: 1997 Kansas Other         KS 0.03539200  1996
#> 6: 1998 Kansas Other         KS 0.03539200  1996

根据条件将内插行添加到 data.table

Adding interpollated rows to a data.table based on a condition

interpolation

r

rows

conditional-statements

data.table