根据条件将内插行添加到 data.table
Adding interpollated rows to a data.table based on a condition
我有一个数据集如下:
DT <- structure(list(year = structure(c(1993, 1993, 1993, 1997, 1997,
1997, 1999, 1999, 1999, 2003, 2003, 2005, 2005, 2005, 2009, 2009,
2009, 2011, 2011, 2011, 2015, 2015, 2017, 2017, 2017), comment = "year"),
State = structure(c("Kansas", "Kansas", "Kansas", "Kansas",
"Kansas", "Kansas", "Kansas", "Kansas", "Kansas", "Kansas",
"Kansas", "Kansas", "Kansas", "Kansas", "Kansas", "Kansas",
"Kansas", "Kansas", "Kansas", "Kansas", "Kansas", "Kansas",
"Kansas", "Kansas", "Kansas"), class = "AsIs", comment = "state"),
State_Abbr = structure(c("KS", "KS", "KS", "KS", "KS", "KS",
"KS", "KS", "KS", "KS", "KS", "KS", "KS", "KS", "KS", "KS",
"KS", "KS", "KS", "KS", "KS", "KS", "KS", "KS", "KS"), class = "AsIs", comment = "state_po"),
Party = structure(c("republican", "democrat", "Other", "republican",
"democrat", "Other", "republican", "democrat", "Other", "republican",
"Other", "democrat", "republican", "Other", "republican",
"democrat", "Other", "democrat", "republican", "Other", "republican",
"Other", "democrat", "republican", "Other"), class = "AsIs", comment = "party"),
Percentage = structure(c(0.626967802302283, 0.310289787269175,
0.0627424104285421, 0.620238525135418, 0.344369476385061,
0.035391998479521, 0.652661584410013, 0.315878201849193,
0.0314602137407939, 0.825223659651155, 0.174776340348845,
0.274872411697912, 0.691627798218281, 0.033499790083807,
0.600583964516102, 0.364584658335329, 0.0348313771485682,
0.263785496339944, 0.70094378363408, 0.0352707200259761,
0.531464769317622, 0.468535230682378, 0.322381278217064,
0.621752543886607, 0.0558661778963293), comment = "totalvotes"),
cyear = structure(c(1992L, 1992L, 1992L, 1996L, 1996L, 1996L,
1998L, 1998L, 1998L, 2002L, 2002L, 2004L, 2004L, 2004L, 2008L,
2008L, 2008L, 2010L, 2010L, 2010L, 2014L, 2014L, 2016L, 2016L,
2016L), comment = "year")), row.names = c(NA, -25L), class = c("data.table",
"data.frame"))
部分数据如下所示:
虽然参议院选举通常每两年举行一次,但堪萨斯州在 1994 年没有举行选举 (cyear
)。尽管如此,我还是想知道 1994 年政党分裂的估计。因此我想添加一行。添加我解决的行如下:
编辑:
# Gets the rows and row numbers where a row should be added
Numbers <- setDT(DT)[, newcolumn := as.integer(year - shift(year) > 2), .(State, Party)]
Numbers[, uniqueID := .I]
Numbers[newcolumn==1, newcolumn := uniqueID]
Numbers[, uniqueID := NULL]
sub <- setDT(Numbers)
sub <- setDT(sub)[!is.na(newcolumn)]
sub <- setDT(sub)[newcolumn!=0]
# X is a dataframe of the rows that should be added
X <- sub
# X is a vector of where the rows should be added
sub <- sub[,newcolumn]
# The function that insert the row - r = rownumber
insertRow2 <- function(existingDF, newrow, r) {
existingDF <- rbind(existingDF,newrow, use.names=FALSE)
existingDF <- existingDF[order(c(1:(nrow(existingDF)-1),r-0.5)),]
row.names(existingDF) <- 1:nrow(existingDF)
return(existingDF)
}
# for-loop that gets the row number from the vector, and the corresponding row from the data-frame and adds to the new data-set.
for (i in sub){
newrow <- X[i,]
Numbers <- insertRow2(Numbers, newrow, sub[i] )
}
我想我最后一个 for 循环还有问题。谁能帮帮我?
期望的结果:
如果我没理解错的话,你想要 (a) fill-in 缺失的年份,以及 (b) carry-forward 上一年的信息fill-in那些失踪年份的信息。这是一种插值形式,但还有许多其他形式。
为此,您可以:
- 使用 State-Party-Year 的所有可能组合创建一个“矩形”数据集。
- 将您的数据集合并到该矩形数据中
- 排序方式State-Party-Year
- 使用
zoo
包中的 na.locf
函数将 carry-forward 信息添加到 fill-in 新行。
library(data.table)
library(zoo)
# rectangular data
dat = CJ(year = min(DT$year):max(DT$year),
State = unique(DT$State),
Party = unique(DT$Party))
# merge
dat = merge(dat, DT, all.x=TRUE)
# sort
dat = setkey(dat, State, Party, year)
# carry-forward
dat = dat[, lapply(.SD, function(x) na.locf(x, na.rm=FALSE)),
by=.(State, Party)]
# print
head(dat)
#> year State Party State_Abbr Percentage cyear
#> 1: 1993 Kansas Other KS 0.06274241 1992
#> 2: 1994 Kansas Other KS 0.06274241 1992
#> 3: 1995 Kansas Other KS 0.06274241 1992
#> 4: 1996 Kansas Other KS 0.06274241 1992
#> 5: 1997 Kansas Other KS 0.03539200 1996
#> 6: 1998 Kansas Other KS 0.03539200 1996
我有一个数据集如下:
DT <- structure(list(year = structure(c(1993, 1993, 1993, 1997, 1997,
1997, 1999, 1999, 1999, 2003, 2003, 2005, 2005, 2005, 2009, 2009,
2009, 2011, 2011, 2011, 2015, 2015, 2017, 2017, 2017), comment = "year"),
State = structure(c("Kansas", "Kansas", "Kansas", "Kansas",
"Kansas", "Kansas", "Kansas", "Kansas", "Kansas", "Kansas",
"Kansas", "Kansas", "Kansas", "Kansas", "Kansas", "Kansas",
"Kansas", "Kansas", "Kansas", "Kansas", "Kansas", "Kansas",
"Kansas", "Kansas", "Kansas"), class = "AsIs", comment = "state"),
State_Abbr = structure(c("KS", "KS", "KS", "KS", "KS", "KS",
"KS", "KS", "KS", "KS", "KS", "KS", "KS", "KS", "KS", "KS",
"KS", "KS", "KS", "KS", "KS", "KS", "KS", "KS", "KS"), class = "AsIs", comment = "state_po"),
Party = structure(c("republican", "democrat", "Other", "republican",
"democrat", "Other", "republican", "democrat", "Other", "republican",
"Other", "democrat", "republican", "Other", "republican",
"democrat", "Other", "democrat", "republican", "Other", "republican",
"Other", "democrat", "republican", "Other"), class = "AsIs", comment = "party"),
Percentage = structure(c(0.626967802302283, 0.310289787269175,
0.0627424104285421, 0.620238525135418, 0.344369476385061,
0.035391998479521, 0.652661584410013, 0.315878201849193,
0.0314602137407939, 0.825223659651155, 0.174776340348845,
0.274872411697912, 0.691627798218281, 0.033499790083807,
0.600583964516102, 0.364584658335329, 0.0348313771485682,
0.263785496339944, 0.70094378363408, 0.0352707200259761,
0.531464769317622, 0.468535230682378, 0.322381278217064,
0.621752543886607, 0.0558661778963293), comment = "totalvotes"),
cyear = structure(c(1992L, 1992L, 1992L, 1996L, 1996L, 1996L,
1998L, 1998L, 1998L, 2002L, 2002L, 2004L, 2004L, 2004L, 2008L,
2008L, 2008L, 2010L, 2010L, 2010L, 2014L, 2014L, 2016L, 2016L,
2016L), comment = "year")), row.names = c(NA, -25L), class = c("data.table",
"data.frame"))
部分数据如下所示:
虽然参议院选举通常每两年举行一次,但堪萨斯州在 1994 年没有举行选举 (cyear
)。尽管如此,我还是想知道 1994 年政党分裂的估计。因此我想添加一行。添加我解决的行如下:
编辑:
# Gets the rows and row numbers where a row should be added
Numbers <- setDT(DT)[, newcolumn := as.integer(year - shift(year) > 2), .(State, Party)]
Numbers[, uniqueID := .I]
Numbers[newcolumn==1, newcolumn := uniqueID]
Numbers[, uniqueID := NULL]
sub <- setDT(Numbers)
sub <- setDT(sub)[!is.na(newcolumn)]
sub <- setDT(sub)[newcolumn!=0]
# X is a dataframe of the rows that should be added
X <- sub
# X is a vector of where the rows should be added
sub <- sub[,newcolumn]
# The function that insert the row - r = rownumber
insertRow2 <- function(existingDF, newrow, r) {
existingDF <- rbind(existingDF,newrow, use.names=FALSE)
existingDF <- existingDF[order(c(1:(nrow(existingDF)-1),r-0.5)),]
row.names(existingDF) <- 1:nrow(existingDF)
return(existingDF)
}
# for-loop that gets the row number from the vector, and the corresponding row from the data-frame and adds to the new data-set.
for (i in sub){
newrow <- X[i,]
Numbers <- insertRow2(Numbers, newrow, sub[i] )
}
我想我最后一个 for 循环还有问题。谁能帮帮我?
期望的结果:
如果我没理解错的话,你想要 (a) fill-in 缺失的年份,以及 (b) carry-forward 上一年的信息fill-in那些失踪年份的信息。这是一种插值形式,但还有许多其他形式。
为此,您可以:
- 使用 State-Party-Year 的所有可能组合创建一个“矩形”数据集。
- 将您的数据集合并到该矩形数据中
- 排序方式State-Party-Year
- 使用
zoo
包中的na.locf
函数将 carry-forward 信息添加到 fill-in 新行。
library(data.table)
library(zoo)
# rectangular data
dat = CJ(year = min(DT$year):max(DT$year),
State = unique(DT$State),
Party = unique(DT$Party))
# merge
dat = merge(dat, DT, all.x=TRUE)
# sort
dat = setkey(dat, State, Party, year)
# carry-forward
dat = dat[, lapply(.SD, function(x) na.locf(x, na.rm=FALSE)),
by=.(State, Party)]
# print
head(dat)
#> year State Party State_Abbr Percentage cyear
#> 1: 1993 Kansas Other KS 0.06274241 1992
#> 2: 1994 Kansas Other KS 0.06274241 1992
#> 3: 1995 Kansas Other KS 0.06274241 1992
#> 4: 1996 Kansas Other KS 0.06274241 1992
#> 5: 1997 Kansas Other KS 0.03539200 1996
#> 6: 1998 Kansas Other KS 0.03539200 1996