如何用 R 中的时间顺序列表填充 NA?
How to fill NAs with chronological list in R?
假设我有一个按时间顺序排列的家庭收入列表和一个包含家庭收入的城镇 ID 数据框,但我想填写一些 NA。
HouseholdIncome_list <- c(10000, 20000, 30000,40000,50000, 60000, 70000)
Town_ID <- c("A", "A", "A", "A", "B", "B", "B", "B", "B")
HouseholdIncome <- c(10000, 40000, 50000, NA, 20000, 40000, NA, NA, 60000)
df <- data.frame(Town_ID, HouseholdIncome)
Town_ID HouseholdIncome
1 A 10000
2 A 40000
3 A 50000
4 A NA
5 B 20000
6 B 40000
7 B NA
8 B NA
9 B 60000
如何填写数据框中的 NA,以便缺失值是列表中的值。所以看起来像下面的df
Town_ID HouseholdIncome
1 A 10000
2 A 40000
3 A 50000
4 A 60000
5 B 20000
6 B 40000
7 B 50000
8 B 50000
9 B 60000
我花了时间搜索某种 na 填充选项,但找不到真正符合给定列表的选项
这是一个糟糕的解决方案,但它可以帮助您完成工作。
library(tidyr)
library(dplyr)
df %>%
group_by(grp = cumsum(!is.na(HouseholdIncome))) %>%
rowwise() %>%
mutate(Income = ifelse(length(which(HouseholdIncome_list == HouseholdIncome)) > 0,
HouseholdIncome_list[which(HouseholdIncome_list == HouseholdIncome) + 1],
NA_real_)) %>%
ungroup() %>%
fill(Income) %>%
mutate(HouseholdIncome = ifelse(is.na(HouseholdIncome), Income, HouseholdIncome)) %>%
select(Town_ID, HouseholdIncome)
returns
# A tibble: 9 x 2
Town_ID HouseholdIncome
<chr> <dbl>
1 A 10000
2 A 40000
3 A 50000
4 A 60000
5 B 20000
6 B 40000
7 B 50000
8 B 50000
9 B 60000
如果您的第一项是 NA
,这将不起作用。
这是另一种基于连接的方法,它也会在缺少的情况下估算组的第一个值:
library(tidyverse)
rdf <- data.frame(HouseholdIncome_list = c(10000, 20000, 30000,40000,50000, 60000, 70000)) %>%
dplyr::mutate(rn = as.double(dplyr::row_number()))
df <- data.frame(Town_ID = c("A", "A", "A", "A", "B", "B", "B", "B", "B"),
HouseholdIncome = c(10000, 40000, 50000, NA, 20000, 40000, NA, NA, 60000))
df %>%
dplyr::left_join(rdf, by = c("HouseholdIncome" = "HouseholdIncome_list")) %>%
dplyr::group_by(Town_ID) %>%
tidyr::fill(rn, .direction = "down") %>%
tidyr::fill(rn, .direction = "up") %>%
dplyr::mutate(rn2 = dplyr::row_number()) %>%
dplyr::ungroup() %>%
dplyr::mutate(rn = case_when(is.na(HouseholdIncome) & rn2 == 1 & rn == min(rdf$rn) ~ rn,
is.na(HouseholdIncome) & rn2 == 1 ~ rn - 1,
is.na(HouseholdIncome) & rn < max(rdf$rn) ~ rn + 1,
TRUE ~ rn)) %>%
dplyr::left_join(rdf, by = "rn") %>%
select(Town_ID, HouseholdIncome = HouseholdIncome_list)
# A tibble: 9 x 2
Town_ID HouseholdIncome
<chr> <dbl>
1 A 10000
2 A 40000
3 A 50000
4 A 60000
5 B 20000
6 B 40000
7 B 50000
8 B 50000
9 B 60000
我会使用 tidyverse
来“作弊”。显然,家庭收入以 10.000 为间隔,因此我们可以利用它,
df %>% mutate(
is_na = as.numeric(is.na(HouseholdIncome)) * 10000
) %>% fill(
HouseholdIncome, .direction = "down"
) %>% mutate(
HouseholdIncome =(HouseholdIncome + is_na),
is_na = NULL
)
首先我们检查 NA
,这里 is_na = 1 * 10000
if TRUE
,然后我们使用 fill
向前移动最后的值。
最后我们sum
我们的骗子变量is_na
和HouseholdIncome
得到下一个HouseholdIncome
区间。
输出如下,
Town_ID HouseholdIncome
1 A 10000
2 A 40000
3 A 50000
4 A 60000
5 B 20000
6 B 40000
7 B 50000
8 B 50000
9 B 60000
一个可能的基础 R 选项
transform(
df,
HouseholdIncome = ave(
HouseholdIncome,
Town_ID,
FUN = function(x) replace(x, is.na(x), x[min(which(is.na(x))) - 1] + 1e4)
)
)
给予
Town_ID HouseholdIncome
1 A 10000
2 A 40000
3 A 50000
4 A 60000
5 B 20000
6 B 40000
7 B 50000
8 B 50000
9 B 60000
假设我有一个按时间顺序排列的家庭收入列表和一个包含家庭收入的城镇 ID 数据框,但我想填写一些 NA。
HouseholdIncome_list <- c(10000, 20000, 30000,40000,50000, 60000, 70000)
Town_ID <- c("A", "A", "A", "A", "B", "B", "B", "B", "B")
HouseholdIncome <- c(10000, 40000, 50000, NA, 20000, 40000, NA, NA, 60000)
df <- data.frame(Town_ID, HouseholdIncome)
Town_ID HouseholdIncome
1 A 10000
2 A 40000
3 A 50000
4 A NA
5 B 20000
6 B 40000
7 B NA
8 B NA
9 B 60000
如何填写数据框中的 NA,以便缺失值是列表中的值。所以看起来像下面的df
Town_ID HouseholdIncome
1 A 10000
2 A 40000
3 A 50000
4 A 60000
5 B 20000
6 B 40000
7 B 50000
8 B 50000
9 B 60000
我花了时间搜索某种 na 填充选项,但找不到真正符合给定列表的选项
这是一个糟糕的解决方案,但它可以帮助您完成工作。
library(tidyr)
library(dplyr)
df %>%
group_by(grp = cumsum(!is.na(HouseholdIncome))) %>%
rowwise() %>%
mutate(Income = ifelse(length(which(HouseholdIncome_list == HouseholdIncome)) > 0,
HouseholdIncome_list[which(HouseholdIncome_list == HouseholdIncome) + 1],
NA_real_)) %>%
ungroup() %>%
fill(Income) %>%
mutate(HouseholdIncome = ifelse(is.na(HouseholdIncome), Income, HouseholdIncome)) %>%
select(Town_ID, HouseholdIncome)
returns
# A tibble: 9 x 2
Town_ID HouseholdIncome
<chr> <dbl>
1 A 10000
2 A 40000
3 A 50000
4 A 60000
5 B 20000
6 B 40000
7 B 50000
8 B 50000
9 B 60000
如果您的第一项是 NA
,这将不起作用。
这是另一种基于连接的方法,它也会在缺少的情况下估算组的第一个值:
library(tidyverse)
rdf <- data.frame(HouseholdIncome_list = c(10000, 20000, 30000,40000,50000, 60000, 70000)) %>%
dplyr::mutate(rn = as.double(dplyr::row_number()))
df <- data.frame(Town_ID = c("A", "A", "A", "A", "B", "B", "B", "B", "B"),
HouseholdIncome = c(10000, 40000, 50000, NA, 20000, 40000, NA, NA, 60000))
df %>%
dplyr::left_join(rdf, by = c("HouseholdIncome" = "HouseholdIncome_list")) %>%
dplyr::group_by(Town_ID) %>%
tidyr::fill(rn, .direction = "down") %>%
tidyr::fill(rn, .direction = "up") %>%
dplyr::mutate(rn2 = dplyr::row_number()) %>%
dplyr::ungroup() %>%
dplyr::mutate(rn = case_when(is.na(HouseholdIncome) & rn2 == 1 & rn == min(rdf$rn) ~ rn,
is.na(HouseholdIncome) & rn2 == 1 ~ rn - 1,
is.na(HouseholdIncome) & rn < max(rdf$rn) ~ rn + 1,
TRUE ~ rn)) %>%
dplyr::left_join(rdf, by = "rn") %>%
select(Town_ID, HouseholdIncome = HouseholdIncome_list)
# A tibble: 9 x 2
Town_ID HouseholdIncome
<chr> <dbl>
1 A 10000
2 A 40000
3 A 50000
4 A 60000
5 B 20000
6 B 40000
7 B 50000
8 B 50000
9 B 60000
我会使用 tidyverse
来“作弊”。显然,家庭收入以 10.000 为间隔,因此我们可以利用它,
df %>% mutate(
is_na = as.numeric(is.na(HouseholdIncome)) * 10000
) %>% fill(
HouseholdIncome, .direction = "down"
) %>% mutate(
HouseholdIncome =(HouseholdIncome + is_na),
is_na = NULL
)
首先我们检查 NA
,这里 is_na = 1 * 10000
if TRUE
,然后我们使用 fill
向前移动最后的值。
最后我们sum
我们的骗子变量is_na
和HouseholdIncome
得到下一个HouseholdIncome
区间。
输出如下,
Town_ID HouseholdIncome
1 A 10000
2 A 40000
3 A 50000
4 A 60000
5 B 20000
6 B 40000
7 B 50000
8 B 50000
9 B 60000
一个可能的基础 R 选项
transform(
df,
HouseholdIncome = ave(
HouseholdIncome,
Town_ID,
FUN = function(x) replace(x, is.na(x), x[min(which(is.na(x))) - 1] + 1e4)
)
)
给予
Town_ID HouseholdIncome
1 A 10000
2 A 40000
3 A 50000
4 A 60000
5 B 20000
6 B 40000
7 B 50000
8 B 50000
9 B 60000