按行检查 NA,然后聚合它们的列名
Checking for NAs by Row, then Aggregating their Column Names
我目前正在努力逐行获取关于哪些变量结果 well/which 是 NA 的信息。下面,我展示了我的一部分数据的 example/dput 输出。
head(big_test)
# A tibble: 3 x 19
id ctr_n ctr yr mn nvvi ENP_nat ENP_avg ENP_wght inflation1 inflation2 inflation3 inflation4 PSNS PSNS_s PSNS_w
<dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr> <chr> <chr> <dbl> <chr>
1 1854 Isra~ 376 2019 4 1 3.50 3.50 3.50 NA NA NA NA NA 0.962 NA
2 1855 Isra~ 376 2019 9 1 2.51 2.51 2.51 NA NA NA NA NA 0.992 NA
3 1856 Isra~ 376 2020 3 1 3.78 3.78 3.78 NA NA NA NA NA 0.999 NA
# ... with 3 more variables: PSNS_sw <chr>, local_E <dbl>, cst_tot <dbl>
dput(big_test)
structure(list(id = c(1854, 1855, 1856), ctr_n = c("Israel",
"Israel", "Israel"), ctr = c(376, 376, 376), yr = c(2019, 2019,
2020), mn = c(4, 9, 3), nvvi = c(1, 1, 1), ENP_nat = c(3.50348063163162,
2.51319610127466, 3.78468892335972), ENP_avg = c(3.50348063163162,
2.51319610127466, 3.78468892335972), ENP_wght = c(3.50348063163162,
2.51319610127466, 3.78468892335972), inflation1 = c("NA", "NA",
"NA"), inflation2 = c("NA", "NA", "NA"), inflation3 = c("NA",
"NA", "NA"), inflation4 = c("NA", "NA", "NA"), PSNS = c("NA",
"NA", "NA"), PSNS_s = c(0.961748183147869, 0.992275075925835,
0.998547438416594), PSNS_w = c("NA", "NA", "NA"), PSNS_sw = c("NA",
"NA", "NA"), local_E = c(1, 1, 1), cst_tot = c(1, 1, 1)), row.names = c(NA,
-3L), class = c("tbl_df", "tbl", "data.frame"))
编辑:这里的 NAs 用引号引起来,这是不合适的。我认为问题出在写入 .xlsx;下面的 dput
输出中列出了正确的版本,NA 上没有引号。
如您所见,数据在国家层面通过选举分开,这里的每一行都应该是唯一的(即以色列,2019 年,第 4 个月)。 我想创建一个字符列,列出此输出中缺少的变量。这是所需列的示例:
desired_output <- tibble(missing_vars=paste("inflation1","inflation2","inflation3","inflation4","etc",sep=";"))
head(desired_output)
# A tibble: 1 x 1
missing_vars
<chr>
1 inflation1;inflation2;inflation3;inflation4;etc
因此,我想知道是否有某种循环可以对唯一选举进行切片,查看缺失的列,然后将缺失的列向量化?这对于自动化至关重要,因为对于相同 country/year,某些变量可能会在其他变量存在的情况下丢失。我试图对它们进行计数,但我不知道如何将这些列名称列为字符列。
感谢您的帮助。谢谢!
如@akrun 所述,您有“NA”字符串而不是 NA。修复后,您可以定义一个缺失的函数并将其应用于每一行以创建一个新变量:
big_test <- structure(list(id = c(1854, 1855, 1856), ctr_n = c("Israel",
"Israel", "Israel"), ctr = c(376, 376, 376), yr = c(2019, 2019,
2020), mn = c(4, 9, 3), nvvi = c(1, 1, 1), ENP_nat = c(3.50348063163162,
2.51319610127466, 3.78468892335972), ENP_avg = c(3.50348063163162,
2.51319610127466, 3.78468892335972), ENP_wght = c(3.50348063163162,
2.51319610127466, 3.78468892335972), inflation1 = c(NA, NA,
NA), inflation2 = c(NA, NA, NA), inflation3 = c(NA,
NA, NA), inflation4 = c(NA, NA, NA), PSNS = c(NA,
NA, NA), PSNS_s = c(0.961748183147869, 0.992275075925835,
0.998547438416594), PSNS_w = c(NA, NA, NA), PSNS_sw = c(NA,
NA, NA), local_E = c(1, 1, 1), cst_tot = c(1, 1, 1)), row.names = c(NA,
-3L), class = c("tbl_df", "tbl", "data.frame"))
missing <- function(x) {
idx <- is.na(unlist(x))
paste(colnames(big_test)[idx], collapse=", ")
}
big_test$missing <- apply(big_test, 1, missing)
big_test$missing
#> [1] "inflation1, inflation2, inflation3, inflation4, PSNS, PSNS_w, PSNS_sw"
#> [2] "inflation1, inflation2, inflation3, inflation4, PSNS, PSNS_w, PSNS_sw"
#> [3] "inflation1, inflation2, inflation3, inflation4, PSNS, PSNS_w, PSNS_sw"
这是一个带有 tidyverse
的选项,我们在其中使用 pivot_longer
重塑为 'long' 格式,按 row_number()
、paste
列名称分组'value'
中有缺失值的地方
library(dplyr)
library(tidyr)
library(stringr)
big_test %>%
select(starts_with('inflation')) %>%
mutate(rn = row_number()) %>%
pivot_longer(cols = -rn) %>%
group_by(rn) %>%
summarise(missing_vars = str_c(name[is.na(value)], collapse=";"),
.groups = 'drop') %>%
select(-rn)
不整形,一个选项是rowwise/c_across
big_test %>%
rowwise %>%
transmute(missing_vars = str_c(names(select(cur_data(),
starts_with('inflation')))[which(c_across(starts_with('inflation'))
== 'NA')], collapse=";"))
这里是比较(==
)和"NA"。如果是真实的 NA
,请使用 is.na
而不是 ==
我目前正在努力逐行获取关于哪些变量结果 well/which 是 NA 的信息。下面,我展示了我的一部分数据的 example/dput 输出。
head(big_test)
# A tibble: 3 x 19
id ctr_n ctr yr mn nvvi ENP_nat ENP_avg ENP_wght inflation1 inflation2 inflation3 inflation4 PSNS PSNS_s PSNS_w
<dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr> <chr> <chr> <dbl> <chr>
1 1854 Isra~ 376 2019 4 1 3.50 3.50 3.50 NA NA NA NA NA 0.962 NA
2 1855 Isra~ 376 2019 9 1 2.51 2.51 2.51 NA NA NA NA NA 0.992 NA
3 1856 Isra~ 376 2020 3 1 3.78 3.78 3.78 NA NA NA NA NA 0.999 NA
# ... with 3 more variables: PSNS_sw <chr>, local_E <dbl>, cst_tot <dbl>
dput(big_test)
structure(list(id = c(1854, 1855, 1856), ctr_n = c("Israel",
"Israel", "Israel"), ctr = c(376, 376, 376), yr = c(2019, 2019,
2020), mn = c(4, 9, 3), nvvi = c(1, 1, 1), ENP_nat = c(3.50348063163162,
2.51319610127466, 3.78468892335972), ENP_avg = c(3.50348063163162,
2.51319610127466, 3.78468892335972), ENP_wght = c(3.50348063163162,
2.51319610127466, 3.78468892335972), inflation1 = c("NA", "NA",
"NA"), inflation2 = c("NA", "NA", "NA"), inflation3 = c("NA",
"NA", "NA"), inflation4 = c("NA", "NA", "NA"), PSNS = c("NA",
"NA", "NA"), PSNS_s = c(0.961748183147869, 0.992275075925835,
0.998547438416594), PSNS_w = c("NA", "NA", "NA"), PSNS_sw = c("NA",
"NA", "NA"), local_E = c(1, 1, 1), cst_tot = c(1, 1, 1)), row.names = c(NA,
-3L), class = c("tbl_df", "tbl", "data.frame"))
编辑:这里的 NAs 用引号引起来,这是不合适的。我认为问题出在写入 .xlsx;下面的 dput
输出中列出了正确的版本,NA 上没有引号。
如您所见,数据在国家层面通过选举分开,这里的每一行都应该是唯一的(即以色列,2019 年,第 4 个月)。 我想创建一个字符列,列出此输出中缺少的变量。这是所需列的示例:
desired_output <- tibble(missing_vars=paste("inflation1","inflation2","inflation3","inflation4","etc",sep=";"))
head(desired_output)
# A tibble: 1 x 1
missing_vars
<chr>
1 inflation1;inflation2;inflation3;inflation4;etc
因此,我想知道是否有某种循环可以对唯一选举进行切片,查看缺失的列,然后将缺失的列向量化?这对于自动化至关重要,因为对于相同 country/year,某些变量可能会在其他变量存在的情况下丢失。我试图对它们进行计数,但我不知道如何将这些列名称列为字符列。
感谢您的帮助。谢谢!
如@akrun 所述,您有“NA”字符串而不是 NA。修复后,您可以定义一个缺失的函数并将其应用于每一行以创建一个新变量:
big_test <- structure(list(id = c(1854, 1855, 1856), ctr_n = c("Israel",
"Israel", "Israel"), ctr = c(376, 376, 376), yr = c(2019, 2019,
2020), mn = c(4, 9, 3), nvvi = c(1, 1, 1), ENP_nat = c(3.50348063163162,
2.51319610127466, 3.78468892335972), ENP_avg = c(3.50348063163162,
2.51319610127466, 3.78468892335972), ENP_wght = c(3.50348063163162,
2.51319610127466, 3.78468892335972), inflation1 = c(NA, NA,
NA), inflation2 = c(NA, NA, NA), inflation3 = c(NA,
NA, NA), inflation4 = c(NA, NA, NA), PSNS = c(NA,
NA, NA), PSNS_s = c(0.961748183147869, 0.992275075925835,
0.998547438416594), PSNS_w = c(NA, NA, NA), PSNS_sw = c(NA,
NA, NA), local_E = c(1, 1, 1), cst_tot = c(1, 1, 1)), row.names = c(NA,
-3L), class = c("tbl_df", "tbl", "data.frame"))
missing <- function(x) {
idx <- is.na(unlist(x))
paste(colnames(big_test)[idx], collapse=", ")
}
big_test$missing <- apply(big_test, 1, missing)
big_test$missing
#> [1] "inflation1, inflation2, inflation3, inflation4, PSNS, PSNS_w, PSNS_sw"
#> [2] "inflation1, inflation2, inflation3, inflation4, PSNS, PSNS_w, PSNS_sw"
#> [3] "inflation1, inflation2, inflation3, inflation4, PSNS, PSNS_w, PSNS_sw"
这是一个带有 tidyverse
的选项,我们在其中使用 pivot_longer
重塑为 'long' 格式,按 row_number()
、paste
列名称分组'value'
library(dplyr)
library(tidyr)
library(stringr)
big_test %>%
select(starts_with('inflation')) %>%
mutate(rn = row_number()) %>%
pivot_longer(cols = -rn) %>%
group_by(rn) %>%
summarise(missing_vars = str_c(name[is.na(value)], collapse=";"),
.groups = 'drop') %>%
select(-rn)
不整形,一个选项是rowwise/c_across
big_test %>%
rowwise %>%
transmute(missing_vars = str_c(names(select(cur_data(),
starts_with('inflation')))[which(c_across(starts_with('inflation'))
== 'NA')], collapse=";"))
这里是比较(==
)和"NA"。如果是真实的 NA
,请使用 is.na
而不是 ==