Return某行第二大值的列名
Return the column name of the second largest value of a row
df = data.frame( ID = c (1,2,3,4,5), a = c (0,2,0,1,0),
b = c (0,3,2,NA,0), c = c(0,4,NA,NA,1),
d = c (2,5,4,NA,1))
maxn <- function(n) function(x) order(x, decreasing = TRUE)[n]
df<-df %>% mutate( second_largest=apply(.[2:5], 1, function(x) names(x)[maxn(2)(x)]) )
我用上面的R代码得到了a,b,c,d的第二大值的列名。对于ID=4,因为b,c,d存在缺失值,所以第二大值的名称应该是NA
。但是,代码 return b。我应该如何删除缺失值?
我认为您可以使用以下解决方案。我测试了一些可能的数字配置并且有效:
library(dplyr)
library(purrr)
df %>%
mutate(Name = pmap_chr(., ~ {x <- c(...)[-1];
if(sum(is.na(x)) >= 3) {
NA
} else {
ind <- which(x == max(x[!is.na(x)]))
if(length(ind) > 1) {
colnames(df[-1])[ind[2]]
} else {
colnames(df[-1])[which(x == sort(x)[length(sort(x))-1])][1]
}
}
}
))
ID a b c d Name
1 1 0 0 0 2 a
2 2 2 3 4 5 c
3 3 0 2 NA 4 b
4 4 1 NA NA NA <NA>
5 5 0 0 1 1 d
我们可以把函数改成-
maxn <- function(n) function(x) order(x, decreasing = TRUE)[!is.na(x)][n]
该代码将适用于您的方法 -
library(dplyr)
df %>%
mutate(second_largest=apply(.[2:5], 1, function(x) names(x)[maxn(2)(x)]))
# ID a b c d second_largest
#1 1 0 0 0 2 a
#2 2 2 3 4 5 c
#3 3 0 2 NA 4 b
#4 4 1 NA NA NA <NA>
#5 5 0 0 1 1 d
另一种方法
df = data.frame( ID = c (1,2,3,4,5), a = c (0,2,0,1,0),
b = c (0,3,2,NA,0), c = c(0,4,NA,NA,1),
d = c (2,5,4,NA,1))
library(dplyr, warn.conflicts = F)
df %>% group_by(ID) %>% rowwise() %>%
mutate(name = {x <- c_across(everything());
if (sum(!is.na(x)) >= 2) tail(head(names(cur_data())[order(x, decreasing = T)],2),1) else NA})
#> # A tibble: 5 x 6
#> # Rowwise: ID
#> ID a b c d name
#> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
#> 1 1 0 0 0 2 a
#> 2 2 2 3 4 5 c
#> 3 3 0 2 NA 4 b
#> 4 4 1 NA NA NA <NA>
#> 5 5 0 0 1 1 d
如果您必须为几列执行此操作
df %>% group_by(ID) %>% rowwise() %>%
mutate(name = {x <- c_across(c('a', 'c'));
if (sum(!is.na(x)) >= 2) tail(head(c('a', 'c')[order(x, decreasing = T)],2),1) else NA})
# A tibble: 5 x 6
# Rowwise: ID
ID a b c d name
<dbl> <dbl> <dbl> <dbl> <dbl> <chr>
1 1 0 0 0 2 c
2 2 2 3 4 5 a
3 3 0 2 NA 4 NA
4 4 1 NA NA NA NA
5 5 0 0 1 1 a
df = data.frame( ID = c (1,2,3,4,5), a = c (0,2,0,1,0),
b = c (0,3,2,NA,0), c = c(0,4,NA,NA,1),
d = c (2,5,4,NA,1))
maxn <- function(n) function(x) order(x, decreasing = TRUE)[n]
df<-df %>% mutate( second_largest=apply(.[2:5], 1, function(x) names(x)[maxn(2)(x)]) )
我用上面的R代码得到了a,b,c,d的第二大值的列名。对于ID=4,因为b,c,d存在缺失值,所以第二大值的名称应该是NA
。但是,代码 return b。我应该如何删除缺失值?
我认为您可以使用以下解决方案。我测试了一些可能的数字配置并且有效:
library(dplyr)
library(purrr)
df %>%
mutate(Name = pmap_chr(., ~ {x <- c(...)[-1];
if(sum(is.na(x)) >= 3) {
NA
} else {
ind <- which(x == max(x[!is.na(x)]))
if(length(ind) > 1) {
colnames(df[-1])[ind[2]]
} else {
colnames(df[-1])[which(x == sort(x)[length(sort(x))-1])][1]
}
}
}
))
ID a b c d Name
1 1 0 0 0 2 a
2 2 2 3 4 5 c
3 3 0 2 NA 4 b
4 4 1 NA NA NA <NA>
5 5 0 0 1 1 d
我们可以把函数改成-
maxn <- function(n) function(x) order(x, decreasing = TRUE)[!is.na(x)][n]
该代码将适用于您的方法 -
library(dplyr)
df %>%
mutate(second_largest=apply(.[2:5], 1, function(x) names(x)[maxn(2)(x)]))
# ID a b c d second_largest
#1 1 0 0 0 2 a
#2 2 2 3 4 5 c
#3 3 0 2 NA 4 b
#4 4 1 NA NA NA <NA>
#5 5 0 0 1 1 d
另一种方法
df = data.frame( ID = c (1,2,3,4,5), a = c (0,2,0,1,0),
b = c (0,3,2,NA,0), c = c(0,4,NA,NA,1),
d = c (2,5,4,NA,1))
library(dplyr, warn.conflicts = F)
df %>% group_by(ID) %>% rowwise() %>%
mutate(name = {x <- c_across(everything());
if (sum(!is.na(x)) >= 2) tail(head(names(cur_data())[order(x, decreasing = T)],2),1) else NA})
#> # A tibble: 5 x 6
#> # Rowwise: ID
#> ID a b c d name
#> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
#> 1 1 0 0 0 2 a
#> 2 2 2 3 4 5 c
#> 3 3 0 2 NA 4 b
#> 4 4 1 NA NA NA <NA>
#> 5 5 0 0 1 1 d
如果您必须为几列执行此操作
df %>% group_by(ID) %>% rowwise() %>%
mutate(name = {x <- c_across(c('a', 'c'));
if (sum(!is.na(x)) >= 2) tail(head(c('a', 'c')[order(x, decreasing = T)],2),1) else NA})
# A tibble: 5 x 6
# Rowwise: ID
ID a b c d name
<dbl> <dbl> <dbl> <dbl> <dbl> <chr>
1 1 0 0 0 2 c
2 2 2 3 4 5 a
3 3 0 2 NA 4 NA
4 4 1 NA NA NA NA
5 5 0 0 1 1 a