在 R 中,寻找更好的方法从子组(组内组)中获取最大值
In R, looking for better way to get the max value out of subgroups (groups within groups)
对于包含 X、Y、Z 组的给定 DF,我想将包含最大值的行保留在多个列的每一列中(A_Column、C_Column、N_Column) 如果该行有相应的 PI 值。例如,对于 X 组,应为各自列中的 PI 值 C 和 A 保留最大值。下面是我的尝试 - 是否有更短/更优雅的方法来实现相同的结果?
我的首发DF:
> DF
Group H PI PC A_Column C_Column N_Column
1 X AA001 C C 2 -0.10 22
2 X A16 A C 3 -0.12 13
3 X A2015_01 C C 5 -0.80 51
4 X AA1 C A 2 -0.30 32
5 Y AAA-16-04 A A 5 -0.20 15
6 Y A01_2009 O A 8 -0.40 28
7 Z AA02 A A 17 -0.30 12
8 Z AAD003-2014 A N 3 -0.13 43
9 Z AD4 N N 5 -0.60 45
我最终得到的输出:
> DF_max
Group H PI PC A_Column C_Column N_Column
1 X AA001 C C 2 -0.10 22
2 X A16 A C 3 -0.12 13
5 Y AAA-16-04 A A 5 -0.20 15
7 Z AA02 A A 17 -0.30 12
9 Z AD4 N N 5 -0.60 45
我的代码:
library(dplyr)
library(tidyverse)
# toy example to get the maximum values out of every group
Group <- c("X","X","X","X","Y","Y","Z","Z","Z")
H <-c("AA001","A16","A2015_01","AA1","AAA-16-04","A01_2009","AA02","AAD003-2014","AD4")
PI <- c("C","A","C","C","A","O","A","A","N")
PC <- c("C","C","C","A","A","A","A","N","N")
A_Column <- c(2,3,5,2,5,8,17,3,5)
C_Column <- -c(.1,.12,.8,.3,.2,.4,.3,.13,.6)
N_Column <- c(22,13,51,32,15,28,12,43,45)
DF <- data.frame(Group, H, PI, PC, A_Column, C_Column, N_Column)
DF
# tidy data column Values-Labels before using dplyr
gather_DF <- gather(DF, key = Col_labels, value = Obs, -Group, -H, -PI, -PC)
gather_DF
# look for value label matches within each group
gather_DF$Col_labels_match <- gather_DF$Col_labels
map = setNames(c("A", "C", "N"), c("A_Column", "C_Column", "N_Column"))
gather_DF$Col_labels_match <- map[unlist(gather_DF$Col_labels_match)]
# get max values per group, where PI equals Col_labels
max_DF <- gather_DF %>% filter(Col_labels_match==PI) %>% group_by(Group, PI) %>% top_n(1, Obs)
max_ID <- unique(max_DF$H)
DF_max <- DF[which(DF$H %in% max_ID),] # pull max values out of DF to get original formatting
DF_max
更新:
下面的代码使用@Arun 的 data.table 方法 - 但是如果 PI 没有放在引号中(见下文;我使用 "PI" 而不是 PI 更正)并且输出没有正确的最大值对于组 gY(即将出现 G=6 而不是 G=17)。
df <- structure(list(Group = c("gX", "gX", "gY", "gY", "gY", "gZ",
"gW", "gW", "gV", "gV", "gV", "gT", "gR", "gR", "gR", "gR", "gR",
"gS", "gQ", "gL"), PI = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 1L,
1L, 3L, 3L, 3L, 2L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 1L), .Label = c("C",
"G", "O"), class = "factor"), H = 1:20, PC = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 1L, 1L, 4L, 4L, 4L, 3L, 4L, 4L, 4L, 4L, 4L,
2L, 2L, 1L), .Label = c("C", "G", "I", "O"), class = "factor"),
C = c(NA, NA, NA, NA, NA, NA, 3, 1, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, 2), I = c(NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1), G = c(16,
10, 6, 17, 12, 14, 13, 11, NA, NA, NA, 9, 5, 2, 15, 3, 1,
7, 8, 4), N = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_), O = c(NA, NA, NA, NA, NA,
NA, NA, NA, 3, 2, 1, NA, 8, 4, 5, 7, 6, NA, NA, NA)), .Names = c("Group",
"PI", "H", "PC", "C", "I", "G", "N", "O"), row.names = c(NA,
-20L), class = "data.frame")
df
dt <- data.table(Group=df$Group, PI=df$PI, H=df$H, PC=df$PC,
C = df$C,
I = df$I,
G = df$G,
N = df$N,
O = df$O)
> Maxdt <- dt[PI %in% c("C", "I", "G", "N", "O"),
> + .SD[which.max(get(PI))],
> + by=.(Group, PI)]
Error in get(PI) : invalid first argument
Maxdt <- dt[PI %in% c("C", "I", "G", "N", "O"),
.SD[which.max(get("PI"))],
by=.(Group, PI)]
> Maxdt
Group PI H PC C I G N O
1: gX G 1 G NA NA 16 NA NA
2: gY G 3 G NA NA 6 NA NA
3: gZ G 6 G NA NA 14 NA NA
4: gW C 7 C 3 NA 13 NA NA
5: gV O 9 O NA NA NA NA 3
6: gT G 12 I NA NA 9 NA NA
7: gR O 13 O NA NA 5 NA 8
8: gS G 18 G NA NA 7 NA NA
9: gQ G 19 G NA NA 8 NA NA
10: gL C 20 C 2 1 4 NA NA
您在找这样的东西吗?
DF %>%
group_by(Group, PI) %>%
mutate(sel = (A_Column == max(A_Column)) + (C_Column == max(C_Column)) + (N_Column == max(N_Column))) %>%
filter(sel == max(sel)) %>%
select(-sel)
以上代码的结果为:
# Source: local data frame [6 x 7]
# Groups: Group, PI [6]
#
# Group H PI PC A_Column C_Column N_Column
# <fctr> <fctr> <fctr> <fctr> <dbl> <dbl> <dbl>
# 1 X A16 A C 3 -0.12 13
# 2 X A2015_01 C C 5 -0.80 51
# 3 Y AAA-16-04 A A 5 -0.20 15
# 4 Y A01_2009 O A 8 -0.40 28
# 5 Z AAD003-2014 A N 3 -0.13 43
# 6 Z AD4 N N 5 -0.60 45
不确定这是更短还是更优雅,但是 data.table
这样做的方法是:
# Convert to data.table, rename columns and convert PI from factor to character
library(data.table)
setDT(DF)
setnames(DF, c("A_Column", "C_Column", "N_Column"), c("A", "C", "N"))
DF[, PI := as.character(PI)]
# Melt A, C and N columns to allow filtering (variable == PI) when finding
# max value by group and PI
dt_melt <- melt(DF, id.vars = c("Group", "H", "PI", "PC"))
dt_max <- dt_melt[variable == PI, .(value = max(value)),
by = .(Group, PI, variable)]
# Use Group, PI and max values as index to pull original rows from melt, and
# unique to get rid of ties
dt_idx <- unique(dt_melt[dt_max[, .(Group, PI, variable, value)],
on = c("Group", "PI", "variable", "value")])
# Use Group, H, PI, and PC to pull original rows from DF
out <- DF[dt_idx[, .(Group, H, PI, PC)], on = c("Group", "H", "PI", "PC")]
setnames(out, c("A", "C", "N"), c("A_Column", "C_Column", "N_Column"))
# > out
# Group H PI PC A_Column C_Column N_Column
# 1: X A16 A C 3 -0.12 13
# 2: Y AAA-16-04 A A 5 -0.20 15
# 3: Z AA02 A A 17 -0.30 12
# 4: X AA001 C C 2 -0.10 22
# 5: Z AD4 N N 5 -0.60 45
这是使用 data.table 的另一种方法:
dt <- data.table(Group, H, PI, PC, A=A_Column, C=C_Column, N=N_Column)
dt[PI %in% c("A", "C", "N"),
.SD[which.max(get(PI))],
by=.(Group, PI)]
# Group PI H PC A C N
# 1: X C AA001 C 2 -0.10 22
# 2: X A A16 C 3 -0.12 13
# 3: Y A AAA-16-04 A 5 -0.20 15
# 4: Z A AA02 A 17 -0.30 12
# 5: Z N AD4 N 5 -0.60 45
跟踪正在发生的事情应该非常简单。在 PI
匹配 A,C,N
的任何行上,我们按 Group, PI
分组。.SD
包含 S 子集 [=每组 33=]Data..
get(PI)
returns返回PI
中存储的字符值对应的值和PI
指向的那一列的最大值对应的行每个 Group, PI
.
例如,对于第一个组合,Group=X, PI=C
。 get(PI)
== get("C")
which returns c(-0.1, -0.8, -0.3)
对于 which.max
returns 索引 1.
的那个组
对于包含 X、Y、Z 组的给定 DF,我想将包含最大值的行保留在多个列的每一列中(A_Column、C_Column、N_Column) 如果该行有相应的 PI 值。例如,对于 X 组,应为各自列中的 PI 值 C 和 A 保留最大值。下面是我的尝试 - 是否有更短/更优雅的方法来实现相同的结果?
我的首发DF:
> DF
Group H PI PC A_Column C_Column N_Column
1 X AA001 C C 2 -0.10 22
2 X A16 A C 3 -0.12 13
3 X A2015_01 C C 5 -0.80 51
4 X AA1 C A 2 -0.30 32
5 Y AAA-16-04 A A 5 -0.20 15
6 Y A01_2009 O A 8 -0.40 28
7 Z AA02 A A 17 -0.30 12
8 Z AAD003-2014 A N 3 -0.13 43
9 Z AD4 N N 5 -0.60 45
我最终得到的输出:
> DF_max
Group H PI PC A_Column C_Column N_Column
1 X AA001 C C 2 -0.10 22
2 X A16 A C 3 -0.12 13
5 Y AAA-16-04 A A 5 -0.20 15
7 Z AA02 A A 17 -0.30 12
9 Z AD4 N N 5 -0.60 45
我的代码:
library(dplyr)
library(tidyverse)
# toy example to get the maximum values out of every group
Group <- c("X","X","X","X","Y","Y","Z","Z","Z")
H <-c("AA001","A16","A2015_01","AA1","AAA-16-04","A01_2009","AA02","AAD003-2014","AD4")
PI <- c("C","A","C","C","A","O","A","A","N")
PC <- c("C","C","C","A","A","A","A","N","N")
A_Column <- c(2,3,5,2,5,8,17,3,5)
C_Column <- -c(.1,.12,.8,.3,.2,.4,.3,.13,.6)
N_Column <- c(22,13,51,32,15,28,12,43,45)
DF <- data.frame(Group, H, PI, PC, A_Column, C_Column, N_Column)
DF
# tidy data column Values-Labels before using dplyr
gather_DF <- gather(DF, key = Col_labels, value = Obs, -Group, -H, -PI, -PC)
gather_DF
# look for value label matches within each group
gather_DF$Col_labels_match <- gather_DF$Col_labels
map = setNames(c("A", "C", "N"), c("A_Column", "C_Column", "N_Column"))
gather_DF$Col_labels_match <- map[unlist(gather_DF$Col_labels_match)]
# get max values per group, where PI equals Col_labels
max_DF <- gather_DF %>% filter(Col_labels_match==PI) %>% group_by(Group, PI) %>% top_n(1, Obs)
max_ID <- unique(max_DF$H)
DF_max <- DF[which(DF$H %in% max_ID),] # pull max values out of DF to get original formatting
DF_max
更新: 下面的代码使用@Arun 的 data.table 方法 - 但是如果 PI 没有放在引号中(见下文;我使用 "PI" 而不是 PI 更正)并且输出没有正确的最大值对于组 gY(即将出现 G=6 而不是 G=17)。
df <- structure(list(Group = c("gX", "gX", "gY", "gY", "gY", "gZ",
"gW", "gW", "gV", "gV", "gV", "gT", "gR", "gR", "gR", "gR", "gR",
"gS", "gQ", "gL"), PI = structure(c(2L, 2L, 2L, 2L, 2L, 2L, 1L,
1L, 3L, 3L, 3L, 2L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 1L), .Label = c("C",
"G", "O"), class = "factor"), H = 1:20, PC = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 1L, 1L, 4L, 4L, 4L, 3L, 4L, 4L, 4L, 4L, 4L,
2L, 2L, 1L), .Label = c("C", "G", "I", "O"), class = "factor"),
C = c(NA, NA, NA, NA, NA, NA, 3, 1, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, 2), I = c(NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1), G = c(16,
10, 6, 17, 12, 14, 13, 11, NA, NA, NA, 9, 5, 2, 15, 3, 1,
7, 8, 4), N = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_, NA_real_,
NA_real_, NA_real_, NA_real_), O = c(NA, NA, NA, NA, NA,
NA, NA, NA, 3, 2, 1, NA, 8, 4, 5, 7, 6, NA, NA, NA)), .Names = c("Group",
"PI", "H", "PC", "C", "I", "G", "N", "O"), row.names = c(NA,
-20L), class = "data.frame")
df
dt <- data.table(Group=df$Group, PI=df$PI, H=df$H, PC=df$PC,
C = df$C,
I = df$I,
G = df$G,
N = df$N,
O = df$O)
> Maxdt <- dt[PI %in% c("C", "I", "G", "N", "O"),
> + .SD[which.max(get(PI))],
> + by=.(Group, PI)]
Error in get(PI) : invalid first argument
Maxdt <- dt[PI %in% c("C", "I", "G", "N", "O"),
.SD[which.max(get("PI"))],
by=.(Group, PI)]
> Maxdt
Group PI H PC C I G N O
1: gX G 1 G NA NA 16 NA NA
2: gY G 3 G NA NA 6 NA NA
3: gZ G 6 G NA NA 14 NA NA
4: gW C 7 C 3 NA 13 NA NA
5: gV O 9 O NA NA NA NA 3
6: gT G 12 I NA NA 9 NA NA
7: gR O 13 O NA NA 5 NA 8
8: gS G 18 G NA NA 7 NA NA
9: gQ G 19 G NA NA 8 NA NA
10: gL C 20 C 2 1 4 NA NA
您在找这样的东西吗?
DF %>%
group_by(Group, PI) %>%
mutate(sel = (A_Column == max(A_Column)) + (C_Column == max(C_Column)) + (N_Column == max(N_Column))) %>%
filter(sel == max(sel)) %>%
select(-sel)
以上代码的结果为:
# Source: local data frame [6 x 7]
# Groups: Group, PI [6]
#
# Group H PI PC A_Column C_Column N_Column
# <fctr> <fctr> <fctr> <fctr> <dbl> <dbl> <dbl>
# 1 X A16 A C 3 -0.12 13
# 2 X A2015_01 C C 5 -0.80 51
# 3 Y AAA-16-04 A A 5 -0.20 15
# 4 Y A01_2009 O A 8 -0.40 28
# 5 Z AAD003-2014 A N 3 -0.13 43
# 6 Z AD4 N N 5 -0.60 45
不确定这是更短还是更优雅,但是 data.table
这样做的方法是:
# Convert to data.table, rename columns and convert PI from factor to character
library(data.table)
setDT(DF)
setnames(DF, c("A_Column", "C_Column", "N_Column"), c("A", "C", "N"))
DF[, PI := as.character(PI)]
# Melt A, C and N columns to allow filtering (variable == PI) when finding
# max value by group and PI
dt_melt <- melt(DF, id.vars = c("Group", "H", "PI", "PC"))
dt_max <- dt_melt[variable == PI, .(value = max(value)),
by = .(Group, PI, variable)]
# Use Group, PI and max values as index to pull original rows from melt, and
# unique to get rid of ties
dt_idx <- unique(dt_melt[dt_max[, .(Group, PI, variable, value)],
on = c("Group", "PI", "variable", "value")])
# Use Group, H, PI, and PC to pull original rows from DF
out <- DF[dt_idx[, .(Group, H, PI, PC)], on = c("Group", "H", "PI", "PC")]
setnames(out, c("A", "C", "N"), c("A_Column", "C_Column", "N_Column"))
# > out
# Group H PI PC A_Column C_Column N_Column
# 1: X A16 A C 3 -0.12 13
# 2: Y AAA-16-04 A A 5 -0.20 15
# 3: Z AA02 A A 17 -0.30 12
# 4: X AA001 C C 2 -0.10 22
# 5: Z AD4 N N 5 -0.60 45
这是使用 data.table 的另一种方法:
dt <- data.table(Group, H, PI, PC, A=A_Column, C=C_Column, N=N_Column)
dt[PI %in% c("A", "C", "N"),
.SD[which.max(get(PI))],
by=.(Group, PI)]
# Group PI H PC A C N
# 1: X C AA001 C 2 -0.10 22
# 2: X A A16 C 3 -0.12 13
# 3: Y A AAA-16-04 A 5 -0.20 15
# 4: Z A AA02 A 17 -0.30 12
# 5: Z N AD4 N 5 -0.60 45
跟踪正在发生的事情应该非常简单。在 PI
匹配 A,C,N
的任何行上,我们按 Group, PI
分组。.SD
包含 S 子集 [=每组 33=]Data..
get(PI)
returns返回PI
中存储的字符值对应的值和PI
指向的那一列的最大值对应的行每个 Group, PI
.
例如,对于第一个组合,Group=X, PI=C
。 get(PI)
== get("C")
which returns c(-0.1, -0.8, -0.3)
对于 which.max
returns 索引 1.