R中的特定组排名
Specific group rankings in R
我有数据框 "Category"、"ID"、"Score(t)",我想得到 "Rank(t)":
Category ID Score.08.2007 Score.09.2007 Rank.08.2007 Rank.09.2007 ...
Orange FSGBR070N3 0.16 ... 5 ...
Orange FSGBR070N3 0.05 ... 7 ...
Orange FSGBR070N3 0.11 6
Orange FS00008L4G 0.28 1
Orange FS00008VLD 0.27 2
Orange FS00008VLD 0.27 2
Orange FS00008VLD 0.27 2
Orange FS00009SQX -2.03 8
Orange FS00009SQX NA
Orange FSUSA0A1KW NA
Orange FSUSA0A1KW NA
Orange FSUSA0A1KX NA
Orange FSUSA0A1KY NA
Orange FS0000B389 NA
Banana FS000092GP 96.25 1
Banana FS000092GP 96.25 1
Banana FS000092GP 96.25 1
Banana FS000092GP 52.33 4
Banana FS0000ATLN 31.73 5
Banana FSUSA0AVMF 1.38 7
Banana FSGBR058O8 1.37 8
Banana FSGBR05845 2.24 6
排名是根据每个 "Category" 中的 "Score" 降序排列的。我很难捕捉到的附加规范是,当存在相同的分数和相同的 ID 时,对于具有不同值的以下分数,分配的排名等于前一个 ID 的排名加上共享相同 ID 的数量分数(示例中的排名输出列应该清楚这一点)。
NA 不应获得排名:
na.last = NA
我已经开始为等级创建一个矩阵,然后我可能需要 sort(),但我很难为时间序列和附加规范捕获它...找不到如此具体的现有问题要么。感谢帮助!
time_series <- c("08.2007","09.2007","10.2007",...)
abs_ranks_mat <- as.data.frame(mat.or.vec(nrow(ID),length(time_series)))
您的数据:
df <- structure(list(Category = c("Orange", "Orange", "Orange", "Orange",
"Orange", "Orange", "Orange", "Orange", "Orange", "Orange", "Orange",
"Orange", "Orange", "Orange", "Banana", "Banana", "Banana", "Banana",
"Banana", "Banana", "Banana", "Banana"), ID = c("FSGBR070N3",
"FSGBR070N3", "FSGBR070N3", "FS00008L4G", "FS00008VLD", "FS00008VLD",
"FS00008VLD", "FS00009SQX", "FS00009SQX", "FSUSA0A1KW", "FSUSA0A1KW",
"FSUSA0A1KX", "FSUSA0A1KY", "FS0000B389", "FS000092GP", "FS000092GP",
"FS000092GP", "FS000092GP", "FS0000ATLN", "FSUSA0AVMF", "FSGBR058O8",
"FSGBR05845"), Score.08.2007 = c(0.16, 0.05, 0.11, 0.28, 0.27,
0.27, 0.27, -2.03, NA, NA, NA, NA, NA, NA, 96.25, 96.25, 96.25,
52.33, 31.73, 1.38, 1.37, 2.24), Score.09.2007 = c(0.16, 0.05,
0.14, 0.22, 0.23, 0.27, 0.27, -2.03, NA, NA, 0.14, NA, 0.56,
NA, 96.25, 93.25, 96.25, 51.33, 31.73, 1.38, 1.37, 2.24)), .Names = c("Category",
"ID", "Score.08.2007", "Score.09.2007"), row.names = c(NA, -22L
), class = "data.frame")
循环分数并在每个类别中生成排名:
for(i in names(df)[grep("Score", names(df))]) {
df[,paste0("rank", i)] <- do.call("c", lapply(unique(df$Category), function(x){
a <- floor(rank(df[df$Category == x, i], na.last = TRUE))
a[is.na(df[df$Category == x, i])] <- NA
a <- max(a, na.rm = TRUE) - a + 1
return(a)
}))
}
df
一个解决方案使用 dplyr
。 df
是@trosendal 示例中的示例。 df3
是最终输出。
关键是使用min_rank
函数创建排名。 mutate_at
允许我们指定我们想要或不想进行排名的列。之后,我们可以更改列名并与原始数据框合并。
library(dplyr)
df <- df %>% mutate(RowID = 1:n())
df2 <- df %>%
group_by(Category) %>%
mutate_at(vars(-ID, -RowID), funs(min_rank(desc(.)))) %>%
ungroup() %>%
select(-Category, -ID) %>%
setNames(., gsub("Score", "Rank", colnames(.)))
df3 <- df %>%
left_join(df2, by = "RowID") %>%
select(-RowID)
我有数据框 "Category"、"ID"、"Score(t)",我想得到 "Rank(t)":
Category ID Score.08.2007 Score.09.2007 Rank.08.2007 Rank.09.2007 ...
Orange FSGBR070N3 0.16 ... 5 ...
Orange FSGBR070N3 0.05 ... 7 ...
Orange FSGBR070N3 0.11 6
Orange FS00008L4G 0.28 1
Orange FS00008VLD 0.27 2
Orange FS00008VLD 0.27 2
Orange FS00008VLD 0.27 2
Orange FS00009SQX -2.03 8
Orange FS00009SQX NA
Orange FSUSA0A1KW NA
Orange FSUSA0A1KW NA
Orange FSUSA0A1KX NA
Orange FSUSA0A1KY NA
Orange FS0000B389 NA
Banana FS000092GP 96.25 1
Banana FS000092GP 96.25 1
Banana FS000092GP 96.25 1
Banana FS000092GP 52.33 4
Banana FS0000ATLN 31.73 5
Banana FSUSA0AVMF 1.38 7
Banana FSGBR058O8 1.37 8
Banana FSGBR05845 2.24 6
排名是根据每个 "Category" 中的 "Score" 降序排列的。我很难捕捉到的附加规范是,当存在相同的分数和相同的 ID 时,对于具有不同值的以下分数,分配的排名等于前一个 ID 的排名加上共享相同 ID 的数量分数(示例中的排名输出列应该清楚这一点)。
NA 不应获得排名:
na.last = NA
我已经开始为等级创建一个矩阵,然后我可能需要 sort(),但我很难为时间序列和附加规范捕获它...找不到如此具体的现有问题要么。感谢帮助!
time_series <- c("08.2007","09.2007","10.2007",...)
abs_ranks_mat <- as.data.frame(mat.or.vec(nrow(ID),length(time_series)))
您的数据:
df <- structure(list(Category = c("Orange", "Orange", "Orange", "Orange",
"Orange", "Orange", "Orange", "Orange", "Orange", "Orange", "Orange",
"Orange", "Orange", "Orange", "Banana", "Banana", "Banana", "Banana",
"Banana", "Banana", "Banana", "Banana"), ID = c("FSGBR070N3",
"FSGBR070N3", "FSGBR070N3", "FS00008L4G", "FS00008VLD", "FS00008VLD",
"FS00008VLD", "FS00009SQX", "FS00009SQX", "FSUSA0A1KW", "FSUSA0A1KW",
"FSUSA0A1KX", "FSUSA0A1KY", "FS0000B389", "FS000092GP", "FS000092GP",
"FS000092GP", "FS000092GP", "FS0000ATLN", "FSUSA0AVMF", "FSGBR058O8",
"FSGBR05845"), Score.08.2007 = c(0.16, 0.05, 0.11, 0.28, 0.27,
0.27, 0.27, -2.03, NA, NA, NA, NA, NA, NA, 96.25, 96.25, 96.25,
52.33, 31.73, 1.38, 1.37, 2.24), Score.09.2007 = c(0.16, 0.05,
0.14, 0.22, 0.23, 0.27, 0.27, -2.03, NA, NA, 0.14, NA, 0.56,
NA, 96.25, 93.25, 96.25, 51.33, 31.73, 1.38, 1.37, 2.24)), .Names = c("Category",
"ID", "Score.08.2007", "Score.09.2007"), row.names = c(NA, -22L
), class = "data.frame")
循环分数并在每个类别中生成排名:
for(i in names(df)[grep("Score", names(df))]) {
df[,paste0("rank", i)] <- do.call("c", lapply(unique(df$Category), function(x){
a <- floor(rank(df[df$Category == x, i], na.last = TRUE))
a[is.na(df[df$Category == x, i])] <- NA
a <- max(a, na.rm = TRUE) - a + 1
return(a)
}))
}
df
一个解决方案使用 dplyr
。 df
是@trosendal 示例中的示例。 df3
是最终输出。
关键是使用min_rank
函数创建排名。 mutate_at
允许我们指定我们想要或不想进行排名的列。之后,我们可以更改列名并与原始数据框合并。
library(dplyr)
df <- df %>% mutate(RowID = 1:n())
df2 <- df %>%
group_by(Category) %>%
mutate_at(vars(-ID, -RowID), funs(min_rank(desc(.)))) %>%
ungroup() %>%
select(-Category, -ID) %>%
setNames(., gsub("Score", "Rank", colnames(.)))
df3 <- df %>%
left_join(df2, by = "RowID") %>%
select(-RowID)