为什么函数没有给出每组簇列的得分列的最高值?
why is the function not giving the highest value of score column for each group of cluster column the top rank?
我有一个 dataframe
dt 如下所示
kmeans sd1 sd2 score gene
B4GALNT1 1 1.138399 0.9302788 0.59238585 B4GALNT1
GATA2 1 1.31817 0.9869005 0.70160114 GATA2
KBTBD8 1 0.2799195 0.25295 2.56658313 KBTBD8
LYPD6 1 0.5885738 0.5277333 1.1797581 LYPD6
MSX1 1 0.2846179 0.5276349 1.31276755 MSX1
NAP1L2 1 0.5778767 0.5252137 1.29646305 NAP1L2
PLA2G4C 1 1.545634 0.3505845 1.02694161 PLA2G4C
SLC6A15 1 3.6862153 1.7656347 0.31940624 SLC6A15
SNORA9 1 49.5847239 23.059789 0.01679016 SNORA9
STX1A 1 4.753248 2.3649298 0.17053974 STX1A
TRNP1 1 54.1230886 19.7797807 0.01907904 TRNP1
AKAP6 2 2.7115279 0.1346139 1.12646609 AKAP6
C1QL3 2 3.1646016 0.3646613 0.78840387 C1QL3
CAMK2N1 2 48.4399203 3.628805 0.05655038 CAMK2N1
CDK5R1 2 3.3858407 0.2249831 0.6292364 CDK5R1
CLSTN2 2 1.0131585 0.162797 1.96050927 CLSTN2
CNTN1 2 3.7191809 0.253088 0.83650197 CNTN1
DGKG 2 0.4607949 0.2333855 1.70445926 DGKG
DPF1 2 1.6369965 0.1873143 1.07265653 DPF1
FAM131A 2 8.7092498 1.763698 0.11250896 FAM131A
我打算按以下顺序生成下面的 table,方法是对来自 kmeans
列的行进行排序,并根据列 kmeans
提取每个 kmeans
组内的排名 score
。所以它应该如下所示
期望的输出:
kmeans sd1 sd2 score gene rank
B4GALNT1 1 1.138399 0.9302788 0.59238585 B4GALNT1 7
GATA2 1 1.31817 0.9869005 0.70160114 GATA2 6
KBTBD8 1 0.2799195 0.25295 2.56658313 KBTBD8 1
LYPD6 1 0.5885738 0.5277333 1.1797581 LYPD6 4
MSX1 1 0.2846179 0.5276349 1.31276755 MSX1 2
NAP1L2 1 0.5778767 0.5252137 1.29646305 NAP1L2 3
PLA2G4C 1 1.545634 0.3505845 1.02694161 PLA2G4C 5
SLC6A15 1 3.6862153 1.7656347 0.31940624 SLC6A15 8
SNORA9 1 49.5847239 23.059789 0.01679016 SNORA9 11
STX1A 1 4.753248 2.3649298 0.17053974 STX1A 9
TRNP1 1 54.1230886 19.7797807 0.01907904 TRNP1 10
AKAP6 2 2.7115279 0.1346139 1.12646609 AKAP6 3
C1QL3 2 3.1646016 0.3646613 0.78840387 C1QL3 6
CAMK2N1 2 48.4399203 3.628805 0.05655038 CAMK2N1 9
CDK5R1 2 3.3858407 0.2249831 0.6292364 CDK5R1 7
CLSTN2 2 1.0131585 0.162797 1.96050927 CLSTN2 1
CNTN1 2 3.7191809 0.253088 0.83650197 CNTN1 5
DGKG 2 0.4607949 0.2333855 1.70445926 DGKG 2
DPF1 2 1.6369965 0.1873143 1.07265653 DPF1 4
FAM131A 2 8.7092498 1.763698 0.11250896 FAM131A 8
但这不是我在应用以下代码时得到的结果
dt$rank <- unlist(with(dt, tapply(score, kmeans, function(x) rank(x,ties.method= "first"))))
理想情况下,在每个 kemans
列中,具有 score
最高值的行应该被赋予排名 1,但这不是我在上述命令中看到的。我哪里错了?
或者还有其他方法可以实现吗?
我们可以用 ave
而不是 tapply
来做到这一点。 ave
的优点是在得到输出的同时会保持行的原始顺序
dt$rank <- with(dt, ave(-score, kmeans, FUN = function(x) rank(x, ties.method = "first")))
dt$rank
#[1] 7 6 1 4 2 3 5 8 11 9 10 3 6 9 7 1 5 2 4 8
或使用dplyr
library(dplyr)
dt %>%
group_by(kmeans) %>%
mutate(rank = dense_rank(-score))
数据
dt <- structure(list(kmeans = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), sd1 = c(1.138399,
1.31817, 0.2799195, 0.5885738, 0.2846179, 0.5778767, 1.545634,
3.6862153, 49.5847239, 4.753248, 54.1230886, 2.7115279, 3.1646016,
48.4399203, 3.3858407, 1.0131585, 3.7191809, 0.4607949, 1.6369965,
8.7092498), sd2 = c(0.9302788, 0.9869005, 0.25295, 0.5277333,
0.5276349, 0.5252137, 0.3505845, 1.7656347, 23.059789, 2.3649298,
19.7797807, 0.1346139, 0.3646613, 3.628805, 0.2249831, 0.162797,
0.253088, 0.2333855, 0.1873143, 1.763698), score = c(0.59238585,
0.70160114, 2.56658313, 1.1797581, 1.31276755, 1.29646305, 1.02694161,
0.31940624, 0.01679016, 0.17053974, 0.01907904, 1.12646609, 0.78840387,
0.05655038, 0.6292364, 1.96050927, 0.83650197, 1.70445926, 1.07265653,
0.11250896), gene = c("B4GALNT1", "GATA2", "KBTBD8", "LYPD6",
"MSX1", "NAP1L2", "PLA2G4C", "SLC6A15", "SNORA9", "STX1A", "TRNP1",
"AKAP6", "C1QL3", "CAMK2N1", "CDK5R1", "CLSTN2", "CNTN1", "DGKG",
"DPF1", "FAM131A")), .Names = c("kmeans", "sd1", "sd2", "score",
"gene"), class = "data.frame", row.names = c("B4GALNT1", "GATA2",
"KBTBD8", "LYPD6", "MSX1", "NAP1L2", "PLA2G4C", "SLC6A15", "SNORA9",
"STX1A", "TRNP1", "AKAP6", "C1QL3", "CAMK2N1", "CDK5R1", "CLSTN2",
"CNTN1", "DGKG", "DPF1", "FAM131A"))
我有一个 dataframe
dt 如下所示
kmeans sd1 sd2 score gene
B4GALNT1 1 1.138399 0.9302788 0.59238585 B4GALNT1
GATA2 1 1.31817 0.9869005 0.70160114 GATA2
KBTBD8 1 0.2799195 0.25295 2.56658313 KBTBD8
LYPD6 1 0.5885738 0.5277333 1.1797581 LYPD6
MSX1 1 0.2846179 0.5276349 1.31276755 MSX1
NAP1L2 1 0.5778767 0.5252137 1.29646305 NAP1L2
PLA2G4C 1 1.545634 0.3505845 1.02694161 PLA2G4C
SLC6A15 1 3.6862153 1.7656347 0.31940624 SLC6A15
SNORA9 1 49.5847239 23.059789 0.01679016 SNORA9
STX1A 1 4.753248 2.3649298 0.17053974 STX1A
TRNP1 1 54.1230886 19.7797807 0.01907904 TRNP1
AKAP6 2 2.7115279 0.1346139 1.12646609 AKAP6
C1QL3 2 3.1646016 0.3646613 0.78840387 C1QL3
CAMK2N1 2 48.4399203 3.628805 0.05655038 CAMK2N1
CDK5R1 2 3.3858407 0.2249831 0.6292364 CDK5R1
CLSTN2 2 1.0131585 0.162797 1.96050927 CLSTN2
CNTN1 2 3.7191809 0.253088 0.83650197 CNTN1
DGKG 2 0.4607949 0.2333855 1.70445926 DGKG
DPF1 2 1.6369965 0.1873143 1.07265653 DPF1
FAM131A 2 8.7092498 1.763698 0.11250896 FAM131A
我打算按以下顺序生成下面的 table,方法是对来自 kmeans
列的行进行排序,并根据列 kmeans
提取每个 kmeans
组内的排名 score
。所以它应该如下所示
期望的输出:
kmeans sd1 sd2 score gene rank
B4GALNT1 1 1.138399 0.9302788 0.59238585 B4GALNT1 7
GATA2 1 1.31817 0.9869005 0.70160114 GATA2 6
KBTBD8 1 0.2799195 0.25295 2.56658313 KBTBD8 1
LYPD6 1 0.5885738 0.5277333 1.1797581 LYPD6 4
MSX1 1 0.2846179 0.5276349 1.31276755 MSX1 2
NAP1L2 1 0.5778767 0.5252137 1.29646305 NAP1L2 3
PLA2G4C 1 1.545634 0.3505845 1.02694161 PLA2G4C 5
SLC6A15 1 3.6862153 1.7656347 0.31940624 SLC6A15 8
SNORA9 1 49.5847239 23.059789 0.01679016 SNORA9 11
STX1A 1 4.753248 2.3649298 0.17053974 STX1A 9
TRNP1 1 54.1230886 19.7797807 0.01907904 TRNP1 10
AKAP6 2 2.7115279 0.1346139 1.12646609 AKAP6 3
C1QL3 2 3.1646016 0.3646613 0.78840387 C1QL3 6
CAMK2N1 2 48.4399203 3.628805 0.05655038 CAMK2N1 9
CDK5R1 2 3.3858407 0.2249831 0.6292364 CDK5R1 7
CLSTN2 2 1.0131585 0.162797 1.96050927 CLSTN2 1
CNTN1 2 3.7191809 0.253088 0.83650197 CNTN1 5
DGKG 2 0.4607949 0.2333855 1.70445926 DGKG 2
DPF1 2 1.6369965 0.1873143 1.07265653 DPF1 4
FAM131A 2 8.7092498 1.763698 0.11250896 FAM131A 8
但这不是我在应用以下代码时得到的结果
dt$rank <- unlist(with(dt, tapply(score, kmeans, function(x) rank(x,ties.method= "first"))))
理想情况下,在每个 kemans
列中,具有 score
最高值的行应该被赋予排名 1,但这不是我在上述命令中看到的。我哪里错了?
或者还有其他方法可以实现吗?
我们可以用 ave
而不是 tapply
来做到这一点。 ave
的优点是在得到输出的同时会保持行的原始顺序
dt$rank <- with(dt, ave(-score, kmeans, FUN = function(x) rank(x, ties.method = "first")))
dt$rank
#[1] 7 6 1 4 2 3 5 8 11 9 10 3 6 9 7 1 5 2 4 8
或使用dplyr
library(dplyr)
dt %>%
group_by(kmeans) %>%
mutate(rank = dense_rank(-score))
数据
dt <- structure(list(kmeans = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), sd1 = c(1.138399,
1.31817, 0.2799195, 0.5885738, 0.2846179, 0.5778767, 1.545634,
3.6862153, 49.5847239, 4.753248, 54.1230886, 2.7115279, 3.1646016,
48.4399203, 3.3858407, 1.0131585, 3.7191809, 0.4607949, 1.6369965,
8.7092498), sd2 = c(0.9302788, 0.9869005, 0.25295, 0.5277333,
0.5276349, 0.5252137, 0.3505845, 1.7656347, 23.059789, 2.3649298,
19.7797807, 0.1346139, 0.3646613, 3.628805, 0.2249831, 0.162797,
0.253088, 0.2333855, 0.1873143, 1.763698), score = c(0.59238585,
0.70160114, 2.56658313, 1.1797581, 1.31276755, 1.29646305, 1.02694161,
0.31940624, 0.01679016, 0.17053974, 0.01907904, 1.12646609, 0.78840387,
0.05655038, 0.6292364, 1.96050927, 0.83650197, 1.70445926, 1.07265653,
0.11250896), gene = c("B4GALNT1", "GATA2", "KBTBD8", "LYPD6",
"MSX1", "NAP1L2", "PLA2G4C", "SLC6A15", "SNORA9", "STX1A", "TRNP1",
"AKAP6", "C1QL3", "CAMK2N1", "CDK5R1", "CLSTN2", "CNTN1", "DGKG",
"DPF1", "FAM131A")), .Names = c("kmeans", "sd1", "sd2", "score",
"gene"), class = "data.frame", row.names = c("B4GALNT1", "GATA2",
"KBTBD8", "LYPD6", "MSX1", "NAP1L2", "PLA2G4C", "SLC6A15", "SNORA9",
"STX1A", "TRNP1", "AKAP6", "C1QL3", "CAMK2N1", "CDK5R1", "CLSTN2",
"CNTN1", "DGKG", "DPF1", "FAM131A"))