如何使用具有 blank/missing 值的唯一函数
How to use unique function with blank/missing values
当有 blank/missing 个值时,如何使下面的数据框行唯一地依赖于第二列?
> head(interproscan)
V1 V14
1 sp0000001-mRNA-1
2 sp0000001-mRNA-1
3 sp0000001-mRNA-1
4 sp0000005-mRNA-1 GO:0003723
5 sp0000006-mRNA-1 GO:0016021
6 sp0000006-mRNA-1 GO:0016021
> head(unique(interproscan[ , 1:2] ))
V1 V14
1 sp0000001-mRNA-1
4 sp0000005-mRNA-1 GO:0003723
5 sp0000006-mRNA-1 GO:0016021
7 sp0000006-mRNA-2 GO:0016021
9 sp0000006-mRNA-3 GO:0016021
目标是:
V1 V14
1 sp0000001-mRNA-1
4 sp0000005-mRNA-1 GO:0003723
5 sp0000006-mRNA-1 GO:0016021
提前致谢
您需要修改 V1
以按照您想要的方式对其进行分组。我使用 gsub
丢弃最后一个 -number
后缀。
library(dplyr)
ans <- df %>%
group_by(gsub("-\d","",V1), V14) %>% # now it groups the way you want
arrange(V1) %>% # unnecessary for your toy example but just in case for your full data
slice(1) %>% # select top row-entry
ungroup() %>%
select(-4) # discard intermediate grouping variable
输出
# A tibble: 3 x 3
id V1 V14
<int> <chr> <chr>
1 1 sp0000001-mRNA-1
2 4 sp0000005-mRNA-1 GO:0003723
3 5 sp0000006-mRNA-1 GO:0016021
数据
df <- structure(list(id = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 9L), V1 = c("sp0000001-mRNA-1",
"sp0000001-mRNA-1", "sp0000001-mRNA-1", "sp0000005-mRNA-1", "sp0000006-mRNA-1",
"sp0000006-mRNA-1", "sp0000006-mRNA-2", "sp0000006-mRNA-3"),
V14 = c("", "", "", "GO:0003723", "GO:0016021", "GO:0016021",
"GO:0016021", "GO:0016021")), class = "data.frame", .Names = c("id",
"V1", "V14"), row.names = c(NA, -8L))
id V1 V14
1 1 sp0000001-mRNA-1
2 2 sp0000001-mRNA-1
3 3 sp0000001-mRNA-1
4 4 sp0000005-mRNA-1 GO:0003723
5 5 sp0000006-mRNA-1 GO:0016021
6 6 sp0000006-mRNA-1 GO:0016021
7 7 sp0000006-mRNA-2 GO:0016021
8 9 sp0000006-mRNA-3 GO:0016021
用数据框或数据试试这个 table:
interproscan <- data.frame(interproscan)
unique(interproscan)
输出:
V1 V14
1 sp0000001-mRNA-1
4 sp0000005-mRNA-1 GO:0003723
5 sp0000006-mRNA-1 GO:0016021
示例数据:
require(data.table)
interproscan <- fread("V1, V14
sp0000001-mRNA-1,
sp0000001-mRNA-1,
sp0000001-mRNA-1,
sp0000005-mRNA-1, GO:0003723
sp0000006-mRNA-1, GO:0016021
sp0000006-mRNA-1, GO:0016021")
当有 blank/missing 个值时,如何使下面的数据框行唯一地依赖于第二列?
> head(interproscan)
V1 V14
1 sp0000001-mRNA-1
2 sp0000001-mRNA-1
3 sp0000001-mRNA-1
4 sp0000005-mRNA-1 GO:0003723
5 sp0000006-mRNA-1 GO:0016021
6 sp0000006-mRNA-1 GO:0016021
> head(unique(interproscan[ , 1:2] ))
V1 V14
1 sp0000001-mRNA-1
4 sp0000005-mRNA-1 GO:0003723
5 sp0000006-mRNA-1 GO:0016021
7 sp0000006-mRNA-2 GO:0016021
9 sp0000006-mRNA-3 GO:0016021
目标是:
V1 V14
1 sp0000001-mRNA-1
4 sp0000005-mRNA-1 GO:0003723
5 sp0000006-mRNA-1 GO:0016021
提前致谢
您需要修改 V1
以按照您想要的方式对其进行分组。我使用 gsub
丢弃最后一个 -number
后缀。
library(dplyr)
ans <- df %>%
group_by(gsub("-\d","",V1), V14) %>% # now it groups the way you want
arrange(V1) %>% # unnecessary for your toy example but just in case for your full data
slice(1) %>% # select top row-entry
ungroup() %>%
select(-4) # discard intermediate grouping variable
输出
# A tibble: 3 x 3
id V1 V14
<int> <chr> <chr>
1 1 sp0000001-mRNA-1
2 4 sp0000005-mRNA-1 GO:0003723
3 5 sp0000006-mRNA-1 GO:0016021
数据
df <- structure(list(id = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 9L), V1 = c("sp0000001-mRNA-1",
"sp0000001-mRNA-1", "sp0000001-mRNA-1", "sp0000005-mRNA-1", "sp0000006-mRNA-1",
"sp0000006-mRNA-1", "sp0000006-mRNA-2", "sp0000006-mRNA-3"),
V14 = c("", "", "", "GO:0003723", "GO:0016021", "GO:0016021",
"GO:0016021", "GO:0016021")), class = "data.frame", .Names = c("id",
"V1", "V14"), row.names = c(NA, -8L))
id V1 V14
1 1 sp0000001-mRNA-1
2 2 sp0000001-mRNA-1
3 3 sp0000001-mRNA-1
4 4 sp0000005-mRNA-1 GO:0003723
5 5 sp0000006-mRNA-1 GO:0016021
6 6 sp0000006-mRNA-1 GO:0016021
7 7 sp0000006-mRNA-2 GO:0016021
8 9 sp0000006-mRNA-3 GO:0016021
用数据框或数据试试这个 table:
interproscan <- data.frame(interproscan)
unique(interproscan)
输出:
V1 V14
1 sp0000001-mRNA-1
4 sp0000005-mRNA-1 GO:0003723
5 sp0000006-mRNA-1 GO:0016021
示例数据:
require(data.table)
interproscan <- fread("V1, V14
sp0000001-mRNA-1,
sp0000001-mRNA-1,
sp0000001-mRNA-1,
sp0000005-mRNA-1, GO:0003723
sp0000006-mRNA-1, GO:0016021
sp0000006-mRNA-1, GO:0016021")