根据公共列合并两个数据集
merge two datasets based on common column
我有一个问题。就像现在我有两个 files:sampleattributes 和 genecount.I 过滤样本属性文件,它有一个列名 sampid,genecount 有一个列名 sampid。我正在尝试使用通用 sampid 合并这两个文件。这是我写的:
GTEx_Analysis_v8_Annotations_SampleAttributesDS <- read_delim("/new_gtex/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt",delim = "\t", escape_double = FALSE,trim_ws = TRUE)
sample_attributes <- select(GTEx_Analysis_v8_Annotations_SampleAttributesDS,SAMPID,SMTS,SMTSD,SMAFRZE)
sample_attributes_braindata <- sample_attributes %>% filter(sample_attributes$SMTS == "Brain" & sample_attributes$SMAFRZE == "RNASEQ")
sample_attributes_braindata <- data.frame(sample_attributes_braindata)
GTEx_Analysis_gene_reads <- read_table2("/new_gtex/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_reads.gct")
GTEx_Analysis_gene_reads <- data.frame(GTEx_Analysis_gene_reads)
gene_count <- data.frame(t(GTEx_Analysis_gene_reads[,-c(1:2)]))
colnames(gene_count) <- GTEx_Analysis_gene_reads$Name
这是我的 sample_Attributes_braindata 的样子:
这就是我的 gene_count 数据的样子:
我尝试使用此命令将 gene_count 第一列重命名为 GTEX ID:
colnames(gene_count) <- GTEx_Analysis_gene_reads$Name
但这并没有发生。
我也试过这个命令用 sampid 重命名第一列:
colnames(gene_count)[1] <- "SAMPID"
我想做的是通过公共列 SAMPID 或 GTEXid
合并两个数据集
genecount2 <- merge(sample_attributes_braindata,gene_count, by=SAMPID)
dput(gene_count[1:5, 1:4])
structure(list(ENSG00000223972.5 = c(0, 0, 0, 0, 0), ENSG00000227232.5 = c(187,
109, 143, 251, 113), ENSG00000278267.1 = c(0, 0, 1, 0, 0), ENSG00000243485.5 = c(1,
0, 0, 1, 0)), row.names = c("GTEX.1117F.0226.SM.5GZZ7", "GTEX.1117F.0426.SM.5EGHI",
"GTEX.1117F.0526.SM.5EGHJ", "GTEX.1117F.0626.SM.5N9CS", "GTEX.1117F.0726.SM.5GIEN"
), class = "data.frame")
dput((sample_attributes_braindata[1:5, 1:4]))
structure(list(SAMPID = c("GTEX-1117F-3226-SM-5N9CT", "GTEX-111FC-3126-SM-5GZZ2",
"GTEX-111FC-3326-SM-5GZYV", "GTEX-1128S-2726-SM-5H12C", "GTEX-1128S-2826-SM-5N9DI"
), SMTS = c("Brain", "Brain", "Brain", "Brain", "Brain"), SMTSD = c("Brain - Cortex",
"Brain - Cortex", "Brain - Cerebellum", "Brain - Cortex", "Brain - Cerebellum"
), SMAFRZE = c("RNASEQ", "RNASEQ", "RNASEQ", "RNASEQ", "RNASEQ"
)), row.names = c(NA, 5L), class = "data.frame")
查看您的 gene_count
数据,它没有用于 SAMPID 的列,这些是作为行名称导入的。我们会将它们转换为实际的列,将 "."
替换为 "-"
以便它们匹配 braindata 格式,然后我们可以加入。您的示例数据没有任何共同元素,因此我使用 full_join
,但您可能更喜欢左联接、右联接或内联接——我不太确定您的用例是什么。
library(dplyr)
gene_count %>%
rownames_to_column(var = "SAMPID") %>%
mutate(SAMPID = gsub(pattern = ".", replacement = "-", x = SAMPID, fixed = TRUE)) %>%
full_join(sample_attributes_braindata, by = "SAMPID")
我有一个问题。就像现在我有两个 files:sampleattributes 和 genecount.I 过滤样本属性文件,它有一个列名 sampid,genecount 有一个列名 sampid。我正在尝试使用通用 sampid 合并这两个文件。这是我写的:
GTEx_Analysis_v8_Annotations_SampleAttributesDS <- read_delim("/new_gtex/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt",delim = "\t", escape_double = FALSE,trim_ws = TRUE)
sample_attributes <- select(GTEx_Analysis_v8_Annotations_SampleAttributesDS,SAMPID,SMTS,SMTSD,SMAFRZE)
sample_attributes_braindata <- sample_attributes %>% filter(sample_attributes$SMTS == "Brain" & sample_attributes$SMAFRZE == "RNASEQ")
sample_attributes_braindata <- data.frame(sample_attributes_braindata)
GTEx_Analysis_gene_reads <- read_table2("/new_gtex/GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_reads.gct")
GTEx_Analysis_gene_reads <- data.frame(GTEx_Analysis_gene_reads)
gene_count <- data.frame(t(GTEx_Analysis_gene_reads[,-c(1:2)]))
colnames(gene_count) <- GTEx_Analysis_gene_reads$Name
这是我的 sample_Attributes_braindata 的样子:
我尝试使用此命令将 gene_count 第一列重命名为 GTEX ID:
colnames(gene_count) <- GTEx_Analysis_gene_reads$Name
但这并没有发生。
我也试过这个命令用 sampid 重命名第一列:
colnames(gene_count)[1] <- "SAMPID"
genecount2 <- merge(sample_attributes_braindata,gene_count, by=SAMPID)
dput(gene_count[1:5, 1:4])
structure(list(ENSG00000223972.5 = c(0, 0, 0, 0, 0), ENSG00000227232.5 = c(187,
109, 143, 251, 113), ENSG00000278267.1 = c(0, 0, 1, 0, 0), ENSG00000243485.5 = c(1,
0, 0, 1, 0)), row.names = c("GTEX.1117F.0226.SM.5GZZ7", "GTEX.1117F.0426.SM.5EGHI",
"GTEX.1117F.0526.SM.5EGHJ", "GTEX.1117F.0626.SM.5N9CS", "GTEX.1117F.0726.SM.5GIEN"
), class = "data.frame")
dput((sample_attributes_braindata[1:5, 1:4]))
structure(list(SAMPID = c("GTEX-1117F-3226-SM-5N9CT", "GTEX-111FC-3126-SM-5GZZ2",
"GTEX-111FC-3326-SM-5GZYV", "GTEX-1128S-2726-SM-5H12C", "GTEX-1128S-2826-SM-5N9DI"
), SMTS = c("Brain", "Brain", "Brain", "Brain", "Brain"), SMTSD = c("Brain - Cortex",
"Brain - Cortex", "Brain - Cerebellum", "Brain - Cortex", "Brain - Cerebellum"
), SMAFRZE = c("RNASEQ", "RNASEQ", "RNASEQ", "RNASEQ", "RNASEQ"
)), row.names = c(NA, 5L), class = "data.frame")
查看您的 gene_count
数据,它没有用于 SAMPID 的列,这些是作为行名称导入的。我们会将它们转换为实际的列,将 "."
替换为 "-"
以便它们匹配 braindata 格式,然后我们可以加入。您的示例数据没有任何共同元素,因此我使用 full_join
,但您可能更喜欢左联接、右联接或内联接——我不太确定您的用例是什么。
library(dplyr)
gene_count %>%
rownames_to_column(var = "SAMPID") %>%
mutate(SAMPID = gsub(pattern = ".", replacement = "-", x = SAMPID, fixed = TRUE)) %>%
full_join(sample_attributes_braindata, by = "SAMPID")