如何通过识别单个列中的字符串从数据框中提取行?
How do I extract rows from a data frame by identifying a character string in a single column?
我有一个来自 TCGAbiolinks 的数据框,需要将其缩小到仅非标准化数据。我尝试编写某种 for 循环,它将 return subset.gbmexp 中标签变量包含“未规范化”但似乎无法正确获取代码的行。大致如下:
for (x in subset.gbmexp){
if (grep("unnormalized", x)){
data.frame(x)
}
}
这里是subset.gbmexp:
> dput(head(subset.gbmexp))
structure(list(file_state = c("submitted", "submitted", "submitted",
"submitted", "submitted", "submitted"), updated_datetime = c("2017-03-04T20:47:52.066809-06:00",
"2017-03-05T09:28:11.866514-06:00", "2017-03-05T18:29:39.863030-06:00",
"2017-03-05T09:28:11.866514-06:00", "2017-03-05T12:32:21.948139-06:00",
"2017-03-05T11:18:26.999142-06:00"), state = c("live", "live",
"live", "live", "live", "live"), data_category = c("Gene expression",
"Gene expression", "Gene expression", "Gene expression", "Gene expression",
"Gene expression"), version = c("1", "1", "1", "1", "1", "1"),
file_size = c(1513300L, 1518638L, 1518861L, 436273L, 436814L,
1500084L), data_release = c("0.0 - 25.0", "0.0 - 25.0", "0.0 - 25.0",
"0.0 - 25.0", "0.0 - 25.0", "0.0 - 25.0"), submitter_id = c(NA,
NA, NA, NA, NA, NA), access = c("open", "open", "open", "open",
"open", "open"), data_format = c("TXT", "TXT", "TXT", "TXT",
"TXT", "TXT"), id = c("c76037b3-200b-42ea-a935-7e27b94609be",
"81491634-6923-4e5e-adb3-0b05170dafaf", "8ac630d3-3b35-44f9-9076-ab2edfdb33c1",
"954c898a-b5ae-4fec-880b-890d4a9e037b", "5cd0729b-3a3a-4de0-8014-038966f5616b",
"99077392-c937-4814-9a39-02787ad04ed9"), data_type = c("Gene expression quantification",
"Gene expression quantification", "Gene expression quantification",
"Gene expression quantification", "Gene expression quantification",
"Gene expression quantification"), type = c("file", "file",
"file", "file", "file", "file"), cases = c("TCGA-06-0184-01A-01R-1849-01",
"TCGA-06-0649-01B-01R-1849-01", "TCGA-02-2485-01A-01R-1849-01",
"TCGA-28-1753-01A-01R-1850-01", "TCGA-06-0680-11A-32R-A36H-07",
"TCGA-26-5136-01B-01R-1850-01"), file_id = c("c76037b3-200b-42ea-a935-7e27b94609be",
"81491634-6923-4e5e-adb3-0b05170dafaf", "8ac630d3-3b35-44f9-9076-ab2edfdb33c1",
"954c898a-b5ae-4fec-880b-890d4a9e037b", "5cd0729b-3a3a-4de0-8014-038966f5616b",
"99077392-c937-4814-9a39-02787ad04ed9"), experimental_strategy = c("RNA-Seq",
"RNA-Seq", "RNA-Seq", "RNA-Seq", "RNA-Seq", "RNA-Seq"), md5sum = c("446a53abb4957c031d98c5c5d8b0d389",
"7856260846fba1d83842bf6c28856eaf", "8e8f5d50fa60195f3c5d1c4b6986e232",
"a1379ab262859850649051b2df076fec", "f3cd6c2c8616ac3d89e07d5281eddc49",
"5c8a99cb4bbbd83b3bb24f49b5f5cb23"), tags = list(c("unnormalized",
"gene", "v2"), c("unnormalized", "gene", "v2"), c("unnormalized",
"gene", "v2"), c("normalized", "gene", "v2"), c("gene", "normalized",
"v2"), c("unnormalized", "gene", "v2")), platform = c("Illumina HiSeq",
"Illumina HiSeq", "Illumina HiSeq", "Illumina HiSeq", "Illumina HiSeq",
"Illumina HiSeq"), state_comment = c(NA, NA, NA, NA, NA,
NA), file_name = c("unc.edu.7522ddf3-0d35-4085-9f94-1ca2e38aa804.1541218.rsem.genes.results",
"unc.edu.acb1160e-036a-4108-a9ce-d5f954191593.1538764.rsem.genes.results",
"unc.edu.102a0737-7d27-46b8-a433-4f1bb5300858.1545049.rsem.genes.results",
"unc.edu.23b23702-8e0f-4b4c-ad92-ce7ea44939e6.1544065.rsem.genes.normalized_results",
"unc.edu.94f66829-3cef-4af2-9f97-2352ac85efee.2403684.rsem.genes.normalized_results",
"unc.edu.39b5a7b5-e2ec-442d-94c4-ba938ee79b97.1542432.rsem.genes.results"
), project = c("TCGA-GBM", "TCGA-GBM", "TCGA-GBM", "TCGA-GBM",
"TCGA-GBM", "TCGA-GBM"), center_id = c("ee7a85b3-8177-5d60-a10c-51180eb9009c",
"ee7a85b3-8177-5d60-a10c-51180eb9009c", "ee7a85b3-8177-5d60-a10c-51180eb9009c",
"ee7a85b3-8177-5d60-a10c-51180eb9009c", "ee7a85b3-8177-5d60-a10c-51180eb9009c",
"ee7a85b3-8177-5d60-a10c-51180eb9009c"), center_center_type = c("CGCC",
"CGCC", "CGCC", "CGCC", "CGCC", "CGCC"), center_code = c("07",
"07", "07", "07", "07", "07"), center_name = c("University of North Carolina",
"University of North Carolina", "University of North Carolina",
"University of North Carolina", "University of North Carolina",
"University of North Carolina"), center_namespace = c("unc.edu",
"unc.edu", "unc.edu", "unc.edu", "unc.edu", "unc.edu"), center_short_name = c("UNC",
"UNC", "UNC", "UNC", "UNC", "UNC"), sample_type = c("Primary Tumor",
"Primary Tumor", "Primary Tumor", "Primary Tumor", "Solid Tissue Normal",
"Primary Tumor"), is_ffpe = c(FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE), cases.submitter_id = c("TCGA-06-0184", "TCGA-06-0649",
"TCGA-02-2485", "TCGA-28-1753", "TCGA-06-0680", "TCGA-26-5136"
), sample.submitter_id = c("TCGA-06-0184-01A", "TCGA-06-0649-01B",
"TCGA-02-2485-01A", "TCGA-28-1753-01A", "TCGA-06-0680-11A",
"TCGA-26-5136-01B")), row.names = c(NA, 6L), class = "data.frame")
但是,这只会 return 每个选定行的第一列中的值,我想在其中构建一个 data.frame 包括每个“非规范化”值的整行。谁能解释我需要如何重写这个?谢谢!
假设您想要select第1、2、3和6行,在标签列中包含字符串“unnormalized”,您可以这样做:
subset.gbmexp[grep("unnormalized", subset.gbmexp$tags),]
我有一个来自 TCGAbiolinks 的数据框,需要将其缩小到仅非标准化数据。我尝试编写某种 for 循环,它将 return subset.gbmexp 中标签变量包含“未规范化”但似乎无法正确获取代码的行。大致如下:
for (x in subset.gbmexp){
if (grep("unnormalized", x)){
data.frame(x)
}
}
这里是subset.gbmexp:
> dput(head(subset.gbmexp))
structure(list(file_state = c("submitted", "submitted", "submitted",
"submitted", "submitted", "submitted"), updated_datetime = c("2017-03-04T20:47:52.066809-06:00",
"2017-03-05T09:28:11.866514-06:00", "2017-03-05T18:29:39.863030-06:00",
"2017-03-05T09:28:11.866514-06:00", "2017-03-05T12:32:21.948139-06:00",
"2017-03-05T11:18:26.999142-06:00"), state = c("live", "live",
"live", "live", "live", "live"), data_category = c("Gene expression",
"Gene expression", "Gene expression", "Gene expression", "Gene expression",
"Gene expression"), version = c("1", "1", "1", "1", "1", "1"),
file_size = c(1513300L, 1518638L, 1518861L, 436273L, 436814L,
1500084L), data_release = c("0.0 - 25.0", "0.0 - 25.0", "0.0 - 25.0",
"0.0 - 25.0", "0.0 - 25.0", "0.0 - 25.0"), submitter_id = c(NA,
NA, NA, NA, NA, NA), access = c("open", "open", "open", "open",
"open", "open"), data_format = c("TXT", "TXT", "TXT", "TXT",
"TXT", "TXT"), id = c("c76037b3-200b-42ea-a935-7e27b94609be",
"81491634-6923-4e5e-adb3-0b05170dafaf", "8ac630d3-3b35-44f9-9076-ab2edfdb33c1",
"954c898a-b5ae-4fec-880b-890d4a9e037b", "5cd0729b-3a3a-4de0-8014-038966f5616b",
"99077392-c937-4814-9a39-02787ad04ed9"), data_type = c("Gene expression quantification",
"Gene expression quantification", "Gene expression quantification",
"Gene expression quantification", "Gene expression quantification",
"Gene expression quantification"), type = c("file", "file",
"file", "file", "file", "file"), cases = c("TCGA-06-0184-01A-01R-1849-01",
"TCGA-06-0649-01B-01R-1849-01", "TCGA-02-2485-01A-01R-1849-01",
"TCGA-28-1753-01A-01R-1850-01", "TCGA-06-0680-11A-32R-A36H-07",
"TCGA-26-5136-01B-01R-1850-01"), file_id = c("c76037b3-200b-42ea-a935-7e27b94609be",
"81491634-6923-4e5e-adb3-0b05170dafaf", "8ac630d3-3b35-44f9-9076-ab2edfdb33c1",
"954c898a-b5ae-4fec-880b-890d4a9e037b", "5cd0729b-3a3a-4de0-8014-038966f5616b",
"99077392-c937-4814-9a39-02787ad04ed9"), experimental_strategy = c("RNA-Seq",
"RNA-Seq", "RNA-Seq", "RNA-Seq", "RNA-Seq", "RNA-Seq"), md5sum = c("446a53abb4957c031d98c5c5d8b0d389",
"7856260846fba1d83842bf6c28856eaf", "8e8f5d50fa60195f3c5d1c4b6986e232",
"a1379ab262859850649051b2df076fec", "f3cd6c2c8616ac3d89e07d5281eddc49",
"5c8a99cb4bbbd83b3bb24f49b5f5cb23"), tags = list(c("unnormalized",
"gene", "v2"), c("unnormalized", "gene", "v2"), c("unnormalized",
"gene", "v2"), c("normalized", "gene", "v2"), c("gene", "normalized",
"v2"), c("unnormalized", "gene", "v2")), platform = c("Illumina HiSeq",
"Illumina HiSeq", "Illumina HiSeq", "Illumina HiSeq", "Illumina HiSeq",
"Illumina HiSeq"), state_comment = c(NA, NA, NA, NA, NA,
NA), file_name = c("unc.edu.7522ddf3-0d35-4085-9f94-1ca2e38aa804.1541218.rsem.genes.results",
"unc.edu.acb1160e-036a-4108-a9ce-d5f954191593.1538764.rsem.genes.results",
"unc.edu.102a0737-7d27-46b8-a433-4f1bb5300858.1545049.rsem.genes.results",
"unc.edu.23b23702-8e0f-4b4c-ad92-ce7ea44939e6.1544065.rsem.genes.normalized_results",
"unc.edu.94f66829-3cef-4af2-9f97-2352ac85efee.2403684.rsem.genes.normalized_results",
"unc.edu.39b5a7b5-e2ec-442d-94c4-ba938ee79b97.1542432.rsem.genes.results"
), project = c("TCGA-GBM", "TCGA-GBM", "TCGA-GBM", "TCGA-GBM",
"TCGA-GBM", "TCGA-GBM"), center_id = c("ee7a85b3-8177-5d60-a10c-51180eb9009c",
"ee7a85b3-8177-5d60-a10c-51180eb9009c", "ee7a85b3-8177-5d60-a10c-51180eb9009c",
"ee7a85b3-8177-5d60-a10c-51180eb9009c", "ee7a85b3-8177-5d60-a10c-51180eb9009c",
"ee7a85b3-8177-5d60-a10c-51180eb9009c"), center_center_type = c("CGCC",
"CGCC", "CGCC", "CGCC", "CGCC", "CGCC"), center_code = c("07",
"07", "07", "07", "07", "07"), center_name = c("University of North Carolina",
"University of North Carolina", "University of North Carolina",
"University of North Carolina", "University of North Carolina",
"University of North Carolina"), center_namespace = c("unc.edu",
"unc.edu", "unc.edu", "unc.edu", "unc.edu", "unc.edu"), center_short_name = c("UNC",
"UNC", "UNC", "UNC", "UNC", "UNC"), sample_type = c("Primary Tumor",
"Primary Tumor", "Primary Tumor", "Primary Tumor", "Solid Tissue Normal",
"Primary Tumor"), is_ffpe = c(FALSE, FALSE, FALSE, FALSE,
FALSE, FALSE), cases.submitter_id = c("TCGA-06-0184", "TCGA-06-0649",
"TCGA-02-2485", "TCGA-28-1753", "TCGA-06-0680", "TCGA-26-5136"
), sample.submitter_id = c("TCGA-06-0184-01A", "TCGA-06-0649-01B",
"TCGA-02-2485-01A", "TCGA-28-1753-01A", "TCGA-06-0680-11A",
"TCGA-26-5136-01B")), row.names = c(NA, 6L), class = "data.frame")
但是,这只会 return 每个选定行的第一列中的值,我想在其中构建一个 data.frame 包括每个“非规范化”值的整行。谁能解释我需要如何重写这个?谢谢!
假设您想要select第1、2、3和6行,在标签列中包含字符串“unnormalized”,您可以这样做:
subset.gbmexp[grep("unnormalized", subset.gbmexp$tags),]