如何通过识别单个列中的字符串从数据框中提取行?

How do I extract rows from a data frame by identifying a character string in a single column?

我有一个来自 TCGAbiolinks 的数据框,需要将其缩小到仅非标准化数据。我尝试编写某种 for 循环,它将 return subset.gbmexp 中标签变量包含“未规范化”但似乎无法正确获取代码的行。大致如下:

for (x in subset.gbmexp){
   if (grep("unnormalized", x)){
    data.frame(x)
  }
}

这里是subset.gbmexp:

> dput(head(subset.gbmexp))
structure(list(file_state = c("submitted", "submitted", "submitted", 
"submitted", "submitted", "submitted"), updated_datetime = c("2017-03-04T20:47:52.066809-06:00", 
"2017-03-05T09:28:11.866514-06:00", "2017-03-05T18:29:39.863030-06:00", 
"2017-03-05T09:28:11.866514-06:00", "2017-03-05T12:32:21.948139-06:00", 
"2017-03-05T11:18:26.999142-06:00"), state = c("live", "live", 
"live", "live", "live", "live"), data_category = c("Gene expression", 
"Gene expression", "Gene expression", "Gene expression", "Gene expression", 
"Gene expression"), version = c("1", "1", "1", "1", "1", "1"), 
    file_size = c(1513300L, 1518638L, 1518861L, 436273L, 436814L, 
    1500084L), data_release = c("0.0 - 25.0", "0.0 - 25.0", "0.0 - 25.0", 
    "0.0 - 25.0", "0.0 - 25.0", "0.0 - 25.0"), submitter_id = c(NA, 
    NA, NA, NA, NA, NA), access = c("open", "open", "open", "open", 
    "open", "open"), data_format = c("TXT", "TXT", "TXT", "TXT", 
    "TXT", "TXT"), id = c("c76037b3-200b-42ea-a935-7e27b94609be", 
    "81491634-6923-4e5e-adb3-0b05170dafaf", "8ac630d3-3b35-44f9-9076-ab2edfdb33c1", 
    "954c898a-b5ae-4fec-880b-890d4a9e037b", "5cd0729b-3a3a-4de0-8014-038966f5616b", 
    "99077392-c937-4814-9a39-02787ad04ed9"), data_type = c("Gene expression quantification", 
    "Gene expression quantification", "Gene expression quantification", 
    "Gene expression quantification", "Gene expression quantification", 
    "Gene expression quantification"), type = c("file", "file", 
    "file", "file", "file", "file"), cases = c("TCGA-06-0184-01A-01R-1849-01", 
    "TCGA-06-0649-01B-01R-1849-01", "TCGA-02-2485-01A-01R-1849-01", 
    "TCGA-28-1753-01A-01R-1850-01", "TCGA-06-0680-11A-32R-A36H-07", 
    "TCGA-26-5136-01B-01R-1850-01"), file_id = c("c76037b3-200b-42ea-a935-7e27b94609be", 
    "81491634-6923-4e5e-adb3-0b05170dafaf", "8ac630d3-3b35-44f9-9076-ab2edfdb33c1", 
    "954c898a-b5ae-4fec-880b-890d4a9e037b", "5cd0729b-3a3a-4de0-8014-038966f5616b", 
    "99077392-c937-4814-9a39-02787ad04ed9"), experimental_strategy = c("RNA-Seq", 
    "RNA-Seq", "RNA-Seq", "RNA-Seq", "RNA-Seq", "RNA-Seq"), md5sum = c("446a53abb4957c031d98c5c5d8b0d389", 
    "7856260846fba1d83842bf6c28856eaf", "8e8f5d50fa60195f3c5d1c4b6986e232", 
    "a1379ab262859850649051b2df076fec", "f3cd6c2c8616ac3d89e07d5281eddc49", 
    "5c8a99cb4bbbd83b3bb24f49b5f5cb23"), tags = list(c("unnormalized", 
    "gene", "v2"), c("unnormalized", "gene", "v2"), c("unnormalized", 
    "gene", "v2"), c("normalized", "gene", "v2"), c("gene", "normalized", 
    "v2"), c("unnormalized", "gene", "v2")), platform = c("Illumina HiSeq", 
    "Illumina HiSeq", "Illumina HiSeq", "Illumina HiSeq", "Illumina HiSeq", 
    "Illumina HiSeq"), state_comment = c(NA, NA, NA, NA, NA, 
    NA), file_name = c("unc.edu.7522ddf3-0d35-4085-9f94-1ca2e38aa804.1541218.rsem.genes.results", 
    "unc.edu.acb1160e-036a-4108-a9ce-d5f954191593.1538764.rsem.genes.results", 
    "unc.edu.102a0737-7d27-46b8-a433-4f1bb5300858.1545049.rsem.genes.results", 
    "unc.edu.23b23702-8e0f-4b4c-ad92-ce7ea44939e6.1544065.rsem.genes.normalized_results", 
    "unc.edu.94f66829-3cef-4af2-9f97-2352ac85efee.2403684.rsem.genes.normalized_results", 
    "unc.edu.39b5a7b5-e2ec-442d-94c4-ba938ee79b97.1542432.rsem.genes.results"
    ), project = c("TCGA-GBM", "TCGA-GBM", "TCGA-GBM", "TCGA-GBM", 
    "TCGA-GBM", "TCGA-GBM"), center_id = c("ee7a85b3-8177-5d60-a10c-51180eb9009c", 
    "ee7a85b3-8177-5d60-a10c-51180eb9009c", "ee7a85b3-8177-5d60-a10c-51180eb9009c", 
    "ee7a85b3-8177-5d60-a10c-51180eb9009c", "ee7a85b3-8177-5d60-a10c-51180eb9009c", 
    "ee7a85b3-8177-5d60-a10c-51180eb9009c"), center_center_type = c("CGCC", 
    "CGCC", "CGCC", "CGCC", "CGCC", "CGCC"), center_code = c("07", 
    "07", "07", "07", "07", "07"), center_name = c("University of North Carolina", 
    "University of North Carolina", "University of North Carolina", 
    "University of North Carolina", "University of North Carolina", 
    "University of North Carolina"), center_namespace = c("unc.edu", 
    "unc.edu", "unc.edu", "unc.edu", "unc.edu", "unc.edu"), center_short_name = c("UNC", 
    "UNC", "UNC", "UNC", "UNC", "UNC"), sample_type = c("Primary Tumor", 
    "Primary Tumor", "Primary Tumor", "Primary Tumor", "Solid Tissue Normal", 
    "Primary Tumor"), is_ffpe = c(FALSE, FALSE, FALSE, FALSE, 
    FALSE, FALSE), cases.submitter_id = c("TCGA-06-0184", "TCGA-06-0649", 
    "TCGA-02-2485", "TCGA-28-1753", "TCGA-06-0680", "TCGA-26-5136"
    ), sample.submitter_id = c("TCGA-06-0184-01A", "TCGA-06-0649-01B", 
    "TCGA-02-2485-01A", "TCGA-28-1753-01A", "TCGA-06-0680-11A", 
    "TCGA-26-5136-01B")), row.names = c(NA, 6L), class = "data.frame")

但是,这只会 return 每个选定行的第一列中的值,我想在其中构建一个 data.frame 包括每个“非规范化”值的整行。谁能解释我需要如何重写这个?谢谢!

假设您想要select第1、2、3和6行,在标签列中包含字符串“unnormalized”,您可以这样做:

subset.gbmexp[grep("unnormalized", subset.gbmexp$tags),]