通过将函数应用于数据框中的另一列来创建新列
New column by applying function to another column in data frame
这是我正在使用的数据的示例数据框。对于那些熟悉遗传数据格式的人来说,它基本上是一个修改过的 VCF 文件。如果不是,基本上每一行都包含基因组中可能存在变异的位置的信息。
samp <- structure(list(Chrom = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L), .Label = "chr12", class = "factor"),
Pos = c(8613204L, 8613412L, 8614238L, 8614506L, 8614652L,
8614669L, 8614768L, 8614951L, 8614986L, 8615225L, 8615809L,
8616149L, 8616392L), Ref = structure(c(1L, 1L, 4L, 3L, 3L,
3L, 2L, 3L, 2L, 4L, 2L, 4L, 3L), .Label = c("A", "C", "G",
"T"), class = "factor"), Alt = structure(c(3L, 2L, 2L, 1L,
1L, 1L, 3L, 1L, 1L, 3L, 4L, 2L, 4L), .Label = c("A", "C",
"G", "T"), class = "factor"), Info = c("AC=3913;AF=0.78135;AN=5008;NS=2504;DP=-128;EAS_AF=0.9921;AMR_AF=0.8357;AFR_AF=0.5779;EUR_AF=0.7366;SAS_AF=0.8466;AA=G|||;CSQ=G|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.1881",
"AC=4051;AF=0.808906;AN=5008;NS=2504;DP=-128;EAS_AF=0.9921;AMR_AF=0.8444;AFR_AF=0.6725;EUR_AF=0.7366;SAS_AF=0.8538;AA=C|||;CSQ=C|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.1881",
"AC=4021;AF=0.802915;AN=5008;NS=2504;DP=-128;EAS_AF=0.9921;AMR_AF=0.8415;AFR_AF=0.6558;EUR_AF=0.7376;SAS_AF=0.8466;AA=T|||;CSQ=C|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.7997",
"AC=3990;AF=0.796725;AN=5008;NS=2504;DP=-128;EAS_AF=0.9921;AMR_AF=0.8386;AFR_AF=0.6339;EUR_AF=0.7376;SAS_AF=0.8466;AA=A|||;CSQ=A|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.1881",
"AC=4069;AF=0.8125;AN=5008;NS=2504;DP=17188;EAS_AF=0.9921;AMR_AF=0.8487;AFR_AF=0.6528;EUR_AF=0.7714;SAS_AF=0.8599;AA=A|||;CSQ=A|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.0029",
"AC=4044;AF=0.807508;AN=5008;NS=2504;DP=-128;EAS_AF=0.9911;AMR_AF=0.8458;AFR_AF=0.6362;EUR_AF=0.7714;SAS_AF=0.8599;AA=G|||;CSQ=A|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.0029",
NA, NA, "AC=3795;AF=0.757788;AN=5008;NS=2504;DP=-128;EAS_AF=0.9653;AMR_AF=0.7954;AFR_AF=0.5651;EUR_AF=0.7167;SAS_AF=0.82;AA=c|||;CSQ=A|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.0029",
NA, "AC=4053;AF=0.809305;AN=5008;NS=2504;DP=-128;EAS_AF=0.9921;AMR_AF=0.8458;AFR_AF=0.6362;EUR_AF=0.7724;SAS_AF=0.8671;AA=C|||;CSQ=T|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.0029",
"AC=4076;AF=0.813898;AN=5008;NS=2504;DP=-128;EAS_AF=0.9921;AMR_AF=0.8473;AFR_AF=0.6528;EUR_AF=0.7724;SAS_AF=0.8671;AA=C|||;CSQ=C|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.0029",
"AC=4052;AF=0.809105;AN=5008;NS=2504;DP=-128;EAS_AF=0.9921;AMR_AF=0.8473;AFR_AF=0.6346;EUR_AF=0.7724;SAS_AF=0.8671;AA=T|||;CSQ=T|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.0029"
), TG_rs = c("rs10770739", "rs10770740", "rs4883148", "rs4883149",
"rs4883150", "rs4883151", NA, NA, "rs7303948", NA, "rs4242889",
"rs4883154", "rs4242890")), row.names = c(NA, -13L), .Names = c("Chrom",
"Pos", "Ref", "Alt", "Info", "TG_rs"), class = "data.frame")
我想做的是从 "Info" 列中提取值。但是,此列中包含的信息在每一行中都不相同,并且并不总是以相同的顺序出现。因此,我想使用模式匹配来获取我感兴趣的值。
我写了一个小函数来提取信息列中包含的各种 "super populations"(例如 AMR、AFR、EUR、SAS、EAS)的 "allele frequencies" (AF)。
extractAF <- function(pop, vec) {
info <- unlist((strsplit(vec, ";", fixed=TRUE)))
AF <- as.numeric(unlist(strsplit((info[grep(pop, (unlist((strsplit(vec, ";", fixed=TRUE)))))]), "=", fixed=TRUE))[2])
return(AF)
}
此函数需要两个参数:'pop' 是一个字符串,用于指定要提取的超级种群;'vec' 旨在获取我的数据框的信息列。
传递单个向量时函数按预期工作:
extractAF("AFR_AF", samp[1,'Info'])
#[1] 0.5779
extractAF("AFR_AF", samp[5,'Info'])
#[1] 0.6528
但是,我希望对数据框的每一行都这样做,并创建一个包含数据的新列。当我使用 dplyr 的 mutate 函数时,我得到一个具有相同值的列:
library("dplyr")
mutate(samp, AFR_AF = extractAF("AFR_AF", Info))
我读到 post(我现在似乎找不到,否则我会引用它)说 mutate 是一次传递所有行,而不是我需要的逐行传递.
所以我根据这个post尝试了下面的其他几种方法:
apply(samp[,'Info'], 1, function(x) extractAF("AFR_AF", x))
apply(samp[ "Info"], 1, function(x) extractAF("AMR_AF", x)) 错误:
dim(X) 的长度必须为正数
samp[, extractAF("AMR_AF", Info), by = .I]
[.data.frame
(samp, extractAF("AMR_AF", Info), by = .I) 错误:
未使用的参数 (by = .I)
samp[, extractAF("AMR_AF", Info), by = 1:nrow(samp)]
Error in `[.data.frame`(samp, , extractAF("AMR_AF", Info), by = 1:nrow(samp)) :
unused argument (by = 1:nrow(samp))
#
更新
在下面的 INFO 列中包含 NA 和 AF=0 的其他示例数据集:
structure(list(CHROM = c("chr1", "chr1", "chr1", "chr1", "chr1",
"chr1"), POS = c(16090898L, 16091074L, 16091583L, 16092212L,
16093560L, 16093639L), ID = c("rs6429774", "rs6429776", NA,
"rs74528955", "rs904912", NA), REF = c("G", "A", "T", "C", "T", "C"),
ALT = c("A", "G", "A", "T", "A", "T"), QUAL = c(NA, NA, NA, NA, NA,
NA), FILTER = c(NA, NA, NA, NA, NA, NA), INFO =
c("AC=1606;AF=0.320687;AN=5008;NS=2504;DP=21565;EAS_AF=0.1419;AMR_AF=0.2983;AFR_AF=0.525;EUR_AF=0.3509;SAS_AF=0.2137;AA=G|||;CSQ=A|ENSG00000162458|ENST00000441801|Transcript|upstream_gene_variant|||||||96|1||||||;ERB=A||proximal_1216|Regulatory_Feature|proximal_enhancer;FUNSEQ=0.3335", "AC=1690;AF=0.33746;AN=5008;NS=2504;DP=20247;EAS_AF=0.1498;AMR_AF=0.3012;AFR_AF=0.5681;EUR_AF=0.3549;SAS_AF=0.227;AA=G|||;CSQ=G|ENSG00000162458|ENST00000441801|Transcript|5_prime_UTR_variant|81|||||||1||||||;ERB=G||proximal_1216|Regulatory_Feature|proximal_enhancer;FUNSEQ=0.3335", NA,
"AC=8;AF=0.00159744;AN=5008;NS=2504;DP=19197;EAS_AF=0.0079;AMR_AF=0;AFR_AF=0;EUR_AF=0;SAS_AF=0;AA=C|||;CSQ=T|ENSG00000162458|ENST00000441801|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000441801;ERB=T||proximal_1216|Regulatory_Feature|proximal_enhancer;FUNSEQ=0.3335", "AC=3282;AF=0.655351;AN=5008;NS=2504;DP=14721;EAS_AF=0.8343;AMR_AF=0.6916;AFR_AF=0.4259;EUR_AF=0.6531;SAS_AF=0.7577;AA=A|||;CSQ=A|ENSG00000162458|ENST00000441801|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000441801;FUNSEQ=0.1483",
"AC=5;AF=0.000998403;AN=5008;NS=2504;DP=14736;EAS_AF=0.003;AMR_AF=0;AFR_AF=0;EUR_AF=0;SAS_AF=0.002;AA=C|||;CSQ=T|ENSG00000162458|ENST00000441801|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000441801;FUNSEQ=0.1483"
)), row.names = 14:19, class = "data.frame", .Names = c("CHROM",
"POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"))
您可能不需要这些公式,因为 sub
已矢量化。首先创建一个包含所有可能代码的变量,例如(AFR、AMR、EUR 等)。使用该向量创建搜索模式以遍历 Info
列和 return 包含所有匹配项的新数据框:
all_pop <- c("AMR_AF", "AFR_AF", "EUR_AF", "SAS_AF", "EAS_AF")
pat <- paste0(".*\b", all_pop, "=(\d+(\.\d+)?)\b.*")
out <- sapply(pat, sub, "\1", samp$Info)
newdf <- setNames(as.data.frame(out), all_pop)
# AMR_AF AFR_AF EUR_AF SAS_AF EAS_AF
# 1 0.8357 0.5779 0.7366 0.8466 0.9921
# 2 0.8444 0.6725 0.7366 0.8538 0.9921
# 3 0.8415 0.6558 0.7376 0.8466 0.9921
# 4 0.8386 0.6339 0.7376 0.8466 0.9921
# 5 0.8487 0.6528 0.7714 0.8599 0.9921
# 6 0.8458 0.6362 0.7714 0.8599 0.9911
# 7 <NA> <NA> <NA> <NA> <NA>
# 8 <NA> <NA> <NA> <NA> <NA>
# 9 0.7954 0.5651 0.7167 0.82 0.9653
# 10 <NA> <NA> <NA> <NA> <NA>
# 11 0.8458 0.6362 0.7724 0.8671 0.9921
# 12 0.8473 0.6528 0.7724 0.8671 0.9921
# 13 0.8473 0.6346 0.7724 0.8671 0.9921
这是我正在使用的数据的示例数据框。对于那些熟悉遗传数据格式的人来说,它基本上是一个修改过的 VCF 文件。如果不是,基本上每一行都包含基因组中可能存在变异的位置的信息。
samp <- structure(list(Chrom = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L), .Label = "chr12", class = "factor"),
Pos = c(8613204L, 8613412L, 8614238L, 8614506L, 8614652L,
8614669L, 8614768L, 8614951L, 8614986L, 8615225L, 8615809L,
8616149L, 8616392L), Ref = structure(c(1L, 1L, 4L, 3L, 3L,
3L, 2L, 3L, 2L, 4L, 2L, 4L, 3L), .Label = c("A", "C", "G",
"T"), class = "factor"), Alt = structure(c(3L, 2L, 2L, 1L,
1L, 1L, 3L, 1L, 1L, 3L, 4L, 2L, 4L), .Label = c("A", "C",
"G", "T"), class = "factor"), Info = c("AC=3913;AF=0.78135;AN=5008;NS=2504;DP=-128;EAS_AF=0.9921;AMR_AF=0.8357;AFR_AF=0.5779;EUR_AF=0.7366;SAS_AF=0.8466;AA=G|||;CSQ=G|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.1881",
"AC=4051;AF=0.808906;AN=5008;NS=2504;DP=-128;EAS_AF=0.9921;AMR_AF=0.8444;AFR_AF=0.6725;EUR_AF=0.7366;SAS_AF=0.8538;AA=C|||;CSQ=C|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.1881",
"AC=4021;AF=0.802915;AN=5008;NS=2504;DP=-128;EAS_AF=0.9921;AMR_AF=0.8415;AFR_AF=0.6558;EUR_AF=0.7376;SAS_AF=0.8466;AA=T|||;CSQ=C|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.7997",
"AC=3990;AF=0.796725;AN=5008;NS=2504;DP=-128;EAS_AF=0.9921;AMR_AF=0.8386;AFR_AF=0.6339;EUR_AF=0.7376;SAS_AF=0.8466;AA=A|||;CSQ=A|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.1881",
"AC=4069;AF=0.8125;AN=5008;NS=2504;DP=17188;EAS_AF=0.9921;AMR_AF=0.8487;AFR_AF=0.6528;EUR_AF=0.7714;SAS_AF=0.8599;AA=A|||;CSQ=A|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.0029",
"AC=4044;AF=0.807508;AN=5008;NS=2504;DP=-128;EAS_AF=0.9911;AMR_AF=0.8458;AFR_AF=0.6362;EUR_AF=0.7714;SAS_AF=0.8599;AA=G|||;CSQ=A|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.0029",
NA, NA, "AC=3795;AF=0.757788;AN=5008;NS=2504;DP=-128;EAS_AF=0.9653;AMR_AF=0.7954;AFR_AF=0.5651;EUR_AF=0.7167;SAS_AF=0.82;AA=c|||;CSQ=A|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.0029",
NA, "AC=4053;AF=0.809305;AN=5008;NS=2504;DP=-128;EAS_AF=0.9921;AMR_AF=0.8458;AFR_AF=0.6362;EUR_AF=0.7724;SAS_AF=0.8671;AA=C|||;CSQ=T|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.0029",
"AC=4076;AF=0.813898;AN=5008;NS=2504;DP=-128;EAS_AF=0.9921;AMR_AF=0.8473;AFR_AF=0.6528;EUR_AF=0.7724;SAS_AF=0.8671;AA=C|||;CSQ=C|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.0029",
"AC=4052;AF=0.809105;AN=5008;NS=2504;DP=-128;EAS_AF=0.9921;AMR_AF=0.8473;AFR_AF=0.6346;EUR_AF=0.7724;SAS_AF=0.8671;AA=T|||;CSQ=T|ENSG00000205846|ENST00000382073|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000382073;FUNSEQ=0.0029"
), TG_rs = c("rs10770739", "rs10770740", "rs4883148", "rs4883149",
"rs4883150", "rs4883151", NA, NA, "rs7303948", NA, "rs4242889",
"rs4883154", "rs4242890")), row.names = c(NA, -13L), .Names = c("Chrom",
"Pos", "Ref", "Alt", "Info", "TG_rs"), class = "data.frame")
我想做的是从 "Info" 列中提取值。但是,此列中包含的信息在每一行中都不相同,并且并不总是以相同的顺序出现。因此,我想使用模式匹配来获取我感兴趣的值。
我写了一个小函数来提取信息列中包含的各种 "super populations"(例如 AMR、AFR、EUR、SAS、EAS)的 "allele frequencies" (AF)。
extractAF <- function(pop, vec) {
info <- unlist((strsplit(vec, ";", fixed=TRUE)))
AF <- as.numeric(unlist(strsplit((info[grep(pop, (unlist((strsplit(vec, ";", fixed=TRUE)))))]), "=", fixed=TRUE))[2])
return(AF)
}
此函数需要两个参数:'pop' 是一个字符串,用于指定要提取的超级种群;'vec' 旨在获取我的数据框的信息列。
传递单个向量时函数按预期工作:
extractAF("AFR_AF", samp[1,'Info'])
#[1] 0.5779
extractAF("AFR_AF", samp[5,'Info'])
#[1] 0.6528
但是,我希望对数据框的每一行都这样做,并创建一个包含数据的新列。当我使用 dplyr 的 mutate 函数时,我得到一个具有相同值的列:
library("dplyr")
mutate(samp, AFR_AF = extractAF("AFR_AF", Info))
我读到 post(我现在似乎找不到,否则我会引用它)说 mutate 是一次传递所有行,而不是我需要的逐行传递.
所以我根据这个post尝试了下面的其他几种方法:
apply(samp[,'Info'], 1, function(x) extractAF("AFR_AF", x))
apply(samp[ "Info"], 1, function(x) extractAF("AMR_AF", x)) 错误: dim(X) 的长度必须为正数
samp[, extractAF("AMR_AF", Info), by = .I]
[.data.frame
(samp, extractAF("AMR_AF", Info), by = .I) 错误:
未使用的参数 (by = .I)
samp[, extractAF("AMR_AF", Info), by = 1:nrow(samp)]
Error in `[.data.frame`(samp, , extractAF("AMR_AF", Info), by = 1:nrow(samp)) :
unused argument (by = 1:nrow(samp))
#
更新
在下面的 INFO 列中包含 NA 和 AF=0 的其他示例数据集:
structure(list(CHROM = c("chr1", "chr1", "chr1", "chr1", "chr1", "chr1"), POS = c(16090898L, 16091074L, 16091583L, 16092212L, 16093560L, 16093639L), ID = c("rs6429774", "rs6429776", NA, "rs74528955", "rs904912", NA), REF = c("G", "A", "T", "C", "T", "C"), ALT = c("A", "G", "A", "T", "A", "T"), QUAL = c(NA, NA, NA, NA, NA, NA), FILTER = c(NA, NA, NA, NA, NA, NA), INFO = c("AC=1606;AF=0.320687;AN=5008;NS=2504;DP=21565;EAS_AF=0.1419;AMR_AF=0.2983;AFR_AF=0.525;EUR_AF=0.3509;SAS_AF=0.2137;AA=G|||;CSQ=A|ENSG00000162458|ENST00000441801|Transcript|upstream_gene_variant|||||||96|1||||||;ERB=A||proximal_1216|Regulatory_Feature|proximal_enhancer;FUNSEQ=0.3335", "AC=1690;AF=0.33746;AN=5008;NS=2504;DP=20247;EAS_AF=0.1498;AMR_AF=0.3012;AFR_AF=0.5681;EUR_AF=0.3549;SAS_AF=0.227;AA=G|||;CSQ=G|ENSG00000162458|ENST00000441801|Transcript|5_prime_UTR_variant|81|||||||1||||||;ERB=G||proximal_1216|Regulatory_Feature|proximal_enhancer;FUNSEQ=0.3335", NA, "AC=8;AF=0.00159744;AN=5008;NS=2504;DP=19197;EAS_AF=0.0079;AMR_AF=0;AFR_AF=0;EUR_AF=0;SAS_AF=0;AA=C|||;CSQ=T|ENSG00000162458|ENST00000441801|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000441801;ERB=T||proximal_1216|Regulatory_Feature|proximal_enhancer;FUNSEQ=0.3335", "AC=3282;AF=0.655351;AN=5008;NS=2504;DP=14721;EAS_AF=0.8343;AMR_AF=0.6916;AFR_AF=0.4259;EUR_AF=0.6531;SAS_AF=0.7577;AA=A|||;CSQ=A|ENSG00000162458|ENST00000441801|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000441801;FUNSEQ=0.1483", "AC=5;AF=0.000998403;AN=5008;NS=2504;DP=14736;EAS_AF=0.003;AMR_AF=0;AFR_AF=0;EUR_AF=0;SAS_AF=0.002;AA=C|||;CSQ=T|ENSG00000162458|ENST00000441801|Transcript|intron_variant||||||||1||||||;GENCODE=ENST00000441801;FUNSEQ=0.1483" )), row.names = 14:19, class = "data.frame", .Names = c("CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"))
您可能不需要这些公式,因为 sub
已矢量化。首先创建一个包含所有可能代码的变量,例如(AFR、AMR、EUR 等)。使用该向量创建搜索模式以遍历 Info
列和 return 包含所有匹配项的新数据框:
all_pop <- c("AMR_AF", "AFR_AF", "EUR_AF", "SAS_AF", "EAS_AF")
pat <- paste0(".*\b", all_pop, "=(\d+(\.\d+)?)\b.*")
out <- sapply(pat, sub, "\1", samp$Info)
newdf <- setNames(as.data.frame(out), all_pop)
# AMR_AF AFR_AF EUR_AF SAS_AF EAS_AF
# 1 0.8357 0.5779 0.7366 0.8466 0.9921
# 2 0.8444 0.6725 0.7366 0.8538 0.9921
# 3 0.8415 0.6558 0.7376 0.8466 0.9921
# 4 0.8386 0.6339 0.7376 0.8466 0.9921
# 5 0.8487 0.6528 0.7714 0.8599 0.9921
# 6 0.8458 0.6362 0.7714 0.8599 0.9911
# 7 <NA> <NA> <NA> <NA> <NA>
# 8 <NA> <NA> <NA> <NA> <NA>
# 9 0.7954 0.5651 0.7167 0.82 0.9653
# 10 <NA> <NA> <NA> <NA> <NA>
# 11 0.8458 0.6362 0.7724 0.8671 0.9921
# 12 0.8473 0.6528 0.7724 0.8671 0.9921
# 13 0.8473 0.6346 0.7724 0.8671 0.9921