R - 提取与字符关联的数字
R - Extracting a number associated with a character
我正在尝试从化学式中提取碳、氢和氧的数量。我之前发现了一些我一直在尝试使用的代码。问题是代码仅在化学式中包含超过 1 个元素时才有效。
V <- DATA # example: CH4O, H2O, C10H18O2
# V is a data.frame
C1 <- as.integer(sub("(?i).*?C:?\s*(\d+).*", "\1", V))
# NA NA 10
H1 <- as.integer(sub("(?i).*?H:?\s*(\d+).*", "\1", V))
# 4 2 18
O1 <- as.integer(sub("(?i).*?O:?\s*(\d+).*", "\1", V))
# NA NA 2
我目前正在使用
is.na(C1) <- 1
将 NA 更改为 1,然后手动更改 0 值。是否有更有效的代码可用于获取化学式中元素的正确计数(特别是在值为 0 或 1 并导致 NA 结果的情况下)。如果您需要更多信息或者我是否应该更改某些格式,请告诉我。
编辑:
所需的值是在没有 NA 的情况下获得所有正确的计数,并在可能的情况下手动将值更改为 0。
C1
# 1 0 10
H1
# 4 2 18
O1
# 1 1 2
编辑2:
这是我正在导入的实际数据的示例
Meas. m/z # Ion Formula Score m/z err [mDa] err [ppm] mSigma rdb e¯ Conf Adduct
84.080700 1 C5H10N n.a. 84.080776 0.1 0.9 n.a. 2.0 even
89.060100 1 C4H9O2 n.a. 89.059706 -0.4 -4.4 n.a. 1.0 even
131.987800 1 C2H4N3P2 n.a. 131.987498 -0.3 -2.3 n.a. 6.0 even
135.081100 1 C9H11O n.a. 135.080441 -0.7 -4.9 n.a. 5.0 even
135.117500 1 C10H15 n.a. 135.116827 -0.7 -5.0 n.a. 4.0 even
136.061700 1 C5H6N5 n.a. 136.061772 0.1 0.5 n.a. 6.0 even
在最初的问题中,我只是将 V
列为来自 forlumas 的向量,但我实际拥有的是 data.frame 和其他信息,我在执行时使用 V[,3]
获得感兴趣列的计算。
也许这不是我写过的最优雅的代码,但对于给定的化学式,这将 return 标记向量中每个元素的计数。中间步骤,counts.per.equation
returns 计算每个方程中每个元素的个数。
library(stringr)
extract <- str_extract_all(c('CH4O', 'H2O', 'C10H18O2'), '\D\d*')
counts.per.equation <- lapply(extract, function(x) {
elements <- str_extract_all(x, '\D+')
counts <- str_extract_all(x, '\d+', simplify = T)
counts[counts == ''] <- 1
counts <- as.numeric(counts)
names(counts) <- elements
return(counts)
})
total.counts <- Reduce(function(x, y) {
names <- union(names(x), names(y))
counts <- sapply(names, function(z) sum(x[z], y[z], na.rm = T))
names(counts) <- names
return(counts)
} , counts.per.equation)
C H O
11 24 4
这是一个替代方案:
vec <- c("CH4O", "H2O", "C10H18O2", "C2H4N3P2")
molecules <- regmatches(vec, gregexpr("\b[A-Z][a-z]*\d*", vec))
molecules <- lapply(molecules, function(a) paste0(a, ifelse(grepl("[^0-9]$", a), "1", "")))
atomcounts <- lapply(molecules, function(mol) setNames(as.integer(gsub("\D", "", mol)), gsub("\d", "", mol)))
atoms <- unique(unlist(sapply(atomcounts, names)))
atoms <- sapply(atoms, function(atom) sapply(atomcounts, function(a) if (atom %in% names(a)) a[atom] else 0))
rownames(atoms) <- vec
atoms
# C H O N P
# CH4O 1 4 1 0 0
# H2O 0 2 1 0 0
# C10H18O2 10 18 2 0 0
# C2H4N3P2 2 4 0 3 2
我们可以使用 base R
方法与 strsplit
和 xtabs
out <- do.call(rbind, Map(cbind,
lapply(strsplit(gsub("(?<=[A-Z])(?![0-9])", "1", vec,
perl = TRUE), "(?<=[A-Z])(?=[0-9])|(?<=[0-9])(?=[A-Z])",
perl = TRUE), function(x) data.frame(key = x[c(TRUE, FALSE)],
value = as.numeric(x[c(FALSE, TRUE)]))), grp = vec))
xtabs(value ~ grp + key, out)
# key
#grp C H O N P
# CH4O 1 4 1 0 0
# H2O 0 2 1 0 0
# C10H18O2 10 18 2 0 0
# C2H4N3P2 2 4 0 3 2
或 tidyverse
library(stringr)
library(dplyr)
library(tidyr)
tibble(col1 = vec,
col2 = str_replace_all(col1, "(?<=[A-Z])(?![0-9])", "1")) %>%
separate_rows(col2, sep= "(?<=[A-Z])(?=[0-9])|(?<=[0-9])(?=[A-Z])") %>%
group_by(col1) %>%
summarise(key = list(col2[c(TRUE, FALSE)]),
val = list(col2[c(FALSE, TRUE)])) %>%
unnest(c(key, val)) %>%
pivot_wider(names_from = key, values_from = val, values_fill = list(val = 0))
# A tibble: 4 x 6
# col1 C H O N P
# <chr> <chr> <chr> <chr> <chr> <chr>
#1 C10H18O2 10 18 2 0 0
#2 C2H4N3P2 2 4 0 3 2
#3 CH4O 1 4 1 0 0
#4 H2O 0 2 1 0 0
数据
vec <- c("CH4O", "H2O", "C10H18O2", "C2H4N3P2")
我正在尝试从化学式中提取碳、氢和氧的数量。我之前发现了一些我一直在尝试使用的代码。问题是代码仅在化学式中包含超过 1 个元素时才有效。
V <- DATA # example: CH4O, H2O, C10H18O2
# V is a data.frame
C1 <- as.integer(sub("(?i).*?C:?\s*(\d+).*", "\1", V))
# NA NA 10
H1 <- as.integer(sub("(?i).*?H:?\s*(\d+).*", "\1", V))
# 4 2 18
O1 <- as.integer(sub("(?i).*?O:?\s*(\d+).*", "\1", V))
# NA NA 2
我目前正在使用
is.na(C1) <- 1
将 NA 更改为 1,然后手动更改 0 值。是否有更有效的代码可用于获取化学式中元素的正确计数(特别是在值为 0 或 1 并导致 NA 结果的情况下)。如果您需要更多信息或者我是否应该更改某些格式,请告诉我。
编辑: 所需的值是在没有 NA 的情况下获得所有正确的计数,并在可能的情况下手动将值更改为 0。
C1
# 1 0 10
H1
# 4 2 18
O1
# 1 1 2
编辑2: 这是我正在导入的实际数据的示例
Meas. m/z # Ion Formula Score m/z err [mDa] err [ppm] mSigma rdb e¯ Conf Adduct
84.080700 1 C5H10N n.a. 84.080776 0.1 0.9 n.a. 2.0 even
89.060100 1 C4H9O2 n.a. 89.059706 -0.4 -4.4 n.a. 1.0 even
131.987800 1 C2H4N3P2 n.a. 131.987498 -0.3 -2.3 n.a. 6.0 even
135.081100 1 C9H11O n.a. 135.080441 -0.7 -4.9 n.a. 5.0 even
135.117500 1 C10H15 n.a. 135.116827 -0.7 -5.0 n.a. 4.0 even
136.061700 1 C5H6N5 n.a. 136.061772 0.1 0.5 n.a. 6.0 even
在最初的问题中,我只是将 V
列为来自 forlumas 的向量,但我实际拥有的是 data.frame 和其他信息,我在执行时使用 V[,3]
获得感兴趣列的计算。
也许这不是我写过的最优雅的代码,但对于给定的化学式,这将 return 标记向量中每个元素的计数。中间步骤,counts.per.equation
returns 计算每个方程中每个元素的个数。
library(stringr)
extract <- str_extract_all(c('CH4O', 'H2O', 'C10H18O2'), '\D\d*')
counts.per.equation <- lapply(extract, function(x) {
elements <- str_extract_all(x, '\D+')
counts <- str_extract_all(x, '\d+', simplify = T)
counts[counts == ''] <- 1
counts <- as.numeric(counts)
names(counts) <- elements
return(counts)
})
total.counts <- Reduce(function(x, y) {
names <- union(names(x), names(y))
counts <- sapply(names, function(z) sum(x[z], y[z], na.rm = T))
names(counts) <- names
return(counts)
} , counts.per.equation)
C H O
11 24 4
这是一个替代方案:
vec <- c("CH4O", "H2O", "C10H18O2", "C2H4N3P2")
molecules <- regmatches(vec, gregexpr("\b[A-Z][a-z]*\d*", vec))
molecules <- lapply(molecules, function(a) paste0(a, ifelse(grepl("[^0-9]$", a), "1", "")))
atomcounts <- lapply(molecules, function(mol) setNames(as.integer(gsub("\D", "", mol)), gsub("\d", "", mol)))
atoms <- unique(unlist(sapply(atomcounts, names)))
atoms <- sapply(atoms, function(atom) sapply(atomcounts, function(a) if (atom %in% names(a)) a[atom] else 0))
rownames(atoms) <- vec
atoms
# C H O N P
# CH4O 1 4 1 0 0
# H2O 0 2 1 0 0
# C10H18O2 10 18 2 0 0
# C2H4N3P2 2 4 0 3 2
我们可以使用 base R
方法与 strsplit
和 xtabs
out <- do.call(rbind, Map(cbind,
lapply(strsplit(gsub("(?<=[A-Z])(?![0-9])", "1", vec,
perl = TRUE), "(?<=[A-Z])(?=[0-9])|(?<=[0-9])(?=[A-Z])",
perl = TRUE), function(x) data.frame(key = x[c(TRUE, FALSE)],
value = as.numeric(x[c(FALSE, TRUE)]))), grp = vec))
xtabs(value ~ grp + key, out)
# key
#grp C H O N P
# CH4O 1 4 1 0 0
# H2O 0 2 1 0 0
# C10H18O2 10 18 2 0 0
# C2H4N3P2 2 4 0 3 2
或 tidyverse
library(stringr)
library(dplyr)
library(tidyr)
tibble(col1 = vec,
col2 = str_replace_all(col1, "(?<=[A-Z])(?![0-9])", "1")) %>%
separate_rows(col2, sep= "(?<=[A-Z])(?=[0-9])|(?<=[0-9])(?=[A-Z])") %>%
group_by(col1) %>%
summarise(key = list(col2[c(TRUE, FALSE)]),
val = list(col2[c(FALSE, TRUE)])) %>%
unnest(c(key, val)) %>%
pivot_wider(names_from = key, values_from = val, values_fill = list(val = 0))
# A tibble: 4 x 6
# col1 C H O N P
# <chr> <chr> <chr> <chr> <chr> <chr>
#1 C10H18O2 10 18 2 0 0
#2 C2H4N3P2 2 4 0 3 2
#3 CH4O 1 4 1 0 0
#4 H2O 0 2 1 0 0
数据
vec <- c("CH4O", "H2O", "C10H18O2", "C2H4N3P2")