统计一串数据中的某些化学反应
Count certain chemical reaction in string of data
最有效的计数方法是什么?一串分子式中的甲基化(CH2 差异)。考虑在一串 100 个不同的分子式中,我有一些公式,如 C6H14O3、C5H12O3 和 C4H10O3。第一个和第二个因 CH2 而异,第二个和第三个因 CH2 而异,因此我在此字符串中有 2 个甲基化。当字符串非常大时,这会变得更加复杂。所以我想计算字符串中有多少分子式仅相差一个 CH2。
考虑一下我有:
DT<- data.frame(formula=c("C6H12O7S1","C6H10O8S1","C7H4O2N4","C8H12O5S1","C8H16O5S1","C8H12O3N2","C8H14O4S1","C9H7O3N1S1","C9H11O6N1S1","C9H9O6N1S1","C9H12O5S1","C9H18O5S1","C9H14O5","C9H20O5S1","C9H9O4N1S1","C9H9O5N1S1","C9H14O6","C10H11O5N1S1","C10H14O6","C10H16O6S1","C10H17O5N1","C10H20O7S1","C10H14O4","C10H12O7N2","C10H16O6","C10H14O6N2","C10H7O4N1S1","C10H18O6S1","C10H16O5","C10H13O6N1S1","C10H18O7S1","C11H18O6S1","C11H15O6N1","C11H22O7S1","C11H16O6S1","C11H16O6","C11H18O6"))
我想数一数这个字符串中有多少个如上所述的 CH2 差异。
有人现在有一种简单的方法可以做到这一点,尤其是对于非常大的字符串吗?
非常感谢。
试试这个。可能不是最有效的,但应该有效:
# extract only the second value, aka the number of carbons
DT$formula <- substr(DT$formula, 2, 2)
# counter for matches off by 1
counter <- 0
# for loop that counts all matches
for (i in 1:nrow(DT))
{
for (j in 1:nrow(DT))
{
# if entries' values are off by 1
if (i != j && ((DT[j, 1] + 1 == DT[i, 1]) || (DT[j, 1] - 1 == DT[i, 1])))
{
counter <- counter + 1
}
}
}
# so you don't count the pair twice
counter <- counter/2
此解决方案使用 thelatemail 的建议作为拆分字符串的起点:
重现原始数据集(将 stringsAsFactors 设置为 F)
DT <- data.frame(formula=c("C6H12O7S1","C6H10O8S1","C7H4O2N4","C8H12O5S1","C8H16O5S1",
"C8H12O3N2","C8H14O4S1","C9H7O3N1S1","C9H11O6N1S1","C9H9O6N1S1",
"C9H12O5S1","C9H18O5S1","C9H14O5","C9H20O5S1","C9H9O4N1S1",
"C9H9O5N1S1","C9H14O6","C10H11O5N1S1","C10H14O6","C10H16O6S1",
"C10H17O5N1","C10H20O7S1","C10H14O4","C10H12O7N2","C10H16O6",
"C10H14O6N2","C10H7O4N1S1","C10H18O6S1","C10H16O5","C10H13O6N1S1",
"C10H18O7S1","C11H18O6S1","C11H15O6N1","C11H22O7S1","C11H16O6S1","C11H16O6",
"C11H18O6"), stringsAsFactors = F) %>% arrange(formula)
> head(DT)
formula
1 C10H11O5N1S1
2 C10H12O7N2
3 C10H13O6N1S1
4 C10H14O4
5 C10H14O6
6 C10H14O6N2
扩展列,使每个元素都有一列来计算(缺少的列计为 0)
DT2 <- strsplit(DT$formula, "(?<=[0-9])(?=[A-Z])|(?<=[A-Z])(?=[0-9])", perl=TRUE) %>%
lapply(function(x){structure(list(formula = rep(paste0(x, collapse = ""), length(x)/2),
element = x[seq(from = 1, to = length(x), by = 2)],
count = as.integer(x[seq(from = 2, to = length(x), by = 2)])),
.Names = c("formula", "element", "count"),
row.names = c(NA, -length(x)/2),
class = "data.frame")}) %>%
data.table::rbindlist() %>%
spread(element, count, fill = 0)
>head(DT2)
formula C H N O S
1: C10H11O5N1S1 10 11 1 5 1
2: C10H12O7N2 10 12 2 7 0
3: C10H13O6N1S1 10 13 1 6 1
4: C10H14O4 10 14 0 4 0
5: C10H14O6 10 14 0 6 0
6: C10H14O6N2 10 14 2 6 0
获取所有可能成对组合的列表并扩展数据集
pairwise.combos <- combn(nrow(DT2), m = 2)
DT3 <- rbind(DT2[pairwise.combos[1,],],
DT2[pairwise.combos[2,],])
DT3$pair <- rep(seq.int(dim(pairwise.combos)[2]),2)
> head(DT3)
formula C H N O S pair
1: C10H11O5N1S1 10 11 1 5 1 1
2: C10H11O5N1S1 10 11 1 5 1 2
3: C10H11O5N1S1 10 11 1 5 1 3
4: C10H11O5N1S1 10 11 1 5 1 4
5: C10H11O5N1S1 10 11 1 5 1 5
6: C10H11O5N1S1 10 11 1 5 1 6
最后一步的编辑 根据 OP 的说明,一对公式应该仅在 CH2 上有所不同,其他元素的计数应该相同。
通过CH2检查每对是否不同(可以修改此步骤以检查其他差异)
DT4 <- DT3 %>% group_by(pair) %>%
arrange(C, H) %>%
summarise(CH2.diff = (diff(C) == 1) && (diff(H) == 2) &&
(diff(N) == 0) & (diff(O) == 0) & (diff(S) == 0)) %>%
ungroup() %>%
filter(CH2.diff == 1) %>% select(pair)
DT4 <- right_join(DT3, DT4)
# total count of CH2 in pairwise comparisons
> length(unique(DT4$pair))
[1] 9
# check which pairs differ by CH2
> head(DT4)
formula C H N O S pair
1 C10H11O5N1S1 10 11 1 5 1 35
2 C9H9O5N1S1 9 9 1 5 1 35
3 C10H13O6N1S1 10 13 1 6 1 96
4 C9H11O6N1S1 9 11 1 6 1 96
5 C10H14O6 10 14 0 6 0 149
6 C11H16O6 11 16 0 6 0 149
最有效的计数方法是什么?一串分子式中的甲基化(CH2 差异)。考虑在一串 100 个不同的分子式中,我有一些公式,如 C6H14O3、C5H12O3 和 C4H10O3。第一个和第二个因 CH2 而异,第二个和第三个因 CH2 而异,因此我在此字符串中有 2 个甲基化。当字符串非常大时,这会变得更加复杂。所以我想计算字符串中有多少分子式仅相差一个 CH2。
考虑一下我有:
DT<- data.frame(formula=c("C6H12O7S1","C6H10O8S1","C7H4O2N4","C8H12O5S1","C8H16O5S1","C8H12O3N2","C8H14O4S1","C9H7O3N1S1","C9H11O6N1S1","C9H9O6N1S1","C9H12O5S1","C9H18O5S1","C9H14O5","C9H20O5S1","C9H9O4N1S1","C9H9O5N1S1","C9H14O6","C10H11O5N1S1","C10H14O6","C10H16O6S1","C10H17O5N1","C10H20O7S1","C10H14O4","C10H12O7N2","C10H16O6","C10H14O6N2","C10H7O4N1S1","C10H18O6S1","C10H16O5","C10H13O6N1S1","C10H18O7S1","C11H18O6S1","C11H15O6N1","C11H22O7S1","C11H16O6S1","C11H16O6","C11H18O6"))
我想数一数这个字符串中有多少个如上所述的 CH2 差异。
有人现在有一种简单的方法可以做到这一点,尤其是对于非常大的字符串吗?
非常感谢。
试试这个。可能不是最有效的,但应该有效:
# extract only the second value, aka the number of carbons
DT$formula <- substr(DT$formula, 2, 2)
# counter for matches off by 1
counter <- 0
# for loop that counts all matches
for (i in 1:nrow(DT))
{
for (j in 1:nrow(DT))
{
# if entries' values are off by 1
if (i != j && ((DT[j, 1] + 1 == DT[i, 1]) || (DT[j, 1] - 1 == DT[i, 1])))
{
counter <- counter + 1
}
}
}
# so you don't count the pair twice
counter <- counter/2
此解决方案使用 thelatemail 的建议作为拆分字符串的起点:
重现原始数据集(将 stringsAsFactors 设置为 F)
DT <- data.frame(formula=c("C6H12O7S1","C6H10O8S1","C7H4O2N4","C8H12O5S1","C8H16O5S1",
"C8H12O3N2","C8H14O4S1","C9H7O3N1S1","C9H11O6N1S1","C9H9O6N1S1",
"C9H12O5S1","C9H18O5S1","C9H14O5","C9H20O5S1","C9H9O4N1S1",
"C9H9O5N1S1","C9H14O6","C10H11O5N1S1","C10H14O6","C10H16O6S1",
"C10H17O5N1","C10H20O7S1","C10H14O4","C10H12O7N2","C10H16O6",
"C10H14O6N2","C10H7O4N1S1","C10H18O6S1","C10H16O5","C10H13O6N1S1",
"C10H18O7S1","C11H18O6S1","C11H15O6N1","C11H22O7S1","C11H16O6S1","C11H16O6",
"C11H18O6"), stringsAsFactors = F) %>% arrange(formula)
> head(DT)
formula
1 C10H11O5N1S1
2 C10H12O7N2
3 C10H13O6N1S1
4 C10H14O4
5 C10H14O6
6 C10H14O6N2
扩展列,使每个元素都有一列来计算(缺少的列计为 0)
DT2 <- strsplit(DT$formula, "(?<=[0-9])(?=[A-Z])|(?<=[A-Z])(?=[0-9])", perl=TRUE) %>%
lapply(function(x){structure(list(formula = rep(paste0(x, collapse = ""), length(x)/2),
element = x[seq(from = 1, to = length(x), by = 2)],
count = as.integer(x[seq(from = 2, to = length(x), by = 2)])),
.Names = c("formula", "element", "count"),
row.names = c(NA, -length(x)/2),
class = "data.frame")}) %>%
data.table::rbindlist() %>%
spread(element, count, fill = 0)
>head(DT2)
formula C H N O S
1: C10H11O5N1S1 10 11 1 5 1
2: C10H12O7N2 10 12 2 7 0
3: C10H13O6N1S1 10 13 1 6 1
4: C10H14O4 10 14 0 4 0
5: C10H14O6 10 14 0 6 0
6: C10H14O6N2 10 14 2 6 0
获取所有可能成对组合的列表并扩展数据集
pairwise.combos <- combn(nrow(DT2), m = 2)
DT3 <- rbind(DT2[pairwise.combos[1,],],
DT2[pairwise.combos[2,],])
DT3$pair <- rep(seq.int(dim(pairwise.combos)[2]),2)
> head(DT3)
formula C H N O S pair
1: C10H11O5N1S1 10 11 1 5 1 1
2: C10H11O5N1S1 10 11 1 5 1 2
3: C10H11O5N1S1 10 11 1 5 1 3
4: C10H11O5N1S1 10 11 1 5 1 4
5: C10H11O5N1S1 10 11 1 5 1 5
6: C10H11O5N1S1 10 11 1 5 1 6
最后一步的编辑 根据 OP 的说明,一对公式应该仅在 CH2 上有所不同,其他元素的计数应该相同。
通过CH2检查每对是否不同(可以修改此步骤以检查其他差异)
DT4 <- DT3 %>% group_by(pair) %>%
arrange(C, H) %>%
summarise(CH2.diff = (diff(C) == 1) && (diff(H) == 2) &&
(diff(N) == 0) & (diff(O) == 0) & (diff(S) == 0)) %>%
ungroup() %>%
filter(CH2.diff == 1) %>% select(pair)
DT4 <- right_join(DT3, DT4)
# total count of CH2 in pairwise comparisons
> length(unique(DT4$pair))
[1] 9
# check which pairs differ by CH2
> head(DT4)
formula C H N O S pair
1 C10H11O5N1S1 10 11 1 5 1 35
2 C9H9O5N1S1 9 9 1 5 1 35
3 C10H13O6N1S1 10 13 1 6 1 96
4 C9H11O6N1S1 9 11 1 6 1 96
5 C10H14O6 10 14 0 6 0 149
6 C11H16O6 11 16 0 6 0 149