如何在 R 中使用三个 data.frames 创建双种子 "if" 循环?

How to create a double-seeded "if" loop using three data.frames in R?

我有一个数据框如下,df1df2:

# data
df1 <- read.table(text = "
  SNP    CHR            BP A1      A2       zscore       P CEUmaf    LOC
rs58043752      1       3344877  A       G       0.289   0.7726  .  1:3344877
rs2483242       1       3345145  A       T       0.393   0.6946  .  1:3345145
rs1572039       1       3345216  T       C       0.443   0.658   .  1:3345216
rs1537407       1       3345705  T       C       -0.289  0.7726  .  1:3345705
rs2493277       1       3346348  C       G       -1.552  0.1207  0.09167  1:3346348
rs11583353      1       3346403  C       T       -0.414  0.6786  0.875  1:3346403",
                  header = TRUE, stringsAsFactors = FALSE)
df2 <- read.table(text = "
  CHR         POS         ID     AA      DA          DAF               SDS              LOC
1       3344877 rs58043752      G       A       0.1095  0.80517243505521        1:3344877
1       3345145 rs2483242       T       A       0.5746  0.741513997303754       1:3345145
1       3345216 rs1572039       T       C       0.0784  0.130228249846394       1:3345216
1       3345705 rs1537407       C       T       0.798   0.275710355505832       1:3345705
1       3346348 rs2493277       G       C       0.5737  0.283452115383779       1:3346348
1       3346403 rs11583353      C       T       0.2238  -0.0246952604330743     1:3346403", 
                  header = TRUE, stringsAsFactors = FALSE)

我有第三个数据框 (df3),例如:

Input_SNP  SDS
1:3344877   NA 
1:3345145   NA   
1:3345216   NA  
1:3345705   NA
1:3346348   NA   
1:3346403   NA 

我想将 df1 的 A1 和 A2 与 df2 的 AA 和 DA 进行比较,然后输出到第三个 df3。我的逻辑是这样的:

  1. 如果df1中的df1$zscore为正: 我想看看是否 df1$A1 == df2$DA,如果是,那么我想将 df2$SDS 放入 df3$SDS。 如果df1$A1 == df2$AA,那么我想把df2$SDS的负数放到df3$SDS中。
  2. 如果df1中的df1$zscore为负数: 我想看看是否 df1$A2 == df2$DA,如果是,那么我想将 df2$SDS 放入 df3$SDS 如果df1$A2 == df2$AA,那么我想把df2$SDS的负数放到df3$SDS

最终输出将如下所示:

    Input_SNP      SDS
    1:3344877   0.805
    1:3345145   0.742   
    1:3345216   -0.130  
    1:3345705   -0.276
    1:3346348   -0.283   
    1:3346403   -0.025

我敢肯定有更快的解决方案,但简单的循环的优点是可以说得很清楚。它可以很容易地调整。

df1 <- data.frame(
  BP = 1:6,
  A1 = c("A", "A", "T", "T", "C", "C"),
  A2= c("G", "T", "C", "C", "G", "T"),
  zscore = runif(6, min = -1, max = 1),
  stringsAsFactors = FALSE
)

df2 <- data.frame(
  CHR = 1:6,
  AA = c("G", "T", "T", "C", "G", "C"),
  DA = c("A", "A", "C", "T", "C", "T"),
  SDS = runif(6),
  stringsAsFactors = FALSE
)

df3 <- data.frame(SDS = rep(NA, nrow(df1)))

for (i in 1:nrow(df1)) {
  if (df1$zscore[i] >= 0) {
    if (df1$A1[i] == df2$DA[i]) {
      df3$SDS[i] <- df2$SDS[i]
    } else if (df1$A1[i] == df2$AA[i]) {
      df3$SDS[i] <- -df2$SDS[i]
    }
  } else if (df1$zscore[i] < 0) {
    if (df1$A2[i] == df2$DA[i]) {
      df3$SDS[i] <- df2$SDS[i]
    } else if (df1$A2[i] == df2$AA[i]) {
      df3$SDS[i] <- -df2$SDS[i]
    }
  }
}

df3

这是另一种方法。首先我们确定 zscore 在哪里为负。然后选择哪一列将匹配到 df2。接下来确定哪个 df2 列匹配。第 4 行是针对两个数据帧之间不匹配的情况的保护措施。最后我们returnSDS根据条件判断正负。

coll <- (df1$zscore < 0) + 1L
indx1 <- df1[c("A1","A2")][cbind(1:nrow(df1),coll)]
matches <- max.col((xx=indx1 == df2[c("DA","AA")]))
is.na(matches) <- rowSums(xx) == 0L
df3$SDS <- df2$SDS * ifelse(matches == 1,1,-1)
df3
#   Input_SNP         SDS
# 1 1:3344877  0.80517244
# 2 1:3345145  0.74151400
# 3 1:3345216 -0.13022825
# 4 1:3345705 -0.27571036
# 5 1:3346348 -0.28345212
# 6 1:3346403 -0.02469526

我们可以合并,然后翻转SDS的标志:

# merge
res <-  merge(df1[, 1:6], df2[, c(1:5, 7)],
              by.x = c("CHR", "BP", "SNP"),
              by.y = c("CHR", "POS", "ID"))

# make Input_SNP id column:
res$Input_SNP <- paste(res$CHR, res$BP, sep = ":")

# then flip effect based on sign and allele match
res$SDS <- ifelse(res$zscore > 0 & res$A1 == res$DA, res$SDS,
                  ifelse(res$zscore > 0 & res$A1 == res$AA, res$SDS * -1,
                         ifelse(res$zscore < 0 & res$A2 == res$DA, res$SDS, 
                                ifelse(res$zscore < 0 & res$A2 == res$AA, res$SDS * -1, NA))))

# subset required columns
res <- res[, c("Input_SNP", "SDS")]
res
#   Input_SNP         SDS
# 1 1:3344877  0.80517244
# 2 1:3345145  0.74151400
# 3 1:3345216 -0.13022825
# 4 1:3345705 -0.27571036
# 5 1:3346348 -0.28345212
# 6 1:3346403 -0.02469526