如何在 R 中使用自动过程(即 for 循环等)查找向量之间特定值序列的匹配行
How to find the match rows for a particular sequence of values between vectors using an automatic process (i.e. for loop, etc) in R
我有向量 A
和 B
。矢量的长度 A
是 12。矢量的长度 B
是 23。
A <- c(125,195,322,421,65,102,85,98,88,176,300)
B <- c(62,138,124,78,117,84,148,91,71,112,137,102,65,102,85,98,88,176,150,78,72,68,102)
我需要在自动化过程中做几件事(如果可能的话):
首先,我需要在 A
中找到满足此要求的最长值序列:它们是连续的,并且介于 60 和 180 之间。在本例中,该序列将是:
A.selected <- c(65,102,85,98,88,176)
其次,我必须在 B
中找到与 [=24= 重合度最高的值序列(与 A.selected
长度相同)的第一行].我想这样做的是:
# First, I create different vectors of `B` of the same length (`5` in this example) than `A.selected` starting from the first row in `B`:
B_1 <- c(B[1],B[2],B[3],B[4],B[5],B[6])
B_2 <- c(B[2],B[3],B[4],B[5],B[6],B[7])
B_3 <- c(B[3],B[4],B[5],B[6],B[7],B[8])
. . . .
. . . .
. . . .
B_13 <- c(B[13],B[14],B[15],B[16],B[17],B[18])
. . . .
. . . .
# Second, I estimate the strength of the correlation between `A.selected` and the different combinations of `B` (`B_1`,`B_2`,...,`B_13`,`B_14`, etc) using the Pearson's correlation coefficient (`R²`). I also calculated the `P-value` of this correlation to be sure is significant.
mod1 <- cor.test(A.selected,B_1)
mod2 <- cor.test(A.selected,B_2)
mod3 <- cor.test(A.selected,B_3)
. . . .
. . . .
. . . .
mod13 <- cor.test(A.selected,B_13)
coef.mod1 <- c(as.numeric(mod1[4]),as.numeric(mod1[3])) # R² and P-value of the 1st correlation
coef.mod2 <- c(as.numeric(mod2[4]),as.numeric(mod2[3])) # R² and P-value of the 2nd correlation
coef.mod3 <- c(as.numeric(mod3[4]),as.numeric(mod3[3])) # R² and P-value of the first correlation
. . . .
. . . .
. . . .
coef.mod13 <- c(as.numeric(mod4[4]),as.numeric(mod4[3])) # R² and P-value of the first correlation
# I find the model with the highest R², but considering also that the `P-value` has to be lower than `0.05`.
Models.dataframe <- data.frame(R2 = c(coef.mod1[1],coef.mod2[1],coef.mod3[1],0.38,0.65,0.13,0.26,-0.34,0.76,0.48,0.32,0.42,coef.mod13[1]),
P.value = c(coef.mod1[2], coef.mod2[2], coef.mod3[2], 0.38, 0.65, 0.13, 0.26, 0.84, 0.26, 0.48, 0.32, 0.42, coef.mod13[2]))
rownames(Models.dataframe[which.max(Models.dataframe$R2) & Models.dataframe$P.value <= 0.05,])
"13" # In row 13 in `B` starts the sequence of numbers that have the highest overlap with the sequence `A.selected`
在现实世界中,A
和B
有几十万的长度,所以我需要一个代码来自动完成所有这些。
有谁知道如何创建代码来自动执行此操作?
实现步骤 1 的方法有些麻烦:
编辑 根据 OP 的评论:
library(tidyverse)
get_consecutive_grps <- function(x) {
runlengths <- rle(x) %>% .$lengths
map2(runlengths, 1:length(runlengths), ~ rep(..2, ..1)) %>% unlist()
}
tmp <-
enframe(A) %>%
mutate(
sel = between(value, 60L, 180L),
idx = get_consecutive_grps(sel)
) %>%
group_by(idx) %>%
mutate(
idx_cnt = row_number()
)
longestrun <- filter(tmp, sel) %>% pull(idx_cnt) %>% max()
longestidx <- filter(tmp, sel, idx_cnt == longestrun) %>% pull(idx)
# It's possible that there are several selected sequences of the same length;
# use the first one
A.selected <- filter(tmp, idx == longestidx[1]) %>% pull(value)
编辑:
我在第二步中添加了一个同样麻烦的方法:
get_Bs <- function(start_idx, length, vec) {
vec[start_idx:(start_idx + length - 1)]
}
offset <- 1:(length(B) - length(A.selected))
Bs <-
map_dfc(offset, get_Bs, length = length(A.selected), vec = B) %>%
setNames(str_c("Bidx_", offset)) %>%
mutate(relpos = row_number()) %>%
select(relpos, everything())
# Rearrange data and calculate correlations with `A.selected`
B_corr <-
Bs %>%
pivot_longer(
cols = -relpos,
names_to = "Bidx",
names_prefix = "Bidx_"
) %>%
pivot_wider(
id_cols = Bidx,
values_from = value,
names_from = relpos,
names_prefix = "relpos_"
) %>%
nest(B_snippits = starts_with("relpos")) %>%
mutate(
corr = map(B_snippits, ~ cor.test(A.selected, as.numeric(..1))),
corr_tidy = map(corr, broom::tidy)
) %>%
unnest(corr_tidy)
# Get B-index for highest correlation
B_corr %>%
filter(estimate == max(B_corr$estimate), p.value <= 0.05) %>%
pull(Bidx)
# ==> "13"
我相信还有更直接的方法可以做到这一切...
我有向量 A
和 B
。矢量的长度 A
是 12。矢量的长度 B
是 23。
A <- c(125,195,322,421,65,102,85,98,88,176,300)
B <- c(62,138,124,78,117,84,148,91,71,112,137,102,65,102,85,98,88,176,150,78,72,68,102)
我需要在自动化过程中做几件事(如果可能的话):
首先,我需要在 A
中找到满足此要求的最长值序列:它们是连续的,并且介于 60 和 180 之间。在本例中,该序列将是:
A.selected <- c(65,102,85,98,88,176)
其次,我必须在 B
中找到与 [=24= 重合度最高的值序列(与 A.selected
长度相同)的第一行].我想这样做的是:
# First, I create different vectors of `B` of the same length (`5` in this example) than `A.selected` starting from the first row in `B`:
B_1 <- c(B[1],B[2],B[3],B[4],B[5],B[6])
B_2 <- c(B[2],B[3],B[4],B[5],B[6],B[7])
B_3 <- c(B[3],B[4],B[5],B[6],B[7],B[8])
. . . .
. . . .
. . . .
B_13 <- c(B[13],B[14],B[15],B[16],B[17],B[18])
. . . .
. . . .
# Second, I estimate the strength of the correlation between `A.selected` and the different combinations of `B` (`B_1`,`B_2`,...,`B_13`,`B_14`, etc) using the Pearson's correlation coefficient (`R²`). I also calculated the `P-value` of this correlation to be sure is significant.
mod1 <- cor.test(A.selected,B_1)
mod2 <- cor.test(A.selected,B_2)
mod3 <- cor.test(A.selected,B_3)
. . . .
. . . .
. . . .
mod13 <- cor.test(A.selected,B_13)
coef.mod1 <- c(as.numeric(mod1[4]),as.numeric(mod1[3])) # R² and P-value of the 1st correlation
coef.mod2 <- c(as.numeric(mod2[4]),as.numeric(mod2[3])) # R² and P-value of the 2nd correlation
coef.mod3 <- c(as.numeric(mod3[4]),as.numeric(mod3[3])) # R² and P-value of the first correlation
. . . .
. . . .
. . . .
coef.mod13 <- c(as.numeric(mod4[4]),as.numeric(mod4[3])) # R² and P-value of the first correlation
# I find the model with the highest R², but considering also that the `P-value` has to be lower than `0.05`.
Models.dataframe <- data.frame(R2 = c(coef.mod1[1],coef.mod2[1],coef.mod3[1],0.38,0.65,0.13,0.26,-0.34,0.76,0.48,0.32,0.42,coef.mod13[1]),
P.value = c(coef.mod1[2], coef.mod2[2], coef.mod3[2], 0.38, 0.65, 0.13, 0.26, 0.84, 0.26, 0.48, 0.32, 0.42, coef.mod13[2]))
rownames(Models.dataframe[which.max(Models.dataframe$R2) & Models.dataframe$P.value <= 0.05,])
"13" # In row 13 in `B` starts the sequence of numbers that have the highest overlap with the sequence `A.selected`
在现实世界中,A
和B
有几十万的长度,所以我需要一个代码来自动完成所有这些。
有谁知道如何创建代码来自动执行此操作?
实现步骤 1 的方法有些麻烦:
编辑 根据 OP 的评论:
library(tidyverse)
get_consecutive_grps <- function(x) {
runlengths <- rle(x) %>% .$lengths
map2(runlengths, 1:length(runlengths), ~ rep(..2, ..1)) %>% unlist()
}
tmp <-
enframe(A) %>%
mutate(
sel = between(value, 60L, 180L),
idx = get_consecutive_grps(sel)
) %>%
group_by(idx) %>%
mutate(
idx_cnt = row_number()
)
longestrun <- filter(tmp, sel) %>% pull(idx_cnt) %>% max()
longestidx <- filter(tmp, sel, idx_cnt == longestrun) %>% pull(idx)
# It's possible that there are several selected sequences of the same length;
# use the first one
A.selected <- filter(tmp, idx == longestidx[1]) %>% pull(value)
编辑: 我在第二步中添加了一个同样麻烦的方法:
get_Bs <- function(start_idx, length, vec) {
vec[start_idx:(start_idx + length - 1)]
}
offset <- 1:(length(B) - length(A.selected))
Bs <-
map_dfc(offset, get_Bs, length = length(A.selected), vec = B) %>%
setNames(str_c("Bidx_", offset)) %>%
mutate(relpos = row_number()) %>%
select(relpos, everything())
# Rearrange data and calculate correlations with `A.selected`
B_corr <-
Bs %>%
pivot_longer(
cols = -relpos,
names_to = "Bidx",
names_prefix = "Bidx_"
) %>%
pivot_wider(
id_cols = Bidx,
values_from = value,
names_from = relpos,
names_prefix = "relpos_"
) %>%
nest(B_snippits = starts_with("relpos")) %>%
mutate(
corr = map(B_snippits, ~ cor.test(A.selected, as.numeric(..1))),
corr_tidy = map(corr, broom::tidy)
) %>%
unnest(corr_tidy)
# Get B-index for highest correlation
B_corr %>%
filter(estimate == max(B_corr$estimate), p.value <= 0.05) %>%
pull(Bidx)
# ==> "13"
我相信还有更直接的方法可以做到这一切...