根据带有列表的公共值从 data.frame 中提取行
Extract rows from a data.frame based on common values with a list
我正在寻找一种基于数字序列列表从 data.frame 中过滤行的简单方法。
举个例子:
我的初始数据框:
data <- data.frame(x=c(0,1,2,0,1,2,3,4,5,12,2,0,10,11,12,13),y="other_data")
我的清单:
list1 <- list(1:5,10:13)
我的目标是仅保留 "data" 中的行,其中 "list1" 的数字序列与 "data" 的 "x" 列中的数字序列完全相同。
所以输出 data.frame 应该是:
finaldata <- data.frame(x=c(1:5,10:13),y="other_data")
有什么想法吗?
为什么不使用 zoo
中的 rollapply
:
library(zoo)
ind = lapply(list1, function(x) {
n = length(x)
which(rollapply(data$x, n, function(y) all(y==x))) + 0:(n-1)
})
data[unlist(ind),]
#x y
#5 1 other_data
#6 2 other_data
#7 3 other_data
#8 4 other_data
#9 5 other_data
#13 10 other_data
#14 11 other_data
#15 12 other_data
#16 13 other_data
我从一个自定义函数开始获取一个序列的子集,然后使用 lapply 很容易扩展。
#function that takes sequence and a vector
#and returns indices of vector that have complete sequence
get_row_indices<- function(sequence,v){
#get run lengths of whether vector is in sequence
rle_d <- rle(v %in% sequence)
#test if it's complete, so both v in sequence and length of
#matches is length of sequence
select <- rep(length(sequence)==rle_d$lengths &rle_d$values,rle_d$lengths)
return(select)
}
#add row ID to data to show selection
data$row_id <- 1:nrow(data)
res <- do.call(rbind,lapply(list1,function(x){
return(data[get_row_indices(sequence=x,v=data$x),])
}))
res
> res
x y row_id
5 1 other_data 5
6 2 other_data 6
7 3 other_data 7
8 4 other_data 8
9 5 other_data 9
13 10 other_data 13
14 11 other_data 14
15 12 other_data 15
16 13 other_data 16
函数 match2
遍历每个 x
值并根据长度为 n 的向量检查它和接下来的 n 个值。然后使用 Reduce
创建索引序列。
match2 <- function(vec) {
start <- which(sapply(1:nrow(data), function(i) all(data$x[i:(i+length(vec)-1)] == vec)))
Reduce(':', c(start,start+length(vec)-1))
}
有了这个,我们可以使用一个应用函数为每个 list1
重复这个过程。
s <- sapply(list1, match2)
data[unlist(s),]
# x y
# 5 1 other_data
# 6 2 other_data
# 7 3 other_data
# 8 4 other_data
# 9 5 other_data
# 13 10 other_data
# 14 11 other_data
# 15 12 other_data
# 16 13 other_data
extract_fun <- function(x, dat){
# Index where the sequences start
ind <- which(dat == x[1])
# Indexes (within dat) where the sequence should be
ind_seq <- lapply(ind, seq, length.out = length(x))
# Extract the values from dat at the position
dat_val <- mapply(`[`, list(dat), ind_seq)
# Check if values within dat == those in list1
i <- which(as.logical(apply(dat_val, 2, all.equal, x))) # which one is equal?
# Return the correct indices
ind_seq[[i]]
}
获取 list1
中每个项目的索引,并将它们合并到所需的索引中
all_ind <- do.call(c, lapply(list1, extract_fun, data$x))
data[all_ind,]
结果:
x y
5 1 other_data
6 2 other_data
7 3 other_data
8 4 other_data
9 5 other_data
13 10 other_data
14 11 other_data
15 12 other_data
16 13 other_data
我正在寻找一种基于数字序列列表从 data.frame 中过滤行的简单方法。
举个例子:
我的初始数据框:
data <- data.frame(x=c(0,1,2,0,1,2,3,4,5,12,2,0,10,11,12,13),y="other_data")
我的清单:
list1 <- list(1:5,10:13)
我的目标是仅保留 "data" 中的行,其中 "list1" 的数字序列与 "data" 的 "x" 列中的数字序列完全相同。 所以输出 data.frame 应该是:
finaldata <- data.frame(x=c(1:5,10:13),y="other_data")
有什么想法吗?
为什么不使用 zoo
中的 rollapply
:
library(zoo)
ind = lapply(list1, function(x) {
n = length(x)
which(rollapply(data$x, n, function(y) all(y==x))) + 0:(n-1)
})
data[unlist(ind),]
#x y
#5 1 other_data
#6 2 other_data
#7 3 other_data
#8 4 other_data
#9 5 other_data
#13 10 other_data
#14 11 other_data
#15 12 other_data
#16 13 other_data
我从一个自定义函数开始获取一个序列的子集,然后使用 lapply 很容易扩展。
#function that takes sequence and a vector
#and returns indices of vector that have complete sequence
get_row_indices<- function(sequence,v){
#get run lengths of whether vector is in sequence
rle_d <- rle(v %in% sequence)
#test if it's complete, so both v in sequence and length of
#matches is length of sequence
select <- rep(length(sequence)==rle_d$lengths &rle_d$values,rle_d$lengths)
return(select)
}
#add row ID to data to show selection
data$row_id <- 1:nrow(data)
res <- do.call(rbind,lapply(list1,function(x){
return(data[get_row_indices(sequence=x,v=data$x),])
}))
res
> res
x y row_id
5 1 other_data 5
6 2 other_data 6
7 3 other_data 7
8 4 other_data 8
9 5 other_data 9
13 10 other_data 13
14 11 other_data 14
15 12 other_data 15
16 13 other_data 16
函数 match2
遍历每个 x
值并根据长度为 n 的向量检查它和接下来的 n 个值。然后使用 Reduce
创建索引序列。
match2 <- function(vec) {
start <- which(sapply(1:nrow(data), function(i) all(data$x[i:(i+length(vec)-1)] == vec)))
Reduce(':', c(start,start+length(vec)-1))
}
有了这个,我们可以使用一个应用函数为每个 list1
重复这个过程。
s <- sapply(list1, match2)
data[unlist(s),]
# x y
# 5 1 other_data
# 6 2 other_data
# 7 3 other_data
# 8 4 other_data
# 9 5 other_data
# 13 10 other_data
# 14 11 other_data
# 15 12 other_data
# 16 13 other_data
extract_fun <- function(x, dat){
# Index where the sequences start
ind <- which(dat == x[1])
# Indexes (within dat) where the sequence should be
ind_seq <- lapply(ind, seq, length.out = length(x))
# Extract the values from dat at the position
dat_val <- mapply(`[`, list(dat), ind_seq)
# Check if values within dat == those in list1
i <- which(as.logical(apply(dat_val, 2, all.equal, x))) # which one is equal?
# Return the correct indices
ind_seq[[i]]
}
获取 list1
中每个项目的索引,并将它们合并到所需的索引中
all_ind <- do.call(c, lapply(list1, extract_fun, data$x))
data[all_ind,]
结果:
x y
5 1 other_data
6 2 other_data
7 3 other_data
8 4 other_data
9 5 other_data
13 10 other_data
14 11 other_data
15 12 other_data
16 13 other_data