在 r 中查找与两个不同子 ID 关联的所有父 ID
Find all parent ids that are associated with two distinct child ids in r
我试图在 R 中找到一种可以快速处理大型数据集的过滤方法。我的数据有 Parent_ID
。每个 Parent_ID
可以有多行,每行有一个 Child_ID
.
我有一些数据,我想找到与 Child_ID = 1
和 Child_ID = 2
相关的所有 Parent_IDs
,例如:
parent_id child_id
[1,] 1 1
[2,] 1 1
[3,] 1 2
[4,] 2 1
[5,] 2 2
[6,] 3 3
[7,] 3 1
[8,] 3 4
[9,] 4 4
[10,] 4 2
在上面的示例中,它应该 return parent_ids 1,2
因为这两个 Parent_IDs
同时具有 child_id = 1
和 child_id = 2
.
parent_id = 3
未被 return 编辑,因为它有 child_id=1
但没有 child_id=2
。 parent_id=4
没有被 return 编辑,因为它有 child_id=2
但没有 child_id=1
.
我目前的方法是遍历每个 parent_id
,然后搜索 child_ids
,但我正在寻找更快的方法。
# define the child_ids we're looking for
wanted <- c(1,2)
# find the unique parent ids
uniq_parents <- unique(df$parent_id)
# loop through each parent_id
match <- vector()
for(i in 1:length(uniq_parents))
{
one_parent <- df[df$parent_id == uniq_parents[i],]
found <- wanted%in%one_parent$child_id
if(sum(found) == length(wanted))
{
match <- c(match,one_parent$parent_id)
}
}
match <- unique(match)
> match
[1] 1 2
intersect(df[df$child_id == 1, 'parent_id'], df[df$child_id == 2, 'parent_id'])
您想要 parent_ids 有“1”和“2”两个孩子,所以有“1”的孩子 children 和有“2”孩子的交集。
这是将其推广到任意数量 children:
的另一种实现
find_parent_ids_for_children <- function(df, child_ids) {
df <- unique(df)
df <- df[df$child_id %in% child_ids, , drop = FALSE]
pids <- rle(sort(df$parent_id))
pids$values[pids$lengths == length(child_ids)]
}
df0 <- data.frame(parent_id=c(1,1,1,2,2,3,3,3,4,4), child_id=c(1,1,2,1,2,3,1,4,4,2))
find_parent_ids_for_children(df0, 1:2)
# [1] 1 2
find_parent_ids_for_children(df0, 1:3)
# numeric(0)
find_parent_ids_for_children(df0, 2:3)
# numeric(0)
find_parent_ids_for_children(df0, c(1, 3))
# [1] 3
我也会在 dplyr 中添加一个答案,这对过滤等非常有用
# create data frame
dat <- data.frame(parent_id = c(1, 1, 1, 2, 2, 3, 3, 3, 4, 4), child_id = c(1, 1, 2, 1, 2, 3, 1, 4, 4, 2))
# filter the table for the criteria you presented
result_parents <- dat %>%
group_by(parent_id) %>%
filter(any(child_id == 1) & any(child_id == 2))
# if you really only want the parent_id which match your criteria
unique(result_parents$parent_id)
这里有 data.table
和 dplyr
的解决方案。我很想看看它们在更大数据上的表现如何。
library(dplyr)
df %>%
group_by(parent_id) %>%
filter(all(wanted %in% child_id)) %>%
summarize(cur_group()) %>%
pull(parent_id)
library(data.table)
dt = as.data.table(df)
setkey(dt, parent_id)
dt[, .SD[all(wanted %in% child_id), ], by = parent_id][, .I[1], by = parent_id]$parent_id
使用此示例数据:
df = read.table(text = ' parent_id child_id
[1,] 1 1
[2,] 1 1
[3,] 1 2
[4,] 2 1
[5,] 2 2
[6,] 3 3
[7,] 3 1
[8,] 3 4
[9,] 4 4
[10,] 4 2', header =T)
wanted <- c(1,2)
基准测试
我运行一个基准,我觉得很惊讶。如果您在 wanted
中有 2 个子 ID,intersect
方法是迄今为止最快、内存效率最高且代码最简单的方法。
# A tibble: 5 × 13
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time
<bch:expr> <dbl> <dbl> <dbl> <bch:byt> <dbl> <int> <dbl> <dbl>
1 intersect 0.333 0.373 1418. 1.54MB 27.7 716 14 505.
2 dplyr_em 12.3 12.8 74.2 3.1MB 11.7 38 6 512.
3 dplyr_gregor 12.7 13.2 65.3 5.05MB 9.89 33 5 505.
4 rle 46.1 54.4 18.6 9.87MB 13.0 10 7 538.
5 data.table 213. 215. 4.64 83.26MB 10.8 3 7 647.
# … with 4 more variables: result <list>, memory <list>, time <list>, gc <list>
基准代码:
set.seed(47)
n = 1e5
df = data.frame(parent_id = sample(1:(n / 20), size = n, replace = T), child_id = sample(1:(n / 500), size = n, replace = T))
df = arrange(df, parent_id, child_id)
wanted = c(1, 2)
dt = as.data.table(df)
setkey(dt, parent_id)
dt[, .SD[all(wanted %in% child_id), ], by = parent_id][, .I[1], by = parent_id]$parent_id
find_parent_ids_for_children <- function(df, child_ids) {
df <- unique(df)
df <- df[df$child_id %in% child_ids, , drop = FALSE]
pids <- rle(sort(df$parent_id))
pids$values[pids$lengths == length(child_ids)]
}
bench::mark(
intersect = intersect(df[df$child_id == wanted[1], 'parent_id'], df[df$child_id == wanted[2], 'parent_id']),
rle = find_parent_ids_for_children(df, wanted),
dplyr_gregor = {
df %>%
group_by(parent_id) %>%
filter(all(wanted %in% child_id)) %>%
pull(parent_id) %>%
unique()
},
data.table = dt[, .SD[all(wanted %in% child_id), ], by = parent_id][, .I[1], by = parent_id]$parent_id,
dplyr_em = {
df %>%
group_by(parent_id) %>%
filter(any(child_id == wanted[1]) & any(child_id == wanted[2])) %>%
pull(parent_id) %>%
unique()
},
time_unit = "ms"
) %>% arrange(median)
我试图在 R 中找到一种可以快速处理大型数据集的过滤方法。我的数据有 Parent_ID
。每个 Parent_ID
可以有多行,每行有一个 Child_ID
.
我有一些数据,我想找到与 Child_ID = 1
和 Child_ID = 2
相关的所有 Parent_IDs
,例如:
parent_id child_id
[1,] 1 1
[2,] 1 1
[3,] 1 2
[4,] 2 1
[5,] 2 2
[6,] 3 3
[7,] 3 1
[8,] 3 4
[9,] 4 4
[10,] 4 2
在上面的示例中,它应该 return parent_ids 1,2
因为这两个 Parent_IDs
同时具有 child_id = 1
和 child_id = 2
.
parent_id = 3
未被 return 编辑,因为它有 child_id=1
但没有 child_id=2
。 parent_id=4
没有被 return 编辑,因为它有 child_id=2
但没有 child_id=1
.
我目前的方法是遍历每个 parent_id
,然后搜索 child_ids
,但我正在寻找更快的方法。
# define the child_ids we're looking for
wanted <- c(1,2)
# find the unique parent ids
uniq_parents <- unique(df$parent_id)
# loop through each parent_id
match <- vector()
for(i in 1:length(uniq_parents))
{
one_parent <- df[df$parent_id == uniq_parents[i],]
found <- wanted%in%one_parent$child_id
if(sum(found) == length(wanted))
{
match <- c(match,one_parent$parent_id)
}
}
match <- unique(match)
> match
[1] 1 2
intersect(df[df$child_id == 1, 'parent_id'], df[df$child_id == 2, 'parent_id'])
您想要 parent_ids 有“1”和“2”两个孩子,所以有“1”的孩子 children 和有“2”孩子的交集。
这是将其推广到任意数量 children:
的另一种实现find_parent_ids_for_children <- function(df, child_ids) {
df <- unique(df)
df <- df[df$child_id %in% child_ids, , drop = FALSE]
pids <- rle(sort(df$parent_id))
pids$values[pids$lengths == length(child_ids)]
}
df0 <- data.frame(parent_id=c(1,1,1,2,2,3,3,3,4,4), child_id=c(1,1,2,1,2,3,1,4,4,2))
find_parent_ids_for_children(df0, 1:2)
# [1] 1 2
find_parent_ids_for_children(df0, 1:3)
# numeric(0)
find_parent_ids_for_children(df0, 2:3)
# numeric(0)
find_parent_ids_for_children(df0, c(1, 3))
# [1] 3
我也会在 dplyr 中添加一个答案,这对过滤等非常有用
# create data frame
dat <- data.frame(parent_id = c(1, 1, 1, 2, 2, 3, 3, 3, 4, 4), child_id = c(1, 1, 2, 1, 2, 3, 1, 4, 4, 2))
# filter the table for the criteria you presented
result_parents <- dat %>%
group_by(parent_id) %>%
filter(any(child_id == 1) & any(child_id == 2))
# if you really only want the parent_id which match your criteria
unique(result_parents$parent_id)
这里有 data.table
和 dplyr
的解决方案。我很想看看它们在更大数据上的表现如何。
library(dplyr)
df %>%
group_by(parent_id) %>%
filter(all(wanted %in% child_id)) %>%
summarize(cur_group()) %>%
pull(parent_id)
library(data.table)
dt = as.data.table(df)
setkey(dt, parent_id)
dt[, .SD[all(wanted %in% child_id), ], by = parent_id][, .I[1], by = parent_id]$parent_id
使用此示例数据:
df = read.table(text = ' parent_id child_id
[1,] 1 1
[2,] 1 1
[3,] 1 2
[4,] 2 1
[5,] 2 2
[6,] 3 3
[7,] 3 1
[8,] 3 4
[9,] 4 4
[10,] 4 2', header =T)
wanted <- c(1,2)
基准测试
我运行一个基准,我觉得很惊讶。如果您在 wanted
中有 2 个子 ID,intersect
方法是迄今为止最快、内存效率最高且代码最简单的方法。
# A tibble: 5 × 13
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time
<bch:expr> <dbl> <dbl> <dbl> <bch:byt> <dbl> <int> <dbl> <dbl>
1 intersect 0.333 0.373 1418. 1.54MB 27.7 716 14 505.
2 dplyr_em 12.3 12.8 74.2 3.1MB 11.7 38 6 512.
3 dplyr_gregor 12.7 13.2 65.3 5.05MB 9.89 33 5 505.
4 rle 46.1 54.4 18.6 9.87MB 13.0 10 7 538.
5 data.table 213. 215. 4.64 83.26MB 10.8 3 7 647.
# … with 4 more variables: result <list>, memory <list>, time <list>, gc <list>
基准代码:
set.seed(47)
n = 1e5
df = data.frame(parent_id = sample(1:(n / 20), size = n, replace = T), child_id = sample(1:(n / 500), size = n, replace = T))
df = arrange(df, parent_id, child_id)
wanted = c(1, 2)
dt = as.data.table(df)
setkey(dt, parent_id)
dt[, .SD[all(wanted %in% child_id), ], by = parent_id][, .I[1], by = parent_id]$parent_id
find_parent_ids_for_children <- function(df, child_ids) {
df <- unique(df)
df <- df[df$child_id %in% child_ids, , drop = FALSE]
pids <- rle(sort(df$parent_id))
pids$values[pids$lengths == length(child_ids)]
}
bench::mark(
intersect = intersect(df[df$child_id == wanted[1], 'parent_id'], df[df$child_id == wanted[2], 'parent_id']),
rle = find_parent_ids_for_children(df, wanted),
dplyr_gregor = {
df %>%
group_by(parent_id) %>%
filter(all(wanted %in% child_id)) %>%
pull(parent_id) %>%
unique()
},
data.table = dt[, .SD[all(wanted %in% child_id), ], by = parent_id][, .I[1], by = parent_id]$parent_id,
dplyr_em = {
df %>%
group_by(parent_id) %>%
filter(any(child_id == wanted[1]) & any(child_id == wanted[2])) %>%
pull(parent_id) %>%
unique()
},
time_unit = "ms"
) %>% arrange(median)