在 r 中查找与两个不同子 ID 关联的所有父 ID

Find all parent ids that are associated with two distinct child ids in r

我试图在 R 中找到一种可以快速处理大型数据集的过滤方法。我的数据有 Parent_ID。每个 Parent_ID 可以有多行,每行有一个 Child_ID.

我有一些数据,我想找到与 Child_ID = 1Child_ID = 2 相关的所有 Parent_IDs,例如:

      parent_id child_id
 [1,]         1        1
 [2,]         1        1
 [3,]         1        2
 [4,]         2        1
 [5,]         2        2
 [6,]         3        3
 [7,]         3        1
 [8,]         3        4
 [9,]         4        4
[10,]         4        2

在上面的示例中,它应该 return parent_ids 1,2 因为这两个 Parent_IDs 同时具有 child_id = 1child_id = 2.

parent_id = 3 未被 return 编辑,因为它有 child_id=1 但没有 child_id=2parent_id=4 没有被 return 编辑,因为它有 child_id=2 但没有 child_id=1.

我目前的方法是遍历每个 parent_id,然后搜索 child_ids,但我正在寻找更快的方法。

# define the child_ids we're looking for
wanted <- c(1,2)
# find the unique parent ids
uniq_parents <- unique(df$parent_id)
# loop through each parent_id
match <- vector()
for(i in 1:length(uniq_parents))
{
  one_parent <- df[df$parent_id == uniq_parents[i],]
  found <- wanted%in%one_parent$child_id
  if(sum(found) == length(wanted))
  {
    match <- c(match,one_parent$parent_id)
  }
}
match <- unique(match)

> match
[1] 1 2
intersect(df[df$child_id == 1, 'parent_id'], df[df$child_id == 2, 'parent_id'])

您想要 parent_ids 有“1”和“2”两个孩子,所以有“1”的孩子 children 和有“2”孩子的交集。

这是将其推广到任意数量 children:

的另一种实现
find_parent_ids_for_children <- function(df, child_ids) {
  df <- unique(df)
  df <- df[df$child_id %in% child_ids, , drop = FALSE]
  pids <- rle(sort(df$parent_id))
  
  pids$values[pids$lengths == length(child_ids)]
}

df0 <- data.frame(parent_id=c(1,1,1,2,2,3,3,3,4,4), child_id=c(1,1,2,1,2,3,1,4,4,2))

find_parent_ids_for_children(df0, 1:2)
# [1] 1 2

find_parent_ids_for_children(df0, 1:3)
# numeric(0)

find_parent_ids_for_children(df0, 2:3)
# numeric(0)
find_parent_ids_for_children(df0, c(1, 3))
# [1] 3

我也会在 dplyr 中添加一个答案,这对过滤等非常有用

# create data frame
dat <- data.frame(parent_id = c(1, 1, 1, 2, 2, 3, 3, 3, 4, 4), child_id = c(1, 1, 2, 1, 2, 3, 1, 4, 4, 2))

# filter the table for the criteria you presented
result_parents <- dat %>%
  group_by(parent_id) %>%
  filter(any(child_id == 1) & any(child_id == 2))
    
# if you really only want the parent_id which match your criteria          
unique(result_parents$parent_id)

这里有 data.tabledplyr 的解决方案。我很想看看它们在更大数据上的表现如何。

library(dplyr)
df %>%
  group_by(parent_id) %>%
  filter(all(wanted %in% child_id)) %>%
  summarize(cur_group()) %>%
  pull(parent_id)
library(data.table)
dt = as.data.table(df)
setkey(dt, parent_id)
dt[, .SD[all(wanted %in% child_id), ], by = parent_id][, .I[1], by = parent_id]$parent_id

使用此示例数据:

df = read.table(text = '     parent_id child_id
 [1,]         1        1
 [2,]         1        1
 [3,]         1        2
 [4,]         2        1
 [5,]         2        2
 [6,]         3        3
 [7,]         3        1
 [8,]         3        4
 [9,]         4        4
[10,]         4        2', header =T)

wanted <- c(1,2)

基准测试

我运行一个基准,我觉得很惊讶。如果您在 wanted 中有 2 个子 ID,intersect 方法是迄今为止最快、内存效率最高且代码最简单的方法。

# A tibble: 5 × 13
  expression       min  median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time
  <bch:expr>     <dbl>   <dbl>     <dbl> <bch:byt>    <dbl> <int> <dbl>      <dbl>
1 intersect      0.333   0.373   1418.      1.54MB    27.7    716    14       505.
2 dplyr_em      12.3    12.8       74.2      3.1MB    11.7     38     6       512.
3 dplyr_gregor  12.7    13.2       65.3     5.05MB     9.89    33     5       505.
4 rle           46.1    54.4       18.6     9.87MB    13.0     10     7       538.
5 data.table   213.    215.         4.64   83.26MB    10.8      3     7       647.
# … with 4 more variables: result <list>, memory <list>, time <list>, gc <list>

基准代码:

set.seed(47)
n = 1e5
df = data.frame(parent_id = sample(1:(n / 20), size = n, replace = T), child_id = sample(1:(n / 500), size = n, replace = T))
df = arrange(df, parent_id, child_id) 

wanted = c(1, 2)

dt = as.data.table(df)
setkey(dt, parent_id)
dt[, .SD[all(wanted %in% child_id), ], by = parent_id][, .I[1], by = parent_id]$parent_id

find_parent_ids_for_children <- function(df, child_ids) {
  df <- unique(df)
  df <- df[df$child_id %in% child_ids, , drop = FALSE]
  pids <- rle(sort(df$parent_id))
  
  pids$values[pids$lengths == length(child_ids)]
}


bench::mark(
  intersect = intersect(df[df$child_id == wanted[1], 'parent_id'], df[df$child_id == wanted[2], 'parent_id']),
  rle = find_parent_ids_for_children(df, wanted),
  dplyr_gregor = {
    df %>%
      group_by(parent_id) %>%
      filter(all(wanted %in% child_id)) %>%
      pull(parent_id) %>%
      unique()
  },
  data.table = dt[, .SD[all(wanted %in% child_id), ], by = parent_id][, .I[1], by = parent_id]$parent_id,
  dplyr_em = {
    df %>%
      group_by(parent_id) %>%
      filter(any(child_id == wanted[1]) & any(child_id == wanted[2])) %>%
      pull(parent_id) %>%
      unique()
  },
  time_unit = "ms"
) %>% arrange(median)