使用过滤器查找每组的唯一性

Finding Unique per group with filter

我的数据是这样的:

   date schedule_id food_truck_id building_id   truck_status last_confirmed_date dsle
 2018-04-26         422            58          30 accepted_event                   0   31
 2018-04-26         422            59          30 accepted_event          2018-02-27   11
 2018-04-26         422            65          30 accepted_event          2018-03-15   12
 2018-04-26         422            88          30 accepted_event          2018-02-20    7
 2018-04-26         422            89          30 accepted_event          2018-03-22   13
 2018-04-26         422           101          30 accepted_event          2018-02-06   16
 2018-04-26         422           120          30 accepted_event          2018-03-06   14
 2018-04-26         422           135          30 accepted_event          2018-03-13   21
 2018-04-26         399            42          33 accepted_event          2018-03-15    8
 2018-04-26         399            58          33 accepted_event                   0   31
 2018-04-26         399            59          33 accepted_event          2018-03-01   11
 2018-04-26         399            65          33 accepted_event          2018-02-27   12
 2018-04-26         399            88          33 accepted_event                   

可以使用以下方式复制:

structure(list(date = structure(c(17647, 17647, 17647, 17647, 
17647, 17647, 17647, 17647, 17647, 17647, 17647, 17647, 17647, 
17647, 17647, 17647, 17647), class = "Date"), schedule_id = c(422L, 
422L, 422L, 422L, 422L, 422L, 422L, 422L, 399L, 399L, 399L, 399L, 
399L, 399L, 399L, 399L, 399L), food_truck_id = c(58L, 59L, 65L, 
88L, 89L, 101L, 120L, 135L, 42L, 58L, 59L, 65L, 88L, 89L, 101L, 
120L, 135L), building_id = c(30L, 30L, 30L, 30L, 30L, 30L, 30L, 
30L, 33L, 33L, 33L, 33L, 33L, 33L, 33L, 33L, 33L), truck_status = c("accepted_event", 
"accepted_event", "accepted_event", "accepted_event", "accepted_event", 
"accepted_event", "accepted_event", "accepted_event", "accepted_event", 
"accepted_event", "accepted_event", "accepted_event", "accepted_event", 
"accepted_event", "accepted_event", "accepted_event", "accepted_event"
), last_confirmed_date = c("0", "2018-02-27", "2018-03-15", "2018-02-20", 
"2018-03-22", "2018-02-06", "2018-03-06", "2018-03-13", "2018-03-15", 
"0", "2018-03-01", "2018-02-27", "0", "2018-03-06", "2018-03-13", 
"0", "2018-02-22"), dsle = c(31, 11, 12, 7, 13, 16, 14, 21, 8, 
31, 11, 12, 7, 13, 16, 14, 21)), .Names = c("date", "schedule_id", 
"food_truck_id", "building_id", "truck_status", "last_confirmed_date", 
"dsle"), row.names = c(142L, 223L, 379L, 455L, 495L, 589L, 806L, 
877L, 63L, 155L, 215L, 287L, 452L, 483L, 667L, 809L, 894L), class = "data.frame")

我的目标是仅 select 基于 max(dsle) 的 food_truck_id 但每个日期应该是唯一的。例如,对于schedule_id 422,food_truck_id with max(dsle) 是58,对于schedule_id 399也是58。

我想要的是,假设422是58,但是399应该是下一个max(dsle)而不是58。

我已经尝试了以下方法,但它没有提供我想要的。

testxx %>% 
  group_by(schedule_id) %>% 
  distinct(food_truck_id, date, dsle) %>% 
  filter(dsle == max(dsle))

我要的结果如下

date         schedule_id        food_truck_id    
2018-04-26     422                   58
2018-04-26     399                   135

因为58旁边的135有max(dsle)

p<-df %>% arrange(desc(schedule_id), desc(dsle)) %>% slice(1) %>% select(date,dsle,schedule_id,food_truck_id) 

df %>% subset(!(schedule_id%in%c(p))) %>% subset(!(dsle%in%c(p))) %>% select(date,dsle,schedule_id,food_truck_id) %>% arrange(desc(dsle)) %>% slice(1) %>% 
  rbind(p,.) %>% select(-dsle)

输出

# A tibble: 2 x 3
  date       schedule_id food_truck_id
  <date>           <int>         <int>
1 2018-04-26         422            58
2 2018-04-26         399           135

更新说明日期

这可能是循环是 best/easiest 解决方案的情况之一。

但是,它在循环中做了一个join操作,所以可以做一些优化

我们的想法是遍历每个 schedule_id,并跟踪哪个 food_trucks 已在哪个日期使用。

如果我们在循环之前处理一些 pre-arranging 数据,事情会变得更容易

df <- df %>%
    arrange(schedule_id, -dsle)

## pre-allocate a result data.frame
ids <- unique(df$schedule_id)
df_res <- data.frame(schedule_id = ids,
                food_truck_id = NA)

usedTrucks <- data.frame(date = as.Date(NA), 
                schedule_id = ids, 
                food_truck_id = NA_integer_)
counter <- 1

for(i in ids) {

    possibleTrucks <- df[df$schedule_id %in% i, c("date", "food_truck_id")]
    ## possible Trucks will be in order, as we have pre-arranged the data

    ## use the first one that hasn't already been used
    ## on the given date
    possibleTrucks <- anti_join(possibleTrucks, usedTrucks, by = c("date", "food_truck_id"))
    thisTruck <- possibleTrucks[1, c("food_truck_id", "date")]

    df_res[counter, 'food_truck_id'] <- thisTruck$food_truck_id

    usedTrucks[counter, "food_truck_id"] <- thisTruck$food_truck_id
    usedTrucks[counter, "date"] <- thisTruck$date

    counter <- counter + 1
}

df_res
#   schedule_id food_truck_id
# 1         399            58
# 2         422           135

如果在较大的数据集上速度是一个问题,这可以在 Rcpp 中 re-written 以使其更快。