通过对数据集中的最后几个月进行分组来计算唯一的泥瓦匠

Calculating unique masons by grouping consequent last months in a dataset

我有一个包含列的数据集:

set.seed(123)
df <- data.frame(Mason_Id = sample(c("Mason1", "Mason2","Mason3","Mason4","Mason5","Mason6"), 12, T),
                 Registration_Date = c("01-08-2020", "01-08-2020","05-08-2020","07-08-2020",
                          "02-09-2020", "02-09-2020","02-09-2020",
                          "03-09-2020","04-09-2020","01-10-2020","02-10-2020",
                          "06-10-2020"),
                 Token_Count = runif(12, 10, 100), stringsAsFactors = F)

#calculate last day of every month
library(lubridate)
df$month_end_date=paste(format(df$Registration_Date, format="%m-%Y"),"-", days_in_month(df$Registration_Date), sep="")

我需要找到从 10 月开始往后的最后 3 个月内的唯一泥瓦匠数量,格式如下:

Registration_Date | Unique_Masons
31-10-2020   |   5(unique masons in Oct,Sep, Aug) 
30-09-2020   |   x1(unique masons in Sep, Aug, July)
31-08-2020   |   x2(unique masons in Aug, July, June)
... and so on.

我试过按季度和按月汇总数据,但对我没有用。 请帮忙。提前致谢。

您可以从 Registration_Date 中减去 3 个月,然后找出两个日期之间存在多少唯一 Mason_Id

library(dplyr)
library(lubridate)

df %>%
  mutate(Registration_Date = dmy(Registration_Date),
         month_end_date = ceiling_date(Registration_Date, 'month') - 1, 
         three_month = month_end_date %m-% months(3) + 1,
         Unique_Masons = purrr::map2(three_month, month_end_date,
                 ~n_distinct(Mason_Id[between(Registration_Date, .x, .y)]))) %>%
  distinct(month_end_date, Unique_Masons) %>%
  arrange(desc(month_end_date))

#  month_end_date Unique_Masons
#1     2020-10-31             6
#2     2020-09-30             5
#3     2020-08-31             3

基础 R 解决方案:

clean_df <- transform(
  df,
  Month_Vec = as.Date(gsub("^\d{2}", "01", Registration_Date), "%d-%m-%Y"),
  Registration_Date = as.Date(Registration_Date, "%d-%m-%Y")
)

drng <- range(clean_df$Month_Vec)+31

eom_df <- merge(clean_df, 
            data.frame(eom = seq(drng[1], drng[2], by = "1 month")-1, 
            Month_Vec = sort(unique(clean_df$Month_Vec))), by = "Month_Vec", all.x = TRUE)

lapply(unique(eom_df$Month_Vec), 
  function(x){
    lower_lim <- seq(x, length = 2, by = "-3 months")[2]
    sbst <- subset(eom_df, Month_Vec >= lower_lim & Month_Vec <= x)
    data.frame(
      Registration_Date = max(sbst$eom),
      Unique_Masons = paste0(length(unique(sbst$Mason_Id)), "(Unique Masons in ",
        paste0(unique(month.abb[as.integer(substr(sbst$Month_Vec, 6, 7))]), 
        collapse = ", "), ")"
      )
    )
  }
)