转换数据
Transforming data
我有一个关于人们购买特定产品(假设是肥皂)的交易数据,我想了解购买肥皂的强度如何随时间变化。我将强度定义为一天中使用的肥皂的平均数量,假设如果一个人再次购买 he/she 完成 his/her 之前的肥皂供应。也很高兴为每个消费者删除异常值(消费者的平均个人 +- 2 * 消费者的标准差个人)并在上次购买后删除信息
当前数据框如下所示:
transacrions <- data.frame(Client_ID = c(1, 2, 1, 3, 4, 1, 3, 2, 1),
date = c("2017-01-01", "2017-01-01", "2017-01-02", "2017-01-03", "2017-01-04", "2017-01-05", "2017-01-06", "2017-01-09", "2017-01-10"),
soaps_bought = c(1, 12, 2, 19, 20, 10, 32, 12, 11))
我认为需要特定步骤才能达到我想要的 table。第一步是 "fill" 每个 he/she 未通过 NA 购买的消费者的缺失日期:
partial_results <- data.frame(Client_ID = rep(1:4, each = 10),
date = rep(seq(as.Date("2017-01-01"), as.Date("2017-01-10"), by = "day"), 4),
soaps_bought = c(1, 2, NA, NA, 10, NA, NA, NA, NA, 11,
12, NA, NA, NA, NA, NA, NA, NA, 12, NA,
NA, NA, 19, NA, NA, 32, NA, NA, NA, NA,
NA, NA, NA, 20, NA, NA, NA, NA, NA, NA ))
第二步将计算两次购买之间经过的天数并计算平均使用量。最好也延迟上次购买:
partial_results_II <- data.frame(Client_ID = rep(1:4, each = 10),
date = rep(seq(as.Date("2017-01-01"), as.Date("2017-01-10"), by = "day"), 4),
avg_soaps_bought = c(1/1, 2/3, 2/3, 2/3, 10/5, 10/5, 10/5, 10/5, 10/5, 11/1,
12/8, 12/8, 12/8, 12/8, 12/8, 12/8, 12/8, 12/8, 12/2, 12/2,
NA, NA, 19/3, 19/3, 19/3, 32/5, 32/5, 32/5, 32/5, 32/5,
NA, NA, NA, 20/7, 20/7, 20/7, 20/7, 20/7, 20/7, 20/7 ))
第三步我知道怎么做-它会从长变宽table:
desired_results <- dcast(setDT(partial_results_II), Client_ID ~ date, value.var = "avg_soaps_bought")
第四 - 延迟异常值会延迟消费者 1 的最后日期(使用 11 个肥皂):
我计算了每个人的平均值和标准差,甚至检查了哪些是异常值,但我现在不知道如何根据那个来延迟观察
desired_results_DF <- data.frame(desired_results)
avg <- rowMeans(desired_results_DF[, -1], na.rm = TRUE)
library(matrixStats)
desired_results_MX <- data.matrix(desired_results_DF[, -1])
sd <- rowSds(desired_results_MX, na.rm = TRUE)
is_ok <- desired_results_DF[, -1] < avg + 2 * sd | desired_results_DF[, -1] > avg - 2 * sd
你可以这样做:
library(dplyr)
library(tidyr)
transacrions %>%
group_by(Client_ID) %>%
mutate(avg_soaps_bought = soaps_bought/as.numeric(c(diff(as.Date(date)),1))) %>%
complete(date) %>%
fill(avg_soaps_bought) %>% # partial_results_II
select(-soaps_bought) %>%
spread(date,avg_soaps_bought) # final result
# # A tibble: 4 x 9
# # Groups: Client_ID [4]
# Client_ID `2017-01-01` `2017-01-02` `2017-01-03` `2017-01-04` `2017-01-05` `2017-01-06` `2017-01-09` `2017-01-10`
# * <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 1.0 0.6666667 0.6666667 0.6666667 2.000000 2.0 2 11
# 2 2 1.5 1.5000000 1.5000000 1.5000000 1.500000 1.5 12 12
# 3 3 NA NA 6.3333333 6.3333333 6.333333 32.0 32 32
# 4 4 NA NA NA 20.0000000 20.000000 20.0 20 20
我稍微改变了你的操作顺序,但你第一步想要的功能是 tidyr::complete
我有一个关于人们购买特定产品(假设是肥皂)的交易数据,我想了解购买肥皂的强度如何随时间变化。我将强度定义为一天中使用的肥皂的平均数量,假设如果一个人再次购买 he/she 完成 his/her 之前的肥皂供应。也很高兴为每个消费者删除异常值(消费者的平均个人 +- 2 * 消费者的标准差个人)并在上次购买后删除信息
当前数据框如下所示:
transacrions <- data.frame(Client_ID = c(1, 2, 1, 3, 4, 1, 3, 2, 1),
date = c("2017-01-01", "2017-01-01", "2017-01-02", "2017-01-03", "2017-01-04", "2017-01-05", "2017-01-06", "2017-01-09", "2017-01-10"),
soaps_bought = c(1, 12, 2, 19, 20, 10, 32, 12, 11))
我认为需要特定步骤才能达到我想要的 table。第一步是 "fill" 每个 he/she 未通过 NA 购买的消费者的缺失日期:
partial_results <- data.frame(Client_ID = rep(1:4, each = 10),
date = rep(seq(as.Date("2017-01-01"), as.Date("2017-01-10"), by = "day"), 4),
soaps_bought = c(1, 2, NA, NA, 10, NA, NA, NA, NA, 11,
12, NA, NA, NA, NA, NA, NA, NA, 12, NA,
NA, NA, 19, NA, NA, 32, NA, NA, NA, NA,
NA, NA, NA, 20, NA, NA, NA, NA, NA, NA ))
第二步将计算两次购买之间经过的天数并计算平均使用量。最好也延迟上次购买:
partial_results_II <- data.frame(Client_ID = rep(1:4, each = 10),
date = rep(seq(as.Date("2017-01-01"), as.Date("2017-01-10"), by = "day"), 4),
avg_soaps_bought = c(1/1, 2/3, 2/3, 2/3, 10/5, 10/5, 10/5, 10/5, 10/5, 11/1,
12/8, 12/8, 12/8, 12/8, 12/8, 12/8, 12/8, 12/8, 12/2, 12/2,
NA, NA, 19/3, 19/3, 19/3, 32/5, 32/5, 32/5, 32/5, 32/5,
NA, NA, NA, 20/7, 20/7, 20/7, 20/7, 20/7, 20/7, 20/7 ))
第三步我知道怎么做-它会从长变宽table:
desired_results <- dcast(setDT(partial_results_II), Client_ID ~ date, value.var = "avg_soaps_bought")
第四 - 延迟异常值会延迟消费者 1 的最后日期(使用 11 个肥皂): 我计算了每个人的平均值和标准差,甚至检查了哪些是异常值,但我现在不知道如何根据那个来延迟观察
desired_results_DF <- data.frame(desired_results)
avg <- rowMeans(desired_results_DF[, -1], na.rm = TRUE)
library(matrixStats)
desired_results_MX <- data.matrix(desired_results_DF[, -1])
sd <- rowSds(desired_results_MX, na.rm = TRUE)
is_ok <- desired_results_DF[, -1] < avg + 2 * sd | desired_results_DF[, -1] > avg - 2 * sd
你可以这样做:
library(dplyr)
library(tidyr)
transacrions %>%
group_by(Client_ID) %>%
mutate(avg_soaps_bought = soaps_bought/as.numeric(c(diff(as.Date(date)),1))) %>%
complete(date) %>%
fill(avg_soaps_bought) %>% # partial_results_II
select(-soaps_bought) %>%
spread(date,avg_soaps_bought) # final result
# # A tibble: 4 x 9
# # Groups: Client_ID [4]
# Client_ID `2017-01-01` `2017-01-02` `2017-01-03` `2017-01-04` `2017-01-05` `2017-01-06` `2017-01-09` `2017-01-10`
# * <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 1.0 0.6666667 0.6666667 0.6666667 2.000000 2.0 2 11
# 2 2 1.5 1.5000000 1.5000000 1.5000000 1.500000 1.5 12 12
# 3 3 NA NA 6.3333333 6.3333333 6.333333 32.0 32 32
# 4 4 NA NA NA 20.0000000 20.000000 20.0 20 20
我稍微改变了你的操作顺序,但你第一步想要的功能是 tidyr::complete