按日期和休息时间的组合删除累计和
Remove the cumulative sum by combination of dates and breaks
我有一个数据集 (ds
),其中包含变量``、date
、values
和 break
。 values
是一个累积和,直到 break
== "True" 并且在数据再次开始累积和过程之后。在我的例子中:
# Packages
library(dplyr)
library(lubridate)
# My data set
ds <- read.csv("https://raw.githubusercontent.com/Leprechault/trash/main/accumulated_values_table")
ds$date <- ymd(ds$date)
str(ds)
# 'data.frame': 4458 obs. of 4 variables:
# $ id : int 0 1 2 2 1 0 3 2 4 1 ...
# $ date : Date, format: "2020-11-26" "2020-11-26" "2020-11-26" "2020-11-27" ...
# $ values: int 36 33 27 22 34 28 5 18 28 24 ...
# $ break.: chr "False" "False" "False" "False" ...
我想找到任何方法来删除累计和,所需的输出是 id
的每日值数。我需要任何函数的一些帮助来减去 day+1 - day 直到我的列 break
== 每个 id
的“True”。例如我的 id
=3:
# Actual dataset
ds.3 <- ds %>% filter(id==3)
head(ds.3)
# id date values break.
# 1 3 2020-11-27 5 True
# 2 3 2020-12-03 16 False
# 3 3 2020-12-05 18 False
# 4 3 2020-12-08 7 True
# 5 3 2020-12-09 27 False
# 6 3 2020-12-19 18 False
# Desirable dataset
# id date values break.
# 1 3 2020-11-27 5 True
# 2 3 2020-12-03 16 False
# 3 3 2020-12-05 2 False
# 4 3 2020-12-08 5 True
# 5 3 2020-12-09 27 False
# 6 3 2020-12-19 0 False * PS: if the sum is negative, the the value is equal to 0
有什么想法吗?
我推断我们想按 "True"
之间的行分组(也按 id
)并仅在 break.
不是 [=13 的情况下重新分配 values
=].
dplyr
library(dplyr)
out <- ds %>%
mutate(break. = break. == "True") %>%
group_by(id) %>%
mutate(grp = cumsum(break.)) %>%
group_by(id, grp, break.) %>%
mutate(values2 = if (break.[1]) values else pmax(0, c(values[1], diff(values)))) %>%
ungroup()
out
# # A tibble: 4,458 x 6
# id date values break. grp values2
# <int> <chr> <int> <lgl> <int> <dbl>
# 1 0 2020-11-26 36 FALSE 0 36
# 2 1 2020-11-26 33 FALSE 0 33
# 3 2 2020-11-26 27 FALSE 0 27
# 4 2 2020-11-27 22 FALSE 0 0
# 5 1 2020-11-27 34 TRUE 1 34
# 6 0 2020-11-27 28 FALSE 0 0
# 7 3 2020-11-27 5 TRUE 1 5
# 8 2 2020-11-28 18 FALSE 0 0
# 9 4 2020-11-28 28 FALSE 0 28
# 10 1 2020-11-28 24 FALSE 1 24
# # ... with 4,448 more rows
我们可以只看 id == 3
:
filter(out, id == 3)
# # A tibble: 17 x 6
# id date values break. grp values2
# <int> <chr> <int> <lgl> <int> <dbl>
# 1 3 2020-11-27 5 TRUE 1 5
# 2 3 2020-12-03 16 FALSE 1 16
# 3 3 2020-12-05 18 FALSE 1 2
# 4 3 2020-12-08 7 TRUE 2 7
# 5 3 2020-12-09 27 FALSE 2 27
# 6 3 2020-12-19 18 FALSE 2 0
# 7 3 2020-12-21 2 FALSE 2 0
# 8 3 2020-12-23 18 FALSE 2 16
# 9 3 2020-12-27 15 FALSE 2 0
# 10 3 2020-12-29 11 FALSE 2 0
# 11 3 2021-01-10 11 FALSE 2 0
# 12 3 2021-01-12 14 FALSE 2 3
# 13 3 2021-01-27 10 FALSE 2 0
# 14 3 2021-01-29 15 FALSE 2 5
# 15 3 2021-01-31 15 FALSE 2 0
# 16 3 2021-02-08 16 FALSE 2 1
# 17 3 2021-02-09 8 FALSE 2 0
基础 R
ds$break. <- ds$break. == "True"
ds$grp <- ave(ds$break., ds$id, FUN = cumsum)
ds$values2 <- ifelse(
ds$break. == "True", ds$values,
ave(ds$values, ds[c("id","grp","break.")],
FUN = function(z) pmax(0, c(z[1], diff(z))))
)
subset(ds, id == 3)
# id date values break. grp values2
# 7 3 2020-11-27 5 TRUE 1 5
# 35 3 2020-12-03 16 FALSE 1 16
# 48 3 2020-12-05 18 FALSE 1 2
# 64 3 2020-12-08 7 TRUE 2 7
# 74 3 2020-12-09 27 FALSE 2 27
# 182 3 2020-12-19 18 FALSE 2 0
# 201 3 2020-12-21 2 FALSE 2 0
# 220 3 2020-12-23 18 FALSE 2 16
# 249 3 2020-12-27 15 FALSE 2 0
# 277 3 2020-12-29 11 FALSE 2 0
# 380 3 2021-01-10 11 FALSE 2 0
# 390 3 2021-01-12 14 FALSE 2 3
# 509 3 2021-01-27 10 FALSE 2 0
# 524 3 2021-01-29 15 FALSE 2 5
# 539 3 2021-01-31 15 FALSE 2 0
# 608 3 2021-02-08 16 FALSE 2 1
# 619 3 2021-02-09 8 FALSE 2 0
我有一个数据集 (ds
),其中包含变量``、date
、values
和 break
。 values
是一个累积和,直到 break
== "True" 并且在数据再次开始累积和过程之后。在我的例子中:
# Packages
library(dplyr)
library(lubridate)
# My data set
ds <- read.csv("https://raw.githubusercontent.com/Leprechault/trash/main/accumulated_values_table")
ds$date <- ymd(ds$date)
str(ds)
# 'data.frame': 4458 obs. of 4 variables:
# $ id : int 0 1 2 2 1 0 3 2 4 1 ...
# $ date : Date, format: "2020-11-26" "2020-11-26" "2020-11-26" "2020-11-27" ...
# $ values: int 36 33 27 22 34 28 5 18 28 24 ...
# $ break.: chr "False" "False" "False" "False" ...
我想找到任何方法来删除累计和,所需的输出是 id
的每日值数。我需要任何函数的一些帮助来减去 day+1 - day 直到我的列 break
== 每个 id
的“True”。例如我的 id
=3:
# Actual dataset
ds.3 <- ds %>% filter(id==3)
head(ds.3)
# id date values break.
# 1 3 2020-11-27 5 True
# 2 3 2020-12-03 16 False
# 3 3 2020-12-05 18 False
# 4 3 2020-12-08 7 True
# 5 3 2020-12-09 27 False
# 6 3 2020-12-19 18 False
# Desirable dataset
# id date values break.
# 1 3 2020-11-27 5 True
# 2 3 2020-12-03 16 False
# 3 3 2020-12-05 2 False
# 4 3 2020-12-08 5 True
# 5 3 2020-12-09 27 False
# 6 3 2020-12-19 0 False * PS: if the sum is negative, the the value is equal to 0
有什么想法吗?
我推断我们想按 "True"
之间的行分组(也按 id
)并仅在 break.
不是 [=13 的情况下重新分配 values
=].
dplyr
library(dplyr)
out <- ds %>%
mutate(break. = break. == "True") %>%
group_by(id) %>%
mutate(grp = cumsum(break.)) %>%
group_by(id, grp, break.) %>%
mutate(values2 = if (break.[1]) values else pmax(0, c(values[1], diff(values)))) %>%
ungroup()
out
# # A tibble: 4,458 x 6
# id date values break. grp values2
# <int> <chr> <int> <lgl> <int> <dbl>
# 1 0 2020-11-26 36 FALSE 0 36
# 2 1 2020-11-26 33 FALSE 0 33
# 3 2 2020-11-26 27 FALSE 0 27
# 4 2 2020-11-27 22 FALSE 0 0
# 5 1 2020-11-27 34 TRUE 1 34
# 6 0 2020-11-27 28 FALSE 0 0
# 7 3 2020-11-27 5 TRUE 1 5
# 8 2 2020-11-28 18 FALSE 0 0
# 9 4 2020-11-28 28 FALSE 0 28
# 10 1 2020-11-28 24 FALSE 1 24
# # ... with 4,448 more rows
我们可以只看 id == 3
:
filter(out, id == 3)
# # A tibble: 17 x 6
# id date values break. grp values2
# <int> <chr> <int> <lgl> <int> <dbl>
# 1 3 2020-11-27 5 TRUE 1 5
# 2 3 2020-12-03 16 FALSE 1 16
# 3 3 2020-12-05 18 FALSE 1 2
# 4 3 2020-12-08 7 TRUE 2 7
# 5 3 2020-12-09 27 FALSE 2 27
# 6 3 2020-12-19 18 FALSE 2 0
# 7 3 2020-12-21 2 FALSE 2 0
# 8 3 2020-12-23 18 FALSE 2 16
# 9 3 2020-12-27 15 FALSE 2 0
# 10 3 2020-12-29 11 FALSE 2 0
# 11 3 2021-01-10 11 FALSE 2 0
# 12 3 2021-01-12 14 FALSE 2 3
# 13 3 2021-01-27 10 FALSE 2 0
# 14 3 2021-01-29 15 FALSE 2 5
# 15 3 2021-01-31 15 FALSE 2 0
# 16 3 2021-02-08 16 FALSE 2 1
# 17 3 2021-02-09 8 FALSE 2 0
基础 R
ds$break. <- ds$break. == "True"
ds$grp <- ave(ds$break., ds$id, FUN = cumsum)
ds$values2 <- ifelse(
ds$break. == "True", ds$values,
ave(ds$values, ds[c("id","grp","break.")],
FUN = function(z) pmax(0, c(z[1], diff(z))))
)
subset(ds, id == 3)
# id date values break. grp values2
# 7 3 2020-11-27 5 TRUE 1 5
# 35 3 2020-12-03 16 FALSE 1 16
# 48 3 2020-12-05 18 FALSE 1 2
# 64 3 2020-12-08 7 TRUE 2 7
# 74 3 2020-12-09 27 FALSE 2 27
# 182 3 2020-12-19 18 FALSE 2 0
# 201 3 2020-12-21 2 FALSE 2 0
# 220 3 2020-12-23 18 FALSE 2 16
# 249 3 2020-12-27 15 FALSE 2 0
# 277 3 2020-12-29 11 FALSE 2 0
# 380 3 2021-01-10 11 FALSE 2 0
# 390 3 2021-01-12 14 FALSE 2 3
# 509 3 2021-01-27 10 FALSE 2 0
# 524 3 2021-01-29 15 FALSE 2 5
# 539 3 2021-01-31 15 FALSE 2 0
# 608 3 2021-02-08 16 FALSE 2 1
# 619 3 2021-02-09 8 FALSE 2 0