case_when 失败(意外结果)但 case_when 的条件在外部工作
case_when fails (unexpected results) but conditions for case_when work outside
我正在尝试应用一些方程式来获得一条线(数字数组)与另一条线(另一个数字数组)重合的比例。我有一个包含所需值的数据框,我尝试根据两条线的重合方式创建一个包含百分比结果的新列。我已经用一些例子检查了代码(下面)并且它工作正常,但是当我将 case_when()
应用于数据框时,输出不是它应该的。我会给你一个基本的例子。
这是我的输出。它有一个'ID'列[char],一个'date'(天)列[dttm],一个'result'(值)列[double],'difs'列是数字前一行 [int] 和 'Grp' 列之间的天数,这是一个子分组值。
这是我正在使用的代码。这个想法是获取先前的值并计算矢量位于另一个矢量内的百分比,该矢量的范围为 [2,3]。现在我只是检查条件是否适合每一行。但是,当它应该得到“0”时,得到 'A',或者有时当它应该得到 'A' 时得到 'Inf',等等。我不明白为什么。我认为 mutate 独立迭代组内的每一行,所以与手动检查相比,我不明白为什么结果是错误的。
Rsup = 3 # Highlimit of target array
Rinf = 2 # Low limit of target array
example_output = example%>%
arrange(id,Grp,day) %>%
group_by(id,Grp) %>% # Group by episodes (id + Grp)
mutate(from_r = lag(result)) %>% # get previous result y(t-1)
filter(difs != 0, difs < 181) %>% # dischard first sample of every subgroup/episode
mutate(
p_days = case_when(
(min(result,from_r) < Rinf) & (max(result,from_r) > Rsup) ~ 'A',
(min(result,from_r) > Rinf) & (max(result,from_r) < Rsup) ~ '100',
(min(result,from_r) < Rinf) & (max(result,from_r) > Rinf) ~ 'Inf',
(min(result,from_r) < Rsup) & (max(result,from_r) > Rsup) ~ 'Sup',
TRUE ~ '0')
)
# Case 'A': check interval yt - yt-1 cuts target array for both limits
# Case '100': all the interval yt - yt-1 is inside target array (100%)
# Case 'Inf': interval cuts low limit of target array
# Case 'Sup': interval cuts high limit of target array
# Case True ~ '0': interval does not cut target array and it is not inside (0%)
这是创建基本示例的方法:
structure(list(id = c("A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "B",
"B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B",
"B", "B", "B", "B", "B", "B"), day = structure(c(19104, 19105,
19106, 19107, 19108, 19109, 19110, 19111, 19112, 19113, 19304,
19305, 19306, 19307, 19604, 19605, 19606, 19607, 19608, 19609,
19204, 19205, 19206, 19207, 19208, 19209, 19210, 19211, 19212,
19213, 19214, 19215, 19216, 19217, 19218, 19219, 19220, 19221,
19222, 19223), class = "Date"), result = c(1.55, 1.92, 3.6, 3.45,
3.3, 3.46, 2.79, 2.55, 2.08, 2.27, 2.44, 4.59, 1.8, 0.75, 3.13,
2.59, 2.16, 2.93, 1.38, 2.92, 3.19, 3.23, 3.48, 3.39, 2.62, 2.66,
3.77, 3.44, 3.06, 2.59, 2.87, 1.97, 2.5, 2.84, 1.48, 3.04, 2.62,
0.76, 2.74, 2.84), difs = c(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 191,
1, 1, 1, 297, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1), Grp = c(1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1)), row.names = c(NA, -40L), groups = structure(list(
id = c("A", "B"), .rows = structure(list(1:20, 21:40), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -2L), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
当然,如果有人知道一个函数可以得到与我正在尝试使用 mutate + case_when 相同的输出,那也会非常有帮助。提前致谢。
编辑:我认为 mutate 独立迭代组内的每一行,所以我不明白为什么结果是错误的。也许它以某种方式混合了每个组的结果(和 from_r)值?
问题是您使用 min/max
而不是向量化 pmin/pmax
:
library(dplyr)
ex1 <- example %>%
arrange(id,Grp,day) %>%
group_by(id,Grp) %>% # Group by episodes (id + Grp)
mutate(from_r = lag(result)) %>% # get previous result y(t-1)
filter(difs != 0, difs < 181) # dischard first sample of every subgroup/episode
ex1 %>%
mutate(
p_days = case_when(
(pmin(result,from_r) < Rinf) & (pmax(result,from_r) > Rsup) ~ 'A',
(pmin(result,from_r) > Rinf) & (pmax(result,from_r) < Rsup) ~ '100',
(pmin(result,from_r) < Rinf) & (pmax(result,from_r) > Rinf) ~ 'Inf',
(pmin(result,from_r) < Rsup) & (pmax(result,from_r) > Rsup) ~ 'Sup',
TRUE ~ '0')
)
#> # A tibble: 36 × 7
#> # Groups: id, Grp [4]
#> id day result difs Grp from_r p_days
#> <chr> <date> <dbl> <dbl> <dbl> <dbl> <chr>
#> 1 A 2022-04-23 1.92 1 1 1.55 0
#> 2 A 2022-04-24 3.6 1 1 1.92 A
#> 3 A 2022-04-25 3.45 1 1 3.6 0
#> 4 A 2022-04-26 3.3 1 1 3.45 0
#> 5 A 2022-04-27 3.46 1 1 3.3 0
#> 6 A 2022-04-28 2.79 1 1 3.46 Sup
#> 7 A 2022-04-29 2.55 1 1 2.79 100
#> 8 A 2022-04-30 2.08 1 1 2.55 100
#> 9 A 2022-05-01 2.27 1 1 2.08 100
#> 10 A 2022-11-09 4.59 1 2 2.44 Sup
#> # … with 26 more rows
要更清楚地看到差异,请检查:
min(ex1$result,ex1$from_r) < Rinf
#> [1] TRUE
pmin(ex1$result,ex1$from_r) < Rinf
#> [1] TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE
#> [13] FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
#> [25] FALSE FALSE FALSE TRUE TRUE FALSE TRUE TRUE FALSE TRUE TRUE FALSE
如您所见,使用 min
条件 min(ex1$result,ex1$from_r) < Rinf
为数据的每一行提供 TRUE
。如果你想检查条件或设置每行你必须使用 pmin/pmax
.
我正在尝试应用一些方程式来获得一条线(数字数组)与另一条线(另一个数字数组)重合的比例。我有一个包含所需值的数据框,我尝试根据两条线的重合方式创建一个包含百分比结果的新列。我已经用一些例子检查了代码(下面)并且它工作正常,但是当我将 case_when()
应用于数据框时,输出不是它应该的。我会给你一个基本的例子。
这是我的输出。它有一个'ID'列[char],一个'date'(天)列[dttm],一个'result'(值)列[double],'difs'列是数字前一行 [int] 和 'Grp' 列之间的天数,这是一个子分组值。
这是我正在使用的代码。这个想法是获取先前的值并计算矢量位于另一个矢量内的百分比,该矢量的范围为 [2,3]。现在我只是检查条件是否适合每一行。但是,当它应该得到“0”时,得到 'A',或者有时当它应该得到 'A' 时得到 'Inf',等等。我不明白为什么。我认为 mutate 独立迭代组内的每一行,所以与手动检查相比,我不明白为什么结果是错误的。
Rsup = 3 # Highlimit of target array
Rinf = 2 # Low limit of target array
example_output = example%>%
arrange(id,Grp,day) %>%
group_by(id,Grp) %>% # Group by episodes (id + Grp)
mutate(from_r = lag(result)) %>% # get previous result y(t-1)
filter(difs != 0, difs < 181) %>% # dischard first sample of every subgroup/episode
mutate(
p_days = case_when(
(min(result,from_r) < Rinf) & (max(result,from_r) > Rsup) ~ 'A',
(min(result,from_r) > Rinf) & (max(result,from_r) < Rsup) ~ '100',
(min(result,from_r) < Rinf) & (max(result,from_r) > Rinf) ~ 'Inf',
(min(result,from_r) < Rsup) & (max(result,from_r) > Rsup) ~ 'Sup',
TRUE ~ '0')
)
# Case 'A': check interval yt - yt-1 cuts target array for both limits
# Case '100': all the interval yt - yt-1 is inside target array (100%)
# Case 'Inf': interval cuts low limit of target array
# Case 'Sup': interval cuts high limit of target array
# Case True ~ '0': interval does not cut target array and it is not inside (0%)
这是创建基本示例的方法:
structure(list(id = c("A", "A", "A", "A", "A", "A", "A", "A",
"A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "B",
"B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B", "B",
"B", "B", "B", "B", "B", "B"), day = structure(c(19104, 19105,
19106, 19107, 19108, 19109, 19110, 19111, 19112, 19113, 19304,
19305, 19306, 19307, 19604, 19605, 19606, 19607, 19608, 19609,
19204, 19205, 19206, 19207, 19208, 19209, 19210, 19211, 19212,
19213, 19214, 19215, 19216, 19217, 19218, 19219, 19220, 19221,
19222, 19223), class = "Date"), result = c(1.55, 1.92, 3.6, 3.45,
3.3, 3.46, 2.79, 2.55, 2.08, 2.27, 2.44, 4.59, 1.8, 0.75, 3.13,
2.59, 2.16, 2.93, 1.38, 2.92, 3.19, 3.23, 3.48, 3.39, 2.62, 2.66,
3.77, 3.44, 3.06, 2.59, 2.87, 1.97, 2.5, 2.84, 1.48, 3.04, 2.62,
0.76, 2.74, 2.84), difs = c(0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 191,
1, 1, 1, 297, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1), Grp = c(1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1)), row.names = c(NA, -40L), groups = structure(list(
id = c("A", "B"), .rows = structure(list(1:20, 21:40), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -2L), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
当然,如果有人知道一个函数可以得到与我正在尝试使用 mutate + case_when 相同的输出,那也会非常有帮助。提前致谢。
编辑:我认为 mutate 独立迭代组内的每一行,所以我不明白为什么结果是错误的。也许它以某种方式混合了每个组的结果(和 from_r)值?
问题是您使用 min/max
而不是向量化 pmin/pmax
:
library(dplyr)
ex1 <- example %>%
arrange(id,Grp,day) %>%
group_by(id,Grp) %>% # Group by episodes (id + Grp)
mutate(from_r = lag(result)) %>% # get previous result y(t-1)
filter(difs != 0, difs < 181) # dischard first sample of every subgroup/episode
ex1 %>%
mutate(
p_days = case_when(
(pmin(result,from_r) < Rinf) & (pmax(result,from_r) > Rsup) ~ 'A',
(pmin(result,from_r) > Rinf) & (pmax(result,from_r) < Rsup) ~ '100',
(pmin(result,from_r) < Rinf) & (pmax(result,from_r) > Rinf) ~ 'Inf',
(pmin(result,from_r) < Rsup) & (pmax(result,from_r) > Rsup) ~ 'Sup',
TRUE ~ '0')
)
#> # A tibble: 36 × 7
#> # Groups: id, Grp [4]
#> id day result difs Grp from_r p_days
#> <chr> <date> <dbl> <dbl> <dbl> <dbl> <chr>
#> 1 A 2022-04-23 1.92 1 1 1.55 0
#> 2 A 2022-04-24 3.6 1 1 1.92 A
#> 3 A 2022-04-25 3.45 1 1 3.6 0
#> 4 A 2022-04-26 3.3 1 1 3.45 0
#> 5 A 2022-04-27 3.46 1 1 3.3 0
#> 6 A 2022-04-28 2.79 1 1 3.46 Sup
#> 7 A 2022-04-29 2.55 1 1 2.79 100
#> 8 A 2022-04-30 2.08 1 1 2.55 100
#> 9 A 2022-05-01 2.27 1 1 2.08 100
#> 10 A 2022-11-09 4.59 1 2 2.44 Sup
#> # … with 26 more rows
要更清楚地看到差异,请检查:
min(ex1$result,ex1$from_r) < Rinf
#> [1] TRUE
pmin(ex1$result,ex1$from_r) < Rinf
#> [1] TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE
#> [13] FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
#> [25] FALSE FALSE FALSE TRUE TRUE FALSE TRUE TRUE FALSE TRUE TRUE FALSE
如您所见,使用 min
条件 min(ex1$result,ex1$from_r) < Rinf
为数据的每一行提供 TRUE
。如果你想检查条件或设置每行你必须使用 pmin/pmax
.