使用 tidyverse 从数据子集中创建以因子水平为条件的比例变量
Create variable of proportions conditional on level of factor from subsets of data using tidyverse
我有一个这样的数据框:
df<- data.frame(year= as.character(c("1997",
"1997","1997","1997","1997","1997","1998","1998")),season=
as.character(c("W", "W","W","D","D","D","W","W")),result=
as.character(c("Y", "Y","N","N","Y","N","N","N")))
我想按 year
和 season
对数据进行子集化,并计算该特定子集的 "Y" 在 result
中的比例。这个新的比例列称为 psit_freq
。下面是一个输出示例(我制作了比例分数以帮助读者理解我需要的计算)。
output<- data.frame(year= as.character(c("1997",
"1997","1998")),season= as.character(c("W", "D","W")), psit_freq=
as.character(c("2/3", "1/3","0/2")))
我试过以下变体:
df<-
df %>%
group_by(year, season)%>%
summarise(psit_freq= freq())
但我不确定如何结合条件 if else 语句来计算每个子集中 Y
响应占总 result
行的比例。
您需要做的就是将 result
更改为整数(或逻辑整数),然后像您一样按 year
和 season
分组,并总结取 [= 的平均值11=].
library(dplyr)
df <- tibble(
year= c("1997", "1997","1997","1997","1997","1997","1998","1998"),
season= c("W", "W","W","D","D","D","W","W"),
result= c("Y", "Y","N","N","Y","N","N","N")
)
df %>%
mutate(result = recode(result, "Y" = 1L, "N" = 0L)) %>%
group_by(year, season) %>%
summarise(psit_freq = mean(result))
#> # A tibble: 3 x 3
#> # Groups: year [?]
#> year season psit_freq
#> <chr> <chr> <dbl>
#> 1 1997 D 0.3333333
#> 2 1997 W 0.6666667
#> 3 1998 W 0.0000000
data.frame(year=as.character(c("1997","1997","1997","1997","1997","1997","1998","1998")),
season=as.character(c("W", "W","W","D","D","D","W","W")),
result=as.character(c("Y", "Y","N","N","Y","N","N","N"))) %>%
group_by(year, season) %>%
summarise(psit_freq = sum(result == "Y")/length(result))
我有一个这样的数据框:
df<- data.frame(year= as.character(c("1997",
"1997","1997","1997","1997","1997","1998","1998")),season=
as.character(c("W", "W","W","D","D","D","W","W")),result=
as.character(c("Y", "Y","N","N","Y","N","N","N")))
我想按 year
和 season
对数据进行子集化,并计算该特定子集的 "Y" 在 result
中的比例。这个新的比例列称为 psit_freq
。下面是一个输出示例(我制作了比例分数以帮助读者理解我需要的计算)。
output<- data.frame(year= as.character(c("1997",
"1997","1998")),season= as.character(c("W", "D","W")), psit_freq=
as.character(c("2/3", "1/3","0/2")))
我试过以下变体:
df<-
df %>%
group_by(year, season)%>%
summarise(psit_freq= freq())
但我不确定如何结合条件 if else 语句来计算每个子集中 Y
响应占总 result
行的比例。
您需要做的就是将 result
更改为整数(或逻辑整数),然后像您一样按 year
和 season
分组,并总结取 [= 的平均值11=].
library(dplyr)
df <- tibble(
year= c("1997", "1997","1997","1997","1997","1997","1998","1998"),
season= c("W", "W","W","D","D","D","W","W"),
result= c("Y", "Y","N","N","Y","N","N","N")
)
df %>%
mutate(result = recode(result, "Y" = 1L, "N" = 0L)) %>%
group_by(year, season) %>%
summarise(psit_freq = mean(result))
#> # A tibble: 3 x 3
#> # Groups: year [?]
#> year season psit_freq
#> <chr> <chr> <dbl>
#> 1 1997 D 0.3333333
#> 2 1997 W 0.6666667
#> 3 1998 W 0.0000000
data.frame(year=as.character(c("1997","1997","1997","1997","1997","1997","1998","1998")),
season=as.character(c("W", "W","W","D","D","D","W","W")),
result=as.character(c("Y", "Y","N","N","Y","N","N","N"))) %>%
group_by(year, season) %>%
summarise(psit_freq = sum(result == "Y")/length(result))