使用ggplot在宽格式数据框的条形图上绘制错误条

Plotting error bar on bar chart for a data frame in wide format using ggplot

您好,我想在条形图上绘制误差线。问题是因为我的数据框是宽格式的,所以我无法绘制它。数据框如下图:

structure(list(Insured_Age_Group = c(1, 2, 3, 4, 5, 6, 7), Policy_Status = c("Issuance", 
"Issuance", "Issuance", "Issuance", "Issuance", "Issuance", "Issuance"
), Deposit_mean = c(3859543.73892798, 4013324.11384503, 3970469.37408863, 
4405204.3601121, 4379252.01763646, 3816234.23370925, 3342252.39385489
), Deposit_n = c(31046L, 20039L, 20399L, 48677L, 30045L, 13947L, 
3157L), Deposit_sd = c(2816342.35213949, 3016203.31909278, 3292567.51598225, 
4345771.64693777, 4260381.02418456, 4748349.50958046, 4033440.60986956
), se_Deposit = c(31328.4343156912, 41761.74740604, 45184.1713046368, 
38606.556913894, 48174.6323355127, 78805.8303265365, 140700.113248691
), Insurance_mean = c(1962975.48419977, 2003323.06714903, 2665058.97077804, 
3033051.58298144, 3579542.94373979, 4338039.6868955, 4806849.35326484
), Insurance_n = c(31046L, 20039L, 20399L, 48677L, 30045L, 13947L, 
3157L), Insurance_sd = c(1187550.43329336, 1065410.12671512, 
1840293.78284101, 2248320.36787743, 2642040.82537531, 3128969.83541335, 
3030600.81901732), se_Insurance = c(13210.075727384, 14751.455352518, 
25254.5009726065, 19973.4167588603, 29875.1085068105, 51929.8475078389, 
105717.653906674)), row.names = c(NA, -7L), groups = structure(list(
    Insured_Age_Group = c(1, 2, 3, 4, 5, 6, 7), .rows = structure(list(
        1L, 2L, 3L, 4L, 5L, 6L, 7L), ptype = integer(0), class = c("vctrs_list_of", 
    "vctrs_vctr", "list"))), row.names = c(NA, 7L), class = c("tbl_df", 
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df", 
"tbl_df", "tbl", "data.frame"))

  Insured_Age_Group Policy_Status Deposit_mean Deposit_n Deposit_sd se_Deposit Insurance_mean Insurance_n Insurance_sd se_Insurance
              <dbl> <chr>                <dbl>     <int>      <dbl>      <dbl>          <dbl>       <int>        <dbl>        <dbl>
1                 1 Issuance          3859544.     31046   2816342.     31328.       1962975.       31046     1187550.       13210.
2                 2 Issuance          4013324.     20039   3016203.     41762.       2003323.       20039     1065410.       14751.
3                 3 Issuance          3970469.     20399   3292568.     45184.       2665059.       20399     1840294.       25255.
4                 4 Issuance          4405204.     48677   4345772.     38607.       3033052.       48677     2248320.       19973.
5                 5 Issuance          4379252.     30045   4260381.     48175.       3579543.       30045     2642041.       29875.
6                 6 Issuance          3816234.     13947   4748350.     78806.       4338040.       13947     3128970.       51930.
7                 7 Issuance          3342252.      3157   4033441.    140700.       4806849.        3157     3030601.      105718.

可以看出,对于Deposit_meanInsurance_mean的每个值,我计算了se_Depositse_Insurance(标准误差)。我绘制了如下所示的平均值图:

我知道如何使用 geom_errorbar 添加错误栏,但是,我不确定如何为每个这些条形图添加相应的 se 值,因为它们在宽格式。所以基本上,我必须以某种方式将宽格式更改为长格式,以便在每个计算的 deposit_meaninsurance_mean 前面我有对应的标准错误

有什么帮助或建议吗?

我想我会通过转向长格式,然后转向回不同的宽格式来重塑数据:

library(dplyr)
library(tidyr)
library(ggplot2)

df2 <- df %>% 
  rename(Insurance_se = se_Insurance, Deposit_se = se_Deposit) %>% 
  pivot_longer(-c(1:2), names_sep = "_", names_to = c("type", "metric")) %>% 
  pivot_wider(names_from = metric, values_from = value)

这会为您提供以下格式的数据:

df2
#> # A tibble: 14 x 7
#> # Groups:   Insured_Age_Group [7]
#>    Insured_Age_Group Policy_Status type          mean     n       sd      se
#>                <dbl> <chr>         <chr>        <dbl> <dbl>    <dbl>   <dbl>
#>  1                 1 Issuance      Deposit   3859544. 31046 2816342.  31328.
#>  2                 1 Issuance      Insurance 1962975. 31046 1187550.  13210.
#>  3                 2 Issuance      Deposit   4013324. 20039 3016203.  41762.
#>  4                 2 Issuance      Insurance 2003323. 20039 1065410.  14751.
#>  5                 3 Issuance      Deposit   3970469. 20399 3292568.  45184.
#>  6                 3 Issuance      Insurance 2665059. 20399 1840294.  25255.
#>  7                 4 Issuance      Deposit   4405204. 48677 4345772.  38607.
#>  8                 4 Issuance      Insurance 3033052. 48677 2248320.  19973.
#>  9                 5 Issuance      Deposit   4379252. 30045 4260381.  48175.
#> 10                 5 Issuance      Insurance 3579543. 30045 2642041.  29875.
#> 11                 6 Issuance      Deposit   3816234. 13947 4748350.  78806.
#> 12                 6 Issuance      Insurance 4338040. 13947 3128970.  51930.
#> 13                 7 Issuance      Deposit   3342252.  3157 4033441. 140700.
#> 14                 7 Issuance      Insurance 4806849.  3157 3030601. 105718.

然后您可以添加错误栏和您想要的任何风格调整:

ggplot(df2, aes(factor(Insured_Age_Group), mean, fill = type)) +
  geom_col(position = position_dodge(width = 0.8), width = 0.6) +
  geom_errorbar(aes(ymin = mean - 1.96*se, ymax = mean + 1.96*se),
                width = 0.4,
                position = position_dodge(width = 0.8), size = 1) +
  geom_text(aes(label = scales::dollar(mean), y = mean/2), hjust = 0.5, 
            angle = 90, position = position_dodge(width = 0.8)) +
  scale_y_continuous(labels = scales::dollar) +
  scale_fill_brewer(palette = "Greens") +
  labs(x = "Insured Age Group",
       y = "Premium value",
       title = paste("Mean value for Deposit and Insurance Annual Premium",
                     "for Issuance Group", sep = "\n")) +
  theme_bw() +
  theme(panel.border = element_blank(),
        axis.line = element_line(),
        plot.title = element_text(size = 18, hjust = 0.5))