如何将摘要统计信息作为每个统计信息的新变量加入到原始数据框中

Question

需要帮助来创建一个函数，以便该函数可以生成汇总统计信息并返回到原始列但以列命名（带有前缀（即 cyl_event_pct、cyl_distn_events 用于每个变量数据框中相应变量的值。该函数将需要数据框和自变量的输入参数。在下面的示例中，自变量将为 mtcars$am

df <- mtcars %>% 
  select(cyl,gear,vs,am) 

df %>%
  gather(variable,value, -ncol(df)) %>%
  group_by(variable, value) %>%
  summarise (n = n(),
             n_events = sum(am),
             event_pct = round(mean(am),4)
  ) %>%
  mutate(distn_events = round(n_events/sum(n_events),4))

想将 event_pct 和 distn_events 作为新变量添加到原始 df。

   # A tibble: 8 x 6
# Groups:   variable [3]
  variable value     n n_events event_pct distn_events
     <chr> <dbl> <int>    <dbl>     <dbl>        <dbl>
1      cyl     4    11        8    0.7273       0.6154
2      cyl     6     7        3    0.4286       0.2308
3      cyl     8    14        2    0.1429       0.1538
4     gear     3    15        0    0.0000       0.0000
5     gear     4    12        8    0.6667       0.6154
6     gear     5     5        5    1.0000       0.3846
7       vs     0    18        6    0.3333       0.4615
8       vs     1    14        7    0.5000       0.5385

希望在添加额外的列作为 cyl_event_pct、cyl_distn_events、gear_event_pct、gear_distn_events 到下面的 table 之后看到一个数据框数据框中相应变量的值（不包括am的自变量）

df
                    cyl gear vs am
Mazda RX4             6    4  0  1
Mazda RX4 Wag         6    4  0  1
Datsun 710            4    4  1  1
Hornet 4 Drive        6    3  1  0
Hornet Sportabout     8    3  0  0
Valiant               6    3  1  0
Duster 360            8    3  0  0
Merc 240D             4    4  1  0
Merc 230              4    4  1  0
Merc 280              6    4  1  0
Merc 280C             6    4  1  0

先谢谢各位宇宙高手！ JT

Answer 1

你可以使用

cols <- c("cyl", "gear", "vs")
df[paste0(cols,"_event_pct")] <- lapply(cols, function(x) df_stat[df_stat$variable==x,"event_pct"][match(df[[x]], df_stat[df_stat$variable==x,"value"])])
df[paste0(cols,"_distn_events")] <- lapply(cols, function(x) df_stat[df_stat$variable==x,"distn_events"][match(df[[x]], df_stat[df_stat$variable==x,"value"])])
df

输出为：

                  cyl gear vs am cyl_event_pct gear_event_pct vs_event_pct cyl_distn_events gear_distn_events vs_distn_events
Mazda RX4           6    4  0  1        0.4286         0.6667       0.3333           0.2308            0.6154          0.4615
Mazda RX4 Wag       6    4  0  1        0.4286         0.6667       0.3333           0.2308            0.6154          0.4615
Datsun 710          4    4  1  1        0.7273         0.6667       0.5000           0.6154            0.6154          0.5385
Hornet 4 Drive      6    3  1  0        0.4286         0.0000       0.5000           0.2308            0.0000          0.5385
Hornet Sportabout   8    3  0  0        0.1429         0.0000       0.3333           0.1538            0.0000          0.4615
Valiant             6    3  1  0        0.4286         0.0000       0.5000           0.2308            0.0000          0.5385
Duster 360          8    3  0  0        0.1429         0.0000       0.3333           0.1538            0.0000          0.4615
Merc 240D           4    4  1  0        0.7273         0.6667       0.5000           0.6154            0.6154          0.5385
Merc 230            4    4  1  0        0.7273         0.6667       0.5000           0.6154            0.6154          0.5385
Merc 280            6    4  1  0        0.4286         0.6667       0.5000           0.2308            0.6154          0.5385
Merc 280C           6    4  1  0        0.4286         0.6667       0.5000           0.2308            0.6154          0.5385

#sample data
> dput(df)
structure(list(cyl = c(6L, 6L, 4L, 6L, 8L, 6L, 8L, 4L, 4L, 6L, 
6L), gear = c(4L, 4L, 4L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L), vs = c(0L, 
0L, 1L, 1L, 0L, 1L, 0L, 1L, 1L, 1L, 1L), am = c(1L, 1L, 1L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L)), .Names = c("cyl", "gear", "vs", 
"am"), class = "data.frame", row.names = c("Mazda RX4", "Mazda RX4 Wag", 
"Datsun 710", "Hornet 4 Drive", "Hornet Sportabout", "Valiant", 
"Duster 360", "Merc 240D", "Merc 230", "Merc 280", "Merc 280C"
))
> dput(df_stat)
structure(list(variable = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 
3L, 3L), .Label = c("cyl", "gear", "vs"), class = "factor"), 
    value = c(4L, 6L, 8L, 3L, 4L, 5L, 0L, 1L), n = c(11L, 7L, 
    14L, 15L, 12L, 5L, 18L, 14L), n_events = c(8L, 3L, 2L, 0L, 
    8L, 5L, 6L, 7L), event_pct = c(0.7273, 0.4286, 0.1429, 0, 
    0.6667, 1, 0.3333, 0.5), distn_events = c(0.6154, 0.2308, 
    0.1538, 0, 0.6154, 0.3846, 0.4615, 0.5385)), .Names = c("variable", 
"value", "n", "n_events", "event_pct", "distn_events"), class = "data.frame", row.names = c("1", 
"2", "3", "4", "5", "6", "7", "8"))

Answer 2

这是我尝试使用 Prem 的解决方案编写函数。如果有更优雅的方法，请告诉我。

library(dplyr)
library(tidyr)

cols <- c("cyl", "gear", "vs")
y <- 'am'

gen_stat_df <- function(df, char_col, y){

  my_df <- df %>%  
    select(char_col,y)

  excl_y_cols <- setdiff(char_col,y)

  #create summary statistics
  df_stat <- my_df %>%
    gather(variable,value, excl_y_cols) %>%
    group_by(variable, value) %>%
    summarise (n = n(),
               n_events = sum(!! rlang::sym(y)),
               event_pct = round(mean(!! rlang::sym(y)),4)
    ) %>%
    mutate(distn_events = round(n_events/sum(n_events),4)) %>% data.frame()

  df[paste0(char_col,"_distn_events")] <- lapply(char_col, function(x) df_stat[df_stat$variable==x,"distn_events"][match(df[[x]], df_stat[df_stat$variable==x,"value"])])

  return(df)

}
gen_stat_df(mtcars,cols,y)

和输出

> gen_stat_df(mtcars,cols,y)
                     mpg cyl  disp  hp drat    wt  qsec vs am gear carb cyl_distn_events gear_distn_events
Mazda RX4           21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4           0.2308            0.6154
Mazda RX4 Wag       21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4           0.2308            0.6154
Datsun 710          22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1           0.6154            0.6154
Hornet 4 Drive      21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1           0.2308            0.0000
Hornet Sportabout   18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2           0.1538            0.0000
Valiant             18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1           0.2308            0.0000
Duster 360          14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4           0.1538            0.0000
Merc 240D           24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2           0.6154            0.6154
Merc 230            22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2           0.6154            0.6154
Merc 280            19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4           0.2308            0.6154
Merc 280C           17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4           0.2308            0.6154
Merc 450SE          16.4   8 275.8 180 3.07 4.070 17.40  0  0    3    3           0.1538            0.0000
Merc 450SL          17.3   8 275.8 180 3.07 3.730 17.60  0  0    3    3           0.1538            0.0000

如何将摘要统计信息作为每个统计信息的新变量加入到原始数据框中

How to join summary statistics as new variables for each statistics back to original dataframe

join

r

summary

data-manipulation

dplyr