在 R 中保留向量化应用函数的相关数

Keep the correlative number of a vectorized apply function in R

使用下面的数据,我想将以下操作矢量化为一个 apply 调用,但是这样做时我丢失了每个矢量化操作的相关编号。

示例数据如下:

library(VGAM)
library(dplyr)

df<- structure(list(id_choice = c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L,3L), V1 = c(-2.1578610393492, -7.99842015838894, -2.75141490020271,-10.4498142778112, -1.24287958770339, -0.378360296432904, -5.4054060545789,-1.91973213468639, -3.43124320737278), V2 = c(-0.707981236327309,-2.46810236795902, -0.929869936267716, -5.0157210007768, -0.602127441470801,-0.186681393158885, -2.06095189374723, -0.706069723181491, -1.04581628613078), V3 = c(-0.707981236327309, -2.46810236795902, -0.929869936267716,-3.34318115860932, -0.419902425557776, -0.141349098342894, -2.06095189374723,-0.706069723181491, -1.04581628613078)), row.names = c(NA, -9L), groups = structure(list(id_choice = 1:3, .rows = structure(list(1:3, 4:6, 7:9), ptype = integer(0), class = c("vctrs_list_of","vctrs_vctr", "list"))), row.names = c(NA, 3L), class = c("tbl_df","tbl", "data.frame"), .drop = TRUE), class = c("grouped_df","tbl_df", "tbl", "data.frame"))

# id_choice      V1     V2     V3
# 1         1  -2.16  -0.708 -0.708
# 2         1  -8.00  -2.47  -2.47 
# 3         1  -2.75  -0.930 -0.930
# 4         2 -10.4   -5.02  -3.34 
# 5         2  -1.24  -0.602 -0.420
# 6         2  -0.378 -0.187 -0.141
# 7         3  -5.41  -2.06  -2.06 
# 8         3  -1.92  -0.706 -0.706
# 9         3  -3.43  -1.05  -1.05 

我需要对每个 V1、V2 和 V3 应用相同的 2 个操作。它们是:

  1. 生成U <- V + Gumbel error
  2. 通过id_choice
  3. 生成一个二进制(choice)变量,最大为U

广泛的(绝对不合适的)方法如下(也欢迎对当前方法的第 2 点进行任何改进)。我将其称为 The Dumb 方法

愚蠢的[但有效]方法

# For V1...
# adding the random part to the error rgumbel
df$U1 <- df$V1 + rgumbel(nrow(df) , loc=0 , scale=1)  
# generating choice variable 
id_choice_split_1 <- with(df, split(U1,id_choice))
df$choice1 <- unlist(lapply(id_choice_split_1,
                            function(uValues)
                              as.numeric(uValues == max(uValues))))

# ...for V2...
# adding the random part to the error rgumbel
df$U2 <- df$V2 + rgumbel(nrow(df) , loc=0 , scale=1)  
# generating choice variable 
id_choice_split2 <- with(df, split(U2,id_choice))
df$choice2       <- unlist(lapply(id_choice_split2,
                                  function(uValues)
                                    as.numeric(uValues == max(uValues))))

# ...for V3.
# adding the random part to the error rgumbel
df$U3 <- df$V3 + rgumbel(nrow(df) , loc=0 , scale=1)  
# generating choice variable 
id_choice_split3 <- with(df, split(U3,id_choice))
df$choice3       <- unlist(lapply(id_choice_split3,
                                  function(uValues)
                                    as.numeric(uValues == max(uValues))))

导致....

# id_choice      V1     V2     V3     U1   choice1   U2   choice2   U3   choice3
# 1         1  -2.16  -0.708 -0.708 -2.89        0 -1.09        1 -1.42        0
# 2         1  -8.00  -2.47  -2.47  -8.94        0 -3.17        0 -2.15        0
# 3         1  -2.75  -0.930 -0.930 -2.51        1 -1.22        0 -0.130       1
# 4         2 -10.4   -5.02  -3.34  -8.87        0 -5.77        0 -2.82        0
# 5         2  -1.24  -0.602 -0.420  1.30        1 -0.477       0 -0.942       0
# 6         2  -0.378 -0.187 -0.141 -0.463       0  1.49        1 -0.121       1
# 7         3  -5.41  -2.06  -2.06  -4.28        0 -0.509       0 -1.31        0
# 8         3  -1.92  -0.706 -0.706 -0.633       1 -0.486       1  1.77        1
# 9         3  -3.43  -1.05  -1.05  -2.89        0 -1.68        0 -0.975       0

最后,只保留相关变量(a.k.a:id_choice choice1choice2choice3 ).

这是我的期望输出

#keeping just relevant variables 
df<- data.frame(select(df,-c(U1,V1,
                                U2,V2,
                                U3,V3)))

#   id_choice choice1 choice2 choice3
# 1         1       0       1       0
# 2         1       0       0       0
# 3         1       1       0       1
# 4         2       0       0       0
# 5         2       1       0       0
# 6         2       0       1       1
# 7         3       0       0       0
# 8         3       1       1       1
# 9         3       0       0       0

鉴于只有重复的操作,我更愿意将它们自动化。因此,我创建了一个使用 apply:

调用的函数

智能[但还没有用]方法

choice_generator <- function(V){
  require(VGAM)
  V <- as.data.frame(V)
  # adding the random part to the error rgumbel
  U <- V + rgumbel(nrow(V) , loc=0 , scale=1)  
  # generating choice variable 
  id_choice_split <- with(df, split(U,id_choice))
  choice       <- unlist(lapply(id_choice_split,
                                function(uValues) as.numeric(uValues == max(uValues))))
  #keep it as data.frame
  choice <- data.frame(choice)
}

# getting a list with the choice variables
list_with_choices <-  apply(df[grep('V', names(df))] ,
                            MARGIN = 2,
                            FUN =choice_generator )

#generate_data_frame from the obtained list.
df$choices <- do.call("cbind", list_with_choices) 

导致...

# id_choice      V1     V2     V3 choices$choice $choice $choice
# 1         1  -2.16  -0.708 -0.708              0       0       1
# 2         1  -8.00  -2.47  -2.47               0       0       0
# 3         1  -2.75  -0.930 -0.930              1       1       0
# 4         2 -10.4   -5.02  -3.34               0       0       0
# 5         2  -1.24  -0.602 -0.420              0       0       0
# 6         2  -0.378 -0.187 -0.141              1       1       1
# 7         3  -5.41  -2.06  -2.06               0       0       0
# 8         3  -1.92  -0.706 -0.706              1       1       0
# 9         3  -3.43  -1.05  -1.05               0       0       1

#New names:
#* choices.choice -> choices.choice...5
#* choices.choice -> choices.choice...6
#* choices.choice -> choices.choice...7


colnames(df)
#[1] "id_choice" "V1"        "V2"        "V3"        "choices"  

如您所见,我失去了 我想要的输出 ,因为 choice1choice2choice3 的相关数我正在摆脱 The Dumb Method 消失了。

是否可以通过修改我的函数来保留相关编号?还是我缺少的另一件事?感谢您提前加载您的时间!

智能[但尚未生效] 方法

保留名称的一种简单方法是在调用 cbind 之前使用 rlist::flatten()

do.call("cbind", rlist::list.flatten(list_with_choices)) 
#      V1.choice V2.choice V3.choice
# [1,]         0         1         0
# [2,]         0         0         0
# [3,]         1         0         1
# ...

一种dplyr方法

我认为这是 dplyr 大放异彩的情况之一:您可以使用简单的 group_by(id_choice),而不是 splitting 然后 unlisting 等等,然后在各个 Vx 列上应用一个函数(不需要知道 df):

choice_generator2 <- function(V){
  U <- V + rgumbel(length(V), 0, 1)
  1L * (U == max(U))
}

df %>%
  group_by(id_choice) %>%
  mutate(choice1 = choice_generator2(V1))

您也可以将它应用于所有列(在此过程中替换它们):

df %>%
  group_by(id_choice) %>%
  mutate(across(starts_with("V"), choice_generator2))

或使用以下值创建新列:

df %>%
  group_by(id_choice) %>%
  mutate(across(starts_with("V"), choice_generator2, .names = "choice_{.col}"))
# A tibble: 9 x 7
# Groups:   id_choice [3]
#   id_choice      V1     V2     V3 choice_V1 choice_V2 choice_V3
#       <int>   <dbl>  <dbl>  <dbl>     <int>     <int>     <int>
# 1         1  -2.16  -0.708 -0.708         1         1         0
# 2         1  -8.00  -2.47  -2.47          0         0         1
# 3         1  -2.75  -0.930 -0.930         0         0         0
# 4         2 -10.4   -5.02  -3.34          0         0         0
# 5         2  -1.24  -0.602 -0.420         1         0         0
# 6         2  -0.378 -0.187 -0.141         0         1         1
# 7         3  -5.41  -2.06  -2.06          0         0         0
# 8         3  -1.92  -0.706 -0.706         1         0         1
# 9         3  -3.43  -1.05  -1.05          0         1         0