在 R 中保留向量化应用函数的相关数
Keep the correlative number of a vectorized apply function in R
使用下面的数据,我想将以下操作矢量化为一个 apply
调用,但是这样做时我丢失了每个矢量化操作的相关编号。
示例数据如下:
library(VGAM)
library(dplyr)
df<- structure(list(id_choice = c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L,3L), V1 = c(-2.1578610393492, -7.99842015838894, -2.75141490020271,-10.4498142778112, -1.24287958770339, -0.378360296432904, -5.4054060545789,-1.91973213468639, -3.43124320737278), V2 = c(-0.707981236327309,-2.46810236795902, -0.929869936267716, -5.0157210007768, -0.602127441470801,-0.186681393158885, -2.06095189374723, -0.706069723181491, -1.04581628613078), V3 = c(-0.707981236327309, -2.46810236795902, -0.929869936267716,-3.34318115860932, -0.419902425557776, -0.141349098342894, -2.06095189374723,-0.706069723181491, -1.04581628613078)), row.names = c(NA, -9L), groups = structure(list(id_choice = 1:3, .rows = structure(list(1:3, 4:6, 7:9), ptype = integer(0), class = c("vctrs_list_of","vctrs_vctr", "list"))), row.names = c(NA, 3L), class = c("tbl_df","tbl", "data.frame"), .drop = TRUE), class = c("grouped_df","tbl_df", "tbl", "data.frame"))
# id_choice V1 V2 V3
# 1 1 -2.16 -0.708 -0.708
# 2 1 -8.00 -2.47 -2.47
# 3 1 -2.75 -0.930 -0.930
# 4 2 -10.4 -5.02 -3.34
# 5 2 -1.24 -0.602 -0.420
# 6 2 -0.378 -0.187 -0.141
# 7 3 -5.41 -2.06 -2.06
# 8 3 -1.92 -0.706 -0.706
# 9 3 -3.43 -1.05 -1.05
我需要对每个 V1、V2 和 V3 应用相同的 2 个操作。它们是:
- 生成
U <- V + Gumbel error
- 通过
id_choice
生成一个二进制(choice
)变量,最大为U
广泛的(绝对不合适的)方法如下(也欢迎对当前方法的第 2 点进行任何改进)。我将其称为 The Dumb 方法
愚蠢的[但有效]方法
# For V1...
# adding the random part to the error rgumbel
df$U1 <- df$V1 + rgumbel(nrow(df) , loc=0 , scale=1)
# generating choice variable
id_choice_split_1 <- with(df, split(U1,id_choice))
df$choice1 <- unlist(lapply(id_choice_split_1,
function(uValues)
as.numeric(uValues == max(uValues))))
# ...for V2...
# adding the random part to the error rgumbel
df$U2 <- df$V2 + rgumbel(nrow(df) , loc=0 , scale=1)
# generating choice variable
id_choice_split2 <- with(df, split(U2,id_choice))
df$choice2 <- unlist(lapply(id_choice_split2,
function(uValues)
as.numeric(uValues == max(uValues))))
# ...for V3.
# adding the random part to the error rgumbel
df$U3 <- df$V3 + rgumbel(nrow(df) , loc=0 , scale=1)
# generating choice variable
id_choice_split3 <- with(df, split(U3,id_choice))
df$choice3 <- unlist(lapply(id_choice_split3,
function(uValues)
as.numeric(uValues == max(uValues))))
导致....
# id_choice V1 V2 V3 U1 choice1 U2 choice2 U3 choice3
# 1 1 -2.16 -0.708 -0.708 -2.89 0 -1.09 1 -1.42 0
# 2 1 -8.00 -2.47 -2.47 -8.94 0 -3.17 0 -2.15 0
# 3 1 -2.75 -0.930 -0.930 -2.51 1 -1.22 0 -0.130 1
# 4 2 -10.4 -5.02 -3.34 -8.87 0 -5.77 0 -2.82 0
# 5 2 -1.24 -0.602 -0.420 1.30 1 -0.477 0 -0.942 0
# 6 2 -0.378 -0.187 -0.141 -0.463 0 1.49 1 -0.121 1
# 7 3 -5.41 -2.06 -2.06 -4.28 0 -0.509 0 -1.31 0
# 8 3 -1.92 -0.706 -0.706 -0.633 1 -0.486 1 1.77 1
# 9 3 -3.43 -1.05 -1.05 -2.89 0 -1.68 0 -0.975 0
最后,只保留相关变量(a.k.a:id_choice
choice1
、choice2
和 choice3
).
这是我的期望输出
#keeping just relevant variables
df<- data.frame(select(df,-c(U1,V1,
U2,V2,
U3,V3)))
# id_choice choice1 choice2 choice3
# 1 1 0 1 0
# 2 1 0 0 0
# 3 1 1 0 1
# 4 2 0 0 0
# 5 2 1 0 0
# 6 2 0 1 1
# 7 3 0 0 0
# 8 3 1 1 1
# 9 3 0 0 0
鉴于只有重复的操作,我更愿意将它们自动化。因此,我创建了一个使用 apply
:
调用的函数
智能[但还没有用]方法
choice_generator <- function(V){
require(VGAM)
V <- as.data.frame(V)
# adding the random part to the error rgumbel
U <- V + rgumbel(nrow(V) , loc=0 , scale=1)
# generating choice variable
id_choice_split <- with(df, split(U,id_choice))
choice <- unlist(lapply(id_choice_split,
function(uValues) as.numeric(uValues == max(uValues))))
#keep it as data.frame
choice <- data.frame(choice)
}
# getting a list with the choice variables
list_with_choices <- apply(df[grep('V', names(df))] ,
MARGIN = 2,
FUN =choice_generator )
#generate_data_frame from the obtained list.
df$choices <- do.call("cbind", list_with_choices)
导致...
# id_choice V1 V2 V3 choices$choice $choice $choice
# 1 1 -2.16 -0.708 -0.708 0 0 1
# 2 1 -8.00 -2.47 -2.47 0 0 0
# 3 1 -2.75 -0.930 -0.930 1 1 0
# 4 2 -10.4 -5.02 -3.34 0 0 0
# 5 2 -1.24 -0.602 -0.420 0 0 0
# 6 2 -0.378 -0.187 -0.141 1 1 1
# 7 3 -5.41 -2.06 -2.06 0 0 0
# 8 3 -1.92 -0.706 -0.706 1 1 0
# 9 3 -3.43 -1.05 -1.05 0 0 1
#New names:
#* choices.choice -> choices.choice...5
#* choices.choice -> choices.choice...6
#* choices.choice -> choices.choice...7
colnames(df)
#[1] "id_choice" "V1" "V2" "V3" "choices"
如您所见,我失去了 我想要的输出 ,因为 choice1
、choice2
和 choice3
的相关数我正在摆脱 The Dumb Method 消失了。
是否可以通过修改我的函数来保留相关编号?还是我缺少的另一件事?感谢您提前加载您的时间!
智能[但尚未生效] 方法
保留名称的一种简单方法是在调用 cbind 之前使用 rlist::flatten()
:
do.call("cbind", rlist::list.flatten(list_with_choices))
# V1.choice V2.choice V3.choice
# [1,] 0 1 0
# [2,] 0 0 0
# [3,] 1 0 1
# ...
一种dplyr
方法
我认为这是 dplyr
大放异彩的情况之一:您可以使用简单的 group_by(id_choice)
,而不是 split
ting 然后 unlist
ing 等等,然后在各个 Vx 列上应用一个函数(不需要知道 df
):
choice_generator2 <- function(V){
U <- V + rgumbel(length(V), 0, 1)
1L * (U == max(U))
}
df %>%
group_by(id_choice) %>%
mutate(choice1 = choice_generator2(V1))
您也可以将它应用于所有列(在此过程中替换它们):
df %>%
group_by(id_choice) %>%
mutate(across(starts_with("V"), choice_generator2))
或使用以下值创建新列:
df %>%
group_by(id_choice) %>%
mutate(across(starts_with("V"), choice_generator2, .names = "choice_{.col}"))
# A tibble: 9 x 7
# Groups: id_choice [3]
# id_choice V1 V2 V3 choice_V1 choice_V2 choice_V3
# <int> <dbl> <dbl> <dbl> <int> <int> <int>
# 1 1 -2.16 -0.708 -0.708 1 1 0
# 2 1 -8.00 -2.47 -2.47 0 0 1
# 3 1 -2.75 -0.930 -0.930 0 0 0
# 4 2 -10.4 -5.02 -3.34 0 0 0
# 5 2 -1.24 -0.602 -0.420 1 0 0
# 6 2 -0.378 -0.187 -0.141 0 1 1
# 7 3 -5.41 -2.06 -2.06 0 0 0
# 8 3 -1.92 -0.706 -0.706 1 0 1
# 9 3 -3.43 -1.05 -1.05 0 1 0
使用下面的数据,我想将以下操作矢量化为一个 apply
调用,但是这样做时我丢失了每个矢量化操作的相关编号。
示例数据如下:
library(VGAM)
library(dplyr)
df<- structure(list(id_choice = c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L,3L), V1 = c(-2.1578610393492, -7.99842015838894, -2.75141490020271,-10.4498142778112, -1.24287958770339, -0.378360296432904, -5.4054060545789,-1.91973213468639, -3.43124320737278), V2 = c(-0.707981236327309,-2.46810236795902, -0.929869936267716, -5.0157210007768, -0.602127441470801,-0.186681393158885, -2.06095189374723, -0.706069723181491, -1.04581628613078), V3 = c(-0.707981236327309, -2.46810236795902, -0.929869936267716,-3.34318115860932, -0.419902425557776, -0.141349098342894, -2.06095189374723,-0.706069723181491, -1.04581628613078)), row.names = c(NA, -9L), groups = structure(list(id_choice = 1:3, .rows = structure(list(1:3, 4:6, 7:9), ptype = integer(0), class = c("vctrs_list_of","vctrs_vctr", "list"))), row.names = c(NA, 3L), class = c("tbl_df","tbl", "data.frame"), .drop = TRUE), class = c("grouped_df","tbl_df", "tbl", "data.frame"))
# id_choice V1 V2 V3
# 1 1 -2.16 -0.708 -0.708
# 2 1 -8.00 -2.47 -2.47
# 3 1 -2.75 -0.930 -0.930
# 4 2 -10.4 -5.02 -3.34
# 5 2 -1.24 -0.602 -0.420
# 6 2 -0.378 -0.187 -0.141
# 7 3 -5.41 -2.06 -2.06
# 8 3 -1.92 -0.706 -0.706
# 9 3 -3.43 -1.05 -1.05
我需要对每个 V1、V2 和 V3 应用相同的 2 个操作。它们是:
- 生成
U <- V + Gumbel error
- 通过
id_choice
生成一个二进制(
choice
)变量,最大为U
广泛的(绝对不合适的)方法如下(也欢迎对当前方法的第 2 点进行任何改进)。我将其称为 The Dumb 方法
愚蠢的[但有效]方法
# For V1...
# adding the random part to the error rgumbel
df$U1 <- df$V1 + rgumbel(nrow(df) , loc=0 , scale=1)
# generating choice variable
id_choice_split_1 <- with(df, split(U1,id_choice))
df$choice1 <- unlist(lapply(id_choice_split_1,
function(uValues)
as.numeric(uValues == max(uValues))))
# ...for V2...
# adding the random part to the error rgumbel
df$U2 <- df$V2 + rgumbel(nrow(df) , loc=0 , scale=1)
# generating choice variable
id_choice_split2 <- with(df, split(U2,id_choice))
df$choice2 <- unlist(lapply(id_choice_split2,
function(uValues)
as.numeric(uValues == max(uValues))))
# ...for V3.
# adding the random part to the error rgumbel
df$U3 <- df$V3 + rgumbel(nrow(df) , loc=0 , scale=1)
# generating choice variable
id_choice_split3 <- with(df, split(U3,id_choice))
df$choice3 <- unlist(lapply(id_choice_split3,
function(uValues)
as.numeric(uValues == max(uValues))))
导致....
# id_choice V1 V2 V3 U1 choice1 U2 choice2 U3 choice3
# 1 1 -2.16 -0.708 -0.708 -2.89 0 -1.09 1 -1.42 0
# 2 1 -8.00 -2.47 -2.47 -8.94 0 -3.17 0 -2.15 0
# 3 1 -2.75 -0.930 -0.930 -2.51 1 -1.22 0 -0.130 1
# 4 2 -10.4 -5.02 -3.34 -8.87 0 -5.77 0 -2.82 0
# 5 2 -1.24 -0.602 -0.420 1.30 1 -0.477 0 -0.942 0
# 6 2 -0.378 -0.187 -0.141 -0.463 0 1.49 1 -0.121 1
# 7 3 -5.41 -2.06 -2.06 -4.28 0 -0.509 0 -1.31 0
# 8 3 -1.92 -0.706 -0.706 -0.633 1 -0.486 1 1.77 1
# 9 3 -3.43 -1.05 -1.05 -2.89 0 -1.68 0 -0.975 0
最后,只保留相关变量(a.k.a:id_choice
choice1
、choice2
和 choice3
).
这是我的期望输出
#keeping just relevant variables
df<- data.frame(select(df,-c(U1,V1,
U2,V2,
U3,V3)))
# id_choice choice1 choice2 choice3
# 1 1 0 1 0
# 2 1 0 0 0
# 3 1 1 0 1
# 4 2 0 0 0
# 5 2 1 0 0
# 6 2 0 1 1
# 7 3 0 0 0
# 8 3 1 1 1
# 9 3 0 0 0
鉴于只有重复的操作,我更愿意将它们自动化。因此,我创建了一个使用 apply
:
智能[但还没有用]方法
choice_generator <- function(V){
require(VGAM)
V <- as.data.frame(V)
# adding the random part to the error rgumbel
U <- V + rgumbel(nrow(V) , loc=0 , scale=1)
# generating choice variable
id_choice_split <- with(df, split(U,id_choice))
choice <- unlist(lapply(id_choice_split,
function(uValues) as.numeric(uValues == max(uValues))))
#keep it as data.frame
choice <- data.frame(choice)
}
# getting a list with the choice variables
list_with_choices <- apply(df[grep('V', names(df))] ,
MARGIN = 2,
FUN =choice_generator )
#generate_data_frame from the obtained list.
df$choices <- do.call("cbind", list_with_choices)
导致...
# id_choice V1 V2 V3 choices$choice $choice $choice
# 1 1 -2.16 -0.708 -0.708 0 0 1
# 2 1 -8.00 -2.47 -2.47 0 0 0
# 3 1 -2.75 -0.930 -0.930 1 1 0
# 4 2 -10.4 -5.02 -3.34 0 0 0
# 5 2 -1.24 -0.602 -0.420 0 0 0
# 6 2 -0.378 -0.187 -0.141 1 1 1
# 7 3 -5.41 -2.06 -2.06 0 0 0
# 8 3 -1.92 -0.706 -0.706 1 1 0
# 9 3 -3.43 -1.05 -1.05 0 0 1
#New names:
#* choices.choice -> choices.choice...5
#* choices.choice -> choices.choice...6
#* choices.choice -> choices.choice...7
colnames(df)
#[1] "id_choice" "V1" "V2" "V3" "choices"
如您所见,我失去了 我想要的输出 ,因为 choice1
、choice2
和 choice3
的相关数我正在摆脱 The Dumb Method 消失了。
是否可以通过修改我的函数来保留相关编号?还是我缺少的另一件事?感谢您提前加载您的时间!
智能[但尚未生效] 方法
保留名称的一种简单方法是在调用 cbind 之前使用 rlist::flatten()
:
do.call("cbind", rlist::list.flatten(list_with_choices))
# V1.choice V2.choice V3.choice
# [1,] 0 1 0
# [2,] 0 0 0
# [3,] 1 0 1
# ...
一种dplyr
方法
我认为这是 dplyr
大放异彩的情况之一:您可以使用简单的 group_by(id_choice)
,而不是 split
ting 然后 unlist
ing 等等,然后在各个 Vx 列上应用一个函数(不需要知道 df
):
choice_generator2 <- function(V){
U <- V + rgumbel(length(V), 0, 1)
1L * (U == max(U))
}
df %>%
group_by(id_choice) %>%
mutate(choice1 = choice_generator2(V1))
您也可以将它应用于所有列(在此过程中替换它们):
df %>%
group_by(id_choice) %>%
mutate(across(starts_with("V"), choice_generator2))
或使用以下值创建新列:
df %>%
group_by(id_choice) %>%
mutate(across(starts_with("V"), choice_generator2, .names = "choice_{.col}"))
# A tibble: 9 x 7
# Groups: id_choice [3]
# id_choice V1 V2 V3 choice_V1 choice_V2 choice_V3
# <int> <dbl> <dbl> <dbl> <int> <int> <int>
# 1 1 -2.16 -0.708 -0.708 1 1 0
# 2 1 -8.00 -2.47 -2.47 0 0 1
# 3 1 -2.75 -0.930 -0.930 0 0 0
# 4 2 -10.4 -5.02 -3.34 0 0 0
# 5 2 -1.24 -0.602 -0.420 1 0 0
# 6 2 -0.378 -0.187 -0.141 0 1 1
# 7 3 -5.41 -2.06 -2.06 0 0 0
# 8 3 -1.92 -0.706 -0.706 1 0 1
# 9 3 -3.43 -1.05 -1.05 0 1 0