R如何用多个if else条件向量化一个函数

R how to vectorize a function with multiple if else conditions

您好,我是 R 中向量化函数的新手。我有一个类似于以下的代码。

library(truncnorm)
library(microbenchmark)

num_obs=10000
Observation=seq(1,num_obs)
Obs_Type=sample(1:4, num_obs, replace=T)
Upper_bound = runif(num_obs,0,1)
Lower_bound=runif(num_obs,2,4)
mean = runif(num_obs,10,15)

df1= data.frame(Observation,Obs_Type,Upper_bound,Lower_bound,mean)
df1$draw_value = 0

Trial_func=function(df1){
  for (i in 1:nrow(df1)){
    if (df1[i,"Obs_Type"] ==1){
      #If Type == 1; then a=-Inf, b = Upper_Bound
      df1[i,"draw_value"] = rtruncnorm(1,a=-Inf,b=df1[i,"Upper_bound"],mean= df1[i,"mean"],sd=1)
    } else if (df1[i,"Obs_Type"] ==2){
      #If Type == 2; then a=-10, b = Upper_Bound
      df1[i,"draw_value"] = rtruncnorm(1,a=-10,b=df1[i,"Upper_bound"],mean= df1[i,"mean"],sd=1)
    } else if(df1[i,"Obs_Type"] ==3){
      #If Type == 3; then a=Lower_bound, b = Inf
      df1[i,"draw_value"] = rtruncnorm(1,a=df1[i,"Lower_bound"],b=Inf,mean= df1[i,"mean"],sd=1)
    } else {
      #If Type == 3; then a=Lower_bound, b = 10
      df1[i,"draw_value"] = rtruncnorm(1,a=df1[i,"Lower_bound"],b=10,mean= df1[i,"mean"],sd=1)
    }
  }
  return(df1)
}

#Benchmarking
mbm=microbenchmark(Trial_func(df1=df1),times = 10)
summary(mbm)
#For obtaining the new data
New_data=Trial_func(df1=df1)

在上面,我最初创建了一个名为 df1 的数据框。然后我创建一个函数,它接受一个数据集 (df1)。数据集 (df1) 中的每个观察值可以是四种类型之一。这是由 df1$Obs_Type 给出的。我想做的是基于 Obs_Type,我想从具有给定上下点的截断正态分布中提取值。

规则是:

a) 当Obs_Type =1时; a=-Inf, b = Upper_bound 观测值 i.

b) 当Obs_Type =2时; a=-10, b = Upper_bound 观测值 i.

c) 当Obs_Type =3时; a=Upper_bound 观察值 i, b = Inf.

d) 当Obs_Type =4时; a=Upper_bound 观测值 i, b = 10.

其中a = 下限,b = 上限; 此外,观察 i 的平均值由 df1$mean 和 sd = 1.

给出

我不熟悉矢量化,想知道是否有人可以帮助我解决这个问题。我尝试查看有关 SO 的一些其他示例(例如 ),但无法弄清楚当我有多个条件时该怎么做。

我的原始数据集有大约 1000 万个观察值和其他附加条件(例如,我的数据有 16 种类型,而不是 4 种类型,并且每种类型的均值都不同),但我在这里使用了一个更简单的示例。

如果问题的任何部分需要任何额外说明,请告诉我。

dplyr 包中的 case_when 函数可以方便地向量化这种类型的多个 if else 语句。

与其将单个值传递给 "if" 语句,不如传递整个向量以获得非常显着的性能改进。
case_when 还提高了脚本的可读性。

library(dplyr)

Trial_func <- function(df1) {
  df1[,"draw_value"] <- case_when(
    df1$Obs_Type == 1 ~ rtruncnorm(1,a=-Inf,b=df1[,"Upper_bound"],mean= df1[,"mean"], sd=1),
    df1$Obs_Type == 2 ~ rtruncnorm(1,a=-10,b=df1[,"Upper_bound"],mean= df1[,"mean"],sd=1),
    df1$Obs_Type == 3 ~ rtruncnorm(1,a=df1[,"Lower_bound"],b=Inf,mean= df1[,"mean"],sd=1),
    df1$Obs_Type == 4 ~ rtruncnorm(1,a=df1[,"Lower_bound"],b=10,mean= df1[,"mean"],sd=1)
  )
  df1
}

Trial_func(df1)

这是一种向量化的方式。它创建对应于 4 个条件的逻辑向量 i1i2i3i4。然后它将新值分配给它们索引的位置。

Trial_func2 <- function(df1){
  i1 <- df1[["Obs_Type"]] == 1
  i2 <- df1[["Obs_Type"]] == 2
  i3 <- df1[["Obs_Type"]] == 3
  i4 <- df1[["Obs_Type"]] == 4

  #If Type == 1; then a=-Inf, b = Upper_Bound
  df1[i1, "draw_value"] <- rtruncnorm(sum(i1), a =-Inf, 
                                      b = df1[i1, "Upper_bound"], 
                                      mean = df1[i1, "mean"], sd = 1)
  #If Type == 2; then a=-10, b = Upper_Bound
  df1[i2, "draw_value"] <- rtruncnorm(sum(i2), a = -10,
                                      b = df1[i2 , "Upper_bound"],
                                      mean = df1[i2, "mean"], sd = 1)
  #If Type == 3; then a=Lower_bound, b = Inf
  df1[i3,"draw_value"] <- rtruncnorm(sum(i3), 
                                     a = df1[i3, "Lower_bound"],
                                     b = Inf, mean = df1[i3, "mean"], 
                                     sd = 1)
  #If Type == 3; then a=Lower_bound, b = 10
  df1[i4, "draw_value"] <- rtruncnorm(sum(i4), 
                                      a = df1[i4, "Lower_bound"],
                                      b = 10,
                                      mean = df1[i4,"mean"],
                                      sd = 1)
  df1
}

在速度测试中我命名为Trial_func3

mbm <- microbenchmark(
  loop = Trial_func(df1 = df1),
  vect = Trial_func2(df1 = df1),
  cwhen = Trial_func3(df1 = df1),
  times = 10)

print(mbm, order = "median")
#Unit: milliseconds
#  expr         min          lq       mean      median          uq         max neval cld
#  vect    4.349444    4.371169    4.40920    4.401384    4.450024    4.487453    10  a 
# cwhen   13.458946   13.484247   14.16045   13.528792   13.787951   19.363104    10  a 
#  loop 2125.665690 2138.792497 2211.20887 2157.185408 2201.391083 2453.658767    10   b
library(truncnorm)
library(microbenchmark)

num_obs=10000
Observation=seq(1,num_obs)
Obs_Type=sample(1:4, num_obs, replace=T)
Upper_bound = runif(num_obs,0,1)
Lower_bound=runif(num_obs,2,4)
mean = runif(num_obs,10,15)

df1= data.frame(Observation,Obs_Type,Upper_bound,Lower_bound,mean)
df1$draw_value = 0

###########################
# Your example
###########################

Trial_func=function(df1, seed=NULL){
  if (!is.null(seed)) set.seed(seed)
  for (i in 1:nrow(df1)){
    if (df1[i,"Obs_Type"] ==1){
      #If Type == 1; then a=-Inf, b = Upper_Bound
      df1[i,"draw_value"] = rtruncnorm(1,a=-Inf,b=df1[i,"Upper_bound"],mean= df1[i,"mean"],sd=1)
    } else if (df1[i,"Obs_Type"] ==2){
      #If Type == 2; then a=-10, b = Upper_Bound
      df1[i,"draw_value"] = rtruncnorm(1,a=-10,b=df1[i,"Upper_bound"],mean= df1[i,"mean"],sd=1)
    } else if(df1[i,"Obs_Type"] ==3){
      #If Type == 3; then a=Lower_bound, b = Inf
      df1[i,"draw_value"] = rtruncnorm(1,a=df1[i,"Lower_bound"],b=Inf,mean= df1[i,"mean"],sd=1)
    } else {
      #If Type == 3; then a=Lower_bound, b = 10
      df1[i,"draw_value"] = rtruncnorm(1,a=df1[i,"Lower_bound"],b=10,mean= df1[i,"mean"],sd=1)
    }
  }
  return(df1)
}

#############################
# Vectorized version
#############################

# for each row-elements define a function
truncated_normal <- function(obs_type, lower_bound, upper_bound, mean, sd) {
    if (obs_type == 1) {
      rtruncnorm(1, a=-Inf, b=upper_bound, mean=mean, sd=sd)
    } else if (obs_type == 2){
      rtruncnorm(1, a=-10, b=upper_bound, mean=mean, sd=sd)
    } else if(obs_type == 3){
      rtruncnorm(1, a=lower_bound, b=Inf, mean=mean, sd=sd)
    } else {
      rtruncnorm(1, a=lower_bound, b=10, mean=mean, sd=sd)
    }
}
# vectorize it
truncated_normal <- Vectorize(truncated_normal)

Trial_func_vec <- function(df, res_col="draw_value", seed=NULL) {
    if (!is.null(seed)) set.seed(seed)
    df[, res_col] <- truncated_normal(
                         obs_type=df[, "Obs_Type"],
                         lower_bound=df[, "Lower_bound"],
                         upper_bound=df[, "Upper_bound"],
                         mean=df[,"mean"],
                         sd=1)
    df
}
#Benchmarking
set.seed(1)
mbm=microbenchmark(Trial_func(df=df1),times = 10)
summary(mbm)

set.seed(1)
mbm_vec=microbenchmark(Trial_func_vec(df=df1),times = 10)
summary(mbm_vec)

## vectorization roughly 3x faster!

#For obtaining the new data
set.seed(1) # important so that randomization is reproducible
new_data=Trial_func(df=df1)
set.seed(1) # important so that randomization is reproducible
vec_data=Trial_func_vec(df=df1)
# since in both cases random number generator is provoked
# exactly once per row in the order of the rows,
# resulting df should be absolutely identical.

all(new_data == vec_data) ## TRUE! They are absolutely identical.
# proving that your code does - in principle - exactly the same
# like my vectorized code

基准测试结果

# @Rui Barradas' function
Trial_func2 <- function(df1){
  i1 <- df1[["Obs_Type"]] == 1
  i2 <- df1[["Obs_Type"]] == 2
  i3 <- df1[["Obs_Type"]] == 3
  i4 <- df1[["Obs_Type"]] == 4

  #If Type == 1; then a=-Inf, b = Upper_Bound
  df1[i1, "draw_value"] <- rtruncnorm(sum(i1), a =-Inf, 
                                      b = df1[i1, "Upper_bound"], 
                                      mean = df1[i1, "mean"], sd = 1)
  #If Type == 2; then a=-10, b = Upper_Bound
  df1[i2, "draw_value"] <- rtruncnorm(sum(i2), a = -10,
                                      b = df1[i2 , "Upper_bound"],
                                      mean = df1[i2, "mean"], sd = 1)
  #If Type == 3; then a=Lower_bound, b = Inf
  df1[i3,"draw_value"] <- rtruncnorm(sum(i3), 
                                     a = df1[i3, "Lower_bound"],
                                     b = Inf, mean = df1[i3, "mean"], 
                                     sd = 1)
  #If Type == 3; then a=Lower_bound, b = 10
  df1[i4, "draw_value"] <- rtruncnorm(sum(i4), 
                                      a = df1[i4, "Lower_bound"],
                                      b = 10,
                                      mean = df1[i4,"mean"],
                                      sd = 1)
  df1
}

# @Dave2e's function
library(dplyr)

Trial_func_dplyr <- function(df1) {
  df1[,"draw_value"] <- case_when(
    df1$Obs_Type == 1 ~ rtruncnorm(1,a=-Inf,b=df1[,"Upper_bound"],mean= df1[,"mean"], sd=1),
    df1$Obs_Type == 2 ~ rtruncnorm(1,a=-10,b=df1[,"Upper_bound"],mean= df1[,"mean"],sd=1),
    df1$Obs_Type == 3 ~ rtruncnorm(1,a=df1[,"Lower_bound"],b=Inf,mean= df1[,"mean"],sd=1),
    df1$Obs_Type == 4 ~ rtruncnorm(1,a=df1[,"Lower_bound"],b=10,mean= df1[,"mean"],sd=1)
  )
  df1
}

#Benchmarking
set.seed(1)
mbm <- microbenchmark(
    loop = Trial_func(df1=df1),
    ruy_vect = Trial_func2(df1=df1),
    my_vect = Trial_func_vec(df=df1),
    cwhen = Trial_func_dplyr(df1=df1),
    times=10)


print(mbm, order = "median")

# > print(mbm, order = "median")
# Unit: milliseconds
#      expr         min          lq       mean      median         uq        max
#  ruy_vect    7.583821    7.879766   11.59954    8.815835   10.33289   36.60468
#     cwhen   22.563190   23.103670   25.13804   23.965722   26.49628   30.63777
#   my_vect 1326.771297 1373.415302 1413.75328 1410.995177 1484.28449 1506.11063
#      loop 4149.424632 4269.475169 4486.41376 4423.527566 4742.96651 4911.31992
#  neval cld
#     10 a  
#     10 a  
#     10  b 
#     10   c

# @Rui's vectorize version wins by 3 magnitudes or order!!