R:手动循环函数

R: Manually Looping Functions

我正在使用 R 编程语言。

对于火车数据:

为测试数据:

我已经在下面发布了与上述步骤相对应的代码:

#PART 1
#create data

library(dplyr)
library(caret)
set.seed(123)



salary <- rnorm(1000,5,5)
height <- rnorm(1000,2,2)
my_data = data.frame(salary, height)


#PART 2
#create train and test data


train<-sample_frac(my_data, 0.7)
sid<-as.numeric(rownames(train)) # because rownames() returns character
test<-my_data[-sid,]

#PART 3
salary_quantiles = data.frame( train %>% summarise (quant_1 = quantile(salary, 0.33),
quant_2 = quantile(salary, 0.66),
quant_3 = quantile(salary, 0.99)))




#PART 4
train$salary_type = as.factor(ifelse(train$salary < salary_quantiles$quant_1 , "A", ifelse( train$salary  >  salary_quantiles$quant_1  & train$salary < salary_quantiles$quant_2, "B", "C")))

#PART 5
height_quantiles = data.frame( train %>%  group_by(salary_type)  %>%  summarise(quant_80 = quantile(height, 0.80)))



#PART 6
#test

test$salary_type = as.factor(ifelse(test$salary < salary_quantiles$quant_1 , "A", ifelse( test$salary  >  salary_quantiles$quant_1  & test$salary < salary_quantiles$quant_2, "B", "C")))

  test$height_pred <- height_quantiles$quant_80[match(test$salary_type, height_quantiles$salary_type)]

test$accuracy = ifelse(test$height_pred > test$height, 1, 0)

#PART 7 : Results Frame

results = data.frame(test %>%
    group_by(salary_type) %>%
    dplyr::summarize(Mean = mean(accuracy, na.rm=TRUE)))

results$iteration = 1

results$total_mean = mean(test$accuracy)

#END : view results

 salary_type      Mean iteration total_mean
1           A 0.7472527         1  0.7666667
2           B 0.8090909         1  0.7666667
3           C 0.7373737         1  0.7666667

问题:我想将上述过程转换为“k 折交叉验证”,我在其中测试“第 80 分位数”的准确度(即平均而言,其相应薪资组的第 80 个高度分位数以下的观察值的百分比是多少)。这看起来像下图:

我知道如何手动执行此操作。例如,这是第二次“折叠”(即第二次迭代):

#Second Fold
#PART 2
#create train and test data


train<-sample_frac(my_data, 0.7)
sid<-as.numeric(rownames(train)) # because rownames() returns character
test<-my_data[-sid,]

#PART 3
salary_quantiles = data.frame( train %>% summarise (quant_1 = quantile(salary, 0.33),
quant_2 = quantile(salary, 0.66),
quant_3 = quantile(salary, 0.99)))




#PART 4
train$salary_type = as.factor(ifelse(train$salary < salary_quantiles$quant_1 , "A", ifelse( train$salary  >  salary_quantiles$quant_1  & train$salary < salary_quantiles$quant_2, "B", "C")))

#PART 5
height_quantiles = data.frame( train %>%  group_by(salary_type)  %>%  summarise(quant_80 = quantile(height, 0.80)))



#PART 6
#test

test$salary_type = as.factor(ifelse(test$salary < salary_quantiles$quant_1 , "A", ifelse( test$salary  >  salary_quantiles$quant_1  & test$salary < salary_quantiles$quant_2, "B", "C")))

  test$height_pred <- height_quantiles$quant_80[match(test$salary_type, height_quantiles$salary_type)]

test$accuracy = ifelse(test$height_pred > test$height, 1, 0)

#PART 7 : Results Frame

results = data.frame(test %>%
    group_by(salary_type) %>%
    dplyr::summarize(Mean = mean(accuracy, na.rm=TRUE)))

results$iteration = 2

results$total_mean = mean(test$accuracy)

这里是第三折:

#Third Fold 
#PART 2
#create train and test data


train<-sample_frac(my_data, 0.7)
sid<-as.numeric(rownames(train)) # because rownames() returns character
test<-my_data[-sid,]

#etc etc etc


#PART 7 : Results Frame

results = data.frame(test %>%
    group_by(salary_type) %>%
    dplyr::summarize(Mean = mean(accuracy, na.rm=TRUE)))

results$iteration = 3

results$total_mean = mean(test$accuracy)

这可以重复“k”次(即“k 次折叠”、“k 次迭代”)。

问题 我正在尝试为此过程重新创建一个“循环”。最后,将产生以下 table(例如,最大迭代次数 = 10):

     iteration_aka_fold_number salary_a_accuracy salary_b_accuracy salary_c_accuracy total_accuracy
1                         1                79                77                75             79
2                         2                77                79                80             79
3                         3                78                79                71             79

到目前为止我尝试了什么:我尝试编写以下循环来创建上面的 table:

for (i in 1:10)
    
{
    
    
    #PART 2
    #create train_i and test_i data
    
    
    train_i<-sample_frac(my_data, 0.7)
    sid<-as.numeric(rownames(train_i)) 
    test_i<-my_data[-sid,]
    
    #PART 3
    salary_quantiles = data.frame( train_i %>% summarise (quant_1 = quantile(salary, 0.33),
                                                          quant_2 = quantile(salary, 0.66),
                                                          quant_3 = quantile(salary, 0.99)))
    
    
    
    
    #PART 4
    train_i$salary_type = as.factor(ifelse(train_i$salary < salary_quantiles$quant_1 , "A", ifelse( train_i$salary  >  salary_quantiles$quant_1  & train_i$salary < salary_quantiles$quant_2, "B", "C")))
    
    #PART 5
    height_quantiles = data.frame( train_i %>%  group_by(salary_type)  %>%  summarise(quant_80 = quantile(height, 0.80)))
    
    
    
    #PART 6
    #test_i
    
    test_i$salary_type = as.factor(ifelse(test_i$salary < salary_quantiles$quant_1 , "A", ifelse( test_i$salary  >  salary_quantiles$quant_1  & test_i$salary < salary_quantiles$quant_2, "B", "C")))
    
    test_i$height_pred <- height_quantiles$quant_80[match(test_i$salary_type, height_quantiles$salary_type)]
    
    test_i$accuracy = ifelse(test_i$height_pred > test_i$height, 1, 0)
    
    #PART 7 : Results Frame
    
    results_i = data.frame(test_i %>%
                               group_by(salary_type) %>%
                               dplyr::summarize(Mean = mean(accuracy, na.rm=TRUE)))
    
    results_i$iteration = i
    
    results_i$total_mean = mean(test_i$accuracy)
    
}

但这只保留最后一次迭代:

> results
  salary_type      Mean iteration total_mean
1           A 0.7582418        10  0.7566667
2           B 0.7818182        10  0.7566667
3           C 0.7272727        10  0.7566667

有人可以告诉我如何正确编写这个循环吗?

谢谢

也许这会适合:

results <- list()
for (i in 1:10) {
  train_i<-sample_frac(my_data, 0.7)
  sid<-as.numeric(rownames(train_i)) 
  test_i<-my_data[-sid,]
  
  salary_quantiles = data.frame( train_i %>% summarise (quant_1 = quantile(salary, 0.33),
                                                        quant_2 = quantile(salary, 0.66),
                                                        quant_3 = quantile(salary, 0.99)))
  
  train_i$salary_type = as.factor(ifelse(train_i$salary < salary_quantiles$quant_1 , "A", ifelse( train_i$salary  >  salary_quantiles$quant_1  & train_i$salary < salary_quantiles$quant_2, "B", "C")))
  
  height_quantiles = data.frame( train_i %>%  group_by(salary_type)  %>%  summarise(quant_80 = quantile(height, 0.80)))
  test_i$salary_type = as.factor(ifelse(test_i$salary < salary_quantiles$quant_1 , "A", ifelse( test_i$salary  >  salary_quantiles$quant_1  & test_i$salary < salary_quantiles$quant_2, "B", "C")))
  test_i$height_pred <- height_quantiles$quant_80[match(test_i$salary_type, height_quantiles$salary_type)]
  test_i$accuracy = ifelse(test_i$height_pred > test_i$height, 1, 0)
  
  #PART 7 : Results Frame
  
  results_tmp = data.frame(test_i %>%
                           group_by(salary_type) %>%
                           dplyr::summarize(Mean = mean(accuracy, na.rm=TRUE)))
  
  results_tmp$iteration = i
  
  results_tmp$total_mean = mean(test_i$accuracy)
  results[[i]] <- results_tmp
}
results
results_df <- do.call(rbind.data.frame, results)

这会将您的结果加载到一个列表(称为“结果”)中,然后您可以将其缩减为单个数据帧(“results_df”)。这是否解决了您的问题?

问题是你每次都在覆盖结果,所以你只是在 results_i

中得到最终输出

正如之前的帖子所写,你应该在循环之前定义一个对象,即

results <- list() #output as list

然后通过仅修改循环的结尾将每个输出存储为该列表中的一个项目,以便将每个输出保存到列表中,而不是覆盖数据帧

 for (i in 1:10){
  
  
  #PART 2
  #create train_i and test_i data
  
  
  train_i<-sample_frac(my_data, 0.7)
  sid<-as.numeric(rownames(train_i)) 
  test_i<-my_data[-sid,]
  
  #PART 3
  salary_quantiles = data.frame( train_i %>% summarise (quant_1 = quantile(salary, 0.33),
                                                        quant_2 = quantile(salary, 0.66),
                                                        quant_3 = quantile(salary, 0.99)))
  
  
  
  
  #PART 4
  train_i$salary_type = as.factor(ifelse(train_i$salary < salary_quantiles$quant_1 , "A", ifelse( train_i$salary  >  salary_quantiles$quant_1  & train_i$salary < salary_quantiles$quant_2, "B", "C")))
  
  #PART 5
  height_quantiles = data.frame( train_i %>%  group_by(salary_type)  %>%  summarise(quant_80 = quantile(height, 0.80)))
  
  
  
  #PART 6
  #test_i
  
  test_i$salary_type = as.factor(ifelse(test_i$salary < salary_quantiles$quant_1 , "A", ifelse( test_i$salary  >  salary_quantiles$quant_1  & test_i$salary < salary_quantiles$quant_2, "B", "C")))
  
  test_i$height_pred <- height_quantiles$quant_80[match(test_i$salary_type, height_quantiles$salary_type)]
  
  test_i$accuracy = ifelse(test_i$height_pred > test_i$height, 1, 0)
  
  #PART 7 : Results Frame
  
  results_tmp = data.frame(test_i %>%
                             group_by(salary_type) %>%
                             dplyr::summarize(Mean = mean(accuracy, na.rm=TRUE)))
  
  results_tmp$iteration = i
  
  results_tmp$total_mean = mean(test_i$accuracy)
  
  results[[i]] <- results_tmp
    
  
}

这又和之前的发帖人一样了。然后你可以将列表折叠成一个数据框 reduce() & rbind()

results_df <- results %>% reduce(rbind)