R:手动循环函数
R: Manually Looping Functions
我正在使用 R 编程语言。
- 在这个问题中,我有一个包含 2 个变量的数据集:身高和薪水
- 我随机将这个数据集分成两部分:训练和测试
对于火车数据:
- 我首先将 Salary 变量分成 3 个分位数(0.33、0.66 和 0.99)
- 对于这些薪水分位数中的每一个,我为身高变量计算相应的第 80 个分位数
为测试数据:
- 我使用与训练数据相同的薪水分位数组织测试数据
- 使用列车数据中的第 80 个高度分位数,我计算出小于第 80 个分位数的行的百分比。
- 然后我将所有这些信息汇总成一个结果table
我已经在下面发布了与上述步骤相对应的代码:
#PART 1
#create data
library(dplyr)
library(caret)
set.seed(123)
salary <- rnorm(1000,5,5)
height <- rnorm(1000,2,2)
my_data = data.frame(salary, height)
#PART 2
#create train and test data
train<-sample_frac(my_data, 0.7)
sid<-as.numeric(rownames(train)) # because rownames() returns character
test<-my_data[-sid,]
#PART 3
salary_quantiles = data.frame( train %>% summarise (quant_1 = quantile(salary, 0.33),
quant_2 = quantile(salary, 0.66),
quant_3 = quantile(salary, 0.99)))
#PART 4
train$salary_type = as.factor(ifelse(train$salary < salary_quantiles$quant_1 , "A", ifelse( train$salary > salary_quantiles$quant_1 & train$salary < salary_quantiles$quant_2, "B", "C")))
#PART 5
height_quantiles = data.frame( train %>% group_by(salary_type) %>% summarise(quant_80 = quantile(height, 0.80)))
#PART 6
#test
test$salary_type = as.factor(ifelse(test$salary < salary_quantiles$quant_1 , "A", ifelse( test$salary > salary_quantiles$quant_1 & test$salary < salary_quantiles$quant_2, "B", "C")))
test$height_pred <- height_quantiles$quant_80[match(test$salary_type, height_quantiles$salary_type)]
test$accuracy = ifelse(test$height_pred > test$height, 1, 0)
#PART 7 : Results Frame
results = data.frame(test %>%
group_by(salary_type) %>%
dplyr::summarize(Mean = mean(accuracy, na.rm=TRUE)))
results$iteration = 1
results$total_mean = mean(test$accuracy)
#END : view results
salary_type Mean iteration total_mean
1 A 0.7472527 1 0.7666667
2 B 0.8090909 1 0.7666667
3 C 0.7373737 1 0.7666667
问题:我想将上述过程转换为“k 折交叉验证”,我在其中测试“第 80 分位数”的准确度(即平均而言,其相应薪资组的第 80 个高度分位数以下的观察值的百分比是多少)。这看起来像下图:
我知道如何手动执行此操作。例如,这是第二次“折叠”(即第二次迭代):
#Second Fold
#PART 2
#create train and test data
train<-sample_frac(my_data, 0.7)
sid<-as.numeric(rownames(train)) # because rownames() returns character
test<-my_data[-sid,]
#PART 3
salary_quantiles = data.frame( train %>% summarise (quant_1 = quantile(salary, 0.33),
quant_2 = quantile(salary, 0.66),
quant_3 = quantile(salary, 0.99)))
#PART 4
train$salary_type = as.factor(ifelse(train$salary < salary_quantiles$quant_1 , "A", ifelse( train$salary > salary_quantiles$quant_1 & train$salary < salary_quantiles$quant_2, "B", "C")))
#PART 5
height_quantiles = data.frame( train %>% group_by(salary_type) %>% summarise(quant_80 = quantile(height, 0.80)))
#PART 6
#test
test$salary_type = as.factor(ifelse(test$salary < salary_quantiles$quant_1 , "A", ifelse( test$salary > salary_quantiles$quant_1 & test$salary < salary_quantiles$quant_2, "B", "C")))
test$height_pred <- height_quantiles$quant_80[match(test$salary_type, height_quantiles$salary_type)]
test$accuracy = ifelse(test$height_pred > test$height, 1, 0)
#PART 7 : Results Frame
results = data.frame(test %>%
group_by(salary_type) %>%
dplyr::summarize(Mean = mean(accuracy, na.rm=TRUE)))
results$iteration = 2
results$total_mean = mean(test$accuracy)
这里是第三折:
#Third Fold
#PART 2
#create train and test data
train<-sample_frac(my_data, 0.7)
sid<-as.numeric(rownames(train)) # because rownames() returns character
test<-my_data[-sid,]
#etc etc etc
#PART 7 : Results Frame
results = data.frame(test %>%
group_by(salary_type) %>%
dplyr::summarize(Mean = mean(accuracy, na.rm=TRUE)))
results$iteration = 3
results$total_mean = mean(test$accuracy)
这可以重复“k”次(即“k 次折叠”、“k 次迭代”)。
问题 我正在尝试为此过程重新创建一个“循环”。最后,将产生以下 table(例如,最大迭代次数 = 10):
iteration_aka_fold_number salary_a_accuracy salary_b_accuracy salary_c_accuracy total_accuracy
1 1 79 77 75 79
2 2 77 79 80 79
3 3 78 79 71 79
到目前为止我尝试了什么:我尝试编写以下循环来创建上面的 table:
for (i in 1:10)
{
#PART 2
#create train_i and test_i data
train_i<-sample_frac(my_data, 0.7)
sid<-as.numeric(rownames(train_i))
test_i<-my_data[-sid,]
#PART 3
salary_quantiles = data.frame( train_i %>% summarise (quant_1 = quantile(salary, 0.33),
quant_2 = quantile(salary, 0.66),
quant_3 = quantile(salary, 0.99)))
#PART 4
train_i$salary_type = as.factor(ifelse(train_i$salary < salary_quantiles$quant_1 , "A", ifelse( train_i$salary > salary_quantiles$quant_1 & train_i$salary < salary_quantiles$quant_2, "B", "C")))
#PART 5
height_quantiles = data.frame( train_i %>% group_by(salary_type) %>% summarise(quant_80 = quantile(height, 0.80)))
#PART 6
#test_i
test_i$salary_type = as.factor(ifelse(test_i$salary < salary_quantiles$quant_1 , "A", ifelse( test_i$salary > salary_quantiles$quant_1 & test_i$salary < salary_quantiles$quant_2, "B", "C")))
test_i$height_pred <- height_quantiles$quant_80[match(test_i$salary_type, height_quantiles$salary_type)]
test_i$accuracy = ifelse(test_i$height_pred > test_i$height, 1, 0)
#PART 7 : Results Frame
results_i = data.frame(test_i %>%
group_by(salary_type) %>%
dplyr::summarize(Mean = mean(accuracy, na.rm=TRUE)))
results_i$iteration = i
results_i$total_mean = mean(test_i$accuracy)
}
但这只保留最后一次迭代:
> results
salary_type Mean iteration total_mean
1 A 0.7582418 10 0.7566667
2 B 0.7818182 10 0.7566667
3 C 0.7272727 10 0.7566667
有人可以告诉我如何正确编写这个循环吗?
谢谢
也许这会适合:
results <- list()
for (i in 1:10) {
train_i<-sample_frac(my_data, 0.7)
sid<-as.numeric(rownames(train_i))
test_i<-my_data[-sid,]
salary_quantiles = data.frame( train_i %>% summarise (quant_1 = quantile(salary, 0.33),
quant_2 = quantile(salary, 0.66),
quant_3 = quantile(salary, 0.99)))
train_i$salary_type = as.factor(ifelse(train_i$salary < salary_quantiles$quant_1 , "A", ifelse( train_i$salary > salary_quantiles$quant_1 & train_i$salary < salary_quantiles$quant_2, "B", "C")))
height_quantiles = data.frame( train_i %>% group_by(salary_type) %>% summarise(quant_80 = quantile(height, 0.80)))
test_i$salary_type = as.factor(ifelse(test_i$salary < salary_quantiles$quant_1 , "A", ifelse( test_i$salary > salary_quantiles$quant_1 & test_i$salary < salary_quantiles$quant_2, "B", "C")))
test_i$height_pred <- height_quantiles$quant_80[match(test_i$salary_type, height_quantiles$salary_type)]
test_i$accuracy = ifelse(test_i$height_pred > test_i$height, 1, 0)
#PART 7 : Results Frame
results_tmp = data.frame(test_i %>%
group_by(salary_type) %>%
dplyr::summarize(Mean = mean(accuracy, na.rm=TRUE)))
results_tmp$iteration = i
results_tmp$total_mean = mean(test_i$accuracy)
results[[i]] <- results_tmp
}
results
results_df <- do.call(rbind.data.frame, results)
这会将您的结果加载到一个列表(称为“结果”)中,然后您可以将其缩减为单个数据帧(“results_df”)。这是否解决了您的问题?
问题是你每次都在覆盖结果,所以你只是在 results_i
中得到最终输出
正如之前的帖子所写,你应该在循环之前定义一个对象,即
results <- list() #output as list
然后通过仅修改循环的结尾将每个输出存储为该列表中的一个项目,以便将每个输出保存到列表中,而不是覆盖数据帧
for (i in 1:10){
#PART 2
#create train_i and test_i data
train_i<-sample_frac(my_data, 0.7)
sid<-as.numeric(rownames(train_i))
test_i<-my_data[-sid,]
#PART 3
salary_quantiles = data.frame( train_i %>% summarise (quant_1 = quantile(salary, 0.33),
quant_2 = quantile(salary, 0.66),
quant_3 = quantile(salary, 0.99)))
#PART 4
train_i$salary_type = as.factor(ifelse(train_i$salary < salary_quantiles$quant_1 , "A", ifelse( train_i$salary > salary_quantiles$quant_1 & train_i$salary < salary_quantiles$quant_2, "B", "C")))
#PART 5
height_quantiles = data.frame( train_i %>% group_by(salary_type) %>% summarise(quant_80 = quantile(height, 0.80)))
#PART 6
#test_i
test_i$salary_type = as.factor(ifelse(test_i$salary < salary_quantiles$quant_1 , "A", ifelse( test_i$salary > salary_quantiles$quant_1 & test_i$salary < salary_quantiles$quant_2, "B", "C")))
test_i$height_pred <- height_quantiles$quant_80[match(test_i$salary_type, height_quantiles$salary_type)]
test_i$accuracy = ifelse(test_i$height_pred > test_i$height, 1, 0)
#PART 7 : Results Frame
results_tmp = data.frame(test_i %>%
group_by(salary_type) %>%
dplyr::summarize(Mean = mean(accuracy, na.rm=TRUE)))
results_tmp$iteration = i
results_tmp$total_mean = mean(test_i$accuracy)
results[[i]] <- results_tmp
}
这又和之前的发帖人一样了。然后你可以将列表折叠成一个数据框 reduce()
& rbind()
results_df <- results %>% reduce(rbind)
我正在使用 R 编程语言。
- 在这个问题中,我有一个包含 2 个变量的数据集:身高和薪水
- 我随机将这个数据集分成两部分:训练和测试
对于火车数据:
- 我首先将 Salary 变量分成 3 个分位数(0.33、0.66 和 0.99)
- 对于这些薪水分位数中的每一个,我为身高变量计算相应的第 80 个分位数
为测试数据:
- 我使用与训练数据相同的薪水分位数组织测试数据
- 使用列车数据中的第 80 个高度分位数,我计算出小于第 80 个分位数的行的百分比。
- 然后我将所有这些信息汇总成一个结果table
我已经在下面发布了与上述步骤相对应的代码:
#PART 1
#create data
library(dplyr)
library(caret)
set.seed(123)
salary <- rnorm(1000,5,5)
height <- rnorm(1000,2,2)
my_data = data.frame(salary, height)
#PART 2
#create train and test data
train<-sample_frac(my_data, 0.7)
sid<-as.numeric(rownames(train)) # because rownames() returns character
test<-my_data[-sid,]
#PART 3
salary_quantiles = data.frame( train %>% summarise (quant_1 = quantile(salary, 0.33),
quant_2 = quantile(salary, 0.66),
quant_3 = quantile(salary, 0.99)))
#PART 4
train$salary_type = as.factor(ifelse(train$salary < salary_quantiles$quant_1 , "A", ifelse( train$salary > salary_quantiles$quant_1 & train$salary < salary_quantiles$quant_2, "B", "C")))
#PART 5
height_quantiles = data.frame( train %>% group_by(salary_type) %>% summarise(quant_80 = quantile(height, 0.80)))
#PART 6
#test
test$salary_type = as.factor(ifelse(test$salary < salary_quantiles$quant_1 , "A", ifelse( test$salary > salary_quantiles$quant_1 & test$salary < salary_quantiles$quant_2, "B", "C")))
test$height_pred <- height_quantiles$quant_80[match(test$salary_type, height_quantiles$salary_type)]
test$accuracy = ifelse(test$height_pred > test$height, 1, 0)
#PART 7 : Results Frame
results = data.frame(test %>%
group_by(salary_type) %>%
dplyr::summarize(Mean = mean(accuracy, na.rm=TRUE)))
results$iteration = 1
results$total_mean = mean(test$accuracy)
#END : view results
salary_type Mean iteration total_mean
1 A 0.7472527 1 0.7666667
2 B 0.8090909 1 0.7666667
3 C 0.7373737 1 0.7666667
问题:我想将上述过程转换为“k 折交叉验证”,我在其中测试“第 80 分位数”的准确度(即平均而言,其相应薪资组的第 80 个高度分位数以下的观察值的百分比是多少)。这看起来像下图:
我知道如何手动执行此操作。例如,这是第二次“折叠”(即第二次迭代):
#Second Fold
#PART 2
#create train and test data
train<-sample_frac(my_data, 0.7)
sid<-as.numeric(rownames(train)) # because rownames() returns character
test<-my_data[-sid,]
#PART 3
salary_quantiles = data.frame( train %>% summarise (quant_1 = quantile(salary, 0.33),
quant_2 = quantile(salary, 0.66),
quant_3 = quantile(salary, 0.99)))
#PART 4
train$salary_type = as.factor(ifelse(train$salary < salary_quantiles$quant_1 , "A", ifelse( train$salary > salary_quantiles$quant_1 & train$salary < salary_quantiles$quant_2, "B", "C")))
#PART 5
height_quantiles = data.frame( train %>% group_by(salary_type) %>% summarise(quant_80 = quantile(height, 0.80)))
#PART 6
#test
test$salary_type = as.factor(ifelse(test$salary < salary_quantiles$quant_1 , "A", ifelse( test$salary > salary_quantiles$quant_1 & test$salary < salary_quantiles$quant_2, "B", "C")))
test$height_pred <- height_quantiles$quant_80[match(test$salary_type, height_quantiles$salary_type)]
test$accuracy = ifelse(test$height_pred > test$height, 1, 0)
#PART 7 : Results Frame
results = data.frame(test %>%
group_by(salary_type) %>%
dplyr::summarize(Mean = mean(accuracy, na.rm=TRUE)))
results$iteration = 2
results$total_mean = mean(test$accuracy)
这里是第三折:
#Third Fold
#PART 2
#create train and test data
train<-sample_frac(my_data, 0.7)
sid<-as.numeric(rownames(train)) # because rownames() returns character
test<-my_data[-sid,]
#etc etc etc
#PART 7 : Results Frame
results = data.frame(test %>%
group_by(salary_type) %>%
dplyr::summarize(Mean = mean(accuracy, na.rm=TRUE)))
results$iteration = 3
results$total_mean = mean(test$accuracy)
这可以重复“k”次(即“k 次折叠”、“k 次迭代”)。
问题 我正在尝试为此过程重新创建一个“循环”。最后,将产生以下 table(例如,最大迭代次数 = 10):
iteration_aka_fold_number salary_a_accuracy salary_b_accuracy salary_c_accuracy total_accuracy
1 1 79 77 75 79
2 2 77 79 80 79
3 3 78 79 71 79
到目前为止我尝试了什么:我尝试编写以下循环来创建上面的 table:
for (i in 1:10)
{
#PART 2
#create train_i and test_i data
train_i<-sample_frac(my_data, 0.7)
sid<-as.numeric(rownames(train_i))
test_i<-my_data[-sid,]
#PART 3
salary_quantiles = data.frame( train_i %>% summarise (quant_1 = quantile(salary, 0.33),
quant_2 = quantile(salary, 0.66),
quant_3 = quantile(salary, 0.99)))
#PART 4
train_i$salary_type = as.factor(ifelse(train_i$salary < salary_quantiles$quant_1 , "A", ifelse( train_i$salary > salary_quantiles$quant_1 & train_i$salary < salary_quantiles$quant_2, "B", "C")))
#PART 5
height_quantiles = data.frame( train_i %>% group_by(salary_type) %>% summarise(quant_80 = quantile(height, 0.80)))
#PART 6
#test_i
test_i$salary_type = as.factor(ifelse(test_i$salary < salary_quantiles$quant_1 , "A", ifelse( test_i$salary > salary_quantiles$quant_1 & test_i$salary < salary_quantiles$quant_2, "B", "C")))
test_i$height_pred <- height_quantiles$quant_80[match(test_i$salary_type, height_quantiles$salary_type)]
test_i$accuracy = ifelse(test_i$height_pred > test_i$height, 1, 0)
#PART 7 : Results Frame
results_i = data.frame(test_i %>%
group_by(salary_type) %>%
dplyr::summarize(Mean = mean(accuracy, na.rm=TRUE)))
results_i$iteration = i
results_i$total_mean = mean(test_i$accuracy)
}
但这只保留最后一次迭代:
> results
salary_type Mean iteration total_mean
1 A 0.7582418 10 0.7566667
2 B 0.7818182 10 0.7566667
3 C 0.7272727 10 0.7566667
有人可以告诉我如何正确编写这个循环吗?
谢谢
也许这会适合:
results <- list()
for (i in 1:10) {
train_i<-sample_frac(my_data, 0.7)
sid<-as.numeric(rownames(train_i))
test_i<-my_data[-sid,]
salary_quantiles = data.frame( train_i %>% summarise (quant_1 = quantile(salary, 0.33),
quant_2 = quantile(salary, 0.66),
quant_3 = quantile(salary, 0.99)))
train_i$salary_type = as.factor(ifelse(train_i$salary < salary_quantiles$quant_1 , "A", ifelse( train_i$salary > salary_quantiles$quant_1 & train_i$salary < salary_quantiles$quant_2, "B", "C")))
height_quantiles = data.frame( train_i %>% group_by(salary_type) %>% summarise(quant_80 = quantile(height, 0.80)))
test_i$salary_type = as.factor(ifelse(test_i$salary < salary_quantiles$quant_1 , "A", ifelse( test_i$salary > salary_quantiles$quant_1 & test_i$salary < salary_quantiles$quant_2, "B", "C")))
test_i$height_pred <- height_quantiles$quant_80[match(test_i$salary_type, height_quantiles$salary_type)]
test_i$accuracy = ifelse(test_i$height_pred > test_i$height, 1, 0)
#PART 7 : Results Frame
results_tmp = data.frame(test_i %>%
group_by(salary_type) %>%
dplyr::summarize(Mean = mean(accuracy, na.rm=TRUE)))
results_tmp$iteration = i
results_tmp$total_mean = mean(test_i$accuracy)
results[[i]] <- results_tmp
}
results
results_df <- do.call(rbind.data.frame, results)
这会将您的结果加载到一个列表(称为“结果”)中,然后您可以将其缩减为单个数据帧(“results_df”)。这是否解决了您的问题?
问题是你每次都在覆盖结果,所以你只是在 results_i
正如之前的帖子所写,你应该在循环之前定义一个对象,即
results <- list() #output as list
然后通过仅修改循环的结尾将每个输出存储为该列表中的一个项目,以便将每个输出保存到列表中,而不是覆盖数据帧
for (i in 1:10){
#PART 2
#create train_i and test_i data
train_i<-sample_frac(my_data, 0.7)
sid<-as.numeric(rownames(train_i))
test_i<-my_data[-sid,]
#PART 3
salary_quantiles = data.frame( train_i %>% summarise (quant_1 = quantile(salary, 0.33),
quant_2 = quantile(salary, 0.66),
quant_3 = quantile(salary, 0.99)))
#PART 4
train_i$salary_type = as.factor(ifelse(train_i$salary < salary_quantiles$quant_1 , "A", ifelse( train_i$salary > salary_quantiles$quant_1 & train_i$salary < salary_quantiles$quant_2, "B", "C")))
#PART 5
height_quantiles = data.frame( train_i %>% group_by(salary_type) %>% summarise(quant_80 = quantile(height, 0.80)))
#PART 6
#test_i
test_i$salary_type = as.factor(ifelse(test_i$salary < salary_quantiles$quant_1 , "A", ifelse( test_i$salary > salary_quantiles$quant_1 & test_i$salary < salary_quantiles$quant_2, "B", "C")))
test_i$height_pred <- height_quantiles$quant_80[match(test_i$salary_type, height_quantiles$salary_type)]
test_i$accuracy = ifelse(test_i$height_pred > test_i$height, 1, 0)
#PART 7 : Results Frame
results_tmp = data.frame(test_i %>%
group_by(salary_type) %>%
dplyr::summarize(Mean = mean(accuracy, na.rm=TRUE)))
results_tmp$iteration = i
results_tmp$total_mean = mean(test_i$accuracy)
results[[i]] <- results_tmp
}
这又和之前的发帖人一样了。然后你可以将列表折叠成一个数据框 reduce()
& rbind()
results_df <- results %>% reduce(rbind)