带有 Rcpp 的强盗
Bandits with Rcpp
这是第二次尝试更正我的早期版本 。我正在为多臂强盗翻译 epsilon-greedy 算法。
代码总结如下。基本上,我们有一组手臂,每只手臂都以预定义的概率支付奖励,我们的工作是表明,通过随机抽取手臂,同时间歇性地抽取具有最佳奖励的手臂,最终可以让我们收敛在最好的手臂上。
可以找到原算法here。
#define ARMA_64BIT_WORD
#include <RcppArmadillo.h>
using namespace Rcpp;
// [[Rcpp::depends(RcppArmadillo)]]
// [[Rcpp::plugins(cpp11)]]
struct EpsilonGreedy {
double epsilon;
arma::uvec counts;
arma::vec values;
};
int index_max(arma::uvec& v) {
return v.index_max();
}
int index_rand(arma::vec& v) {
int s = arma::randi<int>(arma::distr_param(0, v.n_elem-1));
return s;
}
int select_arm(EpsilonGreedy& algo) {
if (R::runif(0, 1) > algo.epsilon) {
return index_max(algo.values);
} else {
return index_rand(algo.values);
}
}
void update(EpsilonGreedy& algo, int chosen_arm, double reward) {
algo.counts[chosen_arm] += 1;
int n = algo.counts[chosen_arm];
double value = algo.values[chosen_arm];
algo.values[chosen_arm] = ((n-1)/n) * value + (1/n) * reward;
}
struct BernoulliArm {
double p;
};
int draw(BernoulliArm arm) {
if (R::runif(0, 1) > arm.p) {
return 0;
} else {
return 1;
}
}
// [[Rcpp::export]]
DataFrame test_algorithm(double epsilon, std::vector<double>& means, int
n_sims, int horizon) {
std::vector<BernoulliArm> arms;
for (auto& mu : means) {
BernoulliArm b = {mu};
arms.push_back(b);
}
std::vector<int> sim_num, time, chosen_arms;
std::vector<double> rewards;
for (int sim = 1; sim <= n_sims; ++sim) {
arma::uvec counts(means.size(), arma::fill::zeros);
arma::vec values(means.size(), arma::fill::zeros);
EpsilonGreedy algo = {epsilon, counts, values};
for (int t = 1; t <= horizon; ++t) {
int chosen_arm = select_arm(algo);
double reward = draw(arms[chosen_arm]);
update(algo, chosen_arm, reward);
sim_num.push_back(sim);
time.push_back(t);
chosen_arms.push_back(chosen_arm);
rewards.push_back(reward);
}
}
DataFrame results = DataFrame::create(Named("sim_num") = sim_num,
Named("time") = time,
Named("chosen_arm") = chosen_arms,
Named("reward") = rewards);
return results;
}
/***R
library(tidyverse)
means <- c(0.1, 0.1, 0.1, 0.1, 0.9)
total_results <- data.frame(sim_num = integer(), time = integer(),
chosen_arm = integer(),
reward = numeric(), epsilon = numeric())
for (epsilon in seq(0.1, 0.5, length.out = 5)) {
cat("Starting with ", epsilon, " at: ", format(Sys.time(), "%H:%M"), "\n")
results <- test_algorithm(epsilon, means, 5000, 250)
results$epsilon <- epsilon
total_results <- rbind(total_results, results)
}
avg_reward <- total_results %>% group_by(time, epsilon) %>%
summarize(avg_reward = mean(reward))
dev.new()
ggplot(avg_reward) +
geom_line(aes(x = time, y = avg_reward,
group = epsilon, color = epsilon), size = 1) +
scale_color_gradient(low = "grey", high = "black") +
labs(x = "Time",
y = "Average reward",
title = "Performance of the Epsilon-Greedy Algorithm",
color = "epsilon\n")
以上代码returns情节如下:
这个剧情完全错了!但是,我无法将代码中的逻辑缺陷归零....我在哪里偏离轨道?
编辑:
根据评论,以下是预期的情节:
在这段代码中:
int n = algo.counts[chosen_arm];
//...
algo.values[chosen_arm] = ((n-1)/n) * value + (1/n) * reward;
n
被声明为一个整数,因此 (n-1)/n
和 1/n
将是 整数表达式 ,两者的计算结果都是 0
.您可以通过将 1
更改为 1.0
来解决此问题,这是一个 floating-point 常量,以强制将表达式计算为 double
:
algo.values[chosen_arm] = ((n-1.0)/n) * value + (1.0/n) * reward;
这是第二次尝试更正我的早期版本
代码总结如下。基本上,我们有一组手臂,每只手臂都以预定义的概率支付奖励,我们的工作是表明,通过随机抽取手臂,同时间歇性地抽取具有最佳奖励的手臂,最终可以让我们收敛在最好的手臂上。
可以找到原算法here。
#define ARMA_64BIT_WORD
#include <RcppArmadillo.h>
using namespace Rcpp;
// [[Rcpp::depends(RcppArmadillo)]]
// [[Rcpp::plugins(cpp11)]]
struct EpsilonGreedy {
double epsilon;
arma::uvec counts;
arma::vec values;
};
int index_max(arma::uvec& v) {
return v.index_max();
}
int index_rand(arma::vec& v) {
int s = arma::randi<int>(arma::distr_param(0, v.n_elem-1));
return s;
}
int select_arm(EpsilonGreedy& algo) {
if (R::runif(0, 1) > algo.epsilon) {
return index_max(algo.values);
} else {
return index_rand(algo.values);
}
}
void update(EpsilonGreedy& algo, int chosen_arm, double reward) {
algo.counts[chosen_arm] += 1;
int n = algo.counts[chosen_arm];
double value = algo.values[chosen_arm];
algo.values[chosen_arm] = ((n-1)/n) * value + (1/n) * reward;
}
struct BernoulliArm {
double p;
};
int draw(BernoulliArm arm) {
if (R::runif(0, 1) > arm.p) {
return 0;
} else {
return 1;
}
}
// [[Rcpp::export]]
DataFrame test_algorithm(double epsilon, std::vector<double>& means, int
n_sims, int horizon) {
std::vector<BernoulliArm> arms;
for (auto& mu : means) {
BernoulliArm b = {mu};
arms.push_back(b);
}
std::vector<int> sim_num, time, chosen_arms;
std::vector<double> rewards;
for (int sim = 1; sim <= n_sims; ++sim) {
arma::uvec counts(means.size(), arma::fill::zeros);
arma::vec values(means.size(), arma::fill::zeros);
EpsilonGreedy algo = {epsilon, counts, values};
for (int t = 1; t <= horizon; ++t) {
int chosen_arm = select_arm(algo);
double reward = draw(arms[chosen_arm]);
update(algo, chosen_arm, reward);
sim_num.push_back(sim);
time.push_back(t);
chosen_arms.push_back(chosen_arm);
rewards.push_back(reward);
}
}
DataFrame results = DataFrame::create(Named("sim_num") = sim_num,
Named("time") = time,
Named("chosen_arm") = chosen_arms,
Named("reward") = rewards);
return results;
}
/***R
library(tidyverse)
means <- c(0.1, 0.1, 0.1, 0.1, 0.9)
total_results <- data.frame(sim_num = integer(), time = integer(),
chosen_arm = integer(),
reward = numeric(), epsilon = numeric())
for (epsilon in seq(0.1, 0.5, length.out = 5)) {
cat("Starting with ", epsilon, " at: ", format(Sys.time(), "%H:%M"), "\n")
results <- test_algorithm(epsilon, means, 5000, 250)
results$epsilon <- epsilon
total_results <- rbind(total_results, results)
}
avg_reward <- total_results %>% group_by(time, epsilon) %>%
summarize(avg_reward = mean(reward))
dev.new()
ggplot(avg_reward) +
geom_line(aes(x = time, y = avg_reward,
group = epsilon, color = epsilon), size = 1) +
scale_color_gradient(low = "grey", high = "black") +
labs(x = "Time",
y = "Average reward",
title = "Performance of the Epsilon-Greedy Algorithm",
color = "epsilon\n")
以上代码returns情节如下:
这个剧情完全错了!但是,我无法将代码中的逻辑缺陷归零....我在哪里偏离轨道?
编辑:
根据评论,以下是预期的情节:
在这段代码中:
int n = algo.counts[chosen_arm]; //... algo.values[chosen_arm] = ((n-1)/n) * value + (1/n) * reward;
n
被声明为一个整数,因此 (n-1)/n
和 1/n
将是 整数表达式 ,两者的计算结果都是 0
.您可以通过将 1
更改为 1.0
来解决此问题,这是一个 floating-point 常量,以强制将表达式计算为 double
:
algo.values[chosen_arm] = ((n-1.0)/n) * value + (1.0/n) * reward;