是否可以针对 openmp 优化我的代码?
Is it possible to optimize my code for openmp?
我对它的工作原理不是很了解。我已经能够并行化以下代码部分:
```
float opt = 0;
int cont = 0;
#pragma omp parallel for schedule(dynamic)
for(int p=0; p < S; p=p+4){
//cout << p << endl;
IloEnv env;
IloModel mod(env);
IloCplex cplex(mod);
IloArray<IloNumVarArray> C(env, T);
IloArray<IloArray<IloNumVarArray>> X(env, N);
IloArray<IloArray<IloNumVarArray>> B(env, N);
IloArray<IloArray<IloNumVarArray>> V(env, N);
// cout << "1" << endl;
for(int j = 0; j < T; j++)
C[j] = IloNumVarArray(env, 4, 0, 1, ILOINT);
//cout << "2" << endl;
for(int i= 0; i < N; i++){
X[i] = IloArray<IloNumVarArray>(env, T+1);
B[i] = IloArray<IloNumVarArray>(env, T+1);
V[i] = IloArray<IloNumVarArray>(env, T+1);
for(int j= 0; j <= T; j++){
X[i][j] = IloNumVarArray(env, 4);
B[i][j] = IloNumVarArray(env, 4);
V[i][j] = IloNumVarArray(env, 4);
for(int k=0; k < 4; k++){
X[i][j][k] = IloNumVar(env, 0, IloInfinity, ILOFLOAT);
B[i][j][k] = IloNumVar(env, 0, IloInfinity, ILOFLOAT);
V[i][j][k] = IloNumVar(env, 0, IloInfinity, ILOFLOAT);
}
}
}
//===================================================
// Maximize P[i][j][k]*X[i][j][k]
//====================================================
//cout << "3"<< endl;
IloExpr fo(env);
IloExpr penalty1(env);
IloExpr penalty2(env);
for(int i=0; i < N; i++){
for(int k=0; k < 4; k++){
fo += float(1.0/S)*P[i][T][p+k]*X[i][T][k];
}
if(iteration != 0){
for(int k = 0; k < 4; k++){
for(int j = 0; j < T-2; j++){
if(lambda1[i][j][p+k] !=0)
penalty1 += lambda1[i][j][p+k]*(X[i][j][0] - X_at[i][j][p+k]);
penalty1 += (1/2)*rho*(X[i][j][0] - 2*X[i][j][0]*X_at[i][j][p+k] - X_at[i][j][p+k]*X_at[i][j][p+k]);
}
}
}
else{
penalty1 += 0;
// penalty2 += 0;
}
}
IloAdd(mod, IloMaximize(env, fo - penalty1));
penalty1.end();
fo.end();
//========================================================
// sum X[i][0][k]*P[i][0][k] forall i=1,...,N, k=1,...,S
//========================================================
// cout << "4" << endl;
for(int k=0; k < 4; k++){
IloExpr constraint1(env);
for(int i=0; i <N; i++){
constraint1 += P[i][0][p+k]*X[i][0][k];
}
mod.add(constraint1 == Q);
constraint1.end();
}
//======================================================================================
// X[i][j][k] = X[i][j-1][k] + B[i][j][k] - V[i][j][k] for i=1,...,N,j=1,...,T,,k=1,...S
//======================================================================================
// cout << "5" << endl;
for(int i=0; i < N; i++){
for(int j=0; j <= T; j++){
for(int k=0; k < 4; k++){
IloExpr constraint2(env);
if(j==0){
constraint2 += X[i][j][k] - B[i][j][k];
mod.add(V[i][j][k] == 0);
}
else{
constraint2 += X[i][j][k] - X[i][j-1][k] - B[i][j][k] + V[i][j][k];
}
mod.add(constraint2 == 0);
constraint2.end();
}
}
}
//=========================================================================================
// sum P[i][j][k]V[i][j][k] - sum P[i][j][k]B[i][j][k] + ft = lt forall j=1,...,T, k=1,...,S
//=========================================================================================
// cout << "6" << endl;
for(int j = 1; j <= T; j++){
for(int k = 0; k < 4; k++){
IloExpr constraint3_1(env);
IloExpr constraint3_2(env);
for(int i = 0; i < N; i++){
constraint3_1 += P[i][j][p+k]*B[i][j][k];
constraint3_2 += P[i][j][p+k]*V[i][j][k];
}
mod.add(constraint3_2 - constraint3_1 + ft[j-1] == lt[j-1]);
constraint3_1.end();
constraint3_2.end();
}
}
//============================================================================================
// X[i][j][k]*P[i][j][k] < pi * sum X[i][j][k]*P[i][j][k] forall i=1,...,N,j=1,...T, k=1,...,S
// ===========================================================================================
// cout << "7" << endl;
for(int i = 0; i < N; i++){
for(int j = 1; j <= T; j++){
for(int k = 0; k < 4; k++){
IloExpr constraint4(env);
for(int m = 0; m < N; m++){
constraint4 += X[m][j][k]*P[m][j][p+k];
}
mod.add(X[i][j][k]*P[i][j][p+k] <= pi[i]*constraint4);
constraint4.end();
}
}
}
//==================================================================
//K(Lt - Ft) - sum P[i][j][k]*X[i][j][k] <= MC[j][k] forall j=1,...,T, k=1,...,S
//==================================================================
// cout << "8" << endl;
for(int j = 1; j <= T; j++){
for(int k = 0; k < 4; k++){
IloExpr constraint5(env);
for(int i = 0; i < N; i++){
constraint5 += X[i][j][k]*P[i][j][p+k];
}
mod.add(K*(Lt[j-1] - Ft[j-1]) - constraint5 - M*C[j-1][k] <= 0);
constraint5.end();
}
}
//============================================================
// sum C[j][k] <= 2 forall j=1,...,T-2, k=1,...,S
//=============================================================
// cout << "9" << endl;
for(int j=0; j < T-2; j++){
for(int k=0; k < 4; k++){
IloExpr constraint6(env);
for(int t = 0; t <= 2; t++){
constraint6 += C[j+t][k];
}
mod.add(constraint6 <= 2);
constraint6.end();
}
}
for(int i=0; i < N; i++){
for(int j=0; j < T; j++){
if(j == T-1){
mod.add(X[i][j][0] - X[i][j][1] == 0);
mod.add(X[i][j][2] - X[i][j][3] == 0);
}
else{
mod.add(X[i][j][0] - X[i][j][1] == 0);
mod.add(X[i][j][1] - X[i][j][2] == 0);
mod.add(X[i][j][2] - X[i][j][3] == 0);
}
}
}
IloTimer crono(env);// Variável para coletar o tempo
//cplex.setParam(IloCplex::Param::Benders::Strategy, 3);
crono.start();
cplex.setWarning(env.getNullStream());
cplex.setOut(env.getNullStream()); // Eliminar os logs do solver
// cout << "***" << endl;
cplex.solve();
opt = opt + cplex.getObjValue();
crono.stop();
// cout << "10" << endl;
//a solucao da variavel de decisao X e colocada em matriz
for(int i=0; i < N; i++){
for(int j=0; j <= T; j++){
X1[i][j][p] = cplex.getValue(X[i][j][0]);
X1[i][j][p+1] = cplex.getValue(X[i][j][1]);
X1[i][j][p+2] = cplex.getValue(X[i][j][2]);
X1[i][j][p+3] = cplex.getValue(X[i][j][3]);
}
}
/*
for(int j=0; j <= T; j++){
output1 << cplex.getValue(X[0][j][0]) << '\t';
output2 << cplex.getValue(X[1][j][0]) << '\t';
}*/
cont = cont + 1;
//cout << "!" << omp_in_parallel() << endl;
output1 << endl;
output2 << endl;
env.end();
}```
使用 3 个线程,代码运行速度提高了大约 46%。
我可以更改我的代码以获得加速吗?
我还注意到增加线程数没有任何区别。
在您的代码中,您并行解决了多个优化问题。这很好,但有一个问题:默认情况下,单个 CPLEX 实例将使用与您的机器具有的内核一样多的线程,并将尝试让它们保持忙碌。因此,如果您并行求解 N 个模型,那么在每个核心上,CPLEX 的 N 个实例将竞争计算资源。这通常不是一个好主意,因为 CPLEX 进程或多或少完全 CPU 绑定。
加速代码的一个选项是避免在同一核心上使用多个 CPLEX 线程 运行。例如,如果您有 12 个核心和 运行 3 个并行求解,则确保每个求解仅使用 4 个线程。
为了限制单次求解的线程数使用参数IloCplex::Param::Threads。
我对它的工作原理不是很了解。我已经能够并行化以下代码部分:
```
float opt = 0;
int cont = 0;
#pragma omp parallel for schedule(dynamic)
for(int p=0; p < S; p=p+4){
//cout << p << endl;
IloEnv env;
IloModel mod(env);
IloCplex cplex(mod);
IloArray<IloNumVarArray> C(env, T);
IloArray<IloArray<IloNumVarArray>> X(env, N);
IloArray<IloArray<IloNumVarArray>> B(env, N);
IloArray<IloArray<IloNumVarArray>> V(env, N);
// cout << "1" << endl;
for(int j = 0; j < T; j++)
C[j] = IloNumVarArray(env, 4, 0, 1, ILOINT);
//cout << "2" << endl;
for(int i= 0; i < N; i++){
X[i] = IloArray<IloNumVarArray>(env, T+1);
B[i] = IloArray<IloNumVarArray>(env, T+1);
V[i] = IloArray<IloNumVarArray>(env, T+1);
for(int j= 0; j <= T; j++){
X[i][j] = IloNumVarArray(env, 4);
B[i][j] = IloNumVarArray(env, 4);
V[i][j] = IloNumVarArray(env, 4);
for(int k=0; k < 4; k++){
X[i][j][k] = IloNumVar(env, 0, IloInfinity, ILOFLOAT);
B[i][j][k] = IloNumVar(env, 0, IloInfinity, ILOFLOAT);
V[i][j][k] = IloNumVar(env, 0, IloInfinity, ILOFLOAT);
}
}
}
//===================================================
// Maximize P[i][j][k]*X[i][j][k]
//====================================================
//cout << "3"<< endl;
IloExpr fo(env);
IloExpr penalty1(env);
IloExpr penalty2(env);
for(int i=0; i < N; i++){
for(int k=0; k < 4; k++){
fo += float(1.0/S)*P[i][T][p+k]*X[i][T][k];
}
if(iteration != 0){
for(int k = 0; k < 4; k++){
for(int j = 0; j < T-2; j++){
if(lambda1[i][j][p+k] !=0)
penalty1 += lambda1[i][j][p+k]*(X[i][j][0] - X_at[i][j][p+k]);
penalty1 += (1/2)*rho*(X[i][j][0] - 2*X[i][j][0]*X_at[i][j][p+k] - X_at[i][j][p+k]*X_at[i][j][p+k]);
}
}
}
else{
penalty1 += 0;
// penalty2 += 0;
}
}
IloAdd(mod, IloMaximize(env, fo - penalty1));
penalty1.end();
fo.end();
//========================================================
// sum X[i][0][k]*P[i][0][k] forall i=1,...,N, k=1,...,S
//========================================================
// cout << "4" << endl;
for(int k=0; k < 4; k++){
IloExpr constraint1(env);
for(int i=0; i <N; i++){
constraint1 += P[i][0][p+k]*X[i][0][k];
}
mod.add(constraint1 == Q);
constraint1.end();
}
//======================================================================================
// X[i][j][k] = X[i][j-1][k] + B[i][j][k] - V[i][j][k] for i=1,...,N,j=1,...,T,,k=1,...S
//======================================================================================
// cout << "5" << endl;
for(int i=0; i < N; i++){
for(int j=0; j <= T; j++){
for(int k=0; k < 4; k++){
IloExpr constraint2(env);
if(j==0){
constraint2 += X[i][j][k] - B[i][j][k];
mod.add(V[i][j][k] == 0);
}
else{
constraint2 += X[i][j][k] - X[i][j-1][k] - B[i][j][k] + V[i][j][k];
}
mod.add(constraint2 == 0);
constraint2.end();
}
}
}
//=========================================================================================
// sum P[i][j][k]V[i][j][k] - sum P[i][j][k]B[i][j][k] + ft = lt forall j=1,...,T, k=1,...,S
//=========================================================================================
// cout << "6" << endl;
for(int j = 1; j <= T; j++){
for(int k = 0; k < 4; k++){
IloExpr constraint3_1(env);
IloExpr constraint3_2(env);
for(int i = 0; i < N; i++){
constraint3_1 += P[i][j][p+k]*B[i][j][k];
constraint3_2 += P[i][j][p+k]*V[i][j][k];
}
mod.add(constraint3_2 - constraint3_1 + ft[j-1] == lt[j-1]);
constraint3_1.end();
constraint3_2.end();
}
}
//============================================================================================
// X[i][j][k]*P[i][j][k] < pi * sum X[i][j][k]*P[i][j][k] forall i=1,...,N,j=1,...T, k=1,...,S
// ===========================================================================================
// cout << "7" << endl;
for(int i = 0; i < N; i++){
for(int j = 1; j <= T; j++){
for(int k = 0; k < 4; k++){
IloExpr constraint4(env);
for(int m = 0; m < N; m++){
constraint4 += X[m][j][k]*P[m][j][p+k];
}
mod.add(X[i][j][k]*P[i][j][p+k] <= pi[i]*constraint4);
constraint4.end();
}
}
}
//==================================================================
//K(Lt - Ft) - sum P[i][j][k]*X[i][j][k] <= MC[j][k] forall j=1,...,T, k=1,...,S
//==================================================================
// cout << "8" << endl;
for(int j = 1; j <= T; j++){
for(int k = 0; k < 4; k++){
IloExpr constraint5(env);
for(int i = 0; i < N; i++){
constraint5 += X[i][j][k]*P[i][j][p+k];
}
mod.add(K*(Lt[j-1] - Ft[j-1]) - constraint5 - M*C[j-1][k] <= 0);
constraint5.end();
}
}
//============================================================
// sum C[j][k] <= 2 forall j=1,...,T-2, k=1,...,S
//=============================================================
// cout << "9" << endl;
for(int j=0; j < T-2; j++){
for(int k=0; k < 4; k++){
IloExpr constraint6(env);
for(int t = 0; t <= 2; t++){
constraint6 += C[j+t][k];
}
mod.add(constraint6 <= 2);
constraint6.end();
}
}
for(int i=0; i < N; i++){
for(int j=0; j < T; j++){
if(j == T-1){
mod.add(X[i][j][0] - X[i][j][1] == 0);
mod.add(X[i][j][2] - X[i][j][3] == 0);
}
else{
mod.add(X[i][j][0] - X[i][j][1] == 0);
mod.add(X[i][j][1] - X[i][j][2] == 0);
mod.add(X[i][j][2] - X[i][j][3] == 0);
}
}
}
IloTimer crono(env);// Variável para coletar o tempo
//cplex.setParam(IloCplex::Param::Benders::Strategy, 3);
crono.start();
cplex.setWarning(env.getNullStream());
cplex.setOut(env.getNullStream()); // Eliminar os logs do solver
// cout << "***" << endl;
cplex.solve();
opt = opt + cplex.getObjValue();
crono.stop();
// cout << "10" << endl;
//a solucao da variavel de decisao X e colocada em matriz
for(int i=0; i < N; i++){
for(int j=0; j <= T; j++){
X1[i][j][p] = cplex.getValue(X[i][j][0]);
X1[i][j][p+1] = cplex.getValue(X[i][j][1]);
X1[i][j][p+2] = cplex.getValue(X[i][j][2]);
X1[i][j][p+3] = cplex.getValue(X[i][j][3]);
}
}
/*
for(int j=0; j <= T; j++){
output1 << cplex.getValue(X[0][j][0]) << '\t';
output2 << cplex.getValue(X[1][j][0]) << '\t';
}*/
cont = cont + 1;
//cout << "!" << omp_in_parallel() << endl;
output1 << endl;
output2 << endl;
env.end();
}```
使用 3 个线程,代码运行速度提高了大约 46%。 我可以更改我的代码以获得加速吗? 我还注意到增加线程数没有任何区别。
在您的代码中,您并行解决了多个优化问题。这很好,但有一个问题:默认情况下,单个 CPLEX 实例将使用与您的机器具有的内核一样多的线程,并将尝试让它们保持忙碌。因此,如果您并行求解 N 个模型,那么在每个核心上,CPLEX 的 N 个实例将竞争计算资源。这通常不是一个好主意,因为 CPLEX 进程或多或少完全 CPU 绑定。
加速代码的一个选项是避免在同一核心上使用多个 CPLEX 线程 运行。例如,如果您有 12 个核心和 运行 3 个并行求解,则确保每个求解仅使用 4 个线程。
为了限制单次求解的线程数使用参数IloCplex::Param::Threads。