在c中优化和向量化随机函数
Optimize and vectorize random function in c
我编写了一个函数来生成伪随机数,但我的编译器没有对此进行矢量化 not vectorized: complicated access pattern
。
我怎样才能使编译器对其进行矢量化?
我使用命令 gcc -O3 -ffast-math -funroll-all-loops -ftree-vectorize -fopt-info-vec-missed -lm -g -o .O3
unsigned int seed;
unsigned int temp;
#define val13 13
unsigned int var1 = 214013;
unsigned int var2 = 2531011;
inline int myRandom() {
seed = (var1*seed + var2);
return (seed>>val13);
}
如果我将代码更改为此我设法进行矢量化,但我没有得到预期的结果。
inline int myRandom() {
return ((var1*seed + var2)>>val13);
}
我怎样才能让编译器向量化这个函数并得到预期的结果?
种子上的生成器函数f是一个线性同余生成器,很容易与自身组合实现f2, f3, f4,等等。然后可以实现矢量化生成器,它从一个种子准备初始矢量块,然后使用 fB 来计算 B 车道的结果,每个车道彼此独立。这是一个概念证明。各种修饰都是可能的,例如保留种子向量而不是单个种子并修改 FillRandomVector
以处理任意 N
.
#include <stdio.h>
#include <stdlib.h>
#define Block 4 // Number of lanes (elements) in a vector block.
static const unsigned int var1 = 214013;
static const unsigned int var2 = 2531011;
// Original generator function, modified to take a pointer to the seed.
static inline int MyRandom(unsigned int *seed)
{
*seed = var1 * *seed + var2;
return *seed >> 13;
}
// New parameters for a vectorized generator.
static unsigned int var1Vector;
static unsigned int var2Vector;
/* Initialize parameters for vectorized generator by computing them from the
scalar parameters.
*/
static void Initialize(void)
{
var1Vector = 1;
var2Vector = 0;
for (int i = 0; i < Block; ++i)
{
var1Vector *= var1;
var2Vector = var2Vector * var1 + var2;
}
}
// Fill array with generated numbers using scalar method.
static void FillRandomScalar(unsigned int *seed, size_t N, int *Destination)
{
for (size_t n = 0; n < N; ++n)
Destination[n] = MyRandom(seed);
}
/* Fill array with generated numbers using vectorizable method.
For proof of concept only, so handles only certain N:
N must be a positive multiple of Block.
*/
static void FillRandomVector(unsigned int *seed, size_t N, int *Destination)
{
// Prepare a vector of seeds and generate the first block of results.
unsigned seedVector[Block];
unsigned int S = *seed;
for (size_t n = 0; n < Block; ++n)
{
S = S * var1 + var2;
seedVector[n] = S;
Destination[n] = S >> 13;
}
// Generate the remaining results using independent lanes.
for (size_t n = Block; n < N; n += Block)
for (size_t lane = 0; lane < Block; ++lane)
{
seedVector[lane] = seedVector[lane] * var1Vector + var2Vector;
Destination[n + lane] = seedVector[lane] >> 13;
}
*seed = seedVector[Block-1];
}
#define N 100
int main(void)
{
Initialize();
int expected0[N], observed0[N];
int expected1[N], observed1[N];
unsigned int seed;
seed = 17;
FillRandomScalar(&seed, N, expected0);
FillRandomScalar(&seed, N, expected1);
seed = 17;
FillRandomVector(&seed, N, observed0);
FillRandomVector(&seed, N, observed1);
for (size_t n = 0; n < N; ++n)
{
if (observed0[n] != expected0[n])
{
printf("observed0[%zu] = %d, but expected0 %d.\n",
n, observed0[n], expected0[n]);
exit(EXIT_FAILURE);
}
}
for (size_t n = 0; n < N; ++n)
{
if (observed1[n] != expected1[n])
{
printf("observed1[%zu] = %d, but expected1 %d.\n",
n, observed1[n], expected1[n]);
exit(EXIT_FAILURE);
}
}
}
我编写了一个函数来生成伪随机数,但我的编译器没有对此进行矢量化 not vectorized: complicated access pattern
。
我怎样才能使编译器对其进行矢量化?
我使用命令 gcc -O3 -ffast-math -funroll-all-loops -ftree-vectorize -fopt-info-vec-missed -lm -g -o .O3
unsigned int seed;
unsigned int temp;
#define val13 13
unsigned int var1 = 214013;
unsigned int var2 = 2531011;
inline int myRandom() {
seed = (var1*seed + var2);
return (seed>>val13);
}
如果我将代码更改为此我设法进行矢量化,但我没有得到预期的结果。
inline int myRandom() {
return ((var1*seed + var2)>>val13);
}
我怎样才能让编译器向量化这个函数并得到预期的结果?
种子上的生成器函数f是一个线性同余生成器,很容易与自身组合实现f2, f3, f4,等等。然后可以实现矢量化生成器,它从一个种子准备初始矢量块,然后使用 fB 来计算 B 车道的结果,每个车道彼此独立。这是一个概念证明。各种修饰都是可能的,例如保留种子向量而不是单个种子并修改 FillRandomVector
以处理任意 N
.
#include <stdio.h>
#include <stdlib.h>
#define Block 4 // Number of lanes (elements) in a vector block.
static const unsigned int var1 = 214013;
static const unsigned int var2 = 2531011;
// Original generator function, modified to take a pointer to the seed.
static inline int MyRandom(unsigned int *seed)
{
*seed = var1 * *seed + var2;
return *seed >> 13;
}
// New parameters for a vectorized generator.
static unsigned int var1Vector;
static unsigned int var2Vector;
/* Initialize parameters for vectorized generator by computing them from the
scalar parameters.
*/
static void Initialize(void)
{
var1Vector = 1;
var2Vector = 0;
for (int i = 0; i < Block; ++i)
{
var1Vector *= var1;
var2Vector = var2Vector * var1 + var2;
}
}
// Fill array with generated numbers using scalar method.
static void FillRandomScalar(unsigned int *seed, size_t N, int *Destination)
{
for (size_t n = 0; n < N; ++n)
Destination[n] = MyRandom(seed);
}
/* Fill array with generated numbers using vectorizable method.
For proof of concept only, so handles only certain N:
N must be a positive multiple of Block.
*/
static void FillRandomVector(unsigned int *seed, size_t N, int *Destination)
{
// Prepare a vector of seeds and generate the first block of results.
unsigned seedVector[Block];
unsigned int S = *seed;
for (size_t n = 0; n < Block; ++n)
{
S = S * var1 + var2;
seedVector[n] = S;
Destination[n] = S >> 13;
}
// Generate the remaining results using independent lanes.
for (size_t n = Block; n < N; n += Block)
for (size_t lane = 0; lane < Block; ++lane)
{
seedVector[lane] = seedVector[lane] * var1Vector + var2Vector;
Destination[n + lane] = seedVector[lane] >> 13;
}
*seed = seedVector[Block-1];
}
#define N 100
int main(void)
{
Initialize();
int expected0[N], observed0[N];
int expected1[N], observed1[N];
unsigned int seed;
seed = 17;
FillRandomScalar(&seed, N, expected0);
FillRandomScalar(&seed, N, expected1);
seed = 17;
FillRandomVector(&seed, N, observed0);
FillRandomVector(&seed, N, observed1);
for (size_t n = 0; n < N; ++n)
{
if (observed0[n] != expected0[n])
{
printf("observed0[%zu] = %d, but expected0 %d.\n",
n, observed0[n], expected0[n]);
exit(EXIT_FAILURE);
}
}
for (size_t n = 0; n < N; ++n)
{
if (observed1[n] != expected1[n])
{
printf("observed1[%zu] = %d, but expected1 %d.\n",
n, observed1[n], expected1[n]);
exit(EXIT_FAILURE);
}
}
}