这是在堆上优化多维数组的可能方法吗?
Is this a possible way to optimize multidimensional arrays on heap?
下面是在堆上分配多维数组的常用方法,使用指向指针的指针。
typedef struct ArrayInt {
int *array;
int length;
} ArrayInt;
static void ArrayIntCreate(ArrayInt *array, int length) {
array->array = MjMalloc(length * sizeof(int));
array->length = length;
}
static void ArrayIntDelete(ArrayInt *array) {
free(array->array);
}
typedef struct ArrayArrayInt {
ArrayInt *array;
int length;
} ArrayArrayInt;
static void ArrayArrayIntCreate(ArrayArrayInt *array, int length, int length2) {
array->array = MjMalloc(length * sizeof(ArrayInt));
array->length = length;
for (int i = 0; i < length; i += 1) {
ArrayIntCreate(&array->array[i], length2);
}
}
static void ArrayArrayIntDelete(ArrayArrayInt *array) {
for (int i = 0; i < array->length; i += 1) {
ArrayIntDelete(&array->array[i]);
}
free(array->array);
}
但我决定制作一个只分配一大块内存并通过与索引值相乘来访问元素的版本。
typedef struct ArrayArrayInt2 {
int *array;
int length;
int length2;
} ArrayArrayInt2;
static void ArrayArrayInt2Create(ArrayArrayInt2 *array, int length, int length2) {
array->array = MjMalloc(length * length2 * sizeof(ArrayInt));
array->length = length;
array->length2 = length2;
}
static void ArrayArrayInt2Delete(ArrayArrayInt2 *array) {
free(array->array);
}
#define aai2At(aai2, i) (&aai2.array[i * aai2.length2])
第二个版本 运行 运行 使用下面的测试代码时大约快 20%。可能是什么原因,这是一种普遍适用的优化技术吗?是否有一些库为了优化目的定义了这种数组类型?
我在编辑之前的测试代码中犯了一个巨大的错误。第一个版本 运行 较慢,因为它的分配和释放保持在 for 循环内,而第二个版本在进入循环之前只做了一次。请参阅下面的测试代码中的注释。使两个测试相等后,我发现第一个版本可以 运行 更快,尤其是在优化之后。我放入测试代码的操作越复杂,拷贝越多,我看到第一个总是运行快一点。似乎索引乘法在我的机器上很慢?不过我不确定原因。
static double ElapsedTime(clock_t startTime, clock_t endTime) {
return (double)(endTime - startTime) / CLOCKS_PER_SEC;
}
#define N 2000
int main() {
ArrayArrayInt aai;
ArrayArrayInt2 aai2;
long long int sum;
clock_t startTime, endTime;
startTime = clock();
sum = 0;
for (int k = 0; k < N; k += 1) {
ArrayArrayIntCreate(&aai, N, N);
for (int i = 0; i < aai.length; i += 1) {
int j = 0;
for (; j < aai.array[i].length; j += 1) {
aai.array[i].array[j] = i;
}
while ((j -= 1) >= 0) {
sum += aai.array[i].array[j] - i + 1;
}
}
ArrayArrayIntDelete(&aai);
}
endTime = clock();
printf("aai: sum = %lld; time = %.2f\n", sum, ElapsedTime(startTime, endTime));
startTime = clock();
sum = 0;
ArrayArrayInt2Create(&aai2, N, N); //Mistake Here!!
for (int k = 0; k < N; k += 1) {
for (int i = 0; i < aai2.length; i += 1) {
int j = 0;
for (; j < aai2.length2; j += 1) {
aai2At(aai2, i)[j] = i;
}
while ((j -= 1) >= 0) {
sum += aai2At(aai2, i)[j] - i + 1;
}
}
}
ArrayArrayInt2Delete(&aai2); //Should go inside the loop block..
endTime = clock();
printf("aai2: sum = %lld; time = %.2f\n", sum, ElapsedTime(startTime, endTime));
return 0;
}
是的,使用算术和单个基指针是编译器在内部为非动态分配的 2D(n 维)数组所做的。
您获得了最大的性能,因为只有一个计算和索引查找。对于所示的二维数组,每次数组访问有两次指针查找和两次索引计算(一次索引计算和查找以获取正确的数组,然后第二次访问正确数组中的元素)。对于 3D 数组,将进行三个索引计算和三个查找。
您还分配了更少的内存,并且需要更少的内存分配,但这些是二阶效应。
此外,正如 WhozCraig points out in a 但我没有提到,与多个较小的内存块(加起来更多的内存)相比,您可以获得更好的引用位置和更智能的预取的潜力比单个大块)。
我在 Mac OS X 10.10.2 Yosemite.
上测试了这个用 GCC 4.9.1 编译的文件 (sim2d.c
)
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
static void *MjMalloc(size_t nbytes)
{
void *rv = malloc(nbytes);
if (rv == 0)
{
fprintf(stderr, "Memory allocation failure (%zu bytes)\n", nbytes);
exit(1);
}
return rv;
}
/* Mechanism 1 */
typedef struct ArrayInt {
int *array;
int length;
} ArrayInt;
static void ArrayIntCreate(ArrayInt *array, int length) {
array->array = MjMalloc(length * sizeof(int));
array->length = length;
}
static void ArrayIntDelete(ArrayInt *array) {
free(array->array);
}
typedef struct ArrayArrayInt {
ArrayInt *array;
int length;
} ArrayArrayInt;
static void ArrayArrayIntCreate(ArrayArrayInt *array, int length, int length2) {
array->array = MjMalloc(length * sizeof(ArrayInt));
array->length = length;
for (int i = 0; i < length; i += 1) {
ArrayIntCreate(&array->array[i], length2);
}
}
static void ArrayArrayIntDelete(ArrayArrayInt *array) {
for (int i = 0; i < array->length; i += 1) {
ArrayIntDelete(&array->array[i]);
}
free(array->array);
}
/* Mechanism 2 */
typedef struct ArrayArrayInt2 {
int *array;
int length;
int length2;
} ArrayArrayInt2;
static void ArrayArrayInt2Create(ArrayArrayInt2 *array, int length, int length2) {
array->array = MjMalloc(length * length2 * sizeof(ArrayInt));
array->length = length;
array->length2 = length2;
}
static void ArrayArrayInt2Delete(ArrayArrayInt2 *array) {
free(array->array);
}
#define aai2At(aai2, i) (&aai2.array[(i) * aai2.length2])
#define aai2At2(aai2, i, j) (aai2.array[(i) * aai2.length2 + (j)])
/* Head-to-head testing */
static double ElapsedTime(clock_t startTime, clock_t endTime) {
return (double)(endTime - startTime) / CLOCKS_PER_SEC;
}
#define N 2000
#define N_CYCLES 1000
static void one_test_cycle(void)
{
ArrayArrayInt aai;
ArrayArrayInt2 aai2;
long long int sum;
clock_t startTime, endTime;
startTime = clock();
sum = 0;
for (int k = 0; k < N_CYCLES; k += 1) {
ArrayArrayIntCreate(&aai, N, N);
for (int i = 0; i < aai.length; i += 1) {
int j = 0;
for (; j < aai.array[i].length; j += 1) {
aai.array[i].array[j] = i;
}
while ((j -= 1) >= 0) {
sum += aai.array[i].array[j] - i + 1;
}
}
ArrayArrayIntDelete(&aai);
}
endTime = clock();
printf("aai1: sum = %lld; time = %.2f\n", sum, ElapsedTime(startTime, endTime));
startTime = clock();
sum = 0;
for (int k = 0; k < N_CYCLES; k += 1) {
ArrayArrayInt2Create(&aai2, N, N);
for (int i = 0; i < aai2.length; i += 1) {
int j = 0;
for (; j < aai2.length2; j += 1) {
aai2At(aai2, i)[j] = i;
}
while ((j -= 1) >= 0) {
sum += aai2At(aai2, i)[j] - i + 1;
}
}
ArrayArrayInt2Delete(&aai2);
}
endTime = clock();
printf("aai2: sum = %lld; time = %.2f\n", sum, ElapsedTime(startTime, endTime));
startTime = clock();
sum = 0;
for (int k = 0; k < N_CYCLES; k += 1) {
ArrayArrayInt2Create(&aai2, N, N);
for (int i = 0; i < aai2.length; i += 1) {
int j = 0;
for (; j < aai2.length2; j += 1) {
aai2At2(aai2, i, j) = i;
}
while ((j -= 1) >= 0) {
sum += aai2At2(aai2, i, j) - i + 1;
}
}
ArrayArrayInt2Delete(&aai2);
}
endTime = clock();
printf("aai3: sum = %lld; time = %.2f\n", sum, ElapsedTime(startTime, endTime));
}
static void print_now(const char *tag)
{
time_t now = time(0);
struct tm *lt = localtime(&now);
char buffer[32];
strftime(buffer, sizeof(buffer), "%Y-%m-%d %H:%M:%S", lt);
printf("%s: %s\n", tag, buffer);
}
int main(void)
{
print_now("Started");
for (int i = 0; i < 3; i++)
one_test_cycle();
print_now("Finished");
return 0;
}
有两种访问 aai2
数据的方法略有不同。我还将数组大小 (N = 2000) 与单个测试中的循环数 (N_CYCLES = 1000) 分开。我得到的计时结果是:
Started: 2015-04-07 07:40:41
aai1: sum = 4000000000; time = 6.80
aai2: sum = 4000000000; time = 5.99
aai3: sum = 4000000000; time = 5.98
aai1: sum = 4000000000; time = 6.75
aai2: sum = 4000000000; time = 6.02
aai3: sum = 4000000000; time = 5.99
aai1: sum = 4000000000; time = 6.72
aai2: sum = 4000000000; time = 6.01
aai3: sum = 4000000000; time = 5.99
Finished: 2015-04-07 07:41:38
我得到了与 (N_CYCLE = 2000) 相似的模式,但它花费的时间是 运行 的两倍——惊喜,惊喜。
我看到单个分配代码带来了一个小但明显的好处(减少了大约 13%),但是 'aai2' 测试的两个时间之间没有显着差异。
基本统计:
# All data
# Count = 9
# Mean = 6.250000e+00
# Std Dev = 3.807230e-01
# aai1 only:
# Count = 3
# Mean = 6.756667e+00
# Std Dev = 4.041452e-02
# aai2 and aai3:
# Count = 6
# Mean = 5.996667e+00
# Std Dev = 1.505545e-02
# aai2 only:
# Count = 3
# Mean = 6.006667e+00
# Std Dev = 1.527525e-02
# aai3 only:
# Count = 3
# Mean = 5.986667e+00
# Std Dev = 5.773503e-03
显然,正式确保机器以其他方式卸载,运行进行更多次测试迭代,以及类似的基准测试步骤可能会改善数据,但单一分配 aai2
机制执行在这台机器上比多重分配 aai
机制更好。 (切题:当人们有两个或更多版本的代码时,为什么不在他们的第一个版本上加上后缀 1?)
硬件:17" Mac Book Pro,2011 年初,2.3 GHz 英特尔酷睿 i7,16 GiB 1333 MHz DDR3 内存。
下面是在堆上分配多维数组的常用方法,使用指向指针的指针。
typedef struct ArrayInt {
int *array;
int length;
} ArrayInt;
static void ArrayIntCreate(ArrayInt *array, int length) {
array->array = MjMalloc(length * sizeof(int));
array->length = length;
}
static void ArrayIntDelete(ArrayInt *array) {
free(array->array);
}
typedef struct ArrayArrayInt {
ArrayInt *array;
int length;
} ArrayArrayInt;
static void ArrayArrayIntCreate(ArrayArrayInt *array, int length, int length2) {
array->array = MjMalloc(length * sizeof(ArrayInt));
array->length = length;
for (int i = 0; i < length; i += 1) {
ArrayIntCreate(&array->array[i], length2);
}
}
static void ArrayArrayIntDelete(ArrayArrayInt *array) {
for (int i = 0; i < array->length; i += 1) {
ArrayIntDelete(&array->array[i]);
}
free(array->array);
}
但我决定制作一个只分配一大块内存并通过与索引值相乘来访问元素的版本。
typedef struct ArrayArrayInt2 {
int *array;
int length;
int length2;
} ArrayArrayInt2;
static void ArrayArrayInt2Create(ArrayArrayInt2 *array, int length, int length2) {
array->array = MjMalloc(length * length2 * sizeof(ArrayInt));
array->length = length;
array->length2 = length2;
}
static void ArrayArrayInt2Delete(ArrayArrayInt2 *array) {
free(array->array);
}
#define aai2At(aai2, i) (&aai2.array[i * aai2.length2])
第二个版本 运行 运行 使用下面的测试代码时大约快 20%。可能是什么原因,这是一种普遍适用的优化技术吗?是否有一些库为了优化目的定义了这种数组类型?
我在编辑之前的测试代码中犯了一个巨大的错误。第一个版本 运行 较慢,因为它的分配和释放保持在 for 循环内,而第二个版本在进入循环之前只做了一次。请参阅下面的测试代码中的注释。使两个测试相等后,我发现第一个版本可以 运行 更快,尤其是在优化之后。我放入测试代码的操作越复杂,拷贝越多,我看到第一个总是运行快一点。似乎索引乘法在我的机器上很慢?不过我不确定原因。
static double ElapsedTime(clock_t startTime, clock_t endTime) {
return (double)(endTime - startTime) / CLOCKS_PER_SEC;
}
#define N 2000
int main() {
ArrayArrayInt aai;
ArrayArrayInt2 aai2;
long long int sum;
clock_t startTime, endTime;
startTime = clock();
sum = 0;
for (int k = 0; k < N; k += 1) {
ArrayArrayIntCreate(&aai, N, N);
for (int i = 0; i < aai.length; i += 1) {
int j = 0;
for (; j < aai.array[i].length; j += 1) {
aai.array[i].array[j] = i;
}
while ((j -= 1) >= 0) {
sum += aai.array[i].array[j] - i + 1;
}
}
ArrayArrayIntDelete(&aai);
}
endTime = clock();
printf("aai: sum = %lld; time = %.2f\n", sum, ElapsedTime(startTime, endTime));
startTime = clock();
sum = 0;
ArrayArrayInt2Create(&aai2, N, N); //Mistake Here!!
for (int k = 0; k < N; k += 1) {
for (int i = 0; i < aai2.length; i += 1) {
int j = 0;
for (; j < aai2.length2; j += 1) {
aai2At(aai2, i)[j] = i;
}
while ((j -= 1) >= 0) {
sum += aai2At(aai2, i)[j] - i + 1;
}
}
}
ArrayArrayInt2Delete(&aai2); //Should go inside the loop block..
endTime = clock();
printf("aai2: sum = %lld; time = %.2f\n", sum, ElapsedTime(startTime, endTime));
return 0;
}
是的,使用算术和单个基指针是编译器在内部为非动态分配的 2D(n 维)数组所做的。
您获得了最大的性能,因为只有一个计算和索引查找。对于所示的二维数组,每次数组访问有两次指针查找和两次索引计算(一次索引计算和查找以获取正确的数组,然后第二次访问正确数组中的元素)。对于 3D 数组,将进行三个索引计算和三个查找。
您还分配了更少的内存,并且需要更少的内存分配,但这些是二阶效应。
此外,正如 WhozCraig points out in a
我在 Mac OS X 10.10.2 Yosemite.
上测试了这个用 GCC 4.9.1 编译的文件 (sim2d.c
)
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
static void *MjMalloc(size_t nbytes)
{
void *rv = malloc(nbytes);
if (rv == 0)
{
fprintf(stderr, "Memory allocation failure (%zu bytes)\n", nbytes);
exit(1);
}
return rv;
}
/* Mechanism 1 */
typedef struct ArrayInt {
int *array;
int length;
} ArrayInt;
static void ArrayIntCreate(ArrayInt *array, int length) {
array->array = MjMalloc(length * sizeof(int));
array->length = length;
}
static void ArrayIntDelete(ArrayInt *array) {
free(array->array);
}
typedef struct ArrayArrayInt {
ArrayInt *array;
int length;
} ArrayArrayInt;
static void ArrayArrayIntCreate(ArrayArrayInt *array, int length, int length2) {
array->array = MjMalloc(length * sizeof(ArrayInt));
array->length = length;
for (int i = 0; i < length; i += 1) {
ArrayIntCreate(&array->array[i], length2);
}
}
static void ArrayArrayIntDelete(ArrayArrayInt *array) {
for (int i = 0; i < array->length; i += 1) {
ArrayIntDelete(&array->array[i]);
}
free(array->array);
}
/* Mechanism 2 */
typedef struct ArrayArrayInt2 {
int *array;
int length;
int length2;
} ArrayArrayInt2;
static void ArrayArrayInt2Create(ArrayArrayInt2 *array, int length, int length2) {
array->array = MjMalloc(length * length2 * sizeof(ArrayInt));
array->length = length;
array->length2 = length2;
}
static void ArrayArrayInt2Delete(ArrayArrayInt2 *array) {
free(array->array);
}
#define aai2At(aai2, i) (&aai2.array[(i) * aai2.length2])
#define aai2At2(aai2, i, j) (aai2.array[(i) * aai2.length2 + (j)])
/* Head-to-head testing */
static double ElapsedTime(clock_t startTime, clock_t endTime) {
return (double)(endTime - startTime) / CLOCKS_PER_SEC;
}
#define N 2000
#define N_CYCLES 1000
static void one_test_cycle(void)
{
ArrayArrayInt aai;
ArrayArrayInt2 aai2;
long long int sum;
clock_t startTime, endTime;
startTime = clock();
sum = 0;
for (int k = 0; k < N_CYCLES; k += 1) {
ArrayArrayIntCreate(&aai, N, N);
for (int i = 0; i < aai.length; i += 1) {
int j = 0;
for (; j < aai.array[i].length; j += 1) {
aai.array[i].array[j] = i;
}
while ((j -= 1) >= 0) {
sum += aai.array[i].array[j] - i + 1;
}
}
ArrayArrayIntDelete(&aai);
}
endTime = clock();
printf("aai1: sum = %lld; time = %.2f\n", sum, ElapsedTime(startTime, endTime));
startTime = clock();
sum = 0;
for (int k = 0; k < N_CYCLES; k += 1) {
ArrayArrayInt2Create(&aai2, N, N);
for (int i = 0; i < aai2.length; i += 1) {
int j = 0;
for (; j < aai2.length2; j += 1) {
aai2At(aai2, i)[j] = i;
}
while ((j -= 1) >= 0) {
sum += aai2At(aai2, i)[j] - i + 1;
}
}
ArrayArrayInt2Delete(&aai2);
}
endTime = clock();
printf("aai2: sum = %lld; time = %.2f\n", sum, ElapsedTime(startTime, endTime));
startTime = clock();
sum = 0;
for (int k = 0; k < N_CYCLES; k += 1) {
ArrayArrayInt2Create(&aai2, N, N);
for (int i = 0; i < aai2.length; i += 1) {
int j = 0;
for (; j < aai2.length2; j += 1) {
aai2At2(aai2, i, j) = i;
}
while ((j -= 1) >= 0) {
sum += aai2At2(aai2, i, j) - i + 1;
}
}
ArrayArrayInt2Delete(&aai2);
}
endTime = clock();
printf("aai3: sum = %lld; time = %.2f\n", sum, ElapsedTime(startTime, endTime));
}
static void print_now(const char *tag)
{
time_t now = time(0);
struct tm *lt = localtime(&now);
char buffer[32];
strftime(buffer, sizeof(buffer), "%Y-%m-%d %H:%M:%S", lt);
printf("%s: %s\n", tag, buffer);
}
int main(void)
{
print_now("Started");
for (int i = 0; i < 3; i++)
one_test_cycle();
print_now("Finished");
return 0;
}
有两种访问 aai2
数据的方法略有不同。我还将数组大小 (N = 2000) 与单个测试中的循环数 (N_CYCLES = 1000) 分开。我得到的计时结果是:
Started: 2015-04-07 07:40:41
aai1: sum = 4000000000; time = 6.80
aai2: sum = 4000000000; time = 5.99
aai3: sum = 4000000000; time = 5.98
aai1: sum = 4000000000; time = 6.75
aai2: sum = 4000000000; time = 6.02
aai3: sum = 4000000000; time = 5.99
aai1: sum = 4000000000; time = 6.72
aai2: sum = 4000000000; time = 6.01
aai3: sum = 4000000000; time = 5.99
Finished: 2015-04-07 07:41:38
我得到了与 (N_CYCLE = 2000) 相似的模式,但它花费的时间是 运行 的两倍——惊喜,惊喜。
我看到单个分配代码带来了一个小但明显的好处(减少了大约 13%),但是 'aai2' 测试的两个时间之间没有显着差异。
基本统计:
# All data
# Count = 9
# Mean = 6.250000e+00
# Std Dev = 3.807230e-01
# aai1 only:
# Count = 3
# Mean = 6.756667e+00
# Std Dev = 4.041452e-02
# aai2 and aai3:
# Count = 6
# Mean = 5.996667e+00
# Std Dev = 1.505545e-02
# aai2 only:
# Count = 3
# Mean = 6.006667e+00
# Std Dev = 1.527525e-02
# aai3 only:
# Count = 3
# Mean = 5.986667e+00
# Std Dev = 5.773503e-03
显然,正式确保机器以其他方式卸载,运行进行更多次测试迭代,以及类似的基准测试步骤可能会改善数据,但单一分配 aai2
机制执行在这台机器上比多重分配 aai
机制更好。 (切题:当人们有两个或更多版本的代码时,为什么不在他们的第一个版本上加上后缀 1?)
硬件:17" Mac Book Pro,2011 年初,2.3 GHz 英特尔酷睿 i7,16 GiB 1333 MHz DDR3 内存。