使用 AVX 但不使用 AVX2，分别计算许多 64 位位掩码上的每个位位置

Question

（相关：How to quickly count bits into separate bins in a series of ints on Sandy Bridge? 是这个的早期副本，有一些不同的答案。编者注：这里的答案可能更好。

另外，一个类似问题的 AVX2 版本，整行位的许多 bin 比一个宽得多 uint64_t：)

我正在用 C 开发一个项目，我需要遍历数千万个掩码（ulong 类型（64 位））并更新一个包含 64 个短整数的数组（称为 target） (uint16) 基于一个简单的规则：

// for any given mask, do the following loop
for (i = 0; i < 64; i++) {
    if (mask & (1ull << i)) {
        target[i]++
    }
}

问题是我需要在数千万个面具上执行上述循环并且我需要在不到一秒的时间内完成。想知道是否有任何方法可以加快它的速度，比如使用某种表示上述循环的特殊汇编指令。

目前我在 ubuntu 14.04（i7-2670QM，支持 AVX，不支持 AVX2）上使用 gcc 4.8.4 来编译和运行下面的代码，用了大约 2 秒。希望运行低于 200 毫秒。

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
#include <sys/stat.h>

double getTS() {
    struct timeval tv;
    gettimeofday(&tv, NULL);
    return tv.tv_sec + tv.tv_usec / 1000000.0;
}
unsigned int target[64];

int main(int argc, char *argv[]) {
    int i, j;
    unsigned long x = 123;
    unsigned long m = 1;
    char *p = malloc(8 * 10000000);
    if (!p) {
        printf("failed to allocate\n");
        exit(0);
    }
    memset(p, 0xff, 80000000);
    printf("p=%p\n", p);
    unsigned long *pLong = (unsigned long*)p;
    double start = getTS();
    for (j = 0; j < 10000000; j++) {
        m = 1;
        for (i = 0; i < 64; i++) {
            if ((pLong[j] & m) == m) {
                target[i]++;
            }
            m = (m << 1);
        }
    }
    printf("took %f secs\n", getTS() - start);
    return 0;
}

提前致谢！

Answer 1

对于初学者来说，解包位的问题，因为说真的，你不想单独测试每个位。

所以只需按照以下策略将位解包为向量的字节：

现在您已将每个位填充为 8 位，您可以一次对最多 255 个位掩码的块执行此操作，并将它们全部累积到一个向量寄存器中。在那之后，你将不得不预料到潜在的溢出，所以你需要转移。

每块255后，再次解包为32bit，加入数组。（您不必精确地执行 255，只需一些小于 256 的方便数字即可避免字节累加器溢出）。

每个位掩码 8 条指令（对于 AVX2，每个较低和较高的 32 位 4 条指令）——如果你有可用的 AVX512，则可以达到一半——你应该能够实现每秒大约 50 亿个位掩码的吞吐量，并且最近 CPU.

的核心

typedef uint64_t T;
const size_t bytes = 8;
const size_t bits = bytes * 8;
const size_t block_size = 128;

static inline __m256i expand_bits_to_bytes(uint32_t x)
{
    __m256i xbcast = _mm256_set1_epi32(x);    // we only use the low 32bits of each lane, but this is fine with AVX2

    // Each byte gets the source byte containing the corresponding bit
    const __m256i shufmask = _mm256_set_epi64x(
        0x0303030303030303, 0x0202020202020202,
        0x0101010101010101, 0x0000000000000000);
    __m256i shuf = _mm256_shuffle_epi8(xbcast, shufmask);

    const __m256i andmask = _mm256_set1_epi64x(0x8040201008040201);  // every 8 bits -> 8 bytes, pattern repeats.
    __m256i isolated_inverted = _mm256_andnot_si256(shuf, andmask);

    // this is the extra step: byte == 0 ? 0 : -1
    return _mm256_cmpeq_epi8(isolated_inverted, _mm256_setzero_si256());
}

void bitcount_vectorized(const T *data, uint32_t accumulator[bits], const size_t count)
{
    for (size_t outer = 0; outer < count - (count % block_size); outer += block_size)
    {
        __m256i temp_accumulator[bits / 32] = { _mm256_setzero_si256() };
        for (size_t inner = 0; inner < block_size; ++inner) {
            for (size_t j = 0; j < bits / 32; j++)
            {
                const auto unpacked = expand_bits_to_bytes(static_cast<uint32_t>(data[outer + inner] >> (j * 32)));
                temp_accumulator[j] = _mm256_sub_epi8(temp_accumulator[j], unpacked);
            }
        }
        for (size_t j = 0; j < bits; j++)
        {
            accumulator[j] += ((uint8_t*)(&temp_accumulator))[j];
        }
    }
    for (size_t outer = count - (count % block_size); outer < count; outer++)
    {
        for (size_t j = 0; j < bits; j++)
        {
            if (data[outer] & (T(1) << j))
            {
                accumulator[j]++;
            }
        }
    }
}

void bitcount_naive(const T *data, uint32_t accumulator[bits], const size_t count)
{
    for (size_t outer = 0; outer < count; outer++)
    {
        for (size_t j = 0; j < bits; j++)
        {
            if (data[outer] & (T(1) << j))
            {
                accumulator[j]++;
            }
        }
    }
}

根据所选的编译器，矢量化形式比原始形式实现了大约 25 倍的加速。

在 Ryzen 5 1600X 上，矢量化形式大致达到每秒约 600,000,000 个元素的预测吞吐量。

令人惊讶的是，这实际上仍然比@njuffa 提出的解决方案慢 50%。

Answer 2

在我的系统上，一台使用了 4 年的 MacBook（2.7 GHz 英特尔酷睿 i5）clang-900.0.39.2 -O3，您的代码运行时间为 500 毫秒。

只需将内部测试更改为 if ((pLong[j] & m) != 0) 即可节省 30%，运行 350 毫秒。

在没有测试的情况下进一步将内部部分简化为 target[i] += (pLong[j] >> i) & 1; 将其降低到 280 毫秒。

进一步的改进似乎需要更高级的技术，例如将位解包为 8 个 ulong 的块并并行添加这些块，一次处理 255 个 ulong。

这里是使用这种方法的改进版本。它在我的系统上运行 45 毫秒。

#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
#include <sys/stat.h>

double getTS() {
    struct timeval tv;
    gettimeofday(&tv, NULL);
    return tv.tv_sec + tv.tv_usec / 1000000.0;
}

int main(int argc, char *argv[]) {
    unsigned int target[64] = { 0 };
    unsigned long *pLong = malloc(sizeof(*pLong) * 10000000);
    int i, j;

    if (!pLong) {
        printf("failed to allocate\n");
        exit(1);
    }
    memset(pLong, 0xff, sizeof(*pLong) * 10000000);
    printf("p=%p\n", (void*)pLong);
    double start = getTS();
    uint64_t inflate[256];
    for (i = 0; i < 256; i++) {
        uint64_t x = i;
        x = (x | (x << 28));
        x = (x | (x << 14));
        inflate[i] = (x | (x <<  7)) & 0x0101010101010101ULL;
    }
    for (j = 0; j < 10000000 / 255 * 255; j += 255) {
        uint64_t b[8] = { 0 };
        for (int k = 0; k < 255; k++) {
            uint64_t u = pLong[j + k];
            for (int kk = 0; kk < 8; kk++, u >>= 8)
                b[kk] += inflate[u & 255];
        }
        for (i = 0; i < 64; i++)
            target[i] += (b[i / 8] >> ((i % 8) * 8)) & 255;
    }
    for (; j < 10000000; j++) {
        uint64_t m = 1;
        for (i = 0; i < 64; i++) {
            target[i] += (pLong[j] >> i) & 1;
            m <<= 1;
        }
    }
    printf("target = {");
    for (i = 0; i < 64; i++)
        printf(" %d", target[i]);
    printf(" }\n");
    printf("took %f secs\n", getTS() - start);
    return 0;
}

在答案中调查和解释了将字节扩展为 64 位长的技术：。我将 target 数组和 inflate 数组设为局部变量，并打印结果以确保编译器不会优化计算。在生产版本中，您将单独计算 inflate 数组。

直接使用 SIMD 可能会以牺牲可移植性和可读性为代价提供进一步的改进。这种优化通常最好留给编译器，因为它可以为目标架构生成特定代码。除非性能至关重要并且基准测试证明这是一个瓶颈，否则我总是倾向于通用解决方案。

njuffa 的另一种解决方案提供了类似的性能，而无需预先计算的数组。根据您的编译器和硬件细节，它可能会更快。

Answer 3

更好的策略是逐渐扩大

从 2 位累加器开始，然后 mask/shift 将其扩展到 4 位。所以在 inner-most 循环中，大多数操作都在处理 "dense" 数据，而不是 "diluting" 太多了。更高的信息/熵密度意味着每条指令做更多有用的工作。

使用 SWAR 技术在标量或 SIMD 寄存器内进行 32x 2 位加法很容易/便宜，因为我们无论如何都需要避免执行元素顶部的可能性。使用适当的 SIMD，我们会丢失这些计数，使用 SWAR 我们会破坏下一个元素。

uint64_t x = *(input++);        // load a new bitmask
const uint64_t even_1bits = 0x5555555555555555;  // 0b...01010101;

uint64_t lo = x & even_1bits;
uint64_t hi = (x>>1) & even_1bits;            // or use ANDN before shifting to avoid a MOV copy

accum2_lo += lo;   // can do up to 3 iterations of this without overflow
accum2_hi += hi;   // because a 2-bit integer overflows at 4

然后你最多重复 4 个 4 位元素的向量，然后 8 个 8 位元素的向量，然后你应该一直扩大到 32 并累积到内存中的数组中，因为你会运行无论如何都在寄存器之外，而且这种外层循环工作很少见，我们不需要为转向 16 位而烦恼。（特别是如果我们手动矢量化）。

最大的缺点：这个不会auto-vectorize，不像@njuffa的版本。但是gcc -O3 -march=sandybridge对于 AVX1（然后运行ning Skylake 上的代码），这个运行ning 64 位标量实际上仍然比 128 位 AVX [=304]快 =]来自@njuffa 代码的 asm。

但这是 Skylake 上的时间，它有 4 个标量 ALU 端口（和 mov-elimination），而 Sandybridge 缺少 mov-elimination 并且只有 3 个 ALU 端口，因此标量代码可能会命中 back-end execution-port 瓶颈。（但 SIMD 代码可能几乎一样快，因为有大量的 AND / ADD 与移位混合，而且 SnB 在其所有 3 个端口上都有 SIMD 执行单元，上面有任何 ALU。Haswell 刚刚添加了端口 6，因为 scalar-only 包括轮班和 b运行ches。）

通过良好的手动矢量化，这应该快将近 2 或 4 倍。

但是如果你必须在这个标量或@njuffa 的 AVX2 自动矢量化之间做出选择，@njuffa 在 Skylake 上的速度更快 -march=native

如果在 32 位目标上构建 possible/required，这会受到很多影响（因为在 32 位寄存器中使用 uint64_t 而没有矢量化），而矢量化代码几乎不会受到影响（因为所有的工作都发生在相同宽度的向量 regs 中。

// TODO: put the target[] re-ordering somewhere
// TODO: cleanup for N not a multiple of 3*4*21 = 252
// TODO: manual vectorize with __m128i, __m256i, and/or __m512i

void sum_gradual_widen (const uint64_t *restrict input, unsigned int *restrict target, size_t length)
{
    const uint64_t *endp = input + length - 3*4*21;     // 252 masks per outer iteration
    while(input <= endp) {
        uint64_t accum8[8] = {0};     // 8-bit accumulators
        for (int k=0 ; k<21 ; k++) {
            uint64_t accum4[4] = {0};  // 4-bit accumulators can hold counts up to 15.  We use 4*3=12
            for(int j=0 ; j<4 ; j++){
                uint64_t accum2_lo=0, accum2_hi=0;
                for(int i=0 ; i<3 ; i++) {  // the compiler should fully unroll this
                    uint64_t x = *input++;    // load a new bitmask
                    const uint64_t even_1bits = 0x5555555555555555;
                    uint64_t lo = x & even_1bits; // 0b...01010101;
                    uint64_t hi = (x>>1) & even_1bits;  // or use ANDN before shifting to avoid a MOV copy
                    accum2_lo += lo;
                    accum2_hi += hi;   // can do up to 3 iterations of this without overflow
                }

                const uint64_t even_2bits = 0x3333333333333333;
                accum4[0] +=  accum2_lo       & even_2bits;  // 0b...001100110011;   // same constant 4 times, because we shift *first*
                accum4[1] += (accum2_lo >> 2) & even_2bits;
                accum4[2] +=  accum2_hi       & even_2bits;
                accum4[3] += (accum2_hi >> 2) & even_2bits;
            }
            for (int i = 0 ; i<4 ; i++) {
                accum8[i*2 + 0] +=   accum4[i] & 0x0f0f0f0f0f0f0f0f;
                accum8[i*2 + 1] +=  (accum4[i] >> 4) & 0x0f0f0f0f0f0f0f0f;
            }
        }

        // char* can safely alias anything.
        unsigned char *narrow = (uint8_t*) accum8;
        for (int i=0 ; i<64 ; i++){
            target[i] += narrow[i];
        }
    }
    /* target[0] = bit 0
     * target[1] = bit 8
     * ...
     * target[8] = bit 1
     * target[9] = bit 9
     * ...
     */
    // TODO: 8x8 transpose
}

我们不关心顺序，因此 accum4[0] 例如，每 4 位有一个 4 位累加器。 最后需要（但尚未实现）的最终修复是 uint32_t target[64] 数组的 8x8 t运行spose， 可以使用 unpck 高效完成vshufps 只有 AVX1。 (Transpose an 8x8 float using AVX/AVX2)。还有最后一个最多 251 个掩码的清理循环。

我们可以使用任何 SIMD 元素宽度来实现这些移位；无论如何，我们必须为低于 16 位的宽度进行屏蔽（SSE/AVX 没有 byte-granularity 移位，只有 16 位最小值。）

来自@njuffa 测试工具的 Arch Linux i7-6700k 基准测试结果，添加了这个。 (Godbolt) N = (10000000 / (3*4*21) * 3*4*21) = 9999864（即 10000000 向下舍入为 252 次迭代 "unroll" 因子的倍数，所以我的简单实现是做相同数量的工作，不计算在内re-ordering target[] 它不会这样做，所以它会打印不匹配的结果。但是打印的计数与参考数组的另一个位置匹配。）

我运行程序连续 4 次（以确保 CPU 预热到最大涡轮增压）并选择了一个看起来不错的运行（none异常高的3倍）。

ref: 最好的 bit-loop（下一节）
快：@njuffa 的代码。（auto-vectorized 具有 128 位 AVX 整数指令）。
渐进：我的版本（不是 auto-vectorized 由 gcc 或 clang，至少不在内部循环中。）gcc 和 clang 完全展开内部 12 次迭代。

gcc8.2 -O3 -march=sandybridge -fpie -no-pie
参考：0.331373 秒，快：0.011387 秒，渐进：0.009966 秒
gcc8.2 -O3 -march=sandybridge -fno-pie -no-pie
参考：0.397175 秒，快：0.011255 秒，渐进：0.010018 秒
clang7.0 -O3 -march=sandybridge -fpie -no-pie
ref：0.352381 secs，fast：0.011926 secs，gradual：0.009269 secs（端口 7 uops 的计数非常低，clang 使用索引寻址存储）
clang7.0 -O3 -march=sandybridge -fno-pie -no-pie
ref: 0.293014 secs, fast: 0.011777 secs, gradual: 0.009235 secs

-march=skylake（允许 AVX2 用于 256 位整数向量）对两者都有帮助，但 @njuffa 最重要的是因为它更多地向量化（包括它的 inner-most 循环):

gcc8.2 -O3 -march=skylake -fpie -no-pie
参考：0.328725 秒，快：0.007621 秒，渐进：0.010054 秒（gcc 显示 "gradual" 没有增益，只有 "fast"）
gcc8.2 -O3 -march=skylake -fno-pie -no-pie
参考：0.333922 秒，快：0.007620 秒，渐进：0.009866 秒
clang7.0 -O3 -march=skylake -fpie -no-pie
参考：0.260616 秒，快：0.007521 秒，渐进：0.008535 秒（IDK 为什么渐进比-march=sandybridge 快；它没有使用 BMI1 andn。我猜是因为它使用 256 位 AVX2 来实现 k=0 ..20 外循环 vpaddq)
clang7.0 -O3 -march=skylake -fno-pie -no-pie
参考：0.259159 秒，快速：0.007496 秒，渐进：0.008671 秒

没有 AVX，只有 SSE4.2: (-march=nehalem)，奇怪的是 clang 的渐进速度比使用 AVX / tune=sandybridge 更快。 "fast" 只比 AVX 慢一点点。

gcc8.2 -O3 -march=skylake -fno-pie -no-pie
参考：0.337178 秒，快：0.011983 秒，渐进：0.010587 秒
clang7.0 -O3 -march=skylake -fno-pie -no-pie
参考：0.293555 秒，快速：0.012549 秒，渐进：0.008697 秒

-fprofile-generate / -fprofile-use 对 GCC 有一些帮助，特别是对于 "ref" 版本，默认情况下它根本不展开。

我突出显示了最好的，但它们通常都在彼此的测量噪声容限之内。 -fno-pie -no-pie 有时更快也就不足为奇了：用 [disp32 + reg] 索引静态数组不是索引寻址模式，只是 base + disp32，所以它永远不会 unlaminate在 Sandybridge-family CPUs.

但是使用 gcc 有时 -fpie 更快；我没有检查，但我认为当 32 位绝对寻址成为可能时，gcc 只是搬起石头砸自己的脚。或者只是 innocent-looking code-gen 中的差异恰好导致对齐或 uop-cache 问题；没仔细看

对于 SIMD，我们可以简单地并行执行 2 或 4x uint64_t，仅在我们将字节加宽为 32 位元素的最后一步水平累积。 （也许通过改组 in-lane 然后使用 pmaddubsw 和乘数 _mm256_set1_epi8(1) 将水平字节对添加到 16 位元素中。）

TODO：manually-vectorized __m128i 和 __m256i（和 __m512i）版本。应该比上面的 "gradual" 倍快接近 2 倍、4 倍甚至 8 倍。 可能 HW 预取仍然可以跟上它，除了数据来自 DRAM 的 AVX512 版本，特别是如果有来自其他线程的争论。我们为阅读的每个 qword 做大量工作。

过时的代码：对 bit-loop

的改进

你的 portable 标量版本也可以改进， 将它从 ~1.92 秒加速（34% b运行ch 总体错误预测率，快速循环被注释掉！）到 ~0.35 秒（clang7.0 -O3 -march=sandybridge），在 3.9GHz Skylake 上使用适当的运行dom 输入。或使用 != 0 而不是 == m 的 b运行chy 版本为 1.83 秒，因为编译器无法证明 m 总是正好设置 1 位 and/or 优化因此。

（对比 @njuffa 或我上面的快速版本为 0.01 秒，所以这在绝对意义上是非常无用的，但值得一提的是何时使用 b运行chless 代码的一般优化示例.)

如果您期望运行dom 零和 1 的混合，您需要一些 b运行chless 不会错误预测的东西。对零元素执行 += 0 可以避免这种情况，并且还意味着 C 抽象机肯定会触及该内存，而不管数据如何。

不允许编译器发明写入，所以如果他们想要 auto-vectorize 你的 if() target[i]++ 版本，他们必须使用像 x86 vmaskmovps 这样的屏蔽存储来避免non-atomic 读取/重写 target 的未修改元素。所以一些假设的未来编译器可以 auto-vectorize 纯标量代码会更容易处理这个。

无论如何，一种写法是 target[i] += (pLong[j] & m != 0);，使用 bool->int 转换得到 0 / 1 整数。

但是如果我们只是移动数据并用 &1 隔离低位，我们会得到更好的 x86 asm（可能还有大多数其他架构）。编译器有点笨，似乎没有发现这种优化。他们确实很好地优化了额外的循环计数器，并将 m <<= 1 变成 add same,same 以有效地左移，但他们仍然使用 xor-zero / test / setne 来创建一个 0 / 1 整数。

像这样的内部循环编译效率稍微高一些（但仍然比我们用 SSE2 或 AVX 做的差很多，甚至比使用@chrqlie 的查找的标量 table 像这样重复使用时会在 L1d 中保持热度，允许 uint64_t 中的 SWAR):

    for (int j = 0; j < 10000000; j++) {
#if 1  // extract low bit directly
        unsigned long long tmp = pLong[j];
        for (int i=0 ; i<64 ; i++) {   // while(tmp) could mispredict, but good for sparse data
            target[i] += tmp&1;
            tmp >>= 1;
        }
#else // bool -> int shifting a mask
        unsigned long m = 1;
        for (i = 0; i < 64; i++) {
            target[i]+= (pLong[j] & m) != 0;
            m = (m << 1);
        }
#endif

请注意 unsigned long 不保证运行是 64 位类型，也不在 x86-64 System V x32（64 位模式下的 ILP32）中，并且Windows x64。或者在像 i386 System V 这样的 32 位 ABI 中。

已编译 on the Godbolt compiler explorer by gcc, clang, and ICC，在 gcc 循环中少了 1 个微指令。但它们都是普通标量，clang 和 ICC 展开 2。

# clang7.0 -O3 -march=sandybridge
.LBB1_2:                            # =>This Loop Header: Depth=1
   # outer loop loads a uint64 from the src
    mov     rdx, qword ptr [r14 + 8*rbx]
    mov     rsi, -256
.LBB1_3:                            #   Parent Loop BB1_2 Depth=1
                                    # do {
    mov     edi, edx
    and     edi, 1                              # isolate the low bit
    add     dword ptr [rsi + target+256], edi   # and += into target

    mov     edi, edx
    shr     edi
    and     edi, 1                              # isolate the 2nd bit
    add     dword ptr [rsi + target+260], edi

    shr     rdx, 2                              # tmp >>= 2;

    add     rsi, 8
    jne     .LBB1_3                       # } while(offset += 8 != 0);

这比我们从 test / setnz 得到的稍微好一点。如果不展开，bt / setc 可能相等，但编译器不擅长使用 bt 来实现 bool (x & (1ULL << n))，或使用 bts 来实现 x |= 1ULL << n.

如果许多单词的最高设置位远低于位 63，则在 while(tmp) 上循环可能是一个胜利。 B运行ch 的错误预测使得它在大多数情况下只保存 ~0 到 4 次迭代是不值得的，但如果它经常保存 32 次迭代，那真的是值得的。也许在源代码中展开，所以循环只测试 tmp 每 2 次迭代（因为编译器不会为你做那个 t运行sformation），但是循环 b运行ch 可以是shr rdx, 2 / jnz.

在 Sandybridge-family 上，这是前端每 2 位输入 11 fused-domain 微指令。（add [mem], reg 使用 non-indexed 寻址模式 micro-fuses load+ALU，以及 store-address+store-data，其他都是 single-uop。 add/jcc macro-fuses。参见 Agner Fog 的指南，以及 https://whosebug.com/tags/x86/info）。所以它应该运行每 2 位 3 个周期 = 每 96 个周期一个 uint64_t。（Sandybridge 在其循环缓冲区内部没有 "unroll"，因此 non-multiple-of-4 uop 计数基本上向上舍入，这与 Haswell 和更高版本不同。

对比gcc 的 not-unrolled 版本是每 1 位 7 微指令 = 每位 2 个周期。如果您使用 gcc -O3 -march=native -fprofile-generate / test-run / gcc -O3 -march=native -fprofile-use 进行编译，profile-guided 优化将启用循环展开。

这可能比 b运行chy 版本在完美预测table 数据上慢，就像你从 memset 获得的任何重复字节模式.我建议用来自快速 PRNG 的 randomly-generated 数据填充数组，例如 SSE2 xorshift+，或者如果您只是对计数循环计时，则使用任何您想要的数据，例如 rand().

Answer 4

即使没有 AVX，也可以显着加快速度的一种方法是将数据分成最多 255 个元素的块，并在普通 uint64_t 变量中按字节累加位计数。由于源数据有 64 位，我们需要一个 8 字节累加器数组。第一个累加器计算位置 0, 8, 16, ... 56 中的位，第二个累加器计算位置 1, 9, 17, ... 57 中的位；等等。在我们处理完一个数据块后，我们将计数从字节累加器传输到 target 计数中。可以根据上面的描述以简单的方式编码用于更新最多 255 个数字的块的 target 计数的函数，其中 BITS 是源数据中的位数：

/* update the counts of 1-bits in each bit position for up to 255 numbers */
void sum_block (const uint64_t *pLong, unsigned int *target, int lo, int hi)
{
    int jj, k, kk;
    uint64_t byte_wise_sum [BITS/8] = {0};
    for (jj = lo; jj < hi; jj++) {
        uint64_t t = pLong[jj];
        for (k = 0; k < BITS/8; k++) {
            byte_wise_sum[k] += t & 0x0101010101010101;
            t >>= 1;
        }
    }
    /* accumulate byte sums into target */
    for (k = 0; k < BITS/8; k++) {
        for (kk = 0; kk < BITS; kk += 8) {
            target[kk + k] += (byte_wise_sum[k] >> kk) & 0xff;
        }
    }
}

整个 ISO-C99 程序，应该能够在至少 Windows 和 Linux 平台上运行如下所示。它使用 PRNG 初始化源数据，针对提问者的参考实现执行正确性检查，并对参考代码和加速版本进行基准测试。在我的机器 (Intel Xeon E3-1270 v2 @ 3.50 GHz) 上，当使用 MSVS 2010 以完全优化 (/Ox) 编译时，程序的输出是：

p=0000000000550040
ref took 2.020282 secs, fast took 0.027099 secs

其中ref指的是提问者的原解。这里的加速大约是 74 倍。使用其他（尤其是较新的）编译器将观察到不同的加速。

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>

#if defined(_WIN32)
#if !defined(WIN32_LEAN_AND_MEAN)
#define WIN32_LEAN_AND_MEAN
#endif
#include <windows.h>
double second (void)
{
    LARGE_INTEGER t;
    static double oofreq;
    static int checkedForHighResTimer;
    static BOOL hasHighResTimer;

    if (!checkedForHighResTimer) {
        hasHighResTimer = QueryPerformanceFrequency (&t);
        oofreq = 1.0 / (double)t.QuadPart;
        checkedForHighResTimer = 1;
    }
    if (hasHighResTimer) {
        QueryPerformanceCounter (&t);
        return (double)t.QuadPart * oofreq;
    } else {
        return (double)GetTickCount() * 1.0e-3;
    }
}
#elif defined(__linux__) || defined(__APPLE__)
#include <stddef.h>
#include <sys/time.h>
double second (void)
{
    struct timeval tv;
    gettimeofday(&tv, NULL);
    return (double)tv.tv_sec + (double)tv.tv_usec * 1.0e-6;
}
#else
#error unsupported platform
#endif

/*
  From: geo <gmars...@gmail.com>
  Newsgroups: sci.math,comp.lang.c,comp.lang.fortran
  Subject: 64-bit KISS RNGs
  Date: Sat, 28 Feb 2009 04:30:48 -0800 (PST)

  This 64-bit KISS RNG has three components, each nearly
  good enough to serve alone.    The components are:
  Multiply-With-Carry (MWC), period (2^121+2^63-1)
  Xorshift (XSH), period 2^64-1
  Congruential (CNG), period 2^64
*/
static uint64_t kiss64_x = 1234567890987654321ULL;
static uint64_t kiss64_c = 123456123456123456ULL;
static uint64_t kiss64_y = 362436362436362436ULL;
static uint64_t kiss64_z = 1066149217761810ULL;
static uint64_t kiss64_t;
#define MWC64  (kiss64_t = (kiss64_x << 58) + kiss64_c, \
                kiss64_c = (kiss64_x >> 6), kiss64_x += kiss64_t, \
                kiss64_c += (kiss64_x < kiss64_t), kiss64_x)
#define XSH64  (kiss64_y ^= (kiss64_y << 13), kiss64_y ^= (kiss64_y >> 17), \
                kiss64_y ^= (kiss64_y << 43))
#define CNG64  (kiss64_z = 6906969069ULL * kiss64_z + 1234567ULL)
#define KISS64 (MWC64 + XSH64 + CNG64)

#define N          (10000000)
#define BITS       (64)
#define BLOCK_SIZE (255)

/* cupdate the count of 1-bits in each bit position for up to 255 numbers */
void sum_block (const uint64_t *pLong, unsigned int *target, int lo, int hi)
{
    int jj, k, kk;
    uint64_t byte_wise_sum [BITS/8] = {0};
    for (jj = lo; jj < hi; jj++) {
        uint64_t t = pLong[jj];
        for (k = 0; k < BITS/8; k++) {
            byte_wise_sum[k] += t & 0x0101010101010101;
            t >>= 1;
        }
    }
    /* accumulate byte sums into target */
    for (k = 0; k < BITS/8; k++) {
        for (kk = 0; kk < BITS; kk += 8) {
            target[kk + k] += (byte_wise_sum[k] >> kk) & 0xff;
        }
    }
}

int main (void) 
{
    double start_ref, stop_ref, start, stop;
    uint64_t *pLong;
    unsigned int target_ref [BITS] = {0};
    unsigned int target [BITS] = {0};
    int i, j;

    pLong = malloc (sizeof(pLong[0]) * N);
    if (!pLong) {
        printf("failed to allocate\n");
        return EXIT_FAILURE;
    }
    printf("p=%p\n", pLong);

    /* init data */
    for (j = 0; j < N; j++) {
        pLong[j] = KISS64;
    }

    /* count bits slowly */
    start_ref = second();
    for (j = 0; j < N; j++) {
        uint64_t m = 1;
        for (i = 0; i < BITS; i++) {
            if ((pLong[j] & m) == m) {
                target_ref[i]++;
            }
            m = (m << 1);
        }
    }
    stop_ref = second();

    /* count bits fast */
    start = second();
    for (j = 0; j < N / BLOCK_SIZE; j++) {
        sum_block (pLong, target, j * BLOCK_SIZE, (j+1) * BLOCK_SIZE);
    }
    sum_block (pLong, target, j * BLOCK_SIZE, N);
    stop = second();

    /* check whether result is correct */
    for (i = 0; i < BITS; i++) {
        if (target[i] != target_ref[i]) {
            printf ("error @ %d: res=%u ref=%u\n", i, target[i], target_ref[i]);
        }
    }

    /* print benchmark results */
    printf("ref took %f secs, fast took %f secs\n", stop_ref - start_ref, stop - start);
    return EXIT_SUCCESS;
}

Answer 5

见

Efficient Computation of Positional Population Counts Using SIMD Instructions Marcus D. R. Klarqvist、Wojciech Muła、Daniel Lemire（2019 年 11 月 7 日）

Faster Population Counts using AVX2 Instructions 作者：Wojciech Muła、Nathan Kurz、Daniel Lemire（2016 年 11 月 23 日）。

基本上，每个全加器将3个输入压缩为2个输出。因此，可以用 5 条逻辑指令的价格消除整个 256 位字。可以重复全加器操作，直到寄存器用完为止。然后累积寄存器中的结果（如大多数其他答案所示）。

16 位子字的位置 popcnt 在这里实现： https://github.com/mklarqvist/positional-popcount

// Carry-Save Full Adder (3:2 compressor)
b ^= a;
a ^= c;
c ^= b; // xor sum
b |= a;
b ^= c; // carry

注意：positional-popcnt 的累积步骤比普通 simd popcnt 更昂贵。我相信这使得在 CSU 的末尾添加几个半加器变得可行，在累积之前一直到 256 个字可能会付出代价。

使用 AVX 但不使用 AVX2，分别计算许多 64 位位掩码上的每个位位置

Count each bit-position separately over many 64-bit bitmasks, with AVX but not AVX2

c

optimization

x86

x86-64

simd

更好的策略是逐渐扩大

过时的代码：对 bit-loop