使用内部函数对 128、256、512 位注册表进行全局位移?
Global bitwise shift of 128, 256, 512 bit registry using intrinsics?
考虑一个 64 位无符号整数数组,例如:
std::array<unsigned long long int, 20> a;
最快的方法是什么,包括使用 intel or compiler intrinsics (this or that)(使用 g++ 5.3)执行全局位移(右移或左移),因为此数组是一位整数?
您可能需要查看 std::bitset
,它是编译时已知的许多位的容器。如果我对你的问题的理解是正确的,那就是你试图用你的数组模拟的。位集 class 包括重载的 >>
和 <<
运算符来执行位移,这些实现可能会在您的 compiler/standard 库组合中进行优化。
这里有一些通过内部函数使用 xmm 和 ymm 寄存器的 x86 左移函数。做出相应的右移函数应该不会太难。它们取自 a software lfsr benchmark:
//----------------------------------------------------------------------------
// bit shift left a 128-bit value using xmm registers
// __m128i *data - data to shift
// int count - number of bits to shift
// return: __m128i - carry out bit(s)
static __m128i bitShiftLeft128xmm (__m128i *data, int count)
{
__m128i innerCarry, carryOut;
innerCarry = _mm_srli_epi64 (*data, 64 - count); // carry outs in bit 0 of each qword
carryOut = _mm_shuffle_epi32 (innerCarry, 0xFE); // upper carry in xmm bit 0, others zero
innerCarry = _mm_shuffle_epi32 (innerCarry, 0xCF); // lower carry in xmm bit 64, others zero
*data = _mm_slli_epi64 (*data, count); // shift all qwords left
*data = _mm_or_si128 (*data, innerCarry); // propagate carry out from low qword
return carryOut;
}
//----------------------------------------------------------------------------
// bit shift left a 256-bit value using xmm registers
// __m128i *data - data to shift, ls part stored first
// int count - number of bits to shift
// return: __m128i - carry out bit(s)
static __m128i bitShiftLeft256xmm (__m128i *data, int count)
{
__m128i carryOut0, carryOut1;
carryOut0 = bitShiftLeft128xmm (&data [0], count);
carryOut1 = bitShiftLeft128xmm (&data [1], count);
data [1] = _mm_or_si128 (data [1], carryOut0);
return carryOut1;
}
//----------------------------------------------------------------------------
// bit shift left a 512-bit value using xmm registers
// __m128i *data - data to shift, ls part stored first
// int count - number of bits to shift
// return: __m128i - carry out bit(s)
static __m128i bitShiftLeft512xmm (__m128i *data, int count)
{
__m128i carryOut0, carryOut1;
carryOut0 = bitShiftLeft256xmm (&data [0], count);
carryOut1 = bitShiftLeft256xmm (&data [2], count);
data [2] = _mm_or_si128 (data [2], carryOut0);
return carryOut1;
}
//----------------------------------------------------------------------------
// bit shift left a 256-bit value using ymm registers
// __m256i *data - data to shift
// int count - number of bits to shift
// return: __m256i - carry out bit(s)
static __m256i bitShiftLeft256ymm (__m256i *data, int count)
{
__m256i innerCarry, carryOut, rotate;
innerCarry = _mm256_srli_epi64 (*data, 64 - count); // carry outs in bit 0 of each qword
rotate = _mm256_permute4x64_epi64 (innerCarry, 0x93); // rotate ymm left 64 bits
innerCarry = _mm256_blend_epi32 (_mm256_setzero_si256 (), rotate, 0xFC); // clear lower qword
*data = _mm256_slli_epi64 (*data, count); // shift all qwords left
*data = _mm256_or_si256 (*data, innerCarry); // propagate carrys from low qwords
carryOut = _mm256_xor_si256 (innerCarry, rotate); // clear all except lower qword
return carryOut;
}
//----------------------------------------------------------------------------
// bit shift left a 512-bit value using ymm registers
// __m256i *data - data to shift, ls part stored first
// int count - number of bits to shift
// return: __m256i - carry out bit(s)
static __m256i bitShiftLeft512ymm (__m256i *data, int count)
{
__m256i carryOut0, carryOut1;
carryOut0 = bitShiftLeft256ymm (&data [0], count);
carryOut1 = bitShiftLeft256ymm (&data [1], count);
data [1] = _mm256_or_si256 (data [1], carryOut0);
return carryOut1;
}
//----------------------------------------------------------------------------
考虑一个 64 位无符号整数数组,例如:
std::array<unsigned long long int, 20> a;
最快的方法是什么,包括使用 intel or compiler intrinsics (this or that)(使用 g++ 5.3)执行全局位移(右移或左移),因为此数组是一位整数?
您可能需要查看 std::bitset
,它是编译时已知的许多位的容器。如果我对你的问题的理解是正确的,那就是你试图用你的数组模拟的。位集 class 包括重载的 >>
和 <<
运算符来执行位移,这些实现可能会在您的 compiler/standard 库组合中进行优化。
这里有一些通过内部函数使用 xmm 和 ymm 寄存器的 x86 左移函数。做出相应的右移函数应该不会太难。它们取自 a software lfsr benchmark:
//----------------------------------------------------------------------------
// bit shift left a 128-bit value using xmm registers
// __m128i *data - data to shift
// int count - number of bits to shift
// return: __m128i - carry out bit(s)
static __m128i bitShiftLeft128xmm (__m128i *data, int count)
{
__m128i innerCarry, carryOut;
innerCarry = _mm_srli_epi64 (*data, 64 - count); // carry outs in bit 0 of each qword
carryOut = _mm_shuffle_epi32 (innerCarry, 0xFE); // upper carry in xmm bit 0, others zero
innerCarry = _mm_shuffle_epi32 (innerCarry, 0xCF); // lower carry in xmm bit 64, others zero
*data = _mm_slli_epi64 (*data, count); // shift all qwords left
*data = _mm_or_si128 (*data, innerCarry); // propagate carry out from low qword
return carryOut;
}
//----------------------------------------------------------------------------
// bit shift left a 256-bit value using xmm registers
// __m128i *data - data to shift, ls part stored first
// int count - number of bits to shift
// return: __m128i - carry out bit(s)
static __m128i bitShiftLeft256xmm (__m128i *data, int count)
{
__m128i carryOut0, carryOut1;
carryOut0 = bitShiftLeft128xmm (&data [0], count);
carryOut1 = bitShiftLeft128xmm (&data [1], count);
data [1] = _mm_or_si128 (data [1], carryOut0);
return carryOut1;
}
//----------------------------------------------------------------------------
// bit shift left a 512-bit value using xmm registers
// __m128i *data - data to shift, ls part stored first
// int count - number of bits to shift
// return: __m128i - carry out bit(s)
static __m128i bitShiftLeft512xmm (__m128i *data, int count)
{
__m128i carryOut0, carryOut1;
carryOut0 = bitShiftLeft256xmm (&data [0], count);
carryOut1 = bitShiftLeft256xmm (&data [2], count);
data [2] = _mm_or_si128 (data [2], carryOut0);
return carryOut1;
}
//----------------------------------------------------------------------------
// bit shift left a 256-bit value using ymm registers
// __m256i *data - data to shift
// int count - number of bits to shift
// return: __m256i - carry out bit(s)
static __m256i bitShiftLeft256ymm (__m256i *data, int count)
{
__m256i innerCarry, carryOut, rotate;
innerCarry = _mm256_srli_epi64 (*data, 64 - count); // carry outs in bit 0 of each qword
rotate = _mm256_permute4x64_epi64 (innerCarry, 0x93); // rotate ymm left 64 bits
innerCarry = _mm256_blend_epi32 (_mm256_setzero_si256 (), rotate, 0xFC); // clear lower qword
*data = _mm256_slli_epi64 (*data, count); // shift all qwords left
*data = _mm256_or_si256 (*data, innerCarry); // propagate carrys from low qwords
carryOut = _mm256_xor_si256 (innerCarry, rotate); // clear all except lower qword
return carryOut;
}
//----------------------------------------------------------------------------
// bit shift left a 512-bit value using ymm registers
// __m256i *data - data to shift, ls part stored first
// int count - number of bits to shift
// return: __m256i - carry out bit(s)
static __m256i bitShiftLeft512ymm (__m256i *data, int count)
{
__m256i carryOut0, carryOut1;
carryOut0 = bitShiftLeft256ymm (&data [0], count);
carryOut1 = bitShiftLeft256ymm (&data [1], count);
data [1] = _mm256_or_si256 (data [1], carryOut0);
return carryOut1;
}
//----------------------------------------------------------------------------