是否存在非循环无符号32位整数平方根函数C

Question

我见过浮点位黑客来生成平方根，如此处所示 fast floating point square root，但此方法适用于浮点数。

有没有类似的方法求一个32位无符号整数的无环整数平方根？我一直在网上搜索一个，但没有看到

（我的想法是，纯二进制表示没有足够的信息来做到这一点，但由于它被限制为 32 位，我猜不是这样）

Answer 1

没有。您需要在某处引入日志；由于位表示中的对数，快速浮点平方根有效。

最快的方法可能是查找 table n -> floor(sqrt(n))。您不会将所有值存储在 table 中，而只会存储平方根发生变化的值。使用二进制搜索在 log(n) 时间中找到 table 中的结果。

Answer 2

此答案假定目标平台不支持浮点，或者支持非常慢的浮点（可能通过仿真）。

正如评论中所指出的，计数前导零 (CLZ) 指令可用于提供快速 log₂ 功能，该功能通过浮动的指数部分提供-点操作数。 CLZ 也可以在不通过内在函数提供功能的平台上以合理的效率进行模拟，如下所示。

可以从查找 table (LUT) 中提取适合几个位的初始近似值，就像在浮点情况下一样，可以通过牛顿迭代进一步细化。对于 32 位整数平方根，一到两次迭代通常就足够了。下面的 ISO-C99 代码显示了包括详尽测试在内的工作示例性实施。

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <math.h>

uint8_t clz (uint32_t a); // count leading zeros
uint32_t umul_16_16 (uint16_t a, uint16_t b); // 16x16 bit multiply
uint16_t udiv_32_16 (uint32_t x, uint16_t y); // 32/16 bit division

/* LUT for initial square root approximation */
static const uint16_t sqrt_tab[32] = 
{ 
    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
    0x85ff, 0x8cff, 0x94ff, 0x9aff, 0xa1ff, 0xa7ff, 0xadff, 0xb3ff,
    0xb9ff, 0xbeff, 0xc4ff, 0xc9ff, 0xceff, 0xd3ff, 0xd8ff, 0xdcff, 
    0xe1ff, 0xe6ff, 0xeaff, 0xeeff, 0xf3ff, 0xf7ff, 0xfbff, 0xffff
};

/* table lookup for initial guess followed by division-based Newton iteration */
uint16_t my_isqrt (uint32_t x)
{
    uint16_t q, lz, y, i, xh;

    if (x == 0) return x; // early out, code below can't handle zero

    // initial guess based on leading 5 bits of argument normalized to 2.30
    lz = clz (x);
    i = ((x << (lz & ~1)) >> 27);
    y = sqrt_tab[i] >> (lz >> 1);
    xh = x >> 16; // use for overflow check on divisions

    // first Newton iteration, guard against overflow in division
    q = 0xffff;
    if (xh < y) q = udiv_32_16 (x, y);
    y = (q + y) >> 1;

    if (lz < 10) {
        // second Newton iteration, guard against overflow in division
        q = 0xffff;
        if (xh < y) q = udiv_32_16 (x, y);
        y = (q + y) >> 1;
    }

    if (umul_16_16 (y, y) > x) y--; // adjust quotient if too large

    return y; // (uint16_t)sqrt((double)x)
}

static const uint8_t clz_tab[32] = 
{
    31, 22, 30, 21, 18, 10, 29,  2, 20, 17, 15, 13, 9,  6, 28, 1,
    23, 19, 11,  3, 16, 14,  7, 24, 12,  4,  8, 25, 5, 26, 27, 0
};

/* count leading zeros (for non-zero argument); a machine instruction on many architectures */
uint8_t clz (uint32_t a)
{
    a |= a >> 16;
    a |= a >> 8;
    a |= a >> 4;
    a |= a >> 2;
    a |= a >> 1;
    return clz_tab [0x07c4acdd * a >> 27];
}

/* 16x16->32 bit unsigned multiply; machine instruction on many architectures */
uint32_t umul_16_16 (uint16_t a, uint16_t b)
{
    return (uint32_t)a * b;
}

/* 32/16->16 bit division. Note: Will overflow if x[31:16] >= y */
uint16_t udiv_32_16 (uint32_t x, uint16_t y)
{
    uint16_t r = x / y;
    return r;
}

int main (void)
{
    uint32_t x;
    uint16_t res, ref;
    
    printf ("testing 32-bit integer square root\n");
    x = 0;
    do {
        ref = (uint16_t)sqrt((double)x);
        res = my_isqrt (x);
        if (res != ref) {
            printf ("error: x=%08x  res=%08x  ref=%08x\n", x, res, ref);
            printf ("exhaustive test FAILED\n");
            return EXIT_FAILURE;
        }
        x++;
    } while (x);
    printf ("exhaustive test PASSED\n");
    return EXIT_SUCCESS;
}

Answer 3

下面是 given by @njuffa, including some that are not just loop-free, but also branch-free (the ones that aren't branch free are almost branch free). All of the variants below are derived from the algorithm that's used internally for Python's math.isqrt 的一些变体。变体是：

32 位平方根：几乎无分支，1 个除法
32 位平方根：无分支，1 个除法
32 位平方根：无分支，3 个除法，无查找 table
64 位平方根：几乎无分支，2 个除法
64 位平方根：无分支，4 个除法，无查找 table

最后，我们为无符号 64 位输入提供了一个非常快速的除法自由（并且仍然无循环，几乎无分支）平方根算法。问题在于它只适用于已知为正方形的输入；对于非方形输入，它将给出无用的结果。在我的 2018 2.7 GHz Intel Core i7 笔记本电脑上，它设法在大约 3.3 纳秒内计算出每个平方根。一直向下滚动到答案的底部以查看它。

32 位平方根：几乎无分支，1 个除法

第一个变体是我推荐的变体，其他条件相同。它使用：

单次查找到 192 字节查找 table
具有 16 位结果的单个 32 位 x 16 位除法
具有 32 位结果的单个 16 位乘 16 位乘法
计数前导零操作，以及一些移位和其他廉价操作

在早期 return 案例 x == 0 之后，它基本上是无分支的。（在最后的更正步骤中有一个分支的提示，但我尝试的编译器设法为此提供了无跳转代码。）

这是代码。解释如下。

#include <stdint.h>

// count leading zeros of nonzero 32-bit unsigned integer
int clz32(uint32_t x);

// isqrt32_tab[k] = isqrt(256*(k+64)-1) for 0 <= k < 192
static const uint8_t isqrt32_tab[192] = {
    127, 128, 129, 130, 131, 132, 133, 134, 135, 136,
    137, 138, 139, 140, 141, 142, 143, 143, 144, 145,
    146, 147, 148, 149, 150, 150, 151, 152, 153, 154,
    155, 155, 156, 157, 158, 159, 159, 160, 161, 162,
    163, 163, 164, 165, 166, 167, 167, 168, 169, 170,
    170, 171, 172, 173, 173, 174, 175, 175, 176, 177,
    178, 178, 179, 180, 181, 181, 182, 183, 183, 184,
    185, 185, 186, 187, 187, 188, 189, 189, 190, 191,
    191, 192, 193, 193, 194, 195, 195, 196, 197, 197,
    198, 199, 199, 200, 201, 201, 202, 203, 203, 204,
    204, 205, 206, 206, 207, 207, 208, 209, 209, 210,
    211, 211, 212, 212, 213, 214, 214, 215, 215, 216,
    217, 217, 218, 218, 219, 219, 220, 221, 221, 222,
    222, 223, 223, 224, 225, 225, 226, 226, 227, 227,
    228, 229, 229, 230, 230, 231, 231, 232, 232, 233,
    234, 234, 235, 235, 236, 236, 237, 237, 238, 238,
    239, 239, 240, 241, 241, 242, 242, 243, 243, 244,
    244, 245, 245, 246, 246, 247, 247, 248, 248, 249,
    249, 250, 250, 251, 251, 252, 252, 253, 253, 254,
    254, 255,
};

// integer square root of 32-bit unsigned integer
uint16_t isqrt32(uint32_t x)
{
    if (x == 0) return 0;

    int lz = clz32(x) & 30;
    x <<= lz;
    uint16_t y = 1 + isqrt32_tab[(x >> 24) - 64];
    y = (y << 7) + (x >> 9) / y;
    y -= x < (uint32_t)y * y;
    return y >> (lz >> 1);
}

上面，我们省略了函数clz32的定义，实现起来和@njuffa的post中的clz完全一样。使用 gcc 或 Clang，您可以使用 __builtin_clz 之类的东西代替 clz32。如果您必须自己动手，请从@njuffa 的回答中窃取它。

解释：在处理了特殊情况 x = 0 之后，我们首先对 x 进行归一化，平移一个偶数（有效地乘以 4 的幂）使得 2**30 <= x < 2**32。然后我们计算 x 的整数平方根 y，然后将 y 移回以补偿 returning.

这将我们带到中间的三行代码，它们是算法的核心：

    uint16_t y = 1 + isqrt32_tab[(x >> 24) - 64];
    y = (y << 7) + (x >> 9) / y;
    y -= x < (uint32_t)y * y;

值得注意的是，在假设 2**30 <= x < 2**32.[=106= 的情况下，这三行是计算 32 位整数 x 的整数平方根 y 所需的全部内容]

三行中的第一行使用查找 table 来检索 x、x >> 16 的最高 16 位的平方根的近似值。近似总会有小于1的误差：即第一行执行后，条件：

(y-1) * (y-1) < (x >> 16) && (x >> 16) < (y+1) * (y+1)

永远是真的。

第二行是最有趣的：在执行该行之前，y 是 x >> 16 的平方根的近似值，精度约为 7-8 位。因此 y << 8 是 x 的平方根的近似值，同样具有大约 7-8 个精确位。因此，应用于 y << 8 的单个牛顿迭代应该给出 更好的 近似 x，大致将准确位数加倍，这正是第二行计算的结果.真正美妙的部分是，如果你通过数学计算，结果证明你可以再次证明结果 y 的误差小于 1：即条件

(y-1) * (y-1) < x && x < (y+1) * (y+1)

会是真的。这意味着 y*y <= x 和 y 已经是 x 的整数平方根，或者 x < y*y 和 y - 1是 x 的整数平方根。因此，第三行在 x < y*y.

的情况下将 -1 更正应用于 y

上面有一个稍微微妙的地方。如果随着算法的进行，您遵循 y 的可能大小：在查找之后我们有 128 <= y <= 256。在第二行之后，数学上我们有 32768 <= y <= 65536。但是如果 y = 65536 那么我们就有问题了，因为 y 不能再存储在 uint16_t 中。然而，有可能证明这不可能发生：粗略地说，y 最终变得那么大的唯一方法是如果查找的结果是 256，在这种情况下，它是直接看到下一行最多只能产生 65535.

32 位平方根：无分支，1 个除法

上面的算法非常接近完全无分支。这是一个真正无分支的变体，至少对于我试过的编译器是这样。它使用与前面示例相同的 clz32 和查找 table。

uint16_t isqrt32(uint32_t x)
{
    int lz = clz32(x | 1) & 30;
    x <<= lz;
    int index = (x >> 24) - 64;
    uint16_t y = 1 + sqrt32_tab[index >= 0 ? index : 0];
    y = (y << 7) + (x >> 9) / y;
    y -= x < (uint32_t)y * y;
    return y >> (lz >> 1);
}

这里我们只是让特殊情况 x == 0 像往常一样通过算法传播。我们计算 clz32(x | 1) 代替 clz32(x)，部分原因是 clz32(0) 可能定义不明确（例如，它不适用于 gcc 的 __builtin_clz），部分原因是即使如果定义明确，32 的结果偏移将在下一行的 x <<= lz 中给出未定义的行为。我们必须调整查找以纠正可能为负的查找索引。（我们也可以将查找 table 从 192 个条目扩展到 256 个条目以适应这种情况，但这似乎很浪费。）大多数平台应该有足够聪明的编译器来将 index >= 0 ? index : 0 变成无分支的东西。一些架构甚至提供了饱和减法指令。

32 位平方根：无分支，3 个除法，无查找 table

下一个变体不需要查找 table，但使用三个除法而不是一个。

uint16_t isqrt32(uint32_t x)
{
    int lz = clz32(x | 1) & 30;
    x <<= lz;
    uint32_t y = 1 + (x >> 30);
    y = (y << 1) + (x >> 27) / y;
    y = (y << 3) + (x >> 21) / y;
    y = (y << 7) + (x >> 9) / y;
    y -= x < (uint32_t)y * y;
    return y >> (lz >> 1);
}

这个想法和以前完全一样，除了我们从更小的近似值开始。

在初始行 uint32_t y = 1 + (x >> 30); 之后，y 是 x >> 28 的平方根的近似值。
在y = (y << 1) + (x >> 27) / y;之后，y是x的最高八位的平方根的近似值，x >> 24
在y = (y << 3) + (x >> 21) / y;之后，y是对 x、x >> 16.
算法的其余部分像以前一样进行。

64 位平方根：几乎无分支，2 个除法

OP 要求提供无符号 32 位平方根，但上述技术同样适用于计算无符号 64 位整数的整数平方根。

这是针对这种情况的一些代码，与 Python 的 math.isqrt 用于小于 2**64 的输入的代码基本相同（并且还为更大的输入）。

#include <stdint.h>

// count leading zeros of nonzero 64-bit unsigned integer
int clz64(uint64_t x);

// isqrt64_tab[k] = isqrt(256*(k+65)-1) for 0 <= k < 192
static const uint8_t isqrt64_tab[192] = {
    128, 129, 130, 131, 132, 133, 134, 135, 136, 137,
    138, 139, 140, 141, 142, 143, 143, 144, 145, 146,
    147, 148, 149, 150, 150, 151, 152, 153, 154, 155,
    155, 156, 157, 158, 159, 159, 160, 161, 162, 163,
    163, 164, 165, 166, 167, 167, 168, 169, 170, 170,
    171, 172, 173, 173, 174, 175, 175, 176, 177, 178,
    178, 179, 180, 181, 181, 182, 183, 183, 184, 185,
    185, 186, 187, 187, 188, 189, 189, 190, 191, 191,
    192, 193, 193, 194, 195, 195, 196, 197, 197, 198,
    199, 199, 200, 201, 201, 202, 203, 203, 204, 204,
    205, 206, 206, 207, 207, 208, 209, 209, 210, 211,
    211, 212, 212, 213, 214, 214, 215, 215, 216, 217,
    217, 218, 218, 219, 219, 220, 221, 221, 222, 222,
    223, 223, 224, 225, 225, 226, 226, 227, 227, 228,
    229, 229, 230, 230, 231, 231, 232, 232, 233, 234,
    234, 235, 235, 236, 236, 237, 237, 238, 238, 239,
    239, 240, 241, 241, 242, 242, 243, 243, 244, 244,
    245, 245, 246, 246, 247, 247, 248, 248, 249, 249,
    250, 250, 251, 251, 252, 252, 253, 253, 254, 254,
    255, 255,
};

// integer square root of a 64-bit unsigned integer
uint32_t isqrt64(uint64_t x)
{
    if (x == 0) return 0;

    int lz = clz64(x) & 62;
    x <<= lz;
    uint32_t y = isqrt64_tab[(x >> 56) - 64];
    y = (y << 7) + (x >> 41) / y;
    y = (y << 15) + (x >> 17) / y;
    y -= x < (uint64_t)y * y;
    return y >> (lz >> 1);
}

以上代码假设存在一个函数 clz64 用于计算 uint64_t 值中的前导零。在 x == 0 情况下，函数 clz64 允许有未定义的行为。同样，在 gcc 和 Clang 上，__builtin_clzll 应该可以用来代替 clz64，假设 unsigned long long 的宽度为 64。为了完整起见，这里有一个 clz64 的实现，它基于通常的 de Bruijn 序列技巧。

#include <assert.h>

static const uint8_t clz64_tab[64] = {
    63,  5, 62,  4, 16, 10, 61,  3, 24, 15, 36,  9, 30, 21, 60,  2,
    12, 26, 23, 14, 45, 35, 43,  8, 33, 29, 52, 20, 49, 41, 59,  1,
     6, 17, 11, 25, 37, 31, 22, 13, 27, 46, 44, 34, 53, 50, 42,  7,
    18, 38, 32, 28, 47, 54, 51, 19, 39, 48, 55, 40, 56, 57, 58,  0,
};

// count leading zeros of nonzero 64-bit unsigned integer. Analogous to the
// 32-bit version at
// https://graphics.stanford.edu/~seander/bithacks.html#IntegerLogDeBruijn.
int clz64(uint64_t x)
{
    assert(x);
    x |= x >> 1;
    x |= x >> 2;
    x |= x >> 4;
    x |= x >> 8;
    x |= x >> 16;
    x |= x >> 32;
    return clz64_tab[(uint64_t)(x * 0x03f6eaf2cd271461u) >> 58];
}

在没有超级计算机的情况下，对 64 位输入进行详尽的测试不再合理，而是检查 s*s、s*s + s 和 s*s + 2*s 形式的所有输入0 <= s < 2**32 是可行的，并且让人相信代码可以正常工作。以下代码执行该检查。在我的基于英特尔酷睿 i7 的笔记本电脑上运行完成大约需要 148 秒，计算得出每平方根计算大约需要 11.5 纳秒。如果我用 __builtin_clzll 替换自定义 clz64 实现，则每平方根大约为 9.2ns。（当然，这两个时间仍然包括测试代码的开销。）

#include <stdio.h>

int check_isqrt64(uint64_t x) {
    uint64_t y = isqrt64(x);
    int y_ok = y*y <= x && x - y*y <= 2*y;
    if (!y_ok) {
        printf("isqrt64(%llu) returned incorrect answer %llu\n", x, y);
    }
    return y_ok;
}

int main(void)
{
    printf("Checking isqrt64 for selected values in [0, 2**64) ...\n");
    for (uint64_t s = 0; s < 0x100000000u; s++) {
        if (!check_isqrt64(s*s)) return 1;
        if (!check_isqrt64(s*s + s)) return 1;
        if (!check_isqrt64(s*s + 2*s)) return 1;
    };
    printf("All tests passed\n");
    return 0;
}

64 位平方根：无分支，4 个除法，无查找 table

最终变体为 64 位输入提供了无分支、无查找 table 的整数平方根实现，只是为了证明这是可能的。它需要4个部门。在大多数机器上，这些除法会很慢，以至于这个变体比以前的变体慢。

uint32_t isqrt64(uint64_t x)
{
    int lz = clz64(x | 1) & 62;
    x <<= lz;
    uint32_t y = 2 + (x >> 63);
    y = (y << 1) + (x >> 59) / y;
    y = (y << 3) + (x >> 53) / y;
    y = (y << 7) + (x >> 41) / y;
    y = (y << 15) + (x >> 17) / y;
    y -= x < (uint64_t)y * y;
    return y >> (lz >> 1);
}

精确平方的64位平方根；无除法！

最后，正如所承诺的，这是一个与上述算法略有不同的算法。它计算已知为平方的输入的平方根，对输入的平方根倒数使用牛顿迭代，并在 2-adic 域而不是实数域中运行。它不需要除法，但它使用查找 table，并依赖 GCC 的 __builtin_ctzl 内在函数来有效地计算尾随零。

#include <stdint.h>

static const uint8_t lut[128] = {
    0, 85, 83, 102, 71, 2, 36, 126, 15, 37, 28, 22, 87, 50, 107, 46,
    31, 10, 115, 57, 103, 98, 4, 33, 47, 58, 3, 118, 119, 109, 116, 113,
    63, 106, 108, 38, 120, 61, 27, 62, 79, 101, 35, 41, 104, 13, 84, 17,
    95, 53, 76, 121, 88, 34, 59, 97, 111, 5, 67, 54, 72, 82, 52, 78,
    127, 42, 44, 25, 56, 125, 91, 1, 112, 90, 99, 105, 40, 77, 20, 81,
    96, 117, 12, 70, 24, 29, 123, 94, 80, 69, 124, 9, 8, 18, 11, 14,
    64, 21, 19, 89, 7, 66, 100, 65, 48, 26, 92, 86, 23, 114, 43, 110,
    32, 74, 51, 6, 39, 93, 68, 30, 16, 122, 60, 73, 55, 45, 75, 49,
};

uint32_t isqrt64_exact(uint64_t n)
{
    uint32_t m, k, x, b;

    if (n == 0)
        return 0;

    int j = __builtin_ctzl(n);
    n >>= j;
    m = (uint32_t)n;
    k = (uint32_t)(n >> 2);
    x = lut[k >> 1 & 127];
    x += (m * x * ~x - k) * (x - ~x);
    x += (m * x * ~x - k) * (x - ~x);
    b = m * x + 2 * k;
    b ^= -(b >> 31);
    return (b - ~b) << (j >> 1);
}

对于纯粹主义者来说，不难将其重写为完全无分支，消除查找table，并使代码完全由[=214] =] 和 C99 兼容。这是一种方法：

#include <stdint.h>

uint32_t isqrt64_exact(uint64_t n)
{
    uint64_t t = (n & -n) - 1 & 0x5555555555555555U;
    t = t + (t >> 2) & 0x3333333333333333U;
    t = t + (t >> 4) & 0x0f0f0f0f0f0f0f0fU;
    int j = (uint64_t)(t * 0x0101010101010101U) >> 56;
    n >>= 2 * j;

    uint32_t m, k, x, b;
    m = (uint32_t)n;
    k = (uint32_t)(n >> 2);
    x = k * ~k;
    x += (m * x * ~x - k) * (x - ~x);
    x += (m * x * ~x - k) * (x - ~x);
    x += (m * x * ~x - k) * (x - ~x);
    b = m * x + 2 * k;
    b ^= -(b >> 31);
    return (2 * b | (m & 1)) << j;
}

有关此代码如何工作的完整说明以及进行详尽测试的代码，请参阅 https://gist.github.com/mdickinson/e087001d213725a93eeb8d8f447a2f40 上的 GitHub 要点。