ARM Cortex M0+：如何在 C 代码中使用 "Branch if Carry" 指令？

Question

我有一些 C 代码可以逐位处理数据。简化示例：

// input data, assume this is initialized
uint32_t data[len];

for (uint32_t idx=0; idx<len; idx++)
{
    uint32_t tmp = data[idx];
    
    // iterate over all bits
    for (uint8_t pos=0; pos<32; pos++)
    {
        if (tmp & 0b1)
        {
            // some code
        }
    
        tmp = tmp >> 1;
    }
}

在我的应用中len比较大，所以我想尽可能优化内循环。 // some code 部分很小并且已经过大量优化。

我使用的是 ARM Cortex M0+ MCU，如果设置了进位位，它有一个分支指令（参见 cortex-m0+ manual，第 45 页）。方便地移位位将 LSB（或 MSB）放入进位标志，因此理论上它可以在没有比较的情况下分支，如下所示：

// input data, assume this is initialized
uint32_t data[len];

for (uint32_t idx=0; idx<len; idx++)
{
    uint32_t tmp = data[idx];

    // iterate over all bits
    for (uint8_t pos=0; pos<32; pos++)
    {
        tmp = tmp >> 1;

        if ( CARRY_SET )
        {
            // some code
        }
    }
}

使用 C 代码 and/or 内联汇编程序存档的最佳方法是什么？理想情况下，为了简单和更好的可读性，我想在 C 中保留 // come code。

编辑 1：我已经在 ~~GCC 5.4~~ GCC 6.3 上使用 -O1、-O2 和 -03 测试了这段代码。对于每个设置，它都会生成以下汇编代码（请注意我尝试获取的专用 tst 指令）：

        if (data & 0b1)             
00000218   movs r3, #1       
0000021A   tst  r3, r6       
0000021C   beq  #4

编辑 2：最小可重现示例。我正在 Atmel Studio 7 中编写代码（因为它适用于 MCU）并检查内置调试器的值。如果你使用不同的环境，你可能需要添加som IO代码：

int main(void)
{
    uint32_t tmp = 0x12345678;
    volatile uint8_t bits = 0;  // volatile needed in this example to prevent compiler from optimizing away all code.

    // iterate over all bits
    for (uint8_t pos=0; pos<32; pos++)
    {
        if (tmp & 1)
        {
            bits++;    // the real code isn't popcount.  Some compilers may transform this example loop into a different popcount algorithm if bits wasn't volatile.
        }
        tmp = tmp >> 1;
    }

    // read bits here with debugger
    while(1);
}

Answer 1

如果您有特定的代码，那么...您只需编写即可。从编译后的代码开始，并根据需要进行手动优化。编译器无法读懂你的想法。

gcc 5.x.x 是 gnu 在代码输出方面达到顶峰的地方，从那以后它就走下坡路了。但这并不意味着该版本总是比新版本更好。如果您想让编译器为您完成工作，godbolt 或只是在您的计算机上安装各种软件会有所帮助。

unsigned int fun ( unsigned int tmp )
{
    unsigned int bits;
    bits=0;
    for (unsigned char pos=0; pos<32; pos++)
    {
        if (tmp & 1)
        {
            bits++;
        }
        tmp = tmp >> 1;
    }
    return(bits);
}

位为 32 位

.text 部分的反汇编：

00000000 <fun>:
   0:   0002        movs    r2, r0
   2:   b510        push    {r4, lr}
   4:   2320        movs    r3, #32
   6:   2000        movs    r0, #0
   8:   2401        movs    r4, #1
   a:   0021        movs    r1, r4
   c:   3b01        subs    r3, #1
   e:   4011        ands    r1, r2
  10:   b2db        uxtb    r3, r3
  12:   1840        adds    r0, r0, r1
  14:   0852        lsrs    r2, r2, #1
  16:   2b00        cmp r3, #0
  18:   d1f7        bne.n   a <fun+0xa>
  1a:   bd10        pop {r4, pc}

r4 在循环外设置一次

将位作为 8 位值

Disassembly of section .text:

00000000 <fun>:
   0:   0002        movs    r2, r0
   2:   2320        movs    r3, #32
   4:   2000        movs    r0, #0
   6:   2101        movs    r1, #1
   8:   4211        tst r1, r2
   a:   d001        beq.n   10 <fun+0x10>
   c:   3001        adds    r0, #1
   e:   b2c0        uxtb    r0, r0
  10:   3b01        subs    r3, #1
  12:   b2db        uxtb    r3, r3
  14:   0852        lsrs    r2, r2, #1
  16:   2b00        cmp r3, #0
  18:   d1f6        bne.n   8 <fun+0x8>
  1a:   4770        bx  lr

r1 在循环外设置为 1。这个效率较低，因为它必须在每个循环中执行 utxb。

自然地，您永远不会希望将 char 用于这样的循环变量（也不会用于该计数器），您需要一个寄存器大小的变量，除非您需要一个大于寄存器大小的变量并且您必须承担成本.

unsigned int fun ( unsigned int tmp )
{
    unsigned int bits;
    bits=0;
    for (unsigned int pos=0; pos<32; pos++)
    {
        if (tmp & 1)
        {
            bits++;
        }
        tmp = tmp >> 1;
    }
    return(bits);
}

00000000 <fun>:
   0:   0003        movs    r3, r0
   2:   b510        push    {r4, lr}
   4:   2220        movs    r2, #32
   6:   2000        movs    r0, #0
   8:   2401        movs    r4, #1
   a:   0021        movs    r1, r4
   c:   3a01        subs    r2, #1
   e:   4019        ands    r1, r3
  10:   1840        adds    r0, r0, r1
  12:   085b        lsrs    r3, r3, #1
  14:   2a00        cmp r2, #0
  16:   d1f8        bne.n   a <fun+0xa>
  18:   bd10        pop {r4, pc}

那个好一点

unsigned int fun ( unsigned int tmp )
{
    unsigned int bits;
    bits=0;
    for (unsigned int pos=0x80000000; pos; pos>>=1)
    {
        if (tmp & pos)
        {
            bits++;
        }
    }
    return(bits);
}

更有趣的是

unsigned int fun ( unsigned int tmp )
{
    unsigned int bits;
    bits=0;
    for (unsigned int pos=0x1; pos; pos<<=1)
    {
        if (tmp & pos)
        {
            bits++;
        }
    }
    return(bits);
}

这个编译器并没有更好。

也许你正在寻找这样的东西

push {r4,lr}
mov r1,#0
mov r2,#1
mov r3,#32
top:
movs r4,r0
ands r4,r2
adds r1,r4
lsrs r0,r0,#1
subs r3,#1
bne top
mov r0,r1
pop {r4,pc}

对于位计数，但位计数会导致一些优化（不需要分支）

unsigned int fun ( unsigned int tmp, unsigned int bits )
{
    for (unsigned int pos=0; pos<32; pos++)
    {
        if (tmp & 1)
        {
            bits<<=2;
        }
        tmp >>= 1;
    }
    return(bits);
}

00000000 <fun>:
   0:   0003        movs    r3, r0
   2:   2220        movs    r2, #32
   4:   0008        movs    r0, r1
   6:   2101        movs    r1, #1
   8:   4219        tst r1, r3
   a:   d000        beq.n   e <fun+0xe>
   c:   0080        lsls    r0, r0, #2
   e:   3a01        subs    r2, #1
  10:   085b        lsrs    r3, r3, #1
  12:   2a00        cmp r2, #0
  14:   d1f8        bne.n   8 <fun+0x8>
  16:   4770        bx  lr

mov r1,#1 仍在循环之外。编译器被告知做一个 and 它正在做一个 and 并且可能没有编码优化和 1 的角落情况，稍后会右移。

unsigned int fun ( unsigned int tmp, unsigned int bits )
{
    for (unsigned int pos=0; pos<32; pos++)
    {
        tmp >>= 1;
        if (tmp & 1)
        {
            bits<<=2;
        }
    }
    return(bits);
}

这显然在功能上不一样，但是编译器在这里仍然使用 and (tst)。

需要查看 gcc 源代码，看看它何时会生成 bcc 或 bcs，并非指令集中的每条指令都被编译器使用，作者有他们最喜欢的做事方式和编译器的第一份工作是功能等价物。优化器同样必须首先在功能上等效，然后可能更高效。

天哪，好吧，所以我从来没有使用过 godbolt，而且我没有看到正确的组合 (cortex-m)，但是我尝试了 armv6m 的 clang 并且......好吧......他们展开循环以提高速度。与 -O3

-O2 的叮当声

Disassembly of section .text:

00000000 <fun>:
   0:   2220        movs    r2, #32
   2:   e003        b.n c <fun+0xc>
   4:   1e52        subs    r2, r2, #1
   6:   0840        lsrs    r0, r0, #1
   8:   2a00        cmp r2, #0
   a:   d003        beq.n   14 <fun+0x14>
   c:   07c3        lsls    r3, r0, #31
   e:   d0f9        beq.n   4 <fun+0x4>
  10:   0089        lsls    r1, r1, #2
  12:   e7f7        b.n 4 <fun+0x4>
  14:   4608        mov r0, r1
  16:   4770        bx  lr

这是另一种方法，你会产生很多分支和它的副作用（虽然 cortex-m0+ 管道很小）。这可能会表现更差，不仅仅是因为管道的东西，而且因为获取，你需要一个三层深的分支预测器缓存，但你会招致额外的获取。假定这是闪存外的 MCU 运行，闪存往往很慢，这在很大程度上取决于芯片供应商以及您运行 mcu 等的速度。更多指令可能会更快少于具有更多分支的指令。

对于这些高性能架构（arm，risc），你还需要考虑对齐，将相同的机器代码向上或向下调整一个或两个或三个半字，它可以执行百分之几十（或更快）仅仅是因为抓取。运行这段来自 ram 而不是 flash 的代码一般来说应该有帮助，但这取决于芯片供应商（arm 不是芯片供应商）以及你如何计时。

Answer 2

我没有找到一个“简单”的解决方案，所以我不得不在 assembler 中编写我的简短算法。这是演示代码的样子：

// assume these values as initialized
uint32_t data[len];     // input data bit stream
uint32_t out;           // algorithm input + output
uint32_t in;            // algorithm input (value never written in asm)

for (uint32_t idx=0; idx<len; idx++)
{
    uint32_t tmp = data[idx];
    
    // iterate over all bits
    for (uint8_t pos=0; pos<32; pos++)
    {
        // use optimized code only on supported devices
        #if defined(__CORTEX_M) && (__CORTEX_M <= 4)
        asm volatile  // doesn't need to be volatile if you use the result
        (
            "LSR    %[tmp], %[tmp], #1" "\n\t"  // shift data by one. LSB is now in carry
            "BCC    END_%="             "\n\t"  // branch if carry clear (LSB was not set)
            
            /* your code here */        "\n\t"
        
            "END_%=:"                   "\n\t"  // label only, doesn't generate any instructions
        
            : [tmp]"+l"(tmp), [out]"+l"(out)    // out; l = register 0..7 = general purpose registers
            : [in]"l"(in)                       // in;
            : "cc"                              // clobbers: "cc" = CPU status flags have changed
                               // Add any other registers you use as temporaries, or use dummy output operands to let the compiler pick registers.
        );
        #else
        if (tmp & 0b1)
        {
            // some code
        }
        tmp = tmp >> 1;
        #endif
    }
}

对于您的应用程序，在标记的位置添加您的汇编代码，并使用寄存器从 C 函数中输入数据。请记住，在 Thumb 模式下，许多指令只能使用 16 个通用寄存器中的 8 个，因此您不能传递比这更多的值。

内联汇编很容易以微妙的方式出错，这些方式看似有效，但在内联到不同的周围代码后可能会中断。（例如，忘记声明一个 clobber。）https://gcc.gnu.org/wiki/DontUseInlineAsm unless you need to (including for performance), but if so make sure you check the docs (https://whosebug.com/tags/inline-assembly/info).

请注意，技术上正确的移位指令是 LSRS（带有 s 后缀来设置标志）。但是在GCC 6.3 + GAS上写lsrs在asm代码中会导致在thumb模式下汇编错误，但是如果你写lsr它assemble s 成功进入 lsrs 指令。（在 ARM 模式下，Cortex-M 不支持，lsr 和 lsrs 都 assemble 以按预期分隔指令。）

虽然我不能分享我的应用程序代码，但我可以告诉你这个变化有多少加速：

	-O1	-O2	-O3
original	812us	780us	780us
w/ asm	748us	686us	716us
w/ asm + some loop unrolling	732us	606us	648us

因此，使用我的 ASM 代码和 -O2 而不是 -O1，我获得了 15% 的加速，并且通过额外的循环展开，我获得了 25% 的加速。

使用 __attribute__ ((section(".ramfunc"))) 将函数放在 RAM 中会产生另外 1% 的改进。（确保在您的设备上对此进行测试，某些 MCU 的闪存缓存未命中惩罚非常糟糕。）

请参阅下面 old_timer 的回答以获得更通用的优化。

ARM Cortex M0+：如何在 C 代码中使用 "Branch if Carry" 指令？

ARM Cortex M0+: How to use "Branch if Carry" instructions in C-code?

c

assembly

arm

micro-optimization

compiler-optimization