ARM 和 Intel 内部函数是否对 AES 使用相同的子密钥计划?

Does ARM and Intel intrinsics use the same subkey schedule for AES?

我正在尝试使用内部函数在 ARMv8 上插入 AES 实现。我有一个 C++ 实现,我有一个 Intel 内部函数实现。

这些实现应该是等效的,所以我尝试使用 Intel 作为 ARMv8 的蓝图。有一些差异,但它们已被考虑在内。问题是,我得到了不同的结果。

void AES_encrypt(const Byte *in, Byte *out, const RoundKey *rdkeys, unsigned int rounds)
{
#if defined(__ARM_FEATURE_CRYPTO)

    uint8x16_t data = vld1q_u8(in);

    // AES encryption with ARM intrinsics:
    // rnds-1 (9 for AES128) cycles of AES:
    // (Add, Shift, Sub) plus Mix Columns
    unsigned int i;
    for (i=0; i<rounds; ++i)
    {
        // AES single round encryption
        data = vaeseq_u8(data, rdkeys[i]);
        // AES mix columns
        data = vaesmcq_u8(data);
    }
    // One round of encryption: AES, no Mix Columns
    data = vaeseq_u8(data, rdkeys[i++]);
    // Final Add (bitwise Xor)
    data = veorq_u8(data, rdkeys[i]);
    vst1q_u8(out, data);

#elif defined(__AES__)

    __m128i data = _mm_loadu_si128((const __m128i*)in);
    data = _mm_xor_si128(data, rdkeys[0]);
    for (unsigned int i=1; i<rounds-1; ++i)
    {
        data = _mm_aesenc_si128(data, rdkeys[i]);
    }
    data = _mm_aesenc_si128(data, rdkeys[rounds-1]);
    data = _mm_aesenclast_si128(data, rdkeys[rounds]);
    _mm_storeu_si128((__m128i*)out, data);

#endif
}

在这一点上,我正在尝试侧步执行子键计算。我对这两种实现使用了相同的一组轮密钥:

#if defined(__ARM_FEATURE_CRYPTO)
typedef uint8x16_t RoundKey;
typedef uint8_t Byte;
#elif defined(__AES__)
typedef __m128i RoundKey;
typedef uint8_t Byte;
#endif

// Avoid subkey scheduling at this point
RoundKey rdkeys[ROUNDS+1];
for (size_t i=0; i<COUNTOF(rdkeys); ++i)
    memset(&rdkeys[i], (i<<4)|i, sizeof(RoundKey));

但是,我得到了不同的结果。以下是转储产生的内容:

英特尔 AES-NI:

In: FF FF FF FF FF FF FF FF FF FF FF FF FF FF FF FF
...
Key: 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11
Data: 07 07 07 07 07 07 07 07 07 07 07 07 07 07 07 07
...
Key: 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99
Data: 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 33
Key: AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA
Data: 69 69 69 69 69 69 69 69 69 69 69 69 69 69 69 69
...

Out: 69 69 69 69 69 69 69 69 69 69 69 69 69 69 69 69

ARMv8 AES:

In: FF FF FF FF FF FF FF FF FF FF FF FF FF FF FF FF
...
Key: 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11
Data: C5 C5 C5 C5 C5 C5 C5 C5 C5 C5 C5 C5 C5 C5 C5 C5
...
Key: 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99 99
Data: C3 C3 C3 C3 C3 C3 C3 C3 C3 C3 C3 C3 C3 C3 C3 C3
Key: AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA AA
Data: F9 F9 F9 F9 F9 F9 F9 F9 F9 F9 F9 F9 F9 F9 F9 F9
...
Out: F9 F9 F9 F9 F9 F9 F9 F9 B1 FF B9 F9 F9 F9 F9 F9

我一直对结果摸不着头脑。添加更多 printf 无助于识别问题。我开始认为 Intel 和 ARM 内部函数使用不同的子项计划。

ARM 和 Intel 内部函数是否对 AES 使用相同的子密钥计划?


下图来自paper by Cynthia Crutchfield。它检查 Intel 内在函数和 ARM 内在函数的映射。


下面是完整的程序。还列出了构建它们的命令行。

英特尔:

g++ -Wall -maes aes-test.cxx -o aes-test.exe

AEMv8:

 g++ -Wall -march=armv8-a+crc+crypto -mtune=cortex-a53 aes-test.cxx -o aes-test.exe

程序:

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>

#if defined(__ARM_FEATURE_CRYPTO)
# include <arm_neon.h>
# include <arm_acle.h>
#elif defined(__AES__)
# include <wmmintrin.h>
# include <emmintrin.h>
#endif

#if defined(__ARM_FEATURE_CRYPTO)
typedef uint8x16_t RoundKey;
typedef uint8_t Byte;
#elif defined(__AES__)
typedef __m128i RoundKey;
typedef uint8_t Byte;
#endif

#define COUNTOF(x) (sizeof(x)/(sizeof(x)[0]))

static const unsigned int ROUNDS=10;
void AES_encrypt(const Byte *in, Byte *out, const RoundKey *rdkeys, unsigned int rounds);
void AES_decrypt(const Byte *in, Byte *out, const RoundKey *rdkeys, unsigned int rounds);

void Print(const char* label, const Byte *in, size_t len, bool lf=false)
{
    if (label)
        printf("%s: ", label);

    for (size_t i=0; in && i<len; ++i)
        printf("%02X ", in[i]);    
    printf("\n");

    if (lf)
        printf("\n");
}

int main(int argc, char* argv[])
{
    Byte cipher[16], recover[16];
    const Byte plain[16] = {
        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF
    };

    // Avoid subkey scheduling at this point
    RoundKey rdkeys[ROUNDS+1];
    for (size_t i=0; i<COUNTOF(rdkeys); ++i)
        memset(&rdkeys[i], (i<<4)|i, sizeof(rdkeys[i]));

    AES_encrypt(plain, cipher, rdkeys, ROUNDS);

    return 0;
}

void AES_encrypt(const Byte *in, Byte *out, const RoundKey *rdkeys, unsigned int rounds)
{
    Print("In", in, 16);

#if defined(__ARM_FEATURE_CRYPTO)

    // Load the block
    uint8x16_t data = vld1q_u8(in);

    Print("Data (in)", (Byte*)&data, 16, true);

    // AES encryption with ARM intrinsics:
    // rnds-1 (9 for AES128) cycles of AES:
    // (Add, Shift, Sub) plus Mix Columns
    unsigned int i;
    for (i=0; i<rounds; ++i)
    {
        // AES single round encryption
        data = vaeseq_u8(data, rdkeys[i]);
        // AES mix columns
        data = vaesmcq_u8(data);

        Print("Key", (Byte*)&rdkeys[i], 16);
        Print("Data", (Byte*)&data, 16, true);
    }

    Print("Key", (Byte*)&rdkeys[i], 16);

    // One round of encryption: AES, no Mix Columns
    data = vaeseq_u8(data, rdkeys[i++]);

    Print("Data", (Byte*)&data, 16, true);

    // Final Add (bitwise Xor)
    data = veorq_u8(data, rdkeys[i]);

    Print("Data (xor)", (Byte*)&data, 16);

    // Store the output data
    vst1q_u8(out, data);

#elif defined(__AES__)

    __m128i data = _mm_loadu_si128((const __m128i*)in);

    Print("Data (in)", (Byte*)&data, 16);

    data = _mm_xor_si128(data, rdkeys[0]);

    Print("Key", (Byte*)&rdkeys[0], 16);
    Print("Data (xor)", (Byte*)&data, 16, true);

    for (unsigned int i=1; i<rounds-1; ++i)
    {
        data = _mm_aesenc_si128(data, rdkeys[i]);

        Print("Key", (Byte*)&rdkeys[i], 16);
        Print("Data", (Byte*)&data, 16, true);
    }
    data = _mm_aesenc_si128(data, rdkeys[rounds-1]);

    Print("Key", (Byte*)&rdkeys[rounds-1], 16);
    Print("Data", (Byte*)&data, 16, true);

    data = _mm_aesenclast_si128(data, rdkeys[rounds]);

    Print("Key", (Byte*)&rdkeys[rounds], 16);
    Print("Data", (Byte*)&data, 16, true);

    _mm_storeu_si128((__m128i*)out, data);

#endif

    Print("Out", out, 16);
}

Does ARM and Intel intrinsics use the same subkey schedule for AES?

看来答案是肯定的。我仍然需要针对真正的密钥调度进行测试,但我能够使用相同的密钥调度对 Intel 和 ARMv8 内部函数产生相同的结果。

Crutchfield 的参考实现中似乎有一个差错。它应该使用 rounds-1,而不是 rounds 作为循环控制。这意味着我用 11 轮测试 ARMv8,而不是 10 轮。当 ARMv8 代码生成 F9 F9 F9 F9 F9 F9 F9 F9 B1 FF B9 F9 F9 F9 F9 F9 而不是 F9 F9 ... F9 F9 时,我应该怀疑它。

这是更新后的代码:

void AES_encrypt(const Byte *in, Byte *out, const RoundKey *rdkeys, unsigned int rounds)
{
#if defined(__ARM_FEATURE_CRYPTO)

    uint8x16_t data = vld1q_u8(in);

    unsigned int i;
    for (i=0; i<rounds-1; ++i)
    {
        data = vaeseq_u8(data, rdkeys[i]);
        data = vaesmcq_u8(data);
    }

    data = vaeseq_u8(data, rdkeys[i++]);
    data = veorq_u8(data, rdkeys[i]);

    vst1q_u8(out, data);

#elif defined(__AES__)

    __m128i data = _mm_loadu_si128((const __m128i*)in);
    data = _mm_xor_si128(data, rdkeys[0]);

    unsigned int i;
    for (i=1; i<rounds-1; ++i)
    {
        data = _mm_aesenc_si128(data, rdkeys[i]);
    }

    data = _mm_aesenc_si128(data, rdkeys[i++]);
    data = _mm_aesenclast_si128(data, rdkeys[i]);
    _mm_storeu_si128((__m128i*)out, data);

#endif
}