void * 到运行时 std::tuple 的第 n 个元素

void * to the nth element of std::tuple at runtime

假设我有一个 std::tuple<T...>,我想有效地访问它的第 n 个元素,其中 n 仅在运行时已知。由于类型 T... 是异构的,所以我只能得到一个 void * 并且我可以接受。这是我得出的结果:

template <size_t ... Indexes, class Tuple>
void * get_element_pointer(std::index_sequence<Indexes...>, Tuple & t, size_t idx) {
    static size_t offsets[] = {(size_t)(void *)&std::get<Indexes>(t) - (size_t)(void *)(&t)...};
    return (void *)((size_t)(void *)(&t) + offsets[idx]);
}

然后你可以这样称呼它:

get_element_pointer(std::index_sequence_for<T...>{}, some_tuple, some_index);

要点是静态创建一个 size_t 数组 offsets,其中包含每个元组元素的偏移量列表。然后,在运行时,我可以查找偏移量并将其添加到传递的元组中。

我的解决方案有两个问题困扰着我:

  1. offsets是在第一次调用这个函数时创建的,它是根据当时传递的元组实例创建的。我觉得这有点奇怪。我可以创建一个类型为 Tuple 的虚假临时文件,但它可能不是默认可构造的。或者,我可以将 nullptr 转换为 Tuple *,但随后 std::get<Indexes>(*(Tuple *)(nullptr)) 尖叫 UB。
  2. (size_t)(void *)(&t)(void *)((size_t)(void *)(&t) + offsets[idx]) 指针操作是我能找到的阻止编译器向我发出警告的唯一方法。我知道当你有虚函数等时,指针转换可能会很棘手而且很重要。所以我担心我可能会遗漏一些东西。

你觉得我的方案可以接受吗?你能想到一个更简单的解决方案吗?

  1. 就正确性而言,使用第一个通过的实例对我来说似乎不是问题。如果您尝试提前创建一个元组,您指出默认可构造性是一个问题是正确的,但是您可以再次将 nullptr 转换为 tuple* 并使用它。

  2. 也许 (void *)((size_t)(void *)(&t) + offsets[idx]) 可以更简单地写成 reinterpret_cast<char*>(&t) + offsets[idx].

为什么不只是:

template <size_t ... Indexes, class Tuple>
void* get_element_pointer(std::index_sequence<Indexes...>, Tuple & t, size_t idx) {
    void* ptrs[] = { static_cast<void *>(std::addressof(std::get<Indexes>(t)))... };
    return ptrs[idx];
}

请注意,我使用 std::addressof 来处理重载 operator & 的邪恶 class。

对于您的警告,您应该将 std::size_t 替换为 std::intptr_t and/or char*:

static std::intptr_t offsets[] = {
    reinterpret_cast<char *>(std::addresof(std::get<Indexes>(t)))
    - reinterpret_cast<char *>(&t)...
};
static_cast<void *>(reinterpret_cast<char *>(&t) + offsets[idx]);

在查看了解决方案后,我将您对性能的担忧牢记在心,并决定看看我们是否可以做得更好。

有趣的是,我尝试使用 constexpr 进行优化的结果因编译器而异。

我将在这里比较 gcc 5.3 和 apple clang 的输出:

这是我的解决方案:

#include <utility>
#include <tuple>
#include <iostream>


template<class Tuple, size_t Index> 
  void* get_address(Tuple& t)
{
  return std::addressof(std::get<Index>(t));
}

template <size_t ... Indexes, class Tuple>
constexpr void* get_element_pointer(Tuple & t, 
                          size_t idx, 
                          std::index_sequence<Indexes...>) 
{
  using function_type = void* (*)(Tuple&); 
  function_type constexpr ptrs[] = 
  {
    &get_address<Tuple, Indexes>...
  };
    return ptrs[idx](t);
}


template<class Tuple>
__attribute__((noinline))
constexpr 
  void * get_element_pointer(Tuple& t, size_t index)
{
  return get_element_pointer(t, 
                             index, 
                             std::make_index_sequence<std::tuple_size<Tuple>::value>());
}

int main()
{
  std::tuple<int, int, int, int, int, int, int , int, int, int> x;
  x = std::make_tuple(4, 5, 6, 7, 8, 9, 10, 11, 12, 13);
  std::cout << *reinterpret_cast<int*>(get_element_pointer(x, 1)) << std::endl;
}

(为清晰起见,使用 -O2 -fomit-frame-pointer 编译)

clang 的解决方案是这样的:

__Z19get_element_pointerINSt3__15tupleIJiiiiiiiiiiEEEEPvRT_m:
    .align  4, 0x90
    leaq    __ZZ19get_element_pointerIJLm0ELm1ELm2ELm3ELm4ELm5ELm6ELm7ELm8ELm9EENSt3__15tupleIJiiiiiiiiiiEEEEPvRT0_mNS0_16integer_sequenceImJXspT_EEEEE4ptrs(%rip), %rax
    jmpq    *(%rax,%rsi,8)          ## TAILCALL

正如预期的那样,它指的是编译时生成的跳转 table:

__ZZ19get_element_pointerIJLm0ELm1ELm2ELm3ELm4ELm5ELm6ELm7ELm8ELm9EENSt3__15tupleIJiiiiiiiiiiEEEEPvRT0_mNS0_16integer_sequenceImJXspT_EEEEE4ptrs:
    .quad   __Z11get_addressINSt3__15tupleIJiiiiiiiiiiEEELm0EEPvRT_
    .quad   __Z11get_addressINSt3__15tupleIJiiiiiiiiiiEEELm1EEPvRT_
    .quad   __Z11get_addressINSt3__15tupleIJiiiiiiiiiiEEELm2EEPvRT_
    .quad   __Z11get_addressINSt3__15tupleIJiiiiiiiiiiEEELm3EEPvRT_
    .quad   __Z11get_addressINSt3__15tupleIJiiiiiiiiiiEEELm4EEPvRT_
    .quad   __Z11get_addressINSt3__15tupleIJiiiiiiiiiiEEELm5EEPvRT_
    .quad   __Z11get_addressINSt3__15tupleIJiiiiiiiiiiEEELm6EEPvRT_
    .quad   __Z11get_addressINSt3__15tupleIJiiiiiiiiiiEEELm7EEPvRT_
    .quad   __Z11get_addressINSt3__15tupleIJiiiiiiiiiiEEELm8EEPvRT_
    .quad   __Z11get_addressINSt3__15tupleIJiiiiiiiiiiEEELm9EEPvRT_

其中每个访问器函数都很简单(提供了一个示例):

__Z11get_addressINSt3__15tupleIJiiiiiiiiiiEEELm2EEPvRT_:
    leaq    8(%rdi), %rax
    retq

这是我假设编译器会做的,"what I would do if I were writing machine code"

然而 gcc 似乎错过了优化跳转的机会 table 并在使用之前在内存中构建它!

void* get_element_pointer<std::tuple<int, int, int, int, int, int, int, int, int, int> >(std::tuple<int, int, int, int, int, int, int, int, int, int>&, unsigned long):
        movq    void* get_address<std::tuple<int, int, int, int, int, int, int, int, int, int>, 0ul>(std::tuple<int, int, int, int, int, int, int, int, int, int>&), -88(%rsp)
        movq    void* get_address<std::tuple<int, int, int, int, int, int, int, int, int, int>, 1ul>(std::tuple<int, int, int, int, int, int, int, int, int, int>&), -80(%rsp)
        movq    void* get_address<std::tuple<int, int, int, int, int, int, int, int, int, int>, 2ul>(std::tuple<int, int, int, int, int, int, int, int, int, int>&), -72(%rsp)
        movq    void* get_address<std::tuple<int, int, int, int, int, int, int, int, int, int>, 3ul>(std::tuple<int, int, int, int, int, int, int, int, int, int>&), -64(%rsp)
        movq    void* get_address<std::tuple<int, int, int, int, int, int, int, int, int, int>, 4ul>(std::tuple<int, int, int, int, int, int, int, int, int, int>&), -56(%rsp)
        movq    void* get_address<std::tuple<int, int, int, int, int, int, int, int, int, int>, 5ul>(std::tuple<int, int, int, int, int, int, int, int, int, int>&), -48(%rsp)
        movq    void* get_address<std::tuple<int, int, int, int, int, int, int, int, int, int>, 6ul>(std::tuple<int, int, int, int, int, int, int, int, int, int>&), -40(%rsp)
        movq    void* get_address<std::tuple<int, int, int, int, int, int, int, int, int, int>, 7ul>(std::tuple<int, int, int, int, int, int, int, int, int, int>&), -32(%rsp)
        movq    void* get_address<std::tuple<int, int, int, int, int, int, int, int, int, int>, 8ul>(std::tuple<int, int, int, int, int, int, int, int, int, int>&), -24(%rsp)
        movq    void* get_address<std::tuple<int, int, int, int, int, int, int, int, int, int>, 9ul>(std::tuple<int, int, int, int, int, int, int, int, int, int>&), -16(%rsp)
        movq    -88(%rsp,%rsi,8), %rax
        jmp     *%rax

在调用类似的普通访问器之前:

void* get_address<std::tuple<int, int, int, int, int, int, int, int, int, int>, 3ul>(std::tuple<int, int, int, int, int, int, int, int, int, int>&):
        leaq    24(%rdi), %rax
        ret

所以没有被吓倒,我想知道在非 constexpr 实现中常量折叠是否可能做得更好:

template <size_t ... Indexes, class Tuple>
void* get_element_pointer(Tuple & t, 
                          size_t idx, 
                          std::index_sequence<Indexes...>) 
{
  using function_type = void* (*)(Tuple&); 
  function_type static const ptrs[] = 
  {
    &get_address<Tuple, Indexes>...
  };
    return ptrs[idx](t);
}

事实证明确实如此 - 我现在在 gcc 上获得了与使用 constexpr 解决方案生成的 clang 相同的代码:

void* get_element_pointer<std::tuple<int, int, int, int, int, int, int, int, int, int> >(std::tuple<int, int, int, int, int, int, int, int, int, int>&, unsigned long):
        movq    void* get_element_pointer<0ul, 1ul, 2ul, 3ul, 4ul, 5ul, 6ul, 7ul, 8ul, 9ul, std::tuple<int, int, int, int, int, int, int, int, int, int> >(std::tuple<int, int, int, int, int, int, int, int, int, int>&, unsigned long, std::integer_sequence<unsigned long, 0ul, 1ul, 2ul, 3ul, 4ul, 5ul, 6ul, 7ul, 8ul, 9ul>)::ptrs(,%rsi,8), %rax
        jmp     *%rax

clang 做了什么?

__Z19get_element_pointerINSt3__15tupleIJiiiiiiiiiiEEEEPvRT_m:
    movq    __ZZ19get_element_pointerIJLm0ELm1ELm2ELm3ELm4ELm5ELm6ELm7ELm8ELm9EENSt3__15tupleIJiiiiiiiiiiEEEEPvRT0_mNS0_16integer_sequenceImJXspT_EEEEE4ptrs@GOTPCREL(%rip), %rax
    jmpq    *(%rax,%rsi,8)          ## TAILCALL

令人高兴的是相同的结果。

所以这是最终的、可证明的最佳解决方案:

template<class Tuple, size_t Index>
void* get_address(Tuple& t)
{
    return std::addressof(std::get<Index>(t));
}

template <size_t ... Indexes, class Tuple>
void* get_element_pointer(Tuple & t,
                                    size_t idx,
                                    std::index_sequence<Indexes...>)
{
    using function_type = void* (*)(Tuple&);
    function_type static const ptrs[] =
    {
        &get_address<Tuple, Indexes>...
    };
    return ptrs[idx](t);
}


template<class Tuple>
__attribute__((noinline))
constexpr
void * get_element_pointer(Tuple& t, size_t index)
{
    return get_element_pointer(t,
                               index,
                               std::make_index_sequence<std::tuple_size<Tuple>::value>());
}