找不到为 percpu 变量访问声明堆栈中抢占计数的位置。 (linux 内核)

Can't find where preempt count in the stack is declared for a percpu variable access. (linux kernel)

我正在调试 linux 启动并试图了解这些 percpu 变量在 arm64 中是如何工作的。为了测试,我添加了一个名为 read_pkcontext1 的函数,其中 returns percpu 变量 printk_context。 (此值用于printk)而且我发现了一些我无法理解的东西。
(这是来自 linux 5.4.21)

==== kernel/printk/printk_safe.c ====

int read_pkcontext1(void)   /* function I added for test */
{
    return this_cpu_read(printk_context);
}

==== include/linux/percpu-defs.h ====
/*
 * Operations with implied preemption/interrupt protection.  These
 * operations can be used without worrying about preemption or interrupt.
 */
#define this_cpu_read(pcp)      __pcpu_size_call_return(this_cpu_read_, pcp)

==== include/linux/percpu-defs.h ====
#define __pcpu_size_call_return(stem, variable)             \
({                                  \
    typeof(variable) pscr_ret__;                    \
    __verify_pcpu_ptr(&(variable));                 \
    switch(sizeof(variable)) {                  \
    case 1: pscr_ret__ = stem##1(variable); break;          \
    case 2: pscr_ret__ = stem##2(variable); break;          \
    case 4: pscr_ret__ = stem##4(variable); break;          \
    case 8: pscr_ret__ = stem##8(variable); break;          \
    default:                            \
        __bad_size_call_parameter(); break;         \
    }                               \
    pscr_ret__;                         \
})

这是 aarch64-none-elf-objdump -S vmlinux 的结果 read_pkcontext1 函数和内部使用的函数(关闭优化)。

ffffffc0100f0dc0 <read_pkcontext1>:
void write_pkcontext(void);

#pragma GCC push_options
#pragma GCC optimize ("O0")
int read_pkcontext1(void)
{
ffffffc0100f0dc0:   a9bd7bfd    stp x29, x30, [sp, #-48]!
ffffffc0100f0dc4:   910003fd    mov x29, sp
    return this_cpu_read(printk_context);
ffffffc0100f0dc8:   f9000fff    str xzr, [sp, #24]
ffffffc0100f0dcc:   52800020    mov w0, #0x1                    // #1
ffffffc0100f0dd0:   94000018    bl  ffffffc0100f0e30 <__preempt_count_add>

... skip ... 

ffffffc0100f0e30 <__preempt_count_add>:
ffffffc0100f0e30:   d5384101    mrs x1, sp_el0
ffffffc0100f0e34:   b9401022    ldr w2, [x1, #16]
    pc += val;
ffffffc0100f0e38:   0b020000    add w0, w0, w2
    case 4: *(volatile __u32 *)p = *(__u32 *)res; break;
ffffffc0100f0e3c:   b9001020    str w0, [x1, #16]
}
ffffffc0100f0e40:   d65f03c0    ret

在上面的代码中,它使用 w0 = #1(增加抢占计数)调用 __preempt_count_add,并且 __preempt_count_add 函数将值 (w0) 添加到 sp + #16 处的变量并将其写回。所以堆栈中的这个变量看起来像抢占计数。 (我想这可以防止抢占)。我的问题是:堆栈中的这个值是什么时候定义和初始化的?我在 linux 来源中找不到它。 (使用qemu,我看到这个值是1,在__preempt_count_add后递增到2。当然在访问percpu变量后又递减回1。)

this_cpu_read(printk_context) 扩展为:
__pcpu_size_call_return(this_cpu_read_, printk_context)

({
    typeof(printk_context) pscr_ret__;
    __verify_pcpu_ptr(&(printk_context));
    switch(sizeof(printk_context)) {
    case 1: pscr_ret__ = this_cpu_read_1(printk_context); break;
    case 2: pscr_ret__ = this_cpu_read_2(printk_context); break;
    case 4: pscr_ret__ = this_cpu_read_4(printk_context); break;
    case 8: pscr_ret__ = this_cpu_read_8(printk_context); break;
    default:
        __bad_size_call_parameter(); break;
    }
    pscr_ret__;
})

sizeof(printk_context) 是 4,所以 pscr_ret__ = this_cpu_read_4(printk_context);.

this_cpu_read_4()宏定义为#include <asm/percpu.h>:

==== arch/arm64/include/asm/percpu.h ====

#define this_cpu_read_4(pcp)        \
    _pcp_protect_return(__percpu_read_32, pcp)
#define _pcp_protect_return(op, pcp, args...)               \
({                                  \
    typeof(pcp) __retval;                       \
    preempt_disable_notrace();                  \
    __retval = (typeof(pcp))op(raw_cpu_ptr(&(pcp)), ##args);    \
    preempt_enable_notrace();                   \
    __retval;                           \
})

这就是发生抢占计数操作的地方。

preempt_disable_notrace()preempt_enable_notrace() 宏由 #include <linux/preempt.h> 定义。

==== include/linux/preempt.h ====

#define preempt_enable_notrace() \
do { \
    barrier(); \
    __preempt_count_dec(); \
} while (0)
#define preempt_disable_notrace() \
do { \
    __preempt_count_inc(); \
    barrier(); \
} while (0)
#define __preempt_count_inc() __preempt_count_add(1)
#define __preempt_count_dec() __preempt_count_sub(1)

__preempt_count_add()__preempt_count_sub()#include <asm/preempt.h> 定义。

==== arch/arm64/include/asm/preempt.h ====

static inline void __preempt_count_add(int val)
{
    u32 pc = READ_ONCE(current_thread_info()->preempt.count);
    pc += val;
    WRITE_ONCE(current_thread_info()->preempt.count, pc);
}

static inline void __preempt_count_sub(int val)
{
    u32 pc = READ_ONCE(current_thread_info()->preempt.count);
    pc -= val;
    WRITE_ONCE(current_thread_info()->preempt.count, pc);
}

对于 arm64,CONFIG_THREAD_INFO_IN_TASK 已启用,因此 current_thread_info()#include <linux/thread_info.h> 定义为宏。

==== include/linux/thread_info.h ====

#ifdef CONFIG_THREAD_INFO_IN_TASK
/*
 * For CONFIG_THREAD_INFO_IN_TASK kernels we need <asm/current.h> for the
 * definition of current, but for !CONFIG_THREAD_INFO_IN_TASK kernels,
 * including <asm/current.h> can cause a circular dependency on some platforms.
 */
#include <asm/current.h>
#define current_thread_info() ((struct thread_info *)current)
#endif

current 宏由 #include <asm/current.h> 定义。

==== arch/arm64/include/asm/current.h ====

#define current get_current()
/*
 * We don't use read_sysreg() as we want the compiler to cache the value where
 * possible.
 */
static __always_inline struct task_struct *get_current(void)
{
    unsigned long sp_el0;

    asm ("mrs %0, sp_el0" : "=r" (sp_el0));

    return (struct task_struct *)sp_el0;
}

arch/arm64/kernel/entry.S 中有一些与使用 sp_el0 堆栈指针指向当前 thread_info 有关的魔法/task_struct。抱歉,我没有时间研究血淋淋的细节,但它是由 commit 6cdf9c7ca687 ("arm64: Store struct thread_info in sp_el0") 介绍的。

关键是 sp_el0 寄存器与 sp 不同。内核在 EL0 模式下不 运行,因此 sp_el0 可用作“临时”寄存器。内核用它来指向当前的 thread_info / task_struct.

struct task_struct#include <linux/sched.h>.

定义

==== include/linux/sched.h ====

struct task_struct {
#ifdef CONFIG_THREAD_INFO_IN_TASK
    /*
     * For reasons of header soup (see current_thread_info()), this
     * must be the first element of task_struct.
     */
    struct thread_info      thread_info;
#endif
    /* -1 unrunnable, 0 runnable, >0 stopped: */
    volatile long           state;

由于选择了CONFIG_THREAD_INFO_IN_TASK,第一个成员是struct thread_info thread_infocurrent_thread_info() 指向当前任务中的那个成员。

struct thread_info#include <asm/thread_info.h>.

定义

==== arch/arm64/include/asm/thread_info.h ====

/*
 * low level task data that entry.S needs immediate access to.
 */
struct thread_info {
    unsigned long       flags;      /* low level flags */
    mm_segment_t        addr_limit; /* address limit */
#ifdef CONFIG_ARM64_SW_TTBR0_PAN
    u64         ttbr0;      /* saved TTBR0_EL1 */
#endif
    union {
        u64     preempt_count;  /* 0 => preemptible, <0 => bug */
        struct {
#ifdef CONFIG_CPU_BIG_ENDIAN
            u32 need_resched;
            u32 count;
#else
            u32 count;
            u32 need_resched;
#endif
        } preempt;
    };
};

当未选择 CONFIG_ARM64_SW_TTBR0_PAN 且 CPU 为 little-endian 时,preempt.count 成员将从结构的开头偏移 16。