为什么我使用不稳定的裸程序集会在 movaps 指令上出现段错误?
Why does my usage of the unstable naked assembly feature segfault on a movaps instruction?
我知道我的代码使用了很多不安全的内联汇编,但我仍然想知道为什么它只在发布模式下出现段错误。我尝试了较低的 opt-level 但它只能 运行 当 opt-level 为 1.
//! green-threads is a toy implementation on user-space threads in non-preemptive multitasking.
//! This implementation is mostly guided by cfsamson's tutorial:
//! https://cfsamson.gitbook.io/green-threads-explained-in-200-lines-of-rust/green-threads.
#![deny(missing_docs)]
#![feature(llvm_asm)]
#![feature(naked_functions)]
use std::collections::VecDeque;
use std::ptr;
const DEFAULT_STACK_SIZE: usize = 1024 * 1024 * 2;
static mut RUNTIME: usize = 0;
/// Runtime schedule and switch threads. current is the id of thread which is currently running.
pub struct Runtime {
queue: VecDeque<Task>,
current: Task,
}
/// ThreadContext contains the registers marked as "callee-saved" (preserved across calls)
/// in the specification of x86-64 architecture. They contain all the information
/// we need to resume a thread.
#[derive(Debug, Default)]
#[repr(C)]
struct ThreadContext {
rsp: u64,
r15: u64,
r14: u64,
r13: u64,
r12: u64,
rbx: u64,
rbp: u64,
}
struct Task {
stack: Vec<u8>,
ctx: ThreadContext,
}
impl Task {
fn new() -> Self {
Task {
stack: vec![0_u8; DEFAULT_STACK_SIZE],
ctx: ThreadContext::default(),
}
}
}
impl Runtime {
/// Initialize with a base thread.
pub fn new() -> Self {
let base_thread = Task::new();
Runtime {
queue: VecDeque::new(),
current: base_thread,
}
}
/// This is cheating a bit, but we need a pointer to our Runtime
/// stored so we can call yield on it even if we don't have a
/// reference to it.
pub fn init(&self) {
unsafe {
let r_ptr: *const Runtime = self;
RUNTIME = r_ptr as usize;
}
}
/// start the runtime
pub fn run(&mut self) {
while self.t_yield() {}
}
fn t_return(&mut self) -> bool {
if self.queue.len() == 0 {
return false;
}
let mut next = self.queue.pop_front().unwrap();
std::mem::swap(&mut next, &mut self.current);
unsafe {
switch(&mut next.ctx, &self.current.ctx);
}
self.queue.len() > 0
}
fn t_yield(&mut self) -> bool {
if self.queue.len() == 0 {
return false;
}
let mut next = self.queue.pop_front().unwrap();
std::mem::swap(&mut next, &mut self.current);
self.queue.push_back(next);
unsafe {
let last = self.queue.len() - 1;
switch(&mut self.queue[last].ctx, &self.current.ctx);
}
// Prevents compiler from optimizing our code away on Windows.
self.queue.len() > 0
}
/// spawn a function to be executed by runtime
pub fn spawn(&mut self, f: fn()) {
let mut available = Task::new();
let size = available.stack.len();
let s_ptr = available.stack.as_mut_ptr();
unsafe {
// put the f to the 16 bytes aligned position.
ptr::write(s_ptr.offset((size - 32) as isize) as *mut u64, f as u64);
// put the guard 1 byte next to the f for being executed after f returned.
ptr::write(s_ptr.offset((size - 24) as isize) as *mut u64, guard as u64);
available.ctx.rsp = s_ptr.offset((size - 32) as isize) as u64;
}
self.queue.push_back(available);
}
}
fn guard() {
unsafe {
let rt_ptr = RUNTIME as *mut Runtime;
(*rt_ptr).t_return();
}
}
/// yield_thread is a helper function that lets us call yield from an arbitrary place in our code.
pub fn yield_thread() {
unsafe {
let rt_ptr = RUNTIME as *mut Runtime;
(*rt_ptr).t_yield();
};
}
#[naked]
#[inline(never)]
unsafe fn switch(old: *mut ThreadContext, new: *const ThreadContext) {
llvm_asm!("
mov %rsp, 0x00([=10=])
mov %r15, 0x08([=10=])
mov %r14, 0x10([=10=])
mov %r13, 0x18([=10=])
mov %r12, 0x20([=10=])
mov %rbx, 0x28([=10=])
mov %rbp, 0x30([=10=])
mov 0x00(), %rsp
mov 0x08(), %r15
mov 0x10(), %r14
mov 0x18(), %r13
mov 0x20(), %r12
mov 0x28(), %rbx
mov 0x30(), %rbp
ret
"
:
:"r"(old), "r"(new)
:
: "volatile", "alignstack"
);
}
fn main() {
let mut runtime = Runtime::new();
runtime.init();
runtime.spawn(|| {});
runtime.run();
}
确切的段错误点在
mov qword ptr [rsp + 144], rdi
movups xmm0, xmmword ptr [rdx + rsi + 56]
; Here
movaps xmmword ptr [rsp + 128], xmm0
movups xmm0, xmmword ptr [rdx + rsi + 40]
movaps xmmword ptr [rsp + 112], xmm0
内联pop_front
内联t_return
update2:在第一个答案和更多测试之后,问题是 movaps
要求 [rsp + 128]
是 16 字节对齐的,但 rsp
不是。 guard
(内联 t_return
)的条目在
之后
更多细节在我自己的回答中
movaps
to/from 堆栈上的段错误通常表示您违反了 ABI 并且未对齐堆栈指针。可能调试模式不会自动矢量化此处复制的任何内容的副本。
movaps
的内存操作数需要 16 字节对齐,这与 movups
不同。编译器使用 movaps
因为它在旧 CPU 上效率更高。 ABI 保证任何函数入口的 16 字节堆栈对齐,因此它可以免费为本地人获得 16 字节对齐。 (这样的 ABI 保证是编译器为了效率而应该在不检查的情况下假设的事情。)
注意这是从[rdx+rsi+40]
复制32个字节到堆栈内存,所以之前那个堆栈内存的内容商店执行无关紧要。
经过更多测试, it turns out that the original implementation in the tutorial 有缺陷,我的也是。问题在于使用接下来的 8 个字节来存储指向 guard
.
的函数指针
当派生函数 returns 时,它将堆栈顶部弹出到 %rip
,从而运行 guard
。与调用者负责堆栈对齐的普通函数调用不同,这将导致 guard
中的堆栈不再对齐到 16 字节。随后使用 movaps
时,此程序会出现段错误。
我知道我的代码使用了很多不安全的内联汇编,但我仍然想知道为什么它只在发布模式下出现段错误。我尝试了较低的 opt-level 但它只能 运行 当 opt-level 为 1.
//! green-threads is a toy implementation on user-space threads in non-preemptive multitasking.
//! This implementation is mostly guided by cfsamson's tutorial:
//! https://cfsamson.gitbook.io/green-threads-explained-in-200-lines-of-rust/green-threads.
#![deny(missing_docs)]
#![feature(llvm_asm)]
#![feature(naked_functions)]
use std::collections::VecDeque;
use std::ptr;
const DEFAULT_STACK_SIZE: usize = 1024 * 1024 * 2;
static mut RUNTIME: usize = 0;
/// Runtime schedule and switch threads. current is the id of thread which is currently running.
pub struct Runtime {
queue: VecDeque<Task>,
current: Task,
}
/// ThreadContext contains the registers marked as "callee-saved" (preserved across calls)
/// in the specification of x86-64 architecture. They contain all the information
/// we need to resume a thread.
#[derive(Debug, Default)]
#[repr(C)]
struct ThreadContext {
rsp: u64,
r15: u64,
r14: u64,
r13: u64,
r12: u64,
rbx: u64,
rbp: u64,
}
struct Task {
stack: Vec<u8>,
ctx: ThreadContext,
}
impl Task {
fn new() -> Self {
Task {
stack: vec![0_u8; DEFAULT_STACK_SIZE],
ctx: ThreadContext::default(),
}
}
}
impl Runtime {
/// Initialize with a base thread.
pub fn new() -> Self {
let base_thread = Task::new();
Runtime {
queue: VecDeque::new(),
current: base_thread,
}
}
/// This is cheating a bit, but we need a pointer to our Runtime
/// stored so we can call yield on it even if we don't have a
/// reference to it.
pub fn init(&self) {
unsafe {
let r_ptr: *const Runtime = self;
RUNTIME = r_ptr as usize;
}
}
/// start the runtime
pub fn run(&mut self) {
while self.t_yield() {}
}
fn t_return(&mut self) -> bool {
if self.queue.len() == 0 {
return false;
}
let mut next = self.queue.pop_front().unwrap();
std::mem::swap(&mut next, &mut self.current);
unsafe {
switch(&mut next.ctx, &self.current.ctx);
}
self.queue.len() > 0
}
fn t_yield(&mut self) -> bool {
if self.queue.len() == 0 {
return false;
}
let mut next = self.queue.pop_front().unwrap();
std::mem::swap(&mut next, &mut self.current);
self.queue.push_back(next);
unsafe {
let last = self.queue.len() - 1;
switch(&mut self.queue[last].ctx, &self.current.ctx);
}
// Prevents compiler from optimizing our code away on Windows.
self.queue.len() > 0
}
/// spawn a function to be executed by runtime
pub fn spawn(&mut self, f: fn()) {
let mut available = Task::new();
let size = available.stack.len();
let s_ptr = available.stack.as_mut_ptr();
unsafe {
// put the f to the 16 bytes aligned position.
ptr::write(s_ptr.offset((size - 32) as isize) as *mut u64, f as u64);
// put the guard 1 byte next to the f for being executed after f returned.
ptr::write(s_ptr.offset((size - 24) as isize) as *mut u64, guard as u64);
available.ctx.rsp = s_ptr.offset((size - 32) as isize) as u64;
}
self.queue.push_back(available);
}
}
fn guard() {
unsafe {
let rt_ptr = RUNTIME as *mut Runtime;
(*rt_ptr).t_return();
}
}
/// yield_thread is a helper function that lets us call yield from an arbitrary place in our code.
pub fn yield_thread() {
unsafe {
let rt_ptr = RUNTIME as *mut Runtime;
(*rt_ptr).t_yield();
};
}
#[naked]
#[inline(never)]
unsafe fn switch(old: *mut ThreadContext, new: *const ThreadContext) {
llvm_asm!("
mov %rsp, 0x00([=10=])
mov %r15, 0x08([=10=])
mov %r14, 0x10([=10=])
mov %r13, 0x18([=10=])
mov %r12, 0x20([=10=])
mov %rbx, 0x28([=10=])
mov %rbp, 0x30([=10=])
mov 0x00(), %rsp
mov 0x08(), %r15
mov 0x10(), %r14
mov 0x18(), %r13
mov 0x20(), %r12
mov 0x28(), %rbx
mov 0x30(), %rbp
ret
"
:
:"r"(old), "r"(new)
:
: "volatile", "alignstack"
);
}
fn main() {
let mut runtime = Runtime::new();
runtime.init();
runtime.spawn(|| {});
runtime.run();
}
确切的段错误点在
mov qword ptr [rsp + 144], rdi
movups xmm0, xmmword ptr [rdx + rsi + 56]
; Here
movaps xmmword ptr [rsp + 128], xmm0
movups xmm0, xmmword ptr [rdx + rsi + 40]
movaps xmmword ptr [rsp + 112], xmm0
内联pop_front
内联t_return
update2:在第一个答案和更多测试之后,问题是 movaps
要求 [rsp + 128]
是 16 字节对齐的,但 rsp
不是。 guard
(内联 t_return
)的条目在
更多细节在我自己的回答中
movaps
to/from 堆栈上的段错误通常表示您违反了 ABI 并且未对齐堆栈指针。可能调试模式不会自动矢量化此处复制的任何内容的副本。
movaps
的内存操作数需要 16 字节对齐,这与 movups
不同。编译器使用 movaps
因为它在旧 CPU 上效率更高。 ABI 保证任何函数入口的 16 字节堆栈对齐,因此它可以免费为本地人获得 16 字节对齐。 (这样的 ABI 保证是编译器为了效率而应该在不检查的情况下假设的事情。)
注意这是从[rdx+rsi+40]
复制32个字节到堆栈内存,所以之前那个堆栈内存的内容商店执行无关紧要。
经过更多测试,guard
.
当派生函数 returns 时,它将堆栈顶部弹出到 %rip
,从而运行 guard
。与调用者负责堆栈对齐的普通函数调用不同,这将导致 guard
中的堆栈不再对齐到 16 字节。随后使用 movaps
时,此程序会出现段错误。