diff --git a/src/arch/x86.rs b/src/arch/x86.rs index f677c34..3e257bf 100644 --- a/src/arch/x86.rs +++ b/src/arch/x86.rs @@ -3,15 +3,32 @@ // whitequark // See the LICENSE file included in this distribution. -//! To understand the code in this file, keep in mind this fact: -//! * i686 SysV C ABI requires the stack to be aligned at function entry, -//! so that `%esp+4` is a multiple of 16. Aligned operands are a requirement -//! of SIMD instructions, and making this the responsibility of the caller -//! avoids having to maintain a frame pointer, which is necessary when -//! a function has to realign the stack from an unknown state. -//! * i686 SysV C ABI passes the first argument on the stack. This is -//! unfortunate, because unlike every other architecture we can't reuse -//! `swap` for the initial call, and so we use a trampoline. +// To understand the machine code in this file, keep in mind these facts: +// * i686 SysV C ABI requires the stack to be aligned at function entry, +// so that `%esp+4` is a multiple of 16. Aligned operands are a requirement +// of SIMD instructions, and making this the responsibility of the caller +// avoids having to maintain a frame pointer, which is necessary when +// a function has to realign the stack from an unknown state. +// * i686 SysV C ABI passes the first argument on the stack. This is +// unfortunate, because unlike every other architecture we can't reuse +// `swap` for the initial call, and so we use a trampoline. +// +// To understand the DWARF CFI code in this file, keep in mind these facts: +// * CFI is "call frame information"; a set of instructions to a debugger or +// an unwinder that allow it to simulate returning from functions. This implies +// restoring every register to its pre-call state, as well as the stack pointer. +// * CFA is "call frame address"; the value of stack pointer right before the call +// instruction in the caller. Everything strictly below CFA (and inclusive until +// the next CFA) is the call frame of the callee. This implies that the return +// address is the part of callee's call frame. +// * Logically, DWARF CFI is a table where rows are instruction pointer values and +// columns describe where registers are spilled (mostly using expressions that +// compute a memory location as CFA+n). A .cfi_offset pseudoinstruction changes +// the state of a column for all IP numerically larger than the one it's placed +// after. A .cfi_def_* pseudoinstruction changes the CFA value similarly. +// * Simulating return is as easy as restoring register values from the CFI table +// and then setting stack pointer to CFA. +use core::intrinsics; use stack::Stack; #[derive(Debug)] @@ -19,17 +36,50 @@ pub struct StackPointer(*mut usize); pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize) -> !) -> StackPointer { #[naked] - unsafe extern "C" fn trampoline() -> ! { + unsafe extern "C" fn init_trampoline_1() -> ! { asm!( r#" - # Pop function. - popl %ebx + # gdb has a hardcoded check that rejects backtraces where frame addresses + # do not monotonically decrease. It is turned off if the function is called + # "__morestack" and that is hardcoded. So, to make gdb backtraces match + # the actual unwinder behavior, we call ourselves "__morestack" and mark + # the symbol as local; it shouldn't interfere with anything. + __morestack: + .local __morestack + + # Set up the first part of our DWARF CFI linking stacks together. + # When unwinding the frame corresponding to this function, a DWARF unwinder + # will use %ebx as the next call frame address, restore return address + # from CFA-4 and restore %ebp from CFA-8. This mirrors what the second half + # of `swap_trampoline` does. + .cfi_def_cfa %ebx, 0 + .cfi_offset %ebp, -8 + # Call the next trampoline. + call ${0:c} + + .Lend: + .size __morestack, .Lend-__morestack + "# + : : "s" (init_trampoline_2 as usize) : "memory" : "volatile"); + intrinsics::unreachable() + } + + #[naked] + unsafe extern "C" fn init_trampoline_2() -> ! { + asm!( + r#" + # Set up the second part of our DWARF CFI. + # When unwinding the frame corresponding to this function, a DWARF unwinder + # will restore %ebx (and thus CFA of the first trampoline) from the stack slot. + .cfi_offset %ebx, 4 # Push argument. + .cfi_def_cfa_offset 8 pushl %eax - # Call it. - call *%ebx - "# ::: "memory" : "volatile"); - ::core::intrinsics::unreachable() + # Call the provided function. + call *8(%esp) + "# + : : : "memory" : "volatile"); + intrinsics::unreachable() } unsafe fn push(sp: &mut StackPointer, val: usize) { @@ -38,49 +88,62 @@ pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize) -> !) -> StackP } let mut sp = StackPointer(stack.top() as *mut usize); - push(&mut sp, 0); // alignment - push(&mut sp, 0); // alignment - push(&mut sp, 0); // alignment + push(&mut sp, 0xdead0cfa); // CFA slot push(&mut sp, f as usize); // function - push(&mut sp, trampoline as usize); + push(&mut sp, init_trampoline_1 as usize); + push(&mut sp, 0xdeadbbbb); // saved %ebp sp } #[inline(always)] -pub unsafe fn swap(arg: usize, old_sp: &mut StackPointer, new_sp: &StackPointer) -> usize { +pub unsafe fn swap(arg: usize, old_sp: &mut StackPointer, new_sp: &StackPointer, + new_stack: &Stack) -> usize { + // Address of the topmost CFA stack slot. + let new_cfa = (new_stack.top() as *mut usize).offset(-1); + + #[naked] + unsafe extern "C" fn swap_trampoline() -> ! { + asm!( + r#" + # Save frame pointer explicitly; the unwinder uses it to find CFA of + # the caller, and so it has to have the correct value immediately after + # the call instruction that invoked the trampoline. + pushl %ebp + + # Remember stack pointer of the old context, in case %edx==%esi. + movl %esp, %ebx + # Load stack pointer of the new context. + movl (%edx), %esp + # Save stack pointer of the old context. + movl %ebx, (%esi) + + # Restore frame pointer of the new context. + popl %ebp + + # Return into the new context. Use `pop` and `jmp` instead of a `ret` + # to avoid return address mispredictions (~8ns per `ret` on Ivy Bridge). + popl %ebx + jmpl *%ebx + "# + : : : "memory" : "volatile"); + intrinsics::unreachable(); + } + let ret: usize; asm!( r#" - # Save frame pointer explicitly; LLVM doesn't spill it even if it is - # marked as clobbered. - pushl %ebp + # Link the call stacks together. + movl %esp, (%edi) # Push instruction pointer of the old context and switch to # the new context. - call 1f - # Restore frame pointer. - popl %ebp - # Continue executing old context. - jmp 2f - - 1: - # Remember stack pointer of the old context, in case %rdx==%rsi. - movl %esp, %ebx - # Load stack pointer of the new context. - movl (%edx), %esp - # Save stack pointer of the old context. - movl %ebx, (%esi) - - # Pop instruction pointer of the new context (placed onto stack by - # the call above) and jump there; don't use `ret` to avoid return - # address mispredictions (~8ns on Ivy Bridge). - popl %ebx - jmpl *%ebx - 2: + call ${1:c} "# : "={eax}" (ret) - : "{eax}" (arg) + : "s" (swap_trampoline as usize) + "{eax}" (arg) "{esi}" (old_sp) "{edx}" (new_sp) + "{edi}" (new_cfa) : "eax", "ebx", "ecx", "edx", "esi", "edi", //"ebp", "esp", "mmx0", "mmx1", "mmx2", "mmx3", "mmx4", "mmx5", "mmx6", "mmx7", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", diff --git a/src/arch/x86_64.rs b/src/arch/x86_64.rs index 6afccb1..c62d00f 100644 --- a/src/arch/x86_64.rs +++ b/src/arch/x86_64.rs @@ -3,54 +3,115 @@ // whitequark // See the LICENSE file included in this distribution. -//! To understand the code in this file, keep in mind these two facts: -//! * x86_64 SysV C ABI has a "red zone": 128 bytes under the top of the stack -//! that is defined to be unmolested by signal handlers, interrupts, etc. -//! Leaf functions can use the red zone without adjusting rsp or rbp. -//! * x86_64 SysV C ABI requires the stack to be aligned at function entry, -//! so that (%rsp+8) is a multiple of 16. Aligned operands are a requirement -//! of SIMD instructions, and making this the responsibility of the caller -//! avoids having to maintain a frame pointer, which is necessary when -//! a function has to realign the stack from an unknown state. -//! * x86_64 SysV C ABI passes the first argument in %rdi. We also use %rdi -//! to pass a value while swapping context; this is an arbitrary choice -//! (we clobber all registers and could use any of them) but this allows us -//! to reuse the swap function to perform the initial call. - +// To understand the code in this file, keep in mind these two facts: +// * x86_64 SysV C ABI has a "red zone": 128 bytes under the top of the stack +// that is defined to be unmolested by signal handlers, interrupts, etc. +// Leaf functions can use the red zone without adjusting rsp or rbp. +// * x86_64 SysV C ABI requires the stack to be aligned at function entry, +// so that (%rsp+8) is a multiple of 16. Aligned operands are a requirement +// of SIMD instructions, and making this the responsibility of the caller +// avoids having to maintain a frame pointer, which is necessary when +// a function has to realign the stack from an unknown state. +// * x86_64 SysV C ABI passes the first argument in %rdi. We also use %rdi +// to pass a value while swapping context; this is an arbitrary choice +// (we clobber all registers and could use any of them) but this allows us +// to reuse the swap function to perform the initial call. +// +// To understand the DWARF CFI code in this file, keep in mind these facts: +// * CFI is "call frame information"; a set of instructions to a debugger or +// an unwinder that allow it to simulate returning from functions. This implies +// restoring every register to its pre-call state, as well as the stack pointer. +// * CFA is "call frame address"; the value of stack pointer right before the call +// instruction in the caller. Everything strictly below CFA (and inclusive until +// the next CFA) is the call frame of the callee. This implies that the return +// address is the part of callee's call frame. +// * Logically, DWARF CFI is a table where rows are instruction pointer values and +// columns describe where registers are spilled (mostly using expressions that +// compute a memory location as CFA+n). A .cfi_offset pseudoinstruction changes +// the state of a column for all IP numerically larger than the one it's placed +// after. A .cfi_def_* pseudoinstruction changes the CFA value similarly. +// * Simulating return is as easy as restoring register values from the CFI table +// and then setting stack pointer to CFA. +use core::intrinsics; use stack::Stack; #[derive(Debug)] pub struct StackPointer(*mut usize); pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize) -> !) -> StackPointer { + #[naked] + unsafe extern "C" fn init_trampoline_1() -> ! { + asm!( + r#" + # gdb has a hardcoded check that rejects backtraces where frame addresses + # do not monotonically decrease. It is turned off if the function is called + # "__morestack" and that is hardcoded. So, to make gdb backtraces match + # the actual unwinder behavior, we call ourselves "__morestack" and mark + # the symbol as local; it shouldn't interfere with anything. + __morestack: + .local __morestack + + # Set up the first part of our DWARF CFI linking stacks together. + # When unwinding the frame corresponding to this function, a DWARF unwinder + # will use %rbx as the next call frame address, restore return address + # from CFA-8 and restore %rbp from CFA-16. This mirrors what the second half + # of `swap_trampoline` does. + .cfi_def_cfa %rbx, 0 + .cfi_offset %rbp, -16 + # Call the next trampoline. + call ${0:c} + + .Lend: + .size __morestack, .Lend-__morestack + "# + : : "s" (init_trampoline_2 as usize) : "memory" : "volatile"); + intrinsics::unreachable() + } + + #[naked] + unsafe extern "C" fn init_trampoline_2() -> ! { + asm!( + r#" + # Set up the second part of our DWARF CFI. + # When unwinding the frame corresponding to this function, a DWARF unwinder + # will restore %rbx (and thus CFA of the first trampoline) from the stack slot. + .cfi_offset %rbx, 16 + # Call the provided function. + call *8(%rsp) + "# + : : : "memory" : "volatile"); + intrinsics::unreachable() + } + unsafe fn push(sp: &mut StackPointer, val: usize) { sp.0 = sp.0.offset(-1); *sp.0 = val } let mut sp = StackPointer(stack.top() as *mut usize); - push(&mut sp, 0); // alignment - push(&mut sp, f as usize); + push(&mut sp, 0xdeaddeaddead0cfa); // CFA slot + push(&mut sp, 0 as usize); // alignment + push(&mut sp, f as usize); // function + push(&mut sp, init_trampoline_1 as usize); + push(&mut sp, 0xdeaddeaddeadbbbb); // saved %rbp sp } #[inline(always)] -pub unsafe fn swap(arg: usize, old_sp: &mut StackPointer, new_sp: &StackPointer) -> usize { - macro_rules! swap_body { - () => { - r#" - # Save frame pointer explicitly; LLVM doesn't spill it even if it is - # marked as clobbered. - pushq %rbp - # Push instruction pointer of the old context and switch to - # the new context. - call 1f - # Restore frame pointer. - popq %rbp - # Continue executing old context. - jmp 2f +pub unsafe fn swap(arg: usize, old_sp: &mut StackPointer, new_sp: &StackPointer, + new_stack: &Stack) -> usize { + // Address of the topmost CFA stack slot. + let new_cfa = (new_stack.top() as *mut usize).offset(-1); + + #[naked] + unsafe extern "C" fn swap_trampoline() -> ! { + asm!( + r#" + # Save frame pointer explicitly; the unwinder uses it to find CFA of + # the caller, and so it has to have the correct value immediately after + # the call instruction that invoked the trampoline. + pushq %rbp - 1: # Remember stack pointer of the old context, in case %rdx==%rsi. movq %rsp, %rbx # Load stack pointer of the new context. @@ -58,66 +119,45 @@ pub unsafe fn swap(arg: usize, old_sp: &mut StackPointer, new_sp: &StackPointer) # Save stack pointer of the old context. movq %rbx, (%rsi) - # Pop instruction pointer of the new context (placed onto stack by - # the call above) and jump there; don't use `ret` to avoid return - # address mispredictions (~8ns on Ivy Bridge). + # Restore frame pointer of the new context. + popq %rbp + + # Return into the new context. Use `pop` and `jmp` instead of a `ret` + # to avoid return address mispredictions (~8ns per `ret` on Ivy Bridge). popq %rbx jmpq *%rbx - 2: "# - } + : : : "memory" : "volatile"); + intrinsics::unreachable(); } - #[cfg(not(windows))] - #[inline(always)] - unsafe fn swap_impl(arg: usize, old_sp: &mut StackPointer, new_sp: &StackPointer) -> usize { - let ret: usize; - asm!(swap_body!() - : "={rdi}" (ret) - : "{rdi}" (arg) - "{rsi}" (old_sp) - "{rdx}" (new_sp) - : "rax", "rbx", "rcx", "rdx", "rsi", "rdi", //"rbp", "rsp", - "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", - "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "xmm16", "xmm17", "xmm18", "xmm19", "xmm20", "xmm21", "xmm22", "xmm23", - "xmm24", "xmm25", "xmm26", "xmm27", "xmm28", "xmm29", "xmm30", "xmm31" - "cc", "fpsr", "flags", "memory" - // Ideally, we would set the LLVM "noredzone" attribute on this function - // (and it would be propagated to the call site). Unfortunately, rustc - // provides no such functionality. Fortunately, by a lucky coincidence, - // the "alignstack" LLVM inline assembly option does exactly the same - // thing on x86_64. - : "volatile", "alignstack"); - ret - } - - - #[cfg(windows)] - #[inline(always)] - unsafe fn swap_impl(arg: usize, old_sp: &mut StackPointer, new_sp: &StackPointer) -> usize { - let ret: usize; - asm!(swap_body!() - : "={rcx}" (ret) - : "{rcx}" (arg) - "{rsi}" (old_sp) - "{rdx}" (new_sp) - : "rax", "rbx", "rcx", "rdx", "rsi", "rdi", //"rbp", "rsp", - "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", - "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", - "xmm16", "xmm17", "xmm18", "xmm19", "xmm20", "xmm21", "xmm22", "xmm23", - "xmm24", "xmm25", "xmm26", "xmm27", "xmm28", "xmm29", "xmm30", "xmm31" - "cc", "fpsr", "flags", "memory" - // Ideally, we would set the LLVM "noredzone" attribute on this function - // (and it would be propagated to the call site). Unfortunately, rustc - // provides no such functionality. Fortunately, by a lucky coincidence, - // the "alignstack" LLVM inline assembly option does exactly the same - // thing on x86_64. - : "volatile", "alignstack"); - ret - } - - swap_impl(arg, old_sp, new_sp) + let ret: usize; + asm!( + r#" + # Link the call stacks together. + movq %rsp, (%rcx) + # Push instruction pointer of the old context and switch to + # the new context. + call ${1:c} + "# + : "={rdi}" (ret) + : "s" (swap_trampoline as usize) + "{rdi}" (arg) + "{rsi}" (old_sp) + "{rdx}" (new_sp) + "{rcx}" (new_cfa) + : "rax", "rbx", "rcx", "rdx", "rsi", "rdi", //"rbp", "rsp", + "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15", + "xmm16", "xmm17", "xmm18", "xmm19", "xmm20", "xmm21", "xmm22", "xmm23", + "xmm24", "xmm25", "xmm26", "xmm27", "xmm28", "xmm29", "xmm30", "xmm31" + "cc", "fpsr", "flags", "memory" + // Ideally, we would set the LLVM "noredzone" attribute on this function + // (and it would be propagated to the call site). Unfortunately, rustc + // provides no such functionality. Fortunately, by a lucky coincidence, + // the "alignstack" LLVM inline assembly option does exactly the same + // thing on x86_64. + : "volatile", "alignstack"); + ret } diff --git a/src/context.rs b/src/context.rs index 6c78241..a936a5c 100644 --- a/src/context.rs +++ b/src/context.rs @@ -49,6 +49,6 @@ impl Context where OldStack: stack::Stack { new_ctx: *const Context, arg: usize) -> usize where NewStack: stack::Stack { - arch::swap(arg, &mut (*old_ctx).stack_ptr, &(*new_ctx).stack_ptr) + arch::swap(arg, &mut (*old_ctx).stack_ptr, &(*new_ctx).stack_ptr, &(*new_ctx).stack) } } diff --git a/src/lib.rs b/src/lib.rs index 101745d..5dd83af 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2,7 +2,8 @@ // Copyright (c) edef // See the LICENSE file included in this distribution. #![feature(asm)] -#![cfg_attr(target_arch = "x86", feature(naked_functions, core_intrinsics))] +#![cfg_attr(target_arch = "x86", feature(naked_functions, core_intrinsics))] +#![cfg_attr(target_arch = "x86_64", feature(naked_functions, core_intrinsics))] #![no_std] //! libfringe is a library implementing lightweight context switches, diff --git a/src/os/mod.rs b/src/os/mod.rs index dd29536..05160a4 100644 --- a/src/os/mod.rs +++ b/src/os/mod.rs @@ -50,12 +50,14 @@ impl Stack { } impl stack::Stack for Stack { + #[inline(always)] fn top(&self) -> *mut u8 { unsafe { self.ptr.offset(self.len as isize) } } + #[inline(always)] fn limit(&self) -> *mut u8 { unsafe { self.ptr.offset(sys::page_size() as isize) diff --git a/tests/panic.rs b/tests/panic.rs new file mode 100644 index 0000000..80c21cc --- /dev/null +++ b/tests/panic.rs @@ -0,0 +1,45 @@ +// This file is part of libfringe, a low-level green threading library. +// Copyright (c) whitequark +// See the LICENSE file included in this distribution. +#![feature(thread_local)] +extern crate fringe; + +use fringe::Context; + +#[thread_local] +static mut ctx_slot: *mut Context = 0 as *mut Context<_>; + +unsafe extern "C" fn do_panic(arg: usize) -> ! { + match arg { + 0 => panic!("arg=0"), + 1 => { + Context::swap(ctx_slot, ctx_slot, 0); + panic!("arg=1"); + } + _ => unreachable!() + } +} + +#[test] +#[should_panic="arg=0"] +fn panic_after_start() { + unsafe { + let stack = fringe::OsStack::new(4 << 20).unwrap(); + let mut ctx = Context::new(stack, do_panic); + + Context::swap(&mut ctx, &ctx, 0); + } +} + +#[test] +#[should_panic="arg=1"] +fn panic_after_swap() { + unsafe { + let stack = fringe::OsStack::new(4 << 20).unwrap(); + let mut ctx = Context::new(stack, do_panic); + ctx_slot = &mut ctx; + + Context::swap(&mut ctx, &ctx, 1); + Context::swap(&mut ctx, &ctx, 0); + } +}