diff --git a/src/arch/or1k.rs b/src/arch/or1k.rs index 910d38a..c23b2ed 100644 --- a/src/arch/or1k.rs +++ b/src/arch/or1k.rs @@ -1,6 +1,7 @@ // This file is part of libfringe, a low-level green threading library. // Copyright (c) edef , // whitequark +// Amanieu d'Antras // Licensed under the Apache License, Version 2.0, or the MIT license , at your option. This file may not be @@ -14,6 +15,31 @@ // while swapping context; this is an arbitrary choice // (we clobber all registers and could use any of them) but this allows us // to reuse the swap function to perform the initial call. +// +// To understand the DWARF CFI code in this file, keep in mind these facts: +// * CFI is "call frame information"; a set of instructions to a debugger or +// an unwinder that allow it to simulate returning from functions. This implies +// restoring every register to its pre-call state, as well as the stack pointer. +// * CFA is "call frame address"; the value of stack pointer right before the call +// instruction in the caller. Everything strictly below CFA (and inclusive until +// the next CFA) is the call frame of the callee. This implies that the return +// address is the part of callee's call frame. +// * Logically, DWARF CFI is a table where rows are instruction pointer values and +// columns describe where registers are spilled (mostly using expressions that +// compute a memory location as CFA+n). A .cfi_offset pseudoinstruction changes +// the state of a column for all IP numerically larger than the one it's placed +// after. A .cfi_def_* pseudoinstruction changes the CFA value similarly. +// * Simulating return is as easy as restoring register values from the CFI table +// and then setting stack pointer to CFA. +// +// A high-level overview of the function of the trampolines when unwinding is: +// * The 2nd init trampoline puts a controlled value (written in swap to `new_cfa`) +// into r2. This is then used as the CFA for the 1st trampoline. +// * This controlled value points to the bottom of the stack of the parent context, +// which holds the saved r2 and r9 from the call to swap(). +// * The 1st init trampoline tells the unwinder to restore r2 and r9 +// from the stack frame at r2 (in the parent stack), thus continuing +// unwinding at the swap call site instead of falling off the end of context stack. use stack::Stack; pub const STACK_ALIGNMENT: usize = 4; @@ -23,7 +49,7 @@ pub struct StackPointer(*mut usize); pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize) -> !) -> StackPointer { #[naked] - unsafe extern "C" fn trampoline() { + unsafe extern "C" fn trampoline_1() { asm!( r#" # gdb has a hardcoded check that rejects backtraces where frame addresses @@ -34,20 +60,21 @@ pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize) -> !) -> StackP __morestack: .local __morestack - # When a normal function is entered, the return address is pushed onto the stack, - # and the first thing it does is pushing the frame pointer. The init trampoline - # is not a normal function; on entry the stack pointer is one word above the place - # where the return address should be, and right under it the return address as - # well as the stack pointer are already pre-filled. So, simply move the stack - # pointer where it belongs; and add CFI just like in any other function prologue. - l.addi r1, r1, -8 - .cfi_def_cfa_offset 8 - .cfi_offset r2, -8 - l.or r2, r1, r0 - .cfi_def_cfa_register r2 - # Call f. - l.lwz r9, 8(r1) - l.jr r9 + # Set up the first part of our DWARF CFI linking stacks together. When + # we reach this function from unwinding, r2 will be pointing at the bottom + # of the parent linked stack. This link is set each time swap() is called. + # When unwinding the frame corresponding to this function, a DWARF unwinder + # will use r2+8 as the next call frame address, restore r2 from CFA-4 and + # restore return address (r9) from CFA-8. This mirrors what the second half + # of `swap_trampoline` does. + .cfi_def_cfa r2, 8 + .cfi_offset r2, -4 + .cfi_offset r9, -8 + + # This nop is here so that the initial swap doesn't return to the start + # of the trampoline, which confuses the unwinder since it will look for + # frame information in the previous symbol rather than this one. It is + # never actually executed. l.nop .Lend: @@ -56,59 +83,98 @@ pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize) -> !) -> StackP : : : : "volatile") } + #[naked] + unsafe extern "C" fn trampoline_2() { + asm!( + r#" + # Set up the second part of our DWARF CFI. + # When unwinding the frame corresponding to this function, a DWARF unwinder + # will restore r2 (and thus CFA of the first trampoline) from the stack slot. + # This stack slot is updated every time swap() is called to point to the bottom + # of the stack of the context switch just switched from. + .cfi_def_cfa r2, 8 + .cfi_offset r2, -4 + .cfi_offset r9, -8 + + # Call the provided function. + l.lwz r4, 8(r1) + l.jalr r4 + l.nop + "# + : : : : "volatile") + } + unsafe fn push(sp: &mut StackPointer, val: usize) { sp.0 = sp.0.offset(-1); *sp.0 = val } + // We set up the stack in a somewhat special way so that to the unwinder it + // looks like trampoline_1 has called trampoline_2, which has in turn called + // swap::trampoline. + // + // There are 2 call frames in this setup, each containing the return address + // followed by the r2 value for that frame. This setup supports unwinding + // using DWARF CFI as well as the frame pointer-based unwinding used by tools + // such as perf or dtrace. let mut sp = StackPointer(stack.base() as *mut usize); - push(&mut sp, f as usize); // function - let rsp = sp; - push(&mut sp, trampoline as usize); // trampoline / linked return address - push(&mut sp, 0xdead0bbb); // initial %ebp / linked %ebp - rsp + + push(&mut sp, f as usize); // Function that trampoline_2 should call + + // Call frame for trampoline_2. The CFA slot is updated by swap::trampoline + // each time a context switch is performed. + push(&mut sp, 0xdead0cfa); // CFA slot + push(&mut sp, trampoline_1 as usize + 4); // Return after the nop + + // Call frame for swap::trampoline. We set up the r2 value to point to the + // parent call frame. + let frame = sp; + push(&mut sp, frame.0 as usize); // Pointer to parent call frame + push(&mut sp, trampoline_2 as usize); // Entry point + + // The call frame for swap::trampoline is actually in the red zone and not + // below the stack pointer. + frame } #[inline(always)] pub unsafe fn swap(arg: usize, old_sp: *mut StackPointer, new_sp: StackPointer, new_stack: &Stack) -> usize { // Address of the topmost CFA stack slot. - let new_cfa = (new_stack.base() as *mut usize).offset(-3); + let new_cfa = (new_stack.base() as *mut usize).offset(-2); #[naked] unsafe extern "C" fn trampoline() { asm!( r#" - # Remember the frame and instruction pointers in the callee, to link - # the stacks together later. - l.or r18, r2, r0 - l.or r19, r9, r0 + # Save the frame pointer and link register; the unwinder uses them to find + # the CFA of the caller, and so they have to have the correct value immediately + # after the call instruction that invoked the trampoline. + l.sw -4(r1), r2 + l.sw -8(r1), r9 + .cfi_offset r2, -4 + .cfi_offset r9, -8 - # Save instruction pointer of the old context. - l.sw -4(r1), r9 + # Link the call stacks together by writing the current stack bottom + # address to the CFA slot in the new stack. + l.addi r7, r1, -8 + l.sw 0(r6), r7 - # Save frame pointer explicitly; the unwinder uses it to find CFA of - # the caller, and so it has to have the correct value immediately after - # the call instruction that invoked the trampoline. - l.sw -8(r1), r2 + # Switch to the new stack for unwinding purposes. The old stack may no + # longer be valid now that we have modified the link. + .cfi_def_cfa_register r5 # Save stack pointer of the old context. l.sw 0(r4), r1 # Load stack pointer of the new context. l.or r1, r0, r5 + .cfi_def_cfa_register r1 - # Load frame and instruction pointers of the new context. - l.lwz r2, -8(r1) - l.lwz r9, -4(r1) + # Restore frame pointer and link register of the new context. + l.lwz r2, -4(r1) + l.lwz r9, -8(r1) - # Put the frame and instruction pointers into the trampoline stack frame, - # making it appear to return right after the call instruction that invoked - # this trampoline. This is done after the loads above, since on the very first - # swap, the saved r2/r9 intentionally alias 0(r6)/4(r6). - l.sw 0(r6), r18 - l.sw 4(r6), r19 - - # Return into new context. + # Return into the new context. l.jr r9 l.nop "# @@ -118,8 +184,7 @@ pub unsafe fn swap(arg: usize, old_sp: *mut StackPointer, new_sp: StackPointer, let ret: usize; asm!( r#" - # Push instruction pointer of the old context and switch to - # the new context. + # Call the trampoline to switch to the new context. l.jal ${1} l.nop "# @@ -133,7 +198,7 @@ pub unsafe fn swap(arg: usize, old_sp: *mut StackPointer, new_sp: StackPointer, "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23", "r24", "r25", "r26", "r27", "r28", "r29", "r30", "r31", - "flags", "memory" + "cc", "memory" : "volatile"); ret } diff --git a/src/arch/x86.rs b/src/arch/x86.rs index fbe47b2..462ee50 100644 --- a/src/arch/x86.rs +++ b/src/arch/x86.rs @@ -1,6 +1,7 @@ // This file is part of libfringe, a low-level green threading library. // Copyright (c) edef , // whitequark +// Amanieu d'Antras // Licensed under the Apache License, Version 2.0, or the MIT license , at your option. This file may not be @@ -15,6 +16,31 @@ // * i686 SysV C ABI passes the first argument on the stack. This is // unfortunate, because unlike every other architecture we can't reuse // `swap` for the initial call, and so we use a trampoline. +// +// To understand the DWARF CFI code in this file, keep in mind these facts: +// * CFI is "call frame information"; a set of instructions to a debugger or +// an unwinder that allow it to simulate returning from functions. This implies +// restoring every register to its pre-call state, as well as the stack pointer. +// * CFA is "call frame address"; the value of stack pointer right before the call +// instruction in the caller. Everything strictly below CFA (and inclusive until +// the next CFA) is the call frame of the callee. This implies that the return +// address is the part of callee's call frame. +// * Logically, DWARF CFI is a table where rows are instruction pointer values and +// columns describe where registers are spilled (mostly using expressions that +// compute a memory location as CFA+n). A .cfi_offset pseudoinstruction changes +// the state of a column for all IP numerically larger than the one it's placed +// after. A .cfi_def_* pseudoinstruction changes the CFA value similarly. +// * Simulating return is as easy as restoring register values from the CFI table +// and then setting stack pointer to CFA. +// +// A high-level overview of the function of the trampolines when unwinding is: +// * The 2nd init trampoline puts a controlled value (written in swap to `new_cfa`) +// into %ebp. This is then used as the CFA for the 1st trampoline. +// * This controlled value points to the bottom of the stack of the parent context, +// which holds the saved %ebp and return address from the call to swap(). +// * The 1st init trampoline tells the unwinder to restore %ebp and its return +// address from the stack frame at %ebp (in the parent stack), thus continuing +// unwinding at the swap call site instead of falling off the end of context stack. use stack::Stack; pub const STACK_ALIGNMENT: usize = 16; @@ -25,7 +51,7 @@ pub struct StackPointer(*mut usize); pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize) -> !) -> StackPointer { #[cfg(not(target_vendor = "apple"))] #[naked] - unsafe extern "C" fn trampoline() { + unsafe extern "C" fn trampoline_1() { asm!( r#" # gdb has a hardcoded check that rejects backtraces where frame addresses @@ -36,20 +62,26 @@ pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize) -> !) -> StackP __morestack: .local __morestack - # When a normal function is entered, the return address is pushed onto the stack, - # and the first thing it does is pushing the frame pointer. The init trampoline - # is not a normal function; on entry the stack pointer is one word above the place - # where the return address should be, and right under it the return address as - # well as the stack pointer are already pre-filled. So, simply move the stack - # pointer where it belongs; and add CFI just like in any other function prologue. - subl $$8, %esp - .cfi_def_cfa_offset 8 - .cfi_offset %ebp, -8 - movl %esp, %ebp - .cfi_def_cfa_register %ebp - # Call f. - pushl %eax - calll *12(%esp) + # Set up the first part of our DWARF CFI linking stacks together. When + # we reach this function from unwinding, %ebp will be pointing at the bottom + # of the parent linked stack. This link is set each time swap() is called. + # When unwinding the frame corresponding to this function, a DWARF unwinder + # will use %ebp+8 as the next call frame address, restore return address + # from CFA-4 and restore %ebp from CFA-8. This mirrors what the second half + # of `swap_trampoline` does. + .cfi_def_cfa ebp, 8 + .cfi_offset ebp, -8 + + # This nop is here so that the initial swap doesn't return to the start + # of the trampoline, which confuses the unwinder since it will look for + # frame information in the previous symbol rather than this one. It is + # never actually executed. + nop + + # Stack unwinding in some versions of libunwind doesn't seem to like + # 1-byte symbols, so we add a second nop here. This instruction isn't + # executed either, it is only here to pad the symbol size. + nop .Lend: .size __morestack, .Lend-__morestack @@ -59,21 +91,36 @@ pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize) -> !) -> StackP #[cfg(target_vendor = "apple")] #[naked] - unsafe extern "C" fn trampoline() { + unsafe extern "C" fn trampoline_1() { asm!( r#" # Identical to the above, except avoids .local/.size that aren't available on Mach-O. __morestack: .private_extern __morestack + .cfi_def_cfa ebp, 8 + .cfi_offset ebp, -8 + nop + nop + "# + : : : : "volatile") + } - subl $$8, %esp - .cfi_def_cfa_offset 8 - .cfi_offset %ebp, -8 - movl %esp, %ebp - .cfi_def_cfa_register %ebp - # Call f. - pushl %eax - calll *12(%esp) + #[naked] + unsafe extern "C" fn trampoline_2() { + asm!( + r#" + # Set up the second part of our DWARF CFI. + # When unwinding the frame corresponding to this function, a DWARF unwinder + # will restore %ebp (and thus CFA of the first trampoline) from the stack slot. + # This stack slot is updated every time swap() is called to point to the bottom + # of the stack of the context switch just switched from. + .cfi_def_cfa ebp, 8 + .cfi_offset ebp, -8 + + # Push argument. + pushl %eax + # Call the provided function. + call *12(%esp) "# : : : : "volatile") } @@ -83,10 +130,29 @@ pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize) -> !) -> StackP *sp.0 = val } + // We set up the stack in a somewhat special way so that to the unwinder it + // looks like trampoline_1 has called trampoline_2, which has in turn called + // swap::trampoline. + // + // There are 2 call frames in this setup, each containing the return address + // followed by the %ebp value for that frame. This setup supports unwinding + // using DWARF CFI as well as the frame pointer-based unwinding used by tools + // such as perf or dtrace. let mut sp = StackPointer(stack.base() as *mut usize); - push(&mut sp, f as usize); // function - push(&mut sp, trampoline as usize); // trampoline / linked return address - push(&mut sp, 0xdead0bbb); // initial %ebp / linked %ebp + + push(&mut sp, f as usize); // Function that trampoline_2 should call + + // Call frame for trampoline_2. The CFA slot is updated by swap::trampoline + // each time a context switch is performed. + push(&mut sp, trampoline_1 as usize + 2); // Return after the 2 nops + push(&mut sp, 0xdead0cfa); // CFA slot + + // Call frame for swap::trampoline. We set up the %ebp value to point to the + // parent call frame. + let frame = sp; + push(&mut sp, trampoline_2 as usize); // Entry point + push(&mut sp, frame.0 as usize); // Pointer to parent call frame + sp } @@ -100,38 +166,38 @@ pub unsafe fn swap(arg: usize, old_sp: *mut StackPointer, new_sp: StackPointer, unsafe extern "C" fn trampoline() { asm!( r#" - # Remember the frame and instruction pointers in the callee, to link - # the stacks together later. We put them on stack because x86 doesn't - # have enough registers. - movl %ebp, -8(%edx) - movl (%esp), %ebx - movl %ebx, -12(%edx) - # Save frame pointer explicitly; the unwinder uses it to find CFA of # the caller, and so it has to have the correct value immediately after # the call instruction that invoked the trampoline. pushl %ebp + .cfi_adjust_cfa_offset 4 + .cfi_rel_offset ebp, 0 + + # Link the call stacks together by writing the current stack bottom + # address to the CFA slot in the new stack. + movl %esp, (%edi) + + # Switch to the new stack for unwinding purposes. The old stack may no + # longer be valid now that we have modified the link. + .cfi_def_cfa_register edx # Save stack pointer of the old context. movl %esp, (%esi) # Load stack pointer of the new context. movl %edx, %esp + .cfi_def_cfa_register esp - # Load frame and instruction pointers of the new context. + # Restore frame pointer of the new context. popl %ebp - popl %ebx + .cfi_adjust_cfa_offset -4 + .cfi_restore ebp - # Put the frame and instruction pointers into the trampoline stack frame, - # making it appear to return right after the call instruction that invoked - # this trampoline. This is done after the loads above, since on the very first - # swap, the saved %ebp/%ebx intentionally alias 0(%edi)/4(%edi). - movl -8(%edx), %esi - movl %esi, 0(%edi) - movl -12(%edx), %esi - movl %esi, 4(%edi) - - # Return into new context. - jmpl *%ebx + # Return into the new context. Use `pop` and `jmp` instead of a `ret` + # to avoid return address mispredictions (~8ns per `ret` on Ivy Bridge). + popl %ecx + .cfi_adjust_cfa_offset -4 + .cfi_register eip, ecx + jmpl *%ecx "# : : : : "volatile") } diff --git a/src/arch/x86_64.rs b/src/arch/x86_64.rs index ad1015a..6ed2630 100644 --- a/src/arch/x86_64.rs +++ b/src/arch/x86_64.rs @@ -1,6 +1,7 @@ // This file is part of libfringe, a low-level green threading library. // Copyright (c) edef , // whitequark +// Amanieu d'Antras // Licensed under the Apache License, Version 2.0, or the MIT license , at your option. This file may not be @@ -19,6 +20,31 @@ // to pass a value while swapping context; this is an arbitrary choice // (we clobber all registers and could use any of them) but this allows us // to reuse the swap function to perform the initial call. +// +// To understand the DWARF CFI code in this file, keep in mind these facts: +// * CFI is "call frame information"; a set of instructions to a debugger or +// an unwinder that allow it to simulate returning from functions. This implies +// restoring every register to its pre-call state, as well as the stack pointer. +// * CFA is "call frame address"; the value of stack pointer right before the call +// instruction in the caller. Everything strictly below CFA (and inclusive until +// the next CFA) is the call frame of the callee. This implies that the return +// address is the part of callee's call frame. +// * Logically, DWARF CFI is a table where rows are instruction pointer values and +// columns describe where registers are spilled (mostly using expressions that +// compute a memory location as CFA+n). A .cfi_offset pseudoinstruction changes +// the state of a column for all IP numerically larger than the one it's placed +// after. A .cfi_def_* pseudoinstruction changes the CFA value similarly. +// * Simulating return is as easy as restoring register values from the CFI table +// and then setting stack pointer to CFA. +// +// A high-level overview of the function of the trampolines when unwinding is: +// * The 2nd init trampoline puts a controlled value (written in swap to `new_cfa`) +// into %rbp. This is then used as the CFA for the 1st trampoline. +// * This controlled value points to the bottom of the stack of the parent context, +// which holds the saved %rbp and return address from the call to swap(). +// * The 1st init trampoline tells the unwinder to restore %rbp and its return +// address from the stack frame at %rbp (in the parent stack), thus continuing +// unwinding at the swap call site instead of falling off the end of context stack. use stack::Stack; pub const STACK_ALIGNMENT: usize = 16; @@ -29,7 +55,7 @@ pub struct StackPointer(*mut usize); pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize) -> !) -> StackPointer { #[cfg(not(target_vendor = "apple"))] #[naked] - unsafe extern "C" fn trampoline() { + unsafe extern "C" fn trampoline_1() { asm!( r#" # gdb has a hardcoded check that rejects backtraces where frame addresses @@ -40,19 +66,26 @@ pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize) -> !) -> StackP __morestack: .local __morestack - # When a normal function is entered, the return address is pushed onto the stack, - # and the first thing it does is pushing the frame pointer. The init trampoline - # is not a normal function; on entry the stack pointer is one word above the place - # where the return address should be, and right under it the return address as - # well as the stack pointer are already pre-filled. So, simply move the stack - # pointer where it belongs; and add CFI just like in any other function prologue. - subq $$16, %rsp - .cfi_def_cfa_offset 16 - .cfi_offset %rbp, -16 - movq %rsp, %rbp - .cfi_def_cfa_register %rbp - # Call f. - callq *16(%rsp) + # Set up the first part of our DWARF CFI linking stacks together. When + # we reach this function from unwinding, %rbp will be pointing at the bottom + # of the parent linked stack. This link is set each time swap() is called. + # When unwinding the frame corresponding to this function, a DWARF unwinder + # will use %rbp+16 as the next call frame address, restore return address + # from CFA-8 and restore %rbp from CFA-16. This mirrors what the second half + # of `swap_trampoline` does. + .cfi_def_cfa rbp, 16 + .cfi_offset rbp, -16 + + # This nop is here so that the initial swap doesn't return to the start + # of the trampoline, which confuses the unwinder since it will look for + # frame information in the previous symbol rather than this one. It is + # never actually executed. + nop + + # Stack unwinding in some versions of libunwind doesn't seem to like + # 1-byte symbols, so we add a second nop here. This instruction isn't + # executed either, it is only here to pad the symbol size. + nop .Lend: .size __morestack, .Lend-__morestack @@ -62,32 +95,67 @@ pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize) -> !) -> StackP #[cfg(target_vendor = "apple")] #[naked] - unsafe extern "C" fn trampoline() { + unsafe extern "C" fn trampoline_1() { asm!( r#" # Identical to the above, except avoids .local/.size that aren't available on Mach-O. __morestack: .private_extern __morestack - - subq $$16, %rsp - .cfi_def_cfa_offset 16 - .cfi_offset %rbp, -16 - movq %rsp, %rbp - .cfi_def_cfa_register %rbp - callq *16(%rsp) + .cfi_def_cfa rbp, 16 + .cfi_offset rbp, -16 + nop + nop "# : : : : "volatile") } + + #[naked] + unsafe extern "C" fn trampoline_2() { + asm!( + r#" + # Set up the second part of our DWARF CFI. + # When unwinding the frame corresponding to this function, a DWARF unwinder + # will restore %rbp (and thus CFA of the first trampoline) from the stack slot. + # This stack slot is updated every time swap() is called to point to the bottom + # of the stack of the context switch just switched from. + .cfi_def_cfa rbp, 16 + .cfi_offset rbp, -16 + + # Call the provided function. + call *16(%rsp) + "# + : : : : "volatile") + } + unsafe fn push(sp: &mut StackPointer, val: usize) { sp.0 = sp.0.offset(-1); *sp.0 = val } + // We set up the stack in a somewhat special way so that to the unwinder it + // looks like trampoline_1 has called trampoline_2, which has in turn called + // swap::trampoline. + // + // There are 2 call frames in this setup, each containing the return address + // followed by the %rbp value for that frame. This setup supports unwinding + // using DWARF CFI as well as the frame pointer-based unwinding used by tools + // such as perf or dtrace. let mut sp = StackPointer(stack.base() as *mut usize); - push(&mut sp, 0 as usize); // alignment - push(&mut sp, f as usize); // function - push(&mut sp, trampoline as usize); // trampoline / linked return address - push(&mut sp, 0xdeaddeaddead0bbb); // initial %rbp / linked %rbp + + push(&mut sp, 0 as usize); // Padding to ensure the stack is properly aligned + push(&mut sp, f as usize); // Function that trampoline_2 should call + + // Call frame for trampoline_2. The CFA slot is updated by swap::trampoline + // each time a context switch is performed. + push(&mut sp, trampoline_1 as usize + 2); // Return after the 2 nops + push(&mut sp, 0xdeaddeaddead0cfa); // CFA slot + + // Call frame for swap::trampoline. We set up the %rbp value to point to the + // parent call frame. + let frame = sp; + push(&mut sp, trampoline_2 as usize); // Entry point + push(&mut sp, frame.0 as usize); // Pointer to parent call frame + sp } @@ -101,34 +169,38 @@ pub unsafe fn swap(arg: usize, old_sp: *mut StackPointer, new_sp: StackPointer, unsafe extern "C" fn trampoline() { asm!( r#" - # Remember the frame and instruction pointers in the callee, to link - # the stacks together later. - movq %rbp, %r8 - movq (%rsp), %r9 - # Save frame pointer explicitly; the unwinder uses it to find CFA of # the caller, and so it has to have the correct value immediately after # the call instruction that invoked the trampoline. pushq %rbp + .cfi_adjust_cfa_offset 8 + .cfi_rel_offset rbp, 0 + + # Link the call stacks together by writing the current stack bottom + # address to the CFA slot in the new stack. + movq %rsp, (%rcx) + + # Switch to the new stack for unwinding purposes. The old stack may no + # longer be valid now that we have modified the link. + .cfi_def_cfa_register rdx # Save stack pointer of the old context. movq %rsp, (%rsi) # Load stack pointer of the new context. movq %rdx, %rsp + .cfi_def_cfa_register rsp - # Load frame and instruction pointers of the new context. + # Restore frame pointer of the new context. popq %rbp - popq %rbx + .cfi_adjust_cfa_offset -8 + .cfi_restore rbp - # Put the frame and instruction pointers into the trampoline stack frame, - # making it appear to return right after the call instruction that invoked - # this trampoline. This is done after the loads above, since on the very first - # swap, the saved %rbp/%rbx intentionally alias 0(%rcx)/8(%rcx). - movq %r8, 0(%rcx) - movq %r9, 8(%rcx) - - # Return into new context. - jmpq *%rbx + # Return into the new context. Use `pop` and `jmp` instead of a `ret` + # to avoid return address mispredictions (~8ns per `ret` on Ivy Bridge). + popq %rax + .cfi_adjust_cfa_offset -8 + .cfi_register rip, rax + jmpq *%rax "# : : : : "volatile") }